LLVM 23.0.0git
VectorCombine.cpp
Go to the documentation of this file.
1//===------- VectorCombine.cpp - Optimize partial vector operations -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass optimizes scalar/vector interactions using target cost models. The
10// transforms implemented here may not fit in traditional loop-based or SLP
11// vectorization passes.
12//
13//===----------------------------------------------------------------------===//
14
16#include "llvm/ADT/DenseMap.h"
17#include "llvm/ADT/STLExtras.h"
18#include "llvm/ADT/ScopeExit.h"
20#include "llvm/ADT/Statistic.h"
25#include "llvm/Analysis/Loads.h"
30#include "llvm/IR/Dominators.h"
31#include "llvm/IR/Function.h"
32#include "llvm/IR/IRBuilder.h"
39#include <numeric>
40#include <optional>
41#include <queue>
42#include <set>
43
44#define DEBUG_TYPE "vector-combine"
46
47using namespace llvm;
48using namespace llvm::PatternMatch;
49
50STATISTIC(NumVecLoad, "Number of vector loads formed");
51STATISTIC(NumVecCmp, "Number of vector compares formed");
52STATISTIC(NumVecBO, "Number of vector binops formed");
53STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
54STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast");
55STATISTIC(NumScalarOps, "Number of scalar unary + binary ops formed");
56STATISTIC(NumScalarCmp, "Number of scalar compares formed");
57STATISTIC(NumScalarIntrinsic, "Number of scalar intrinsic calls formed");
58
60 "disable-vector-combine", cl::init(false), cl::Hidden,
61 cl::desc("Disable all vector combine transforms"));
62
64 "disable-binop-extract-shuffle", cl::init(false), cl::Hidden,
65 cl::desc("Disable binop extract to shuffle transforms"));
66
68 "vector-combine-max-scan-instrs", cl::init(30), cl::Hidden,
69 cl::desc("Max number of instructions to scan for vector combining."));
70
71static const unsigned InvalidIndex = std::numeric_limits<unsigned>::max();
72
73namespace {
74class VectorCombine {
75public:
76 VectorCombine(Function &F, const TargetTransformInfo &TTI,
79 bool TryEarlyFoldsOnly)
80 : F(F), Builder(F.getContext(), InstSimplifyFolder(*DL)), TTI(TTI),
81 DT(DT), AA(AA), AC(AC), DL(DL), CostKind(CostKind), SQ(*DL),
82 TryEarlyFoldsOnly(TryEarlyFoldsOnly) {}
83
84 bool run();
85
86private:
87 Function &F;
89 const TargetTransformInfo &TTI;
90 const DominatorTree &DT;
91 AAResults &AA;
92 AssumptionCache &AC;
93 const DataLayout *DL;
94 TTI::TargetCostKind CostKind;
95 const SimplifyQuery SQ;
96
97 /// If true, only perform beneficial early IR transforms. Do not introduce new
98 /// vector operations.
99 bool TryEarlyFoldsOnly;
100
101 InstructionWorklist Worklist;
102
103 /// Next instruction to iterate. It will be updated when it is erased by
104 /// RecursivelyDeleteTriviallyDeadInstructions.
105 Instruction *NextInst;
106
107 // TODO: Direct calls from the top-level "run" loop use a plain "Instruction"
108 // parameter. That should be updated to specific sub-classes because the
109 // run loop was changed to dispatch on opcode.
110 bool vectorizeLoadInsert(Instruction &I);
111 bool widenSubvectorLoad(Instruction &I);
112 ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
113 ExtractElementInst *Ext1,
114 unsigned PreferredExtractIndex) const;
115 bool isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
116 const Instruction &I,
117 ExtractElementInst *&ConvertToShuffle,
118 unsigned PreferredExtractIndex);
119 Value *foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
120 Value *foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
121 bool foldExtractExtract(Instruction &I);
122 bool foldInsExtFNeg(Instruction &I);
123 bool foldInsExtBinop(Instruction &I);
124 bool foldInsExtVectorToShuffle(Instruction &I);
125 bool foldBitOpOfCastops(Instruction &I);
126 bool foldBitOpOfCastConstant(Instruction &I);
127 bool foldBitcastShuffle(Instruction &I);
128 bool scalarizeOpOrCmp(Instruction &I);
129 bool scalarizeVPIntrinsic(Instruction &I);
130 bool foldExtractedCmps(Instruction &I);
131 bool foldSelectsFromBitcast(Instruction &I);
132 bool foldBinopOfReductions(Instruction &I);
133 bool foldSingleElementStore(Instruction &I);
134 bool scalarizeLoad(Instruction &I);
135 bool scalarizeLoadExtract(LoadInst *LI, VectorType *VecTy, Value *Ptr);
136 bool scalarizeLoadBitcast(LoadInst *LI, VectorType *VecTy, Value *Ptr);
137 bool scalarizeExtExtract(Instruction &I);
138 bool foldConcatOfBoolMasks(Instruction &I);
139 bool foldPermuteOfBinops(Instruction &I);
140 bool foldShuffleOfBinops(Instruction &I);
141 bool foldShuffleOfSelects(Instruction &I);
142 bool foldShuffleOfCastops(Instruction &I);
143 bool foldShuffleOfShuffles(Instruction &I);
144 bool foldPermuteOfIntrinsic(Instruction &I);
145 bool foldShufflesOfLengthChangingShuffles(Instruction &I);
146 bool foldShuffleOfIntrinsics(Instruction &I);
147 bool foldShuffleToIdentity(Instruction &I);
148 bool foldShuffleFromReductions(Instruction &I);
149 bool foldShuffleChainsToReduce(Instruction &I);
150 bool foldCastFromReductions(Instruction &I);
151 bool foldSignBitReductionCmp(Instruction &I);
152 bool foldICmpEqZeroVectorReduce(Instruction &I);
153 bool foldEquivalentReductionCmp(Instruction &I);
154 bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
155 bool foldInterleaveIntrinsics(Instruction &I);
156 bool shrinkType(Instruction &I);
157 bool shrinkLoadForShuffles(Instruction &I);
158 bool shrinkPhiOfShuffles(Instruction &I);
159
160 void replaceValue(Instruction &Old, Value &New, bool Erase = true) {
161 LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n');
162 LLVM_DEBUG(dbgs() << " With: " << New << '\n');
163 Old.replaceAllUsesWith(&New);
164 if (auto *NewI = dyn_cast<Instruction>(&New)) {
165 New.takeName(&Old);
166 Worklist.pushUsersToWorkList(*NewI);
167 Worklist.pushValue(NewI);
168 }
169 if (Erase && isInstructionTriviallyDead(&Old)) {
170 eraseInstruction(Old);
171 } else {
172 Worklist.push(&Old);
173 }
174 }
175
176 void eraseInstruction(Instruction &I) {
177 LLVM_DEBUG(dbgs() << "VC: Erasing: " << I << '\n');
178 SmallVector<Value *> Ops(I.operands());
179 Worklist.remove(&I);
180 I.eraseFromParent();
181
182 // Push remaining users of the operands and then the operand itself - allows
183 // further folds that were hindered by OneUse limits.
184 SmallPtrSet<Value *, 4> Visited;
185 for (Value *Op : Ops) {
186 if (!Visited.contains(Op)) {
187 if (auto *OpI = dyn_cast<Instruction>(Op)) {
189 OpI, nullptr, nullptr, [&](Value *V) {
190 if (auto *I = dyn_cast<Instruction>(V)) {
191 LLVM_DEBUG(dbgs() << "VC: Erased: " << *I << '\n');
192 Worklist.remove(I);
193 if (I == NextInst)
194 NextInst = NextInst->getNextNode();
195 Visited.insert(I);
196 }
197 }))
198 continue;
199 Worklist.pushUsersToWorkList(*OpI);
200 Worklist.pushValue(OpI);
201 }
202 }
203 }
204 }
205};
206} // namespace
207
208/// Return the source operand of a potentially bitcasted value. If there is no
209/// bitcast, return the input value itself.
211 while (auto *BitCast = dyn_cast<BitCastInst>(V))
212 V = BitCast->getOperand(0);
213 return V;
214}
215
216static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI) {
217 // Do not widen load if atomic/volatile or under asan/hwasan/memtag/tsan.
218 // The widened load may load data from dirty regions or create data races
219 // non-existent in the source.
220 if (!Load || !Load->isSimple() || !Load->hasOneUse() ||
221 Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) ||
223 return false;
224
225 // We are potentially transforming byte-sized (8-bit) memory accesses, so make
226 // sure we have all of our type-based constraints in place for this target.
227 Type *ScalarTy = Load->getType()->getScalarType();
228 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
229 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
230 if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 ||
231 ScalarSize % 8 != 0)
232 return false;
233
234 return true;
235}
236
237bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
238 // Match insert into fixed vector of scalar value.
239 // TODO: Handle non-zero insert index.
240 Value *Scalar;
241 if (!match(&I,
243 return false;
244
245 // Optionally match an extract from another vector.
246 Value *X;
247 bool HasExtract = match(Scalar, m_ExtractElt(m_Value(X), m_ZeroInt()));
248 if (!HasExtract)
249 X = Scalar;
250
251 auto *Load = dyn_cast<LoadInst>(X);
252 if (!canWidenLoad(Load, TTI))
253 return false;
254
255 Type *ScalarTy = Scalar->getType();
256 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
257 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
258
259 // Check safety of replacing the scalar load with a larger vector load.
260 // We use minimal alignment (maximum flexibility) because we only care about
261 // the dereferenceable region. When calculating cost and creating a new op,
262 // we may use a larger value based on alignment attributes.
263 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
264 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
265
266 unsigned MinVecNumElts = MinVectorSize / ScalarSize;
267 auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
268 unsigned OffsetEltIndex = 0;
269 Align Alignment = Load->getAlign();
270 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
271 &DT)) {
272 // It is not safe to load directly from the pointer, but we can still peek
273 // through gep offsets and check if it safe to load from a base address with
274 // updated alignment. If it is, we can shuffle the element(s) into place
275 // after loading.
276 unsigned OffsetBitWidth = DL->getIndexTypeSizeInBits(SrcPtr->getType());
277 APInt Offset(OffsetBitWidth, 0);
279
280 // We want to shuffle the result down from a high element of a vector, so
281 // the offset must be positive.
282 if (Offset.isNegative())
283 return false;
284
285 // The offset must be a multiple of the scalar element to shuffle cleanly
286 // in the element's size.
287 uint64_t ScalarSizeInBytes = ScalarSize / 8;
288 if (Offset.urem(ScalarSizeInBytes) != 0)
289 return false;
290
291 // If we load MinVecNumElts, will our target element still be loaded?
292 OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue();
293 if (OffsetEltIndex >= MinVecNumElts)
294 return false;
295
296 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
297 &DT))
298 return false;
299
300 // Update alignment with offset value. Note that the offset could be negated
301 // to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but
302 // negation does not change the result of the alignment calculation.
303 Alignment = commonAlignment(Alignment, Offset.getZExtValue());
304 }
305
306 // Original pattern: insertelt undef, load [free casts of] PtrOp, 0
307 // Use the greater of the alignment on the load or its source pointer.
308 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
309 Type *LoadTy = Load->getType();
310 unsigned AS = Load->getPointerAddressSpace();
311 InstructionCost OldCost =
312 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
313 APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
314 OldCost +=
315 TTI.getScalarizationOverhead(MinVecTy, DemandedElts,
316 /* Insert */ true, HasExtract, CostKind);
317
318 // New pattern: load VecPtr
319 InstructionCost NewCost =
320 TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS, CostKind);
321 // Optionally, we are shuffling the loaded vector element(s) into place.
322 // For the mask set everything but element 0 to undef to prevent poison from
323 // propagating from the extra loaded memory. This will also optionally
324 // shrink/grow the vector from the loaded size to the output size.
325 // We assume this operation has no cost in codegen if there was no offset.
326 // Note that we could use freeze to avoid poison problems, but then we might
327 // still need a shuffle to change the vector size.
328 auto *Ty = cast<FixedVectorType>(I.getType());
329 unsigned OutputNumElts = Ty->getNumElements();
330 SmallVector<int, 16> Mask(OutputNumElts, PoisonMaskElem);
331 assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
332 Mask[0] = OffsetEltIndex;
333 if (OffsetEltIndex)
334 NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, MinVecTy, Mask,
335 CostKind);
336
337 // We can aggressively convert to the vector form because the backend can
338 // invert this transform if it does not result in a performance win.
339 if (OldCost < NewCost || !NewCost.isValid())
340 return false;
341
342 // It is safe and potentially profitable to load a vector directly:
343 // inselt undef, load Scalar, 0 --> load VecPtr
344 IRBuilder<> Builder(Load);
345 Value *CastedPtr =
346 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
347 Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
348 VecLd = Builder.CreateShuffleVector(VecLd, Mask);
349
350 replaceValue(I, *VecLd);
351 ++NumVecLoad;
352 return true;
353}
354
355/// If we are loading a vector and then inserting it into a larger vector with
356/// undefined elements, try to load the larger vector and eliminate the insert.
357/// This removes a shuffle in IR and may allow combining of other loaded values.
358bool VectorCombine::widenSubvectorLoad(Instruction &I) {
359 // Match subvector insert of fixed vector.
360 auto *Shuf = cast<ShuffleVectorInst>(&I);
361 if (!Shuf->isIdentityWithPadding())
362 return false;
363
364 // Allow a non-canonical shuffle mask that is choosing elements from op1.
365 unsigned NumOpElts =
366 cast<FixedVectorType>(Shuf->getOperand(0)->getType())->getNumElements();
367 unsigned OpIndex = any_of(Shuf->getShuffleMask(), [&NumOpElts](int M) {
368 return M >= (int)(NumOpElts);
369 });
370
371 auto *Load = dyn_cast<LoadInst>(Shuf->getOperand(OpIndex));
372 if (!canWidenLoad(Load, TTI))
373 return false;
374
375 // We use minimal alignment (maximum flexibility) because we only care about
376 // the dereferenceable region. When calculating cost and creating a new op,
377 // we may use a larger value based on alignment attributes.
378 auto *Ty = cast<FixedVectorType>(I.getType());
379 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
380 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
381 Align Alignment = Load->getAlign();
382 if (!isSafeToLoadUnconditionally(SrcPtr, Ty, Align(1), *DL, Load, &AC, &DT))
383 return false;
384
385 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
386 Type *LoadTy = Load->getType();
387 unsigned AS = Load->getPointerAddressSpace();
388
389 // Original pattern: insert_subvector (load PtrOp)
390 // This conservatively assumes that the cost of a subvector insert into an
391 // undef value is 0. We could add that cost if the cost model accurately
392 // reflects the real cost of that operation.
393 InstructionCost OldCost =
394 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
395
396 // New pattern: load PtrOp
397 InstructionCost NewCost =
398 TTI.getMemoryOpCost(Instruction::Load, Ty, Alignment, AS, CostKind);
399
400 // We can aggressively convert to the vector form because the backend can
401 // invert this transform if it does not result in a performance win.
402 if (OldCost < NewCost || !NewCost.isValid())
403 return false;
404
405 IRBuilder<> Builder(Load);
406 Value *CastedPtr =
407 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
408 Value *VecLd = Builder.CreateAlignedLoad(Ty, CastedPtr, Alignment);
409 replaceValue(I, *VecLd);
410 ++NumVecLoad;
411 return true;
412}
413
414/// Determine which, if any, of the inputs should be replaced by a shuffle
415/// followed by extract from a different index.
416ExtractElementInst *VectorCombine::getShuffleExtract(
417 ExtractElementInst *Ext0, ExtractElementInst *Ext1,
418 unsigned PreferredExtractIndex = InvalidIndex) const {
419 auto *Index0C = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
420 auto *Index1C = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
421 assert(Index0C && Index1C && "Expected constant extract indexes");
422
423 unsigned Index0 = Index0C->getZExtValue();
424 unsigned Index1 = Index1C->getZExtValue();
425
426 // If the extract indexes are identical, no shuffle is needed.
427 if (Index0 == Index1)
428 return nullptr;
429
430 Type *VecTy = Ext0->getVectorOperand()->getType();
431 assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");
432 InstructionCost Cost0 =
433 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
434 InstructionCost Cost1 =
435 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
436
437 // If both costs are invalid no shuffle is needed
438 if (!Cost0.isValid() && !Cost1.isValid())
439 return nullptr;
440
441 // We are extracting from 2 different indexes, so one operand must be shuffled
442 // before performing a vector operation and/or extract. The more expensive
443 // extract will be replaced by a shuffle.
444 if (Cost0 > Cost1)
445 return Ext0;
446 if (Cost1 > Cost0)
447 return Ext1;
448
449 // If the costs are equal and there is a preferred extract index, shuffle the
450 // opposite operand.
451 if (PreferredExtractIndex == Index0)
452 return Ext1;
453 if (PreferredExtractIndex == Index1)
454 return Ext0;
455
456 // Otherwise, replace the extract with the higher index.
457 return Index0 > Index1 ? Ext0 : Ext1;
458}
459
460/// Compare the relative costs of 2 extracts followed by scalar operation vs.
461/// vector operation(s) followed by extract. Return true if the existing
462/// instructions are cheaper than a vector alternative. Otherwise, return false
463/// and if one of the extracts should be transformed to a shufflevector, set
464/// \p ConvertToShuffle to that extract instruction.
465bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
466 ExtractElementInst *Ext1,
467 const Instruction &I,
468 ExtractElementInst *&ConvertToShuffle,
469 unsigned PreferredExtractIndex) {
470 auto *Ext0IndexC = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
471 auto *Ext1IndexC = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
472 assert(Ext0IndexC && Ext1IndexC && "Expected constant extract indexes");
473
474 unsigned Opcode = I.getOpcode();
475 Value *Ext0Src = Ext0->getVectorOperand();
476 Value *Ext1Src = Ext1->getVectorOperand();
477 Type *ScalarTy = Ext0->getType();
478 auto *VecTy = cast<VectorType>(Ext0Src->getType());
479 InstructionCost ScalarOpCost, VectorOpCost;
480
481 // Get cost estimates for scalar and vector versions of the operation.
482 bool IsBinOp = Instruction::isBinaryOp(Opcode);
483 if (IsBinOp) {
484 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
485 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
486 } else {
487 assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
488 "Expected a compare");
489 CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate();
490 ScalarOpCost = TTI.getCmpSelInstrCost(
491 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
492 VectorOpCost = TTI.getCmpSelInstrCost(
493 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
494 }
495
496 // Get cost estimates for the extract elements. These costs will factor into
497 // both sequences.
498 unsigned Ext0Index = Ext0IndexC->getZExtValue();
499 unsigned Ext1Index = Ext1IndexC->getZExtValue();
500
501 InstructionCost Extract0Cost =
502 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Ext0Index);
503 InstructionCost Extract1Cost =
504 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Ext1Index);
505
506 // A more expensive extract will always be replaced by a splat shuffle.
507 // For example, if Ext0 is more expensive:
508 // opcode (extelt V0, Ext0), (ext V1, Ext1) -->
509 // extelt (opcode (splat V0, Ext0), V1), Ext1
510 // TODO: Evaluate whether that always results in lowest cost. Alternatively,
511 // check the cost of creating a broadcast shuffle and shuffling both
512 // operands to element 0.
513 unsigned BestExtIndex = Extract0Cost > Extract1Cost ? Ext0Index : Ext1Index;
514 unsigned BestInsIndex = Extract0Cost > Extract1Cost ? Ext1Index : Ext0Index;
515 InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost);
516
517 // Extra uses of the extracts mean that we include those costs in the
518 // vector total because those instructions will not be eliminated.
519 InstructionCost OldCost, NewCost;
520 if (Ext0Src == Ext1Src && Ext0Index == Ext1Index) {
521 // Handle a special case. If the 2 extracts are identical, adjust the
522 // formulas to account for that. The extra use charge allows for either the
523 // CSE'd pattern or an unoptimized form with identical values:
524 // opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C
525 bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2)
526 : !Ext0->hasOneUse() || !Ext1->hasOneUse();
527 OldCost = CheapExtractCost + ScalarOpCost;
528 NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost;
529 } else {
530 // Handle the general case. Each extract is actually a different value:
531 // opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C
532 OldCost = Extract0Cost + Extract1Cost + ScalarOpCost;
533 NewCost = VectorOpCost + CheapExtractCost +
534 !Ext0->hasOneUse() * Extract0Cost +
535 !Ext1->hasOneUse() * Extract1Cost;
536 }
537
538 ConvertToShuffle = getShuffleExtract(Ext0, Ext1, PreferredExtractIndex);
539 if (ConvertToShuffle) {
540 if (IsBinOp && DisableBinopExtractShuffle)
541 return true;
542
543 // If we are extracting from 2 different indexes, then one operand must be
544 // shuffled before performing the vector operation. The shuffle mask is
545 // poison except for 1 lane that is being translated to the remaining
546 // extraction lane. Therefore, it is a splat shuffle. Ex:
547 // ShufMask = { poison, poison, 0, poison }
548 // TODO: The cost model has an option for a "broadcast" shuffle
549 // (splat-from-element-0), but no option for a more general splat.
550 if (auto *FixedVecTy = dyn_cast<FixedVectorType>(VecTy)) {
551 SmallVector<int> ShuffleMask(FixedVecTy->getNumElements(),
553 ShuffleMask[BestInsIndex] = BestExtIndex;
555 VecTy, VecTy, ShuffleMask, CostKind, 0,
556 nullptr, {ConvertToShuffle});
557 } else {
559 VecTy, VecTy, {}, CostKind, 0, nullptr,
560 {ConvertToShuffle});
561 }
562 }
563
564 // Aggressively form a vector op if the cost is equal because the transform
565 // may enable further optimization.
566 // Codegen can reverse this transform (scalarize) if it was not profitable.
567 return OldCost < NewCost;
568}
569
570/// Create a shuffle that translates (shifts) 1 element from the input vector
571/// to a new element location.
572static Value *createShiftShuffle(Value *Vec, unsigned OldIndex,
573 unsigned NewIndex, IRBuilderBase &Builder) {
574 // The shuffle mask is poison except for 1 lane that is being translated
575 // to the new element index. Example for OldIndex == 2 and NewIndex == 0:
576 // ShufMask = { 2, poison, poison, poison }
577 auto *VecTy = cast<FixedVectorType>(Vec->getType());
578 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
579 ShufMask[NewIndex] = OldIndex;
580 return Builder.CreateShuffleVector(Vec, ShufMask, "shift");
581}
582
583/// Given an extract element instruction with constant index operand, shuffle
584/// the source vector (shift the scalar element) to a NewIndex for extraction.
585/// Return null if the input can be constant folded, so that we are not creating
586/// unnecessary instructions.
587static Value *translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex,
588 IRBuilderBase &Builder) {
589 // Shufflevectors can only be created for fixed-width vectors.
590 Value *X = ExtElt->getVectorOperand();
591 if (!isa<FixedVectorType>(X->getType()))
592 return nullptr;
593
594 // If the extract can be constant-folded, this code is unsimplified. Defer
595 // to other passes to handle that.
596 Value *C = ExtElt->getIndexOperand();
597 assert(isa<ConstantInt>(C) && "Expected a constant index operand");
598 if (isa<Constant>(X))
599 return nullptr;
600
601 Value *Shuf = createShiftShuffle(X, cast<ConstantInt>(C)->getZExtValue(),
602 NewIndex, Builder);
603 return Shuf;
604}
605
606/// Try to reduce extract element costs by converting scalar compares to vector
607/// compares followed by extract.
608/// cmp (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
609Value *VectorCombine::foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex,
610 Instruction &I) {
611 assert(isa<CmpInst>(&I) && "Expected a compare");
612
613 // cmp Pred (extelt V0, ExtIndex), (extelt V1, ExtIndex)
614 // --> extelt (cmp Pred V0, V1), ExtIndex
615 ++NumVecCmp;
616 CmpInst::Predicate Pred = cast<CmpInst>(&I)->getPredicate();
617 Value *VecCmp = Builder.CreateCmp(Pred, V0, V1);
618 return Builder.CreateExtractElement(VecCmp, ExtIndex, "foldExtExtCmp");
619}
620
621/// Try to reduce extract element costs by converting scalar binops to vector
622/// binops followed by extract.
623/// bo (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
624Value *VectorCombine::foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex,
625 Instruction &I) {
626 assert(isa<BinaryOperator>(&I) && "Expected a binary operator");
627
628 // bo (extelt V0, ExtIndex), (extelt V1, ExtIndex)
629 // --> extelt (bo V0, V1), ExtIndex
630 ++NumVecBO;
631 Value *VecBO = Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0,
632 V1, "foldExtExtBinop");
633
634 // All IR flags are safe to back-propagate because any potential poison
635 // created in unused vector elements is discarded by the extract.
636 if (auto *VecBOInst = dyn_cast<Instruction>(VecBO))
637 VecBOInst->copyIRFlags(&I);
638
639 return Builder.CreateExtractElement(VecBO, ExtIndex, "foldExtExtBinop");
640}
641
642/// Match an instruction with extracted vector operands.
643bool VectorCombine::foldExtractExtract(Instruction &I) {
644 // It is not safe to transform things like div, urem, etc. because we may
645 // create undefined behavior when executing those on unknown vector elements.
647 return false;
648
649 Instruction *I0, *I1;
650 CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
651 if (!match(&I, m_Cmp(Pred, m_Instruction(I0), m_Instruction(I1))) &&
653 return false;
654
655 Value *V0, *V1;
656 uint64_t C0, C1;
657 if (!match(I0, m_ExtractElt(m_Value(V0), m_ConstantInt(C0))) ||
658 !match(I1, m_ExtractElt(m_Value(V1), m_ConstantInt(C1))) ||
659 V0->getType() != V1->getType())
660 return false;
661
662 // If the scalar value 'I' is going to be re-inserted into a vector, then try
663 // to create an extract to that same element. The extract/insert can be
664 // reduced to a "select shuffle".
665 // TODO: If we add a larger pattern match that starts from an insert, this
666 // probably becomes unnecessary.
667 auto *Ext0 = cast<ExtractElementInst>(I0);
668 auto *Ext1 = cast<ExtractElementInst>(I1);
669 uint64_t InsertIndex = InvalidIndex;
670 if (I.hasOneUse())
671 match(I.user_back(),
672 m_InsertElt(m_Value(), m_Value(), m_ConstantInt(InsertIndex)));
673
674 ExtractElementInst *ExtractToChange;
675 if (isExtractExtractCheap(Ext0, Ext1, I, ExtractToChange, InsertIndex))
676 return false;
677
678 Value *ExtOp0 = Ext0->getVectorOperand();
679 Value *ExtOp1 = Ext1->getVectorOperand();
680
681 if (ExtractToChange) {
682 unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0;
683 Value *NewExtOp =
684 translateExtract(ExtractToChange, CheapExtractIdx, Builder);
685 if (!NewExtOp)
686 return false;
687 if (ExtractToChange == Ext0)
688 ExtOp0 = NewExtOp;
689 else
690 ExtOp1 = NewExtOp;
691 }
692
693 Value *ExtIndex = ExtractToChange == Ext0 ? Ext1->getIndexOperand()
694 : Ext0->getIndexOperand();
695 Value *NewExt = Pred != CmpInst::BAD_ICMP_PREDICATE
696 ? foldExtExtCmp(ExtOp0, ExtOp1, ExtIndex, I)
697 : foldExtExtBinop(ExtOp0, ExtOp1, ExtIndex, I);
698 Worklist.push(Ext0);
699 Worklist.push(Ext1);
700 replaceValue(I, *NewExt);
701 return true;
702}
703
704/// Try to replace an extract + scalar fneg + insert with a vector fneg +
705/// shuffle.
706bool VectorCombine::foldInsExtFNeg(Instruction &I) {
707 // Match an insert (op (extract)) pattern.
708 Value *DstVec;
709 uint64_t ExtIdx, InsIdx;
710 Instruction *FNeg;
711 if (!match(&I, m_InsertElt(m_Value(DstVec), m_OneUse(m_Instruction(FNeg)),
712 m_ConstantInt(InsIdx))))
713 return false;
714
715 // Note: This handles the canonical fneg instruction and "fsub -0.0, X".
716 Value *SrcVec;
717 Instruction *Extract;
718 if (!match(FNeg, m_FNeg(m_CombineAnd(
719 m_Instruction(Extract),
720 m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx))))))
721 return false;
722
723 auto *DstVecTy = cast<FixedVectorType>(DstVec->getType());
724 auto *DstVecScalarTy = DstVecTy->getScalarType();
725 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
726 if (!SrcVecTy || DstVecScalarTy != SrcVecTy->getScalarType())
727 return false;
728
729 // Ignore if insert/extract index is out of bounds or destination vector has
730 // one element
731 unsigned NumDstElts = DstVecTy->getNumElements();
732 unsigned NumSrcElts = SrcVecTy->getNumElements();
733 if (ExtIdx > NumSrcElts || InsIdx >= NumDstElts || NumDstElts == 1)
734 return false;
735
736 // We are inserting the negated element into the same lane that we extracted
737 // from. This is equivalent to a select-shuffle that chooses all but the
738 // negated element from the destination vector.
739 SmallVector<int> Mask(NumDstElts);
740 std::iota(Mask.begin(), Mask.end(), 0);
741 Mask[InsIdx] = (ExtIdx % NumDstElts) + NumDstElts;
742 InstructionCost OldCost =
743 TTI.getArithmeticInstrCost(Instruction::FNeg, DstVecScalarTy, CostKind) +
744 TTI.getVectorInstrCost(I, DstVecTy, CostKind, InsIdx);
745
746 // If the extract has one use, it will be eliminated, so count it in the
747 // original cost. If it has more than one use, ignore the cost because it will
748 // be the same before/after.
749 if (Extract->hasOneUse())
750 OldCost += TTI.getVectorInstrCost(*Extract, SrcVecTy, CostKind, ExtIdx);
751
752 InstructionCost NewCost =
753 TTI.getArithmeticInstrCost(Instruction::FNeg, SrcVecTy, CostKind) +
755 DstVecTy, Mask, CostKind);
756
757 bool NeedLenChg = SrcVecTy->getNumElements() != NumDstElts;
758 // If the lengths of the two vectors are not equal,
759 // we need to add a length-change vector. Add this cost.
760 SmallVector<int> SrcMask;
761 if (NeedLenChg) {
762 SrcMask.assign(NumDstElts, PoisonMaskElem);
763 SrcMask[ExtIdx % NumDstElts] = ExtIdx;
765 DstVecTy, SrcVecTy, SrcMask, CostKind);
766 }
767
768 LLVM_DEBUG(dbgs() << "Found an insertion of (extract)fneg : " << I
769 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
770 << "\n");
771 if (NewCost > OldCost)
772 return false;
773
774 Value *NewShuf, *LenChgShuf = nullptr;
775 // insertelt DstVec, (fneg (extractelt SrcVec, Index)), Index
776 Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg);
777 if (NeedLenChg) {
778 // shuffle DstVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask
779 LenChgShuf = Builder.CreateShuffleVector(VecFNeg, SrcMask);
780 NewShuf = Builder.CreateShuffleVector(DstVec, LenChgShuf, Mask);
781 Worklist.pushValue(LenChgShuf);
782 } else {
783 // shuffle DstVec, (fneg SrcVec), Mask
784 NewShuf = Builder.CreateShuffleVector(DstVec, VecFNeg, Mask);
785 }
786
787 Worklist.pushValue(VecFNeg);
788 replaceValue(I, *NewShuf);
789 return true;
790}
791
792/// Try to fold insert(binop(x,y),binop(a,b),idx)
793/// --> binop(insert(x,a,idx),insert(y,b,idx))
794bool VectorCombine::foldInsExtBinop(Instruction &I) {
795 BinaryOperator *VecBinOp, *SclBinOp;
796 uint64_t Index;
797 if (!match(&I,
798 m_InsertElt(m_OneUse(m_BinOp(VecBinOp)),
799 m_OneUse(m_BinOp(SclBinOp)), m_ConstantInt(Index))))
800 return false;
801
802 // TODO: Add support for addlike etc.
803 Instruction::BinaryOps BinOpcode = VecBinOp->getOpcode();
804 if (BinOpcode != SclBinOp->getOpcode())
805 return false;
806
807 auto *ResultTy = dyn_cast<FixedVectorType>(I.getType());
808 if (!ResultTy)
809 return false;
810
811 // TODO: Attempt to detect m_ExtractElt for scalar operands and convert to
812 // shuffle?
813
815 TTI.getInstructionCost(VecBinOp, CostKind) +
817 InstructionCost NewCost =
818 TTI.getArithmeticInstrCost(BinOpcode, ResultTy, CostKind) +
819 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
820 Index, VecBinOp->getOperand(0),
821 SclBinOp->getOperand(0)) +
822 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
823 Index, VecBinOp->getOperand(1),
824 SclBinOp->getOperand(1));
825
826 LLVM_DEBUG(dbgs() << "Found an insertion of two binops: " << I
827 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
828 << "\n");
829 if (NewCost > OldCost)
830 return false;
831
832 Value *NewIns0 = Builder.CreateInsertElement(VecBinOp->getOperand(0),
833 SclBinOp->getOperand(0), Index);
834 Value *NewIns1 = Builder.CreateInsertElement(VecBinOp->getOperand(1),
835 SclBinOp->getOperand(1), Index);
836 Value *NewBO = Builder.CreateBinOp(BinOpcode, NewIns0, NewIns1);
837
838 // Intersect flags from the old binops.
839 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
840 NewInst->copyIRFlags(VecBinOp);
841 NewInst->andIRFlags(SclBinOp);
842 }
843
844 Worklist.pushValue(NewIns0);
845 Worklist.pushValue(NewIns1);
846 replaceValue(I, *NewBO);
847 return true;
848}
849
850/// Match: bitop(castop(x), castop(y)) -> castop(bitop(x, y))
851/// Supports: bitcast, trunc, sext, zext
852bool VectorCombine::foldBitOpOfCastops(Instruction &I) {
853 // Check if this is a bitwise logic operation
854 auto *BinOp = dyn_cast<BinaryOperator>(&I);
855 if (!BinOp || !BinOp->isBitwiseLogicOp())
856 return false;
857
858 // Get the cast instructions
859 auto *LHSCast = dyn_cast<CastInst>(BinOp->getOperand(0));
860 auto *RHSCast = dyn_cast<CastInst>(BinOp->getOperand(1));
861 if (!LHSCast || !RHSCast) {
862 LLVM_DEBUG(dbgs() << " One or both operands are not cast instructions\n");
863 return false;
864 }
865
866 // Both casts must be the same type
867 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
868 if (CastOpcode != RHSCast->getOpcode())
869 return false;
870
871 // Only handle supported cast operations
872 switch (CastOpcode) {
873 case Instruction::BitCast:
874 case Instruction::Trunc:
875 case Instruction::SExt:
876 case Instruction::ZExt:
877 break;
878 default:
879 return false;
880 }
881
882 Value *LHSSrc = LHSCast->getOperand(0);
883 Value *RHSSrc = RHSCast->getOperand(0);
884
885 // Source types must match
886 if (LHSSrc->getType() != RHSSrc->getType())
887 return false;
888
889 auto *SrcTy = LHSSrc->getType();
890 auto *DstTy = I.getType();
891 // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
892 // Other casts only handle vector types with integer elements.
893 if (CastOpcode != Instruction::BitCast &&
894 (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
895 return false;
896
897 // Only integer scalar/vector values are legal for bitwise logic operations.
898 if (!SrcTy->getScalarType()->isIntegerTy() ||
899 !DstTy->getScalarType()->isIntegerTy())
900 return false;
901
902 // Cost Check :
903 // OldCost = bitlogic + 2*casts
904 // NewCost = bitlogic + cast
905
906 // Calculate specific costs for each cast with instruction context
908 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
910 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, RHSCast);
911
912 InstructionCost OldCost =
913 TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstTy, CostKind) +
914 LHSCastCost + RHSCastCost;
915
916 // For new cost, we can't provide an instruction (it doesn't exist yet)
917 InstructionCost GenericCastCost = TTI.getCastInstrCost(
918 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
919
920 InstructionCost NewCost =
921 TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcTy, CostKind) +
922 GenericCastCost;
923
924 // Account for multi-use casts using specific costs
925 if (!LHSCast->hasOneUse())
926 NewCost += LHSCastCost;
927 if (!RHSCast->hasOneUse())
928 NewCost += RHSCastCost;
929
930 LLVM_DEBUG(dbgs() << "foldBitOpOfCastops: OldCost=" << OldCost
931 << " NewCost=" << NewCost << "\n");
932
933 if (NewCost > OldCost)
934 return false;
935
936 // Create the operation on the source type
937 Value *NewOp = Builder.CreateBinOp(BinOp->getOpcode(), LHSSrc, RHSSrc,
938 BinOp->getName() + ".inner");
939 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
940 NewBinOp->copyIRFlags(BinOp);
941
942 Worklist.pushValue(NewOp);
943
944 // Create the cast operation directly to ensure we get a new instruction
945 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
946
947 // Preserve cast instruction flags
948 NewCast->copyIRFlags(LHSCast);
949 NewCast->andIRFlags(RHSCast);
950
951 // Insert the new instruction
952 Value *Result = Builder.Insert(NewCast);
953
954 replaceValue(I, *Result);
955 return true;
956}
957
958/// Match:
959// bitop(castop(x), C) ->
960// bitop(castop(x), castop(InvC)) ->
961// castop(bitop(x, InvC))
962// Supports: bitcast
963bool VectorCombine::foldBitOpOfCastConstant(Instruction &I) {
965 Constant *C;
966
967 // Check if this is a bitwise logic operation
969 return false;
970
971 // Get the cast instructions
972 auto *LHSCast = dyn_cast<CastInst>(LHS);
973 if (!LHSCast)
974 return false;
975
976 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
977
978 // Only handle supported cast operations
979 switch (CastOpcode) {
980 case Instruction::BitCast:
981 case Instruction::ZExt:
982 case Instruction::SExt:
983 case Instruction::Trunc:
984 break;
985 default:
986 return false;
987 }
988
989 Value *LHSSrc = LHSCast->getOperand(0);
990
991 auto *SrcTy = LHSSrc->getType();
992 auto *DstTy = I.getType();
993 // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
994 // Other casts only handle vector types with integer elements.
995 if (CastOpcode != Instruction::BitCast &&
996 (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
997 return false;
998
999 // Only integer scalar/vector values are legal for bitwise logic operations.
1000 if (!SrcTy->getScalarType()->isIntegerTy() ||
1001 !DstTy->getScalarType()->isIntegerTy())
1002 return false;
1003
1004 // Find the constant InvC, such that castop(InvC) equals to C.
1005 PreservedCastFlags RHSFlags;
1006 Constant *InvC = getLosslessInvCast(C, SrcTy, CastOpcode, *DL, &RHSFlags);
1007 if (!InvC)
1008 return false;
1009
1010 // Cost Check :
1011 // OldCost = bitlogic + cast
1012 // NewCost = bitlogic + cast
1013
1014 // Calculate specific costs for each cast with instruction context
1015 InstructionCost LHSCastCost = TTI.getCastInstrCost(
1016 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
1017
1018 InstructionCost OldCost =
1019 TTI.getArithmeticInstrCost(I.getOpcode(), DstTy, CostKind) + LHSCastCost;
1020
1021 // For new cost, we can't provide an instruction (it doesn't exist yet)
1022 InstructionCost GenericCastCost = TTI.getCastInstrCost(
1023 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
1024
1025 InstructionCost NewCost =
1026 TTI.getArithmeticInstrCost(I.getOpcode(), SrcTy, CostKind) +
1027 GenericCastCost;
1028
1029 // Account for multi-use casts using specific costs
1030 if (!LHSCast->hasOneUse())
1031 NewCost += LHSCastCost;
1032
1033 LLVM_DEBUG(dbgs() << "foldBitOpOfCastConstant: OldCost=" << OldCost
1034 << " NewCost=" << NewCost << "\n");
1035
1036 if (NewCost > OldCost)
1037 return false;
1038
1039 // Create the operation on the source type
1040 Value *NewOp = Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(),
1041 LHSSrc, InvC, I.getName() + ".inner");
1042 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
1043 NewBinOp->copyIRFlags(&I);
1044
1045 Worklist.pushValue(NewOp);
1046
1047 // Create the cast operation directly to ensure we get a new instruction
1048 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
1049
1050 // Preserve cast instruction flags
1051 if (RHSFlags.NNeg)
1052 NewCast->setNonNeg();
1053 if (RHSFlags.NUW)
1054 NewCast->setHasNoUnsignedWrap();
1055 if (RHSFlags.NSW)
1056 NewCast->setHasNoSignedWrap();
1057
1058 NewCast->andIRFlags(LHSCast);
1059
1060 // Insert the new instruction
1061 Value *Result = Builder.Insert(NewCast);
1062
1063 replaceValue(I, *Result);
1064 return true;
1065}
1066
1067/// If this is a bitcast of a shuffle, try to bitcast the source vector to the
1068/// destination type followed by shuffle. This can enable further transforms by
1069/// moving bitcasts or shuffles together.
1070bool VectorCombine::foldBitcastShuffle(Instruction &I) {
1071 Value *V0, *V1;
1072 ArrayRef<int> Mask;
1073 if (!match(&I, m_BitCast(m_OneUse(
1074 m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(Mask))))))
1075 return false;
1076
1077 // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for
1078 // scalable type is unknown; Second, we cannot reason if the narrowed shuffle
1079 // mask for scalable type is a splat or not.
1080 // 2) Disallow non-vector casts.
1081 // TODO: We could allow any shuffle.
1082 auto *DestTy = dyn_cast<FixedVectorType>(I.getType());
1083 auto *SrcTy = dyn_cast<FixedVectorType>(V0->getType());
1084 if (!DestTy || !SrcTy)
1085 return false;
1086
1087 unsigned DestEltSize = DestTy->getScalarSizeInBits();
1088 unsigned SrcEltSize = SrcTy->getScalarSizeInBits();
1089 if (SrcTy->getPrimitiveSizeInBits() % DestEltSize != 0)
1090 return false;
1091
1092 bool IsUnary = isa<UndefValue>(V1);
1093
1094 // For binary shuffles, only fold bitcast(shuffle(X,Y))
1095 // if it won't increase the number of bitcasts.
1096 if (!IsUnary) {
1099 if (!(BCTy0 && BCTy0->getElementType() == DestTy->getElementType()) &&
1100 !(BCTy1 && BCTy1->getElementType() == DestTy->getElementType()))
1101 return false;
1102 }
1103
1104 SmallVector<int, 16> NewMask;
1105 if (DestEltSize <= SrcEltSize) {
1106 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
1107 // always be expanded to the equivalent form choosing narrower elements.
1108 if (SrcEltSize % DestEltSize != 0)
1109 return false;
1110 unsigned ScaleFactor = SrcEltSize / DestEltSize;
1111 narrowShuffleMaskElts(ScaleFactor, Mask, NewMask);
1112 } else {
1113 // The bitcast is from narrow elements to wide elements. The shuffle mask
1114 // must choose consecutive elements to allow casting first.
1115 if (DestEltSize % SrcEltSize != 0)
1116 return false;
1117 unsigned ScaleFactor = DestEltSize / SrcEltSize;
1118 if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask))
1119 return false;
1120 }
1121
1122 // Bitcast the shuffle src - keep its original width but using the destination
1123 // scalar type.
1124 unsigned NumSrcElts = SrcTy->getPrimitiveSizeInBits() / DestEltSize;
1125 auto *NewShuffleTy =
1126 FixedVectorType::get(DestTy->getScalarType(), NumSrcElts);
1127 auto *OldShuffleTy =
1128 FixedVectorType::get(SrcTy->getScalarType(), Mask.size());
1129 unsigned NumOps = IsUnary ? 1 : 2;
1130
1131 // The new shuffle must not cost more than the old shuffle.
1135
1136 InstructionCost NewCost =
1137 TTI.getShuffleCost(SK, DestTy, NewShuffleTy, NewMask, CostKind) +
1138 (NumOps * TTI.getCastInstrCost(Instruction::BitCast, NewShuffleTy, SrcTy,
1139 TargetTransformInfo::CastContextHint::None,
1140 CostKind));
1141 InstructionCost OldCost =
1142 TTI.getShuffleCost(SK, OldShuffleTy, SrcTy, Mask, CostKind) +
1143 TTI.getCastInstrCost(Instruction::BitCast, DestTy, OldShuffleTy,
1144 TargetTransformInfo::CastContextHint::None,
1145 CostKind);
1146
1147 LLVM_DEBUG(dbgs() << "Found a bitcasted shuffle: " << I << "\n OldCost: "
1148 << OldCost << " vs NewCost: " << NewCost << "\n");
1149
1150 if (NewCost > OldCost || !NewCost.isValid())
1151 return false;
1152
1153 // bitcast (shuf V0, V1, MaskC) --> shuf (bitcast V0), (bitcast V1), MaskC'
1154 ++NumShufOfBitcast;
1155 Value *CastV0 = Builder.CreateBitCast(peekThroughBitcasts(V0), NewShuffleTy);
1156 Value *CastV1 = Builder.CreateBitCast(peekThroughBitcasts(V1), NewShuffleTy);
1157 Value *Shuf = Builder.CreateShuffleVector(CastV0, CastV1, NewMask);
1158 replaceValue(I, *Shuf);
1159 return true;
1160}
1161
1162/// VP Intrinsics whose vector operands are both splat values may be simplified
1163/// into the scalar version of the operation and the result splatted. This
1164/// can lead to scalarization down the line.
1165bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
1166 if (!isa<VPIntrinsic>(I))
1167 return false;
1168 VPIntrinsic &VPI = cast<VPIntrinsic>(I);
1169 Value *Op0 = VPI.getArgOperand(0);
1170 Value *Op1 = VPI.getArgOperand(1);
1171
1172 if (!isSplatValue(Op0) || !isSplatValue(Op1))
1173 return false;
1174
1175 // Check getSplatValue early in this function, to avoid doing unnecessary
1176 // work.
1177 Value *ScalarOp0 = getSplatValue(Op0);
1178 Value *ScalarOp1 = getSplatValue(Op1);
1179 if (!ScalarOp0 || !ScalarOp1)
1180 return false;
1181
1182 // For the binary VP intrinsics supported here, the result on disabled lanes
1183 // is a poison value. For now, only do this simplification if all lanes
1184 // are active.
1185 // TODO: Relax the condition that all lanes are active by using insertelement
1186 // on inactive lanes.
1187 auto IsAllTrueMask = [](Value *MaskVal) {
1188 if (Value *SplattedVal = getSplatValue(MaskVal))
1189 if (auto *ConstValue = dyn_cast<Constant>(SplattedVal))
1190 return ConstValue->isAllOnesValue();
1191 return false;
1192 };
1193 if (!IsAllTrueMask(VPI.getArgOperand(2)))
1194 return false;
1195
1196 // Check to make sure we support scalarization of the intrinsic
1197 Intrinsic::ID IntrID = VPI.getIntrinsicID();
1198 if (!VPBinOpIntrinsic::isVPBinOp(IntrID))
1199 return false;
1200
1201 // Calculate cost of splatting both operands into vectors and the vector
1202 // intrinsic
1203 VectorType *VecTy = cast<VectorType>(VPI.getType());
1204 SmallVector<int> Mask;
1205 if (auto *FVTy = dyn_cast<FixedVectorType>(VecTy))
1206 Mask.resize(FVTy->getNumElements(), 0);
1207 InstructionCost SplatCost =
1208 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0) +
1210 CostKind);
1211
1212 // Calculate the cost of the VP Intrinsic
1214 for (Value *V : VPI.args())
1215 Args.push_back(V->getType());
1216 IntrinsicCostAttributes Attrs(IntrID, VecTy, Args);
1217 InstructionCost VectorOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1218 InstructionCost OldCost = 2 * SplatCost + VectorOpCost;
1219
1220 // Determine scalar opcode
1221 std::optional<unsigned> FunctionalOpcode =
1222 VPI.getFunctionalOpcode();
1223 std::optional<Intrinsic::ID> ScalarIntrID = std::nullopt;
1224 if (!FunctionalOpcode) {
1225 ScalarIntrID = VPI.getFunctionalIntrinsicID();
1226 if (!ScalarIntrID)
1227 return false;
1228 }
1229
1230 // Calculate cost of scalarizing
1231 InstructionCost ScalarOpCost = 0;
1232 if (ScalarIntrID) {
1233 IntrinsicCostAttributes Attrs(*ScalarIntrID, VecTy->getScalarType(), Args);
1234 ScalarOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1235 } else {
1236 ScalarOpCost = TTI.getArithmeticInstrCost(*FunctionalOpcode,
1237 VecTy->getScalarType(), CostKind);
1238 }
1239
1240 // The existing splats may be kept around if other instructions use them.
1241 InstructionCost CostToKeepSplats =
1242 (SplatCost * !Op0->hasOneUse()) + (SplatCost * !Op1->hasOneUse());
1243 InstructionCost NewCost = ScalarOpCost + SplatCost + CostToKeepSplats;
1244
1245 LLVM_DEBUG(dbgs() << "Found a VP Intrinsic to scalarize: " << VPI
1246 << "\n");
1247 LLVM_DEBUG(dbgs() << "Cost of Intrinsic: " << OldCost
1248 << ", Cost of scalarizing:" << NewCost << "\n");
1249
1250 // We want to scalarize unless the vector variant actually has lower cost.
1251 if (OldCost < NewCost || !NewCost.isValid())
1252 return false;
1253
1254 // Scalarize the intrinsic
1255 ElementCount EC = cast<VectorType>(Op0->getType())->getElementCount();
1256 Value *EVL = VPI.getArgOperand(3);
1257
1258 // If the VP op might introduce UB or poison, we can scalarize it provided
1259 // that we know the EVL > 0: If the EVL is zero, then the original VP op
1260 // becomes a no-op and thus won't be UB, so make sure we don't introduce UB by
1261 // scalarizing it.
1262 bool SafeToSpeculate;
1263 if (ScalarIntrID)
1264 SafeToSpeculate = Intrinsic::getFnAttributes(I.getContext(), *ScalarIntrID)
1265 .hasAttribute(Attribute::AttrKind::Speculatable);
1266 else
1268 *FunctionalOpcode, &VPI, nullptr, &AC, &DT);
1269 if (!SafeToSpeculate &&
1270 !isKnownNonZero(EVL, SimplifyQuery(*DL, &DT, &AC, &VPI)))
1271 return false;
1272
1273 Value *ScalarVal =
1274 ScalarIntrID
1275 ? Builder.CreateIntrinsic(VecTy->getScalarType(), *ScalarIntrID,
1276 {ScalarOp0, ScalarOp1})
1277 : Builder.CreateBinOp((Instruction::BinaryOps)(*FunctionalOpcode),
1278 ScalarOp0, ScalarOp1);
1279
1280 replaceValue(VPI, *Builder.CreateVectorSplat(EC, ScalarVal));
1281 return true;
1282}
1283
1284/// Match a vector op/compare/intrinsic with at least one
1285/// inserted scalar operand and convert to scalar op/cmp/intrinsic followed
1286/// by insertelement.
1287bool VectorCombine::scalarizeOpOrCmp(Instruction &I) {
1288 auto *UO = dyn_cast<UnaryOperator>(&I);
1289 auto *BO = dyn_cast<BinaryOperator>(&I);
1290 auto *CI = dyn_cast<CmpInst>(&I);
1291 auto *II = dyn_cast<IntrinsicInst>(&I);
1292 if (!UO && !BO && !CI && !II)
1293 return false;
1294
1295 // TODO: Allow intrinsics with different argument types
1296 if (II) {
1297 if (!isTriviallyVectorizable(II->getIntrinsicID()))
1298 return false;
1299 for (auto [Idx, Arg] : enumerate(II->args()))
1300 if (Arg->getType() != II->getType() &&
1301 !isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, &TTI))
1302 return false;
1303 }
1304
1305 // Do not convert the vector condition of a vector select into a scalar
1306 // condition. That may cause problems for codegen because of differences in
1307 // boolean formats and register-file transfers.
1308 // TODO: Can we account for that in the cost model?
1309 if (CI)
1310 for (User *U : I.users())
1311 if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value())))
1312 return false;
1313
1314 // Match constant vectors or scalars being inserted into constant vectors:
1315 // vec_op [VecC0 | (inselt VecC0, V0, Index)], ...
1316 SmallVector<Value *> VecCs, ScalarOps;
1317 std::optional<uint64_t> Index;
1318
1319 auto Ops = II ? II->args() : I.operands();
1320 for (auto [OpNum, Op] : enumerate(Ops)) {
1321 Constant *VecC;
1322 Value *V;
1323 uint64_t InsIdx = 0;
1324 if (match(Op.get(), m_InsertElt(m_Constant(VecC), m_Value(V),
1325 m_ConstantInt(InsIdx)))) {
1326 // Bail if any inserts are out of bounds.
1327 VectorType *OpTy = cast<VectorType>(Op->getType());
1328 if (OpTy->getElementCount().getKnownMinValue() <= InsIdx)
1329 return false;
1330 // All inserts must have the same index.
1331 // TODO: Deal with mismatched index constants and variable indexes?
1332 if (!Index)
1333 Index = InsIdx;
1334 else if (InsIdx != *Index)
1335 return false;
1336 VecCs.push_back(VecC);
1337 ScalarOps.push_back(V);
1338 } else if (II && isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(),
1339 OpNum, &TTI)) {
1340 VecCs.push_back(Op.get());
1341 ScalarOps.push_back(Op.get());
1342 } else if (match(Op.get(), m_Constant(VecC))) {
1343 VecCs.push_back(VecC);
1344 ScalarOps.push_back(nullptr);
1345 } else {
1346 return false;
1347 }
1348 }
1349
1350 // Bail if all operands are constant.
1351 if (!Index.has_value())
1352 return false;
1353
1354 VectorType *VecTy = cast<VectorType>(I.getType());
1355 Type *ScalarTy = VecTy->getScalarType();
1356 assert(VecTy->isVectorTy() &&
1357 (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() ||
1358 ScalarTy->isPointerTy()) &&
1359 "Unexpected types for insert element into binop or cmp");
1360
1361 unsigned Opcode = I.getOpcode();
1362 InstructionCost ScalarOpCost, VectorOpCost;
1363 if (CI) {
1364 CmpInst::Predicate Pred = CI->getPredicate();
1365 ScalarOpCost = TTI.getCmpSelInstrCost(
1366 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
1367 VectorOpCost = TTI.getCmpSelInstrCost(
1368 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1369 } else if (UO || BO) {
1370 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
1371 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
1372 } else {
1373 IntrinsicCostAttributes ScalarICA(
1374 II->getIntrinsicID(), ScalarTy,
1375 SmallVector<Type *>(II->arg_size(), ScalarTy));
1376 ScalarOpCost = TTI.getIntrinsicInstrCost(ScalarICA, CostKind);
1377 IntrinsicCostAttributes VectorICA(
1378 II->getIntrinsicID(), VecTy,
1379 SmallVector<Type *>(II->arg_size(), VecTy));
1380 VectorOpCost = TTI.getIntrinsicInstrCost(VectorICA, CostKind);
1381 }
1382
1383 // Fold the vector constants in the original vectors into a new base vector to
1384 // get more accurate cost modelling.
1385 Value *NewVecC = nullptr;
1386 if (CI)
1387 NewVecC = simplifyCmpInst(CI->getPredicate(), VecCs[0], VecCs[1], SQ);
1388 else if (UO)
1389 NewVecC =
1390 simplifyUnOp(UO->getOpcode(), VecCs[0], UO->getFastMathFlags(), SQ);
1391 else if (BO)
1392 NewVecC = simplifyBinOp(BO->getOpcode(), VecCs[0], VecCs[1], SQ);
1393 else if (II)
1394 NewVecC = simplifyCall(II, II->getCalledOperand(), VecCs, SQ);
1395
1396 if (!NewVecC)
1397 return false;
1398
1399 // Get cost estimate for the insert element. This cost will factor into
1400 // both sequences.
1401 InstructionCost OldCost = VectorOpCost;
1402 InstructionCost NewCost =
1403 ScalarOpCost + TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
1404 CostKind, *Index, NewVecC);
1405
1406 for (auto [Idx, Op, VecC, Scalar] : enumerate(Ops, VecCs, ScalarOps)) {
1407 if (!Scalar || (II && isVectorIntrinsicWithScalarOpAtArg(
1408 II->getIntrinsicID(), Idx, &TTI)))
1409 continue;
1411 Instruction::InsertElement, VecTy, CostKind, *Index, VecC, Scalar);
1412 OldCost += InsertCost;
1413 NewCost += !Op->hasOneUse() * InsertCost;
1414 }
1415
1416 // We want to scalarize unless the vector variant actually has lower cost.
1417 if (OldCost < NewCost || !NewCost.isValid())
1418 return false;
1419
1420 // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
1421 // inselt NewVecC, (scalar_op V0, V1), Index
1422 if (CI)
1423 ++NumScalarCmp;
1424 else if (UO || BO)
1425 ++NumScalarOps;
1426 else
1427 ++NumScalarIntrinsic;
1428
1429 // For constant cases, extract the scalar element, this should constant fold.
1430 for (auto [OpIdx, Scalar, VecC] : enumerate(ScalarOps, VecCs))
1431 if (!Scalar)
1433 cast<Constant>(VecC), Builder.getInt64(*Index));
1434
1435 Value *Scalar;
1436 if (CI)
1437 Scalar = Builder.CreateCmp(CI->getPredicate(), ScalarOps[0], ScalarOps[1]);
1438 else if (UO || BO)
1439 Scalar = Builder.CreateNAryOp(Opcode, ScalarOps);
1440 else
1441 Scalar = Builder.CreateIntrinsic(ScalarTy, II->getIntrinsicID(), ScalarOps);
1442
1443 Scalar->setName(I.getName() + ".scalar");
1444
1445 // All IR flags are safe to back-propagate. There is no potential for extra
1446 // poison to be created by the scalar instruction.
1447 if (auto *ScalarInst = dyn_cast<Instruction>(Scalar))
1448 ScalarInst->copyIRFlags(&I);
1449
1450 Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, *Index);
1451 replaceValue(I, *Insert);
1452 return true;
1453}
1454
1455/// Try to combine a scalar binop + 2 scalar compares of extracted elements of
1456/// a vector into vector operations followed by extract. Note: The SLP pass
1457/// may miss this pattern because of implementation problems.
1458bool VectorCombine::foldExtractedCmps(Instruction &I) {
1459 auto *BI = dyn_cast<BinaryOperator>(&I);
1460
1461 // We are looking for a scalar binop of booleans.
1462 // binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1)
1463 if (!BI || !I.getType()->isIntegerTy(1))
1464 return false;
1465
1466 // The compare predicates should match, and each compare should have a
1467 // constant operand.
1468 Value *B0 = I.getOperand(0), *B1 = I.getOperand(1);
1469 Instruction *I0, *I1;
1470 Constant *C0, *C1;
1471 CmpPredicate P0, P1;
1472 if (!match(B0, m_Cmp(P0, m_Instruction(I0), m_Constant(C0))) ||
1473 !match(B1, m_Cmp(P1, m_Instruction(I1), m_Constant(C1))))
1474 return false;
1475
1476 auto MatchingPred = CmpPredicate::getMatching(P0, P1);
1477 if (!MatchingPred)
1478 return false;
1479
1480 // The compare operands must be extracts of the same vector with constant
1481 // extract indexes.
1482 Value *X;
1483 uint64_t Index0, Index1;
1484 if (!match(I0, m_ExtractElt(m_Value(X), m_ConstantInt(Index0))) ||
1485 !match(I1, m_ExtractElt(m_Specific(X), m_ConstantInt(Index1))))
1486 return false;
1487
1488 auto *Ext0 = cast<ExtractElementInst>(I0);
1489 auto *Ext1 = cast<ExtractElementInst>(I1);
1490 ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1, CostKind);
1491 if (!ConvertToShuf)
1492 return false;
1493 assert((ConvertToShuf == Ext0 || ConvertToShuf == Ext1) &&
1494 "Unknown ExtractElementInst");
1495
1496 // The original scalar pattern is:
1497 // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1)
1498 CmpInst::Predicate Pred = *MatchingPred;
1499 unsigned CmpOpcode =
1500 CmpInst::isFPPredicate(Pred) ? Instruction::FCmp : Instruction::ICmp;
1501 auto *VecTy = dyn_cast<FixedVectorType>(X->getType());
1502 if (!VecTy)
1503 return false;
1504
1505 InstructionCost Ext0Cost =
1506 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
1507 InstructionCost Ext1Cost =
1508 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
1510 CmpOpcode, I0->getType(), CmpInst::makeCmpResultType(I0->getType()), Pred,
1511 CostKind);
1512
1513 InstructionCost OldCost =
1514 Ext0Cost + Ext1Cost + CmpCost * 2 +
1515 TTI.getArithmeticInstrCost(I.getOpcode(), I.getType(), CostKind);
1516
1517 // The proposed vector pattern is:
1518 // vcmp = cmp Pred X, VecC
1519 // ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0
1520 int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0;
1521 int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1;
1524 CmpOpcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1525 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
1526 ShufMask[CheapIndex] = ExpensiveIndex;
1528 CmpTy, ShufMask, CostKind);
1529 NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy, CostKind);
1530 NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex);
1531 NewCost += Ext0->hasOneUse() ? 0 : Ext0Cost;
1532 NewCost += Ext1->hasOneUse() ? 0 : Ext1Cost;
1533
1534 // Aggressively form vector ops if the cost is equal because the transform
1535 // may enable further optimization.
1536 // Codegen can reverse this transform (scalarize) if it was not profitable.
1537 if (OldCost < NewCost || !NewCost.isValid())
1538 return false;
1539
1540 // Create a vector constant from the 2 scalar constants.
1541 SmallVector<Constant *, 32> CmpC(VecTy->getNumElements(),
1542 PoisonValue::get(VecTy->getElementType()));
1543 CmpC[Index0] = C0;
1544 CmpC[Index1] = C1;
1545 Value *VCmp = Builder.CreateCmp(Pred, X, ConstantVector::get(CmpC));
1546 Value *Shuf = createShiftShuffle(VCmp, ExpensiveIndex, CheapIndex, Builder);
1547 Value *LHS = ConvertToShuf == Ext0 ? Shuf : VCmp;
1548 Value *RHS = ConvertToShuf == Ext0 ? VCmp : Shuf;
1549 Value *VecLogic = Builder.CreateBinOp(BI->getOpcode(), LHS, RHS);
1550 Value *NewExt = Builder.CreateExtractElement(VecLogic, CheapIndex);
1551 replaceValue(I, *NewExt);
1552 ++NumVecCmpBO;
1553 return true;
1554}
1555
1556/// Try to fold scalar selects that select between extracted elements and zero
1557/// into extracting from a vector select. This is rooted at the bitcast.
1558///
1559/// This pattern arises when a vector is bitcast to a smaller element type,
1560/// elements are extracted, and then conditionally selected with zero:
1561///
1562/// %bc = bitcast <4 x i32> %src to <16 x i8>
1563/// %e0 = extractelement <16 x i8> %bc, i32 0
1564/// %s0 = select i1 %cond, i8 %e0, i8 0
1565/// %e1 = extractelement <16 x i8> %bc, i32 1
1566/// %s1 = select i1 %cond, i8 %e1, i8 0
1567/// ...
1568///
1569/// Transforms to:
1570/// %sel = select i1 %cond, <4 x i32> %src, <4 x i32> zeroinitializer
1571/// %bc = bitcast <4 x i32> %sel to <16 x i8>
1572/// %e0 = extractelement <16 x i8> %bc, i32 0
1573/// %e1 = extractelement <16 x i8> %bc, i32 1
1574/// ...
1575///
1576/// This is profitable because vector select on wider types produces fewer
1577/// select/cndmask instructions than scalar selects on each element.
1578bool VectorCombine::foldSelectsFromBitcast(Instruction &I) {
1579 auto *BC = dyn_cast<BitCastInst>(&I);
1580 if (!BC)
1581 return false;
1582
1583 FixedVectorType *SrcVecTy = dyn_cast<FixedVectorType>(BC->getSrcTy());
1584 FixedVectorType *DstVecTy = dyn_cast<FixedVectorType>(BC->getDestTy());
1585 if (!SrcVecTy || !DstVecTy)
1586 return false;
1587
1588 // Source must be 32-bit or 64-bit elements, destination must be smaller
1589 // integer elements. Zero in all these types is all-bits-zero.
1590 Type *SrcEltTy = SrcVecTy->getElementType();
1591 Type *DstEltTy = DstVecTy->getElementType();
1592 unsigned SrcEltBits = SrcEltTy->getPrimitiveSizeInBits();
1593 unsigned DstEltBits = DstEltTy->getPrimitiveSizeInBits();
1594
1595 if (SrcEltBits != 32 && SrcEltBits != 64)
1596 return false;
1597
1598 if (!DstEltTy->isIntegerTy() || DstEltBits >= SrcEltBits)
1599 return false;
1600
1601 // Check profitability using TTI before collecting users.
1602 Type *CondTy = CmpInst::makeCmpResultType(DstEltTy);
1603 Type *VecCondTy = CmpInst::makeCmpResultType(SrcVecTy);
1604
1605 InstructionCost ScalarSelCost =
1606 TTI.getCmpSelInstrCost(Instruction::Select, DstEltTy, CondTy,
1608 InstructionCost VecSelCost =
1609 TTI.getCmpSelInstrCost(Instruction::Select, SrcVecTy, VecCondTy,
1611
1612 // We need at least this many selects for vectorization to be profitable.
1613 // VecSelCost < ScalarSelCost * NumSelects => NumSelects > VecSelCost /
1614 // ScalarSelCost
1615 if (!ScalarSelCost.isValid() || ScalarSelCost == 0)
1616 return false;
1617
1618 unsigned MinSelects = (VecSelCost.getValue() / ScalarSelCost.getValue()) + 1;
1619
1620 // Quick check: if bitcast doesn't have enough users, bail early.
1621 if (!BC->hasNUsesOrMore(MinSelects))
1622 return false;
1623
1624 // Collect all select users that match the pattern, grouped by condition.
1625 // Pattern: select i1 %cond, (extractelement %bc, idx), 0
1626 DenseMap<Value *, SmallVector<SelectInst *, 8>> CondToSelects;
1627
1628 for (User *U : BC->users()) {
1629 auto *Ext = dyn_cast<ExtractElementInst>(U);
1630 if (!Ext)
1631 continue;
1632
1633 for (User *ExtUser : Ext->users()) {
1634 Value *Cond;
1635 // Match: select i1 %cond, %ext, 0
1636 if (match(ExtUser, m_Select(m_Value(Cond), m_Specific(Ext), m_Zero())) &&
1637 Cond->getType()->isIntegerTy(1))
1638 CondToSelects[Cond].push_back(cast<SelectInst>(ExtUser));
1639 }
1640 }
1641
1642 if (CondToSelects.empty())
1643 return false;
1644
1645 bool MadeChange = false;
1646 Value *SrcVec = BC->getOperand(0);
1647
1648 // Process each group of selects with the same condition.
1649 for (auto [Cond, Selects] : CondToSelects) {
1650 // Only profitable if vector select cost < total scalar select cost.
1651 if (Selects.size() < MinSelects) {
1652 LLVM_DEBUG(dbgs() << "VectorCombine: foldSelectsFromBitcast not "
1653 << "profitable (VecCost=" << VecSelCost
1654 << ", ScalarCost=" << ScalarSelCost
1655 << ", NumSelects=" << Selects.size() << ")\n");
1656 continue;
1657 }
1658
1659 // Create the vector select and bitcast once for this condition.
1660 auto InsertPt = std::next(BC->getIterator());
1661
1662 if (auto *CondInst = dyn_cast<Instruction>(Cond))
1663 if (DT.dominates(BC, CondInst))
1664 InsertPt = std::next(CondInst->getIterator());
1665
1666 Builder.SetInsertPoint(InsertPt);
1667 Value *VecSel =
1668 Builder.CreateSelect(Cond, SrcVec, Constant::getNullValue(SrcVecTy));
1669 Value *NewBC = Builder.CreateBitCast(VecSel, DstVecTy);
1670
1671 // Replace each scalar select with an extract from the new bitcast.
1672 for (SelectInst *Sel : Selects) {
1673 auto *Ext = cast<ExtractElementInst>(Sel->getTrueValue());
1674 Value *Idx = Ext->getIndexOperand();
1675
1676 Builder.SetInsertPoint(Sel);
1677 Value *NewExt = Builder.CreateExtractElement(NewBC, Idx);
1678 replaceValue(*Sel, *NewExt);
1679 MadeChange = true;
1680 }
1681
1682 LLVM_DEBUG(dbgs() << "VectorCombine: folded " << Selects.size()
1683 << " selects into vector select\n");
1684 }
1685
1686 return MadeChange;
1687}
1688
1691 const TargetTransformInfo &TTI,
1692 InstructionCost &CostBeforeReduction,
1693 InstructionCost &CostAfterReduction) {
1694 Instruction *Op0, *Op1;
1695 auto *RedOp = dyn_cast<Instruction>(II.getOperand(0));
1696 auto *VecRedTy = cast<VectorType>(II.getOperand(0)->getType());
1697 unsigned ReductionOpc =
1698 getArithmeticReductionInstruction(II.getIntrinsicID());
1699 if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value()))) {
1700 bool IsUnsigned = isa<ZExtInst>(RedOp);
1701 auto *ExtType = cast<VectorType>(RedOp->getOperand(0)->getType());
1702
1703 CostBeforeReduction =
1704 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, ExtType,
1706 CostAfterReduction =
1707 TTI.getExtendedReductionCost(ReductionOpc, IsUnsigned, II.getType(),
1708 ExtType, FastMathFlags(), CostKind);
1709 return;
1710 }
1711 if (RedOp && II.getIntrinsicID() == Intrinsic::vector_reduce_add &&
1712 match(RedOp,
1714 match(Op0, m_ZExtOrSExt(m_Value())) &&
1715 Op0->getOpcode() == Op1->getOpcode() &&
1716 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
1717 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
1718 // Matched reduce.add(ext(mul(ext(A), ext(B)))
1719 bool IsUnsigned = isa<ZExtInst>(Op0);
1720 auto *ExtType = cast<VectorType>(Op0->getOperand(0)->getType());
1721 VectorType *MulType = VectorType::get(Op0->getType(), VecRedTy);
1722
1723 InstructionCost ExtCost =
1724 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
1726 InstructionCost MulCost =
1727 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
1728 InstructionCost Ext2Cost =
1729 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, MulType,
1731
1732 CostBeforeReduction = ExtCost * 2 + MulCost + Ext2Cost;
1733 CostAfterReduction = TTI.getMulAccReductionCost(
1734 IsUnsigned, ReductionOpc, II.getType(), ExtType, CostKind);
1735 return;
1736 }
1737 CostAfterReduction = TTI.getArithmeticReductionCost(ReductionOpc, VecRedTy,
1738 std::nullopt, CostKind);
1739}
1740
1741bool VectorCombine::foldBinopOfReductions(Instruction &I) {
1742 Instruction::BinaryOps BinOpOpc = cast<BinaryOperator>(&I)->getOpcode();
1743 Intrinsic::ID ReductionIID = getReductionForBinop(BinOpOpc);
1744 if (BinOpOpc == Instruction::Sub)
1745 ReductionIID = Intrinsic::vector_reduce_add;
1746 if (ReductionIID == Intrinsic::not_intrinsic)
1747 return false;
1748
1749 auto checkIntrinsicAndGetItsArgument = [](Value *V,
1750 Intrinsic::ID IID) -> Value * {
1751 auto *II = dyn_cast<IntrinsicInst>(V);
1752 if (!II)
1753 return nullptr;
1754 if (II->getIntrinsicID() == IID && II->hasOneUse())
1755 return II->getArgOperand(0);
1756 return nullptr;
1757 };
1758
1759 Value *V0 = checkIntrinsicAndGetItsArgument(I.getOperand(0), ReductionIID);
1760 if (!V0)
1761 return false;
1762 Value *V1 = checkIntrinsicAndGetItsArgument(I.getOperand(1), ReductionIID);
1763 if (!V1)
1764 return false;
1765
1766 auto *VTy = cast<VectorType>(V0->getType());
1767 if (V1->getType() != VTy)
1768 return false;
1769 const auto &II0 = *cast<IntrinsicInst>(I.getOperand(0));
1770 const auto &II1 = *cast<IntrinsicInst>(I.getOperand(1));
1771 unsigned ReductionOpc =
1772 getArithmeticReductionInstruction(II0.getIntrinsicID());
1773
1774 InstructionCost OldCost = 0;
1775 InstructionCost NewCost = 0;
1776 InstructionCost CostOfRedOperand0 = 0;
1777 InstructionCost CostOfRed0 = 0;
1778 InstructionCost CostOfRedOperand1 = 0;
1779 InstructionCost CostOfRed1 = 0;
1780 analyzeCostOfVecReduction(II0, CostKind, TTI, CostOfRedOperand0, CostOfRed0);
1781 analyzeCostOfVecReduction(II1, CostKind, TTI, CostOfRedOperand1, CostOfRed1);
1782 OldCost = CostOfRed0 + CostOfRed1 + TTI.getInstructionCost(&I, CostKind);
1783 NewCost =
1784 CostOfRedOperand0 + CostOfRedOperand1 +
1785 TTI.getArithmeticInstrCost(BinOpOpc, VTy, CostKind) +
1786 TTI.getArithmeticReductionCost(ReductionOpc, VTy, std::nullopt, CostKind);
1787 if (NewCost >= OldCost || !NewCost.isValid())
1788 return false;
1789
1790 LLVM_DEBUG(dbgs() << "Found two mergeable reductions: " << I
1791 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
1792 << "\n");
1793 Value *VectorBO;
1794 if (BinOpOpc == Instruction::Or)
1795 VectorBO = Builder.CreateOr(V0, V1, "",
1796 cast<PossiblyDisjointInst>(I).isDisjoint());
1797 else
1798 VectorBO = Builder.CreateBinOp(BinOpOpc, V0, V1);
1799
1800 Instruction *Rdx = Builder.CreateIntrinsic(ReductionIID, {VTy}, {VectorBO});
1801 replaceValue(I, *Rdx);
1802 return true;
1803}
1804
1805// Check if memory loc modified between two instrs in the same BB
1808 const MemoryLocation &Loc, AAResults &AA) {
1809 unsigned NumScanned = 0;
1810 return std::any_of(Begin, End, [&](const Instruction &Instr) {
1811 return isModSet(AA.getModRefInfo(&Instr, Loc)) ||
1812 ++NumScanned > MaxInstrsToScan;
1813 });
1814}
1815
1816namespace {
1817/// Helper class to indicate whether a vector index can be safely scalarized and
1818/// if a freeze needs to be inserted.
1819class ScalarizationResult {
1820 enum class StatusTy { Unsafe, Safe, SafeWithFreeze };
1821
1822 StatusTy Status;
1823 Value *ToFreeze;
1824
1825 ScalarizationResult(StatusTy Status, Value *ToFreeze = nullptr)
1826 : Status(Status), ToFreeze(ToFreeze) {}
1827
1828public:
1829 ScalarizationResult(const ScalarizationResult &Other) = default;
1830 ~ScalarizationResult() {
1831 assert(!ToFreeze && "freeze() not called with ToFreeze being set");
1832 }
1833
1834 static ScalarizationResult unsafe() { return {StatusTy::Unsafe}; }
1835 static ScalarizationResult safe() { return {StatusTy::Safe}; }
1836 static ScalarizationResult safeWithFreeze(Value *ToFreeze) {
1837 return {StatusTy::SafeWithFreeze, ToFreeze};
1838 }
1839
1840 /// Returns true if the index can be scalarize without requiring a freeze.
1841 bool isSafe() const { return Status == StatusTy::Safe; }
1842 /// Returns true if the index cannot be scalarized.
1843 bool isUnsafe() const { return Status == StatusTy::Unsafe; }
1844 /// Returns true if the index can be scalarize, but requires inserting a
1845 /// freeze.
1846 bool isSafeWithFreeze() const { return Status == StatusTy::SafeWithFreeze; }
1847
1848 /// Reset the state of Unsafe and clear ToFreze if set.
1849 void discard() {
1850 ToFreeze = nullptr;
1851 Status = StatusTy::Unsafe;
1852 }
1853
1854 /// Freeze the ToFreeze and update the use in \p User to use it.
1855 void freeze(IRBuilderBase &Builder, Instruction &UserI) {
1856 assert(isSafeWithFreeze() &&
1857 "should only be used when freezing is required");
1858 assert(is_contained(ToFreeze->users(), &UserI) &&
1859 "UserI must be a user of ToFreeze");
1860 IRBuilder<>::InsertPointGuard Guard(Builder);
1861 Builder.SetInsertPoint(cast<Instruction>(&UserI));
1862 Value *Frozen =
1863 Builder.CreateFreeze(ToFreeze, ToFreeze->getName() + ".frozen");
1864 for (Use &U : make_early_inc_range((UserI.operands())))
1865 if (U.get() == ToFreeze)
1866 U.set(Frozen);
1867
1868 ToFreeze = nullptr;
1869 }
1870};
1871} // namespace
1872
1873/// Check if it is legal to scalarize a memory access to \p VecTy at index \p
1874/// Idx. \p Idx must access a valid vector element.
1875static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx,
1876 Instruction *CtxI,
1877 AssumptionCache &AC,
1878 const DominatorTree &DT) {
1879 // We do checks for both fixed vector types and scalable vector types.
1880 // This is the number of elements of fixed vector types,
1881 // or the minimum number of elements of scalable vector types.
1882 uint64_t NumElements = VecTy->getElementCount().getKnownMinValue();
1883 unsigned IntWidth = Idx->getType()->getScalarSizeInBits();
1884
1885 if (auto *C = dyn_cast<ConstantInt>(Idx)) {
1886 if (C->getValue().ult(NumElements))
1887 return ScalarizationResult::safe();
1888 return ScalarizationResult::unsafe();
1889 }
1890
1891 // Always unsafe if the index type can't handle all inbound values.
1892 if (!llvm::isUIntN(IntWidth, NumElements))
1893 return ScalarizationResult::unsafe();
1894
1895 APInt Zero(IntWidth, 0);
1896 APInt MaxElts(IntWidth, NumElements);
1897 ConstantRange ValidIndices(Zero, MaxElts);
1898 ConstantRange IdxRange(IntWidth, true);
1899
1900 if (isGuaranteedNotToBePoison(Idx, &AC)) {
1901 if (ValidIndices.contains(computeConstantRange(Idx, /* ForSigned */ false,
1902 true, &AC, CtxI, &DT)))
1903 return ScalarizationResult::safe();
1904 return ScalarizationResult::unsafe();
1905 }
1906
1907 // If the index may be poison, check if we can insert a freeze before the
1908 // range of the index is restricted.
1909 Value *IdxBase;
1910 ConstantInt *CI;
1911 if (match(Idx, m_And(m_Value(IdxBase), m_ConstantInt(CI)))) {
1912 IdxRange = IdxRange.binaryAnd(CI->getValue());
1913 } else if (match(Idx, m_URem(m_Value(IdxBase), m_ConstantInt(CI)))) {
1914 IdxRange = IdxRange.urem(CI->getValue());
1915 }
1916
1917 if (ValidIndices.contains(IdxRange))
1918 return ScalarizationResult::safeWithFreeze(IdxBase);
1919 return ScalarizationResult::unsafe();
1920}
1921
1922/// The memory operation on a vector of \p ScalarType had alignment of
1923/// \p VectorAlignment. Compute the maximal, but conservatively correct,
1924/// alignment that will be valid for the memory operation on a single scalar
1925/// element of the same type with index \p Idx.
1927 Type *ScalarType, Value *Idx,
1928 const DataLayout &DL) {
1929 if (auto *C = dyn_cast<ConstantInt>(Idx))
1930 return commonAlignment(VectorAlignment,
1931 C->getZExtValue() * DL.getTypeStoreSize(ScalarType));
1932 return commonAlignment(VectorAlignment, DL.getTypeStoreSize(ScalarType));
1933}
1934
1935// Combine patterns like:
1936// %0 = load <4 x i32>, <4 x i32>* %a
1937// %1 = insertelement <4 x i32> %0, i32 %b, i32 1
1938// store <4 x i32> %1, <4 x i32>* %a
1939// to:
1940// %0 = bitcast <4 x i32>* %a to i32*
1941// %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1
1942// store i32 %b, i32* %1
1943bool VectorCombine::foldSingleElementStore(Instruction &I) {
1945 return false;
1946 auto *SI = cast<StoreInst>(&I);
1947 if (!SI->isSimple() || !isa<VectorType>(SI->getValueOperand()->getType()))
1948 return false;
1949
1950 // TODO: Combine more complicated patterns (multiple insert) by referencing
1951 // TargetTransformInfo.
1953 Value *NewElement;
1954 Value *Idx;
1955 if (!match(SI->getValueOperand(),
1956 m_InsertElt(m_Instruction(Source), m_Value(NewElement),
1957 m_Value(Idx))))
1958 return false;
1959
1960 if (auto *Load = dyn_cast<LoadInst>(Source)) {
1961 auto VecTy = cast<VectorType>(SI->getValueOperand()->getType());
1962 Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts();
1963 // Don't optimize for atomic/volatile load or store. Ensure memory is not
1964 // modified between, vector type matches store size, and index is inbounds.
1965 if (!Load->isSimple() || Load->getParent() != SI->getParent() ||
1966 !DL->typeSizeEqualsStoreSize(Load->getType()->getScalarType()) ||
1967 SrcAddr != SI->getPointerOperand()->stripPointerCasts())
1968 return false;
1969
1970 auto ScalarizableIdx = canScalarizeAccess(VecTy, Idx, Load, AC, DT);
1971 if (ScalarizableIdx.isUnsafe() ||
1972 isMemModifiedBetween(Load->getIterator(), SI->getIterator(),
1973 MemoryLocation::get(SI), AA))
1974 return false;
1975
1976 // Ensure we add the load back to the worklist BEFORE its users so they can
1977 // erased in the correct order.
1978 Worklist.push(Load);
1979
1980 if (ScalarizableIdx.isSafeWithFreeze())
1981 ScalarizableIdx.freeze(Builder, *cast<Instruction>(Idx));
1982 Value *GEP = Builder.CreateInBoundsGEP(
1983 SI->getValueOperand()->getType(), SI->getPointerOperand(),
1984 {ConstantInt::get(Idx->getType(), 0), Idx});
1985 StoreInst *NSI = Builder.CreateStore(NewElement, GEP);
1986 NSI->copyMetadata(*SI);
1987 Align ScalarOpAlignment = computeAlignmentAfterScalarization(
1988 std::max(SI->getAlign(), Load->getAlign()), NewElement->getType(), Idx,
1989 *DL);
1990 NSI->setAlignment(ScalarOpAlignment);
1991 replaceValue(I, *NSI);
1993 return true;
1994 }
1995
1996 return false;
1997}
1998
1999/// Try to scalarize vector loads feeding extractelement or bitcast
2000/// instructions.
2001bool VectorCombine::scalarizeLoad(Instruction &I) {
2002 Value *Ptr;
2003 if (!match(&I, m_Load(m_Value(Ptr))))
2004 return false;
2005
2006 auto *LI = cast<LoadInst>(&I);
2007 auto *VecTy = cast<VectorType>(LI->getType());
2008 if (LI->isVolatile() || !DL->typeSizeEqualsStoreSize(VecTy->getScalarType()))
2009 return false;
2010
2011 bool AllExtracts = true;
2012 bool AllBitcasts = true;
2013 Instruction *LastCheckedInst = LI;
2014 unsigned NumInstChecked = 0;
2015
2016 // Check what type of users we have (must either all be extracts or
2017 // bitcasts) and ensure no memory modifications between the load and
2018 // its users.
2019 for (User *U : LI->users()) {
2020 auto *UI = dyn_cast<Instruction>(U);
2021 if (!UI || UI->getParent() != LI->getParent())
2022 return false;
2023
2024 // If any user is waiting to be erased, then bail out as this will
2025 // distort the cost calculation and possibly lead to infinite loops.
2026 if (UI->use_empty())
2027 return false;
2028
2029 if (!isa<ExtractElementInst>(UI))
2030 AllExtracts = false;
2031 if (!isa<BitCastInst>(UI))
2032 AllBitcasts = false;
2033
2034 // Check if any instruction between the load and the user may modify memory.
2035 if (LastCheckedInst->comesBefore(UI)) {
2036 for (Instruction &I :
2037 make_range(std::next(LI->getIterator()), UI->getIterator())) {
2038 // Bail out if we reached the check limit or the instruction may write
2039 // to memory.
2040 if (NumInstChecked == MaxInstrsToScan || I.mayWriteToMemory())
2041 return false;
2042 NumInstChecked++;
2043 }
2044 LastCheckedInst = UI;
2045 }
2046 }
2047
2048 if (AllExtracts)
2049 return scalarizeLoadExtract(LI, VecTy, Ptr);
2050 if (AllBitcasts)
2051 return scalarizeLoadBitcast(LI, VecTy, Ptr);
2052 return false;
2053}
2054
2055/// Try to scalarize vector loads feeding extractelement instructions.
2056bool VectorCombine::scalarizeLoadExtract(LoadInst *LI, VectorType *VecTy,
2057 Value *Ptr) {
2059 return false;
2060
2061 DenseMap<ExtractElementInst *, ScalarizationResult> NeedFreeze;
2062 llvm::scope_exit FailureGuard([&]() {
2063 // If the transform is aborted, discard the ScalarizationResults.
2064 for (auto &Pair : NeedFreeze)
2065 Pair.second.discard();
2066 });
2067
2068 InstructionCost OriginalCost =
2069 TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
2071 InstructionCost ScalarizedCost = 0;
2072
2073 for (User *U : LI->users()) {
2074 auto *UI = cast<ExtractElementInst>(U);
2075
2076 auto ScalarIdx =
2077 canScalarizeAccess(VecTy, UI->getIndexOperand(), LI, AC, DT);
2078 if (ScalarIdx.isUnsafe())
2079 return false;
2080 if (ScalarIdx.isSafeWithFreeze()) {
2081 NeedFreeze.try_emplace(UI, ScalarIdx);
2082 ScalarIdx.discard();
2083 }
2084
2085 auto *Index = dyn_cast<ConstantInt>(UI->getIndexOperand());
2086 OriginalCost +=
2087 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
2088 Index ? Index->getZExtValue() : -1);
2089 ScalarizedCost +=
2090 TTI.getMemoryOpCost(Instruction::Load, VecTy->getElementType(),
2092 ScalarizedCost += TTI.getAddressComputationCost(LI->getPointerOperandType(),
2093 nullptr, nullptr, CostKind);
2094 }
2095
2096 LLVM_DEBUG(dbgs() << "Found all extractions of a vector load: " << *LI
2097 << "\n LoadExtractCost: " << OriginalCost
2098 << " vs ScalarizedCost: " << ScalarizedCost << "\n");
2099
2100 if (ScalarizedCost >= OriginalCost)
2101 return false;
2102
2103 // Ensure we add the load back to the worklist BEFORE its users so they can
2104 // erased in the correct order.
2105 Worklist.push(LI);
2106
2107 Type *ElemType = VecTy->getElementType();
2108
2109 // Replace extracts with narrow scalar loads.
2110 for (User *U : LI->users()) {
2111 auto *EI = cast<ExtractElementInst>(U);
2112 Value *Idx = EI->getIndexOperand();
2113
2114 // Insert 'freeze' for poison indexes.
2115 auto It = NeedFreeze.find(EI);
2116 if (It != NeedFreeze.end())
2117 It->second.freeze(Builder, *cast<Instruction>(Idx));
2118
2119 Builder.SetInsertPoint(EI);
2120 Value *GEP =
2121 Builder.CreateInBoundsGEP(VecTy, Ptr, {Builder.getInt32(0), Idx});
2122 auto *NewLoad = cast<LoadInst>(
2123 Builder.CreateLoad(ElemType, GEP, EI->getName() + ".scalar"));
2124
2125 Align ScalarOpAlignment =
2126 computeAlignmentAfterScalarization(LI->getAlign(), ElemType, Idx, *DL);
2127 NewLoad->setAlignment(ScalarOpAlignment);
2128
2129 if (auto *ConstIdx = dyn_cast<ConstantInt>(Idx)) {
2130 size_t Offset = ConstIdx->getZExtValue() * DL->getTypeStoreSize(ElemType);
2131 AAMDNodes OldAAMD = LI->getAAMetadata();
2132 NewLoad->setAAMetadata(OldAAMD.adjustForAccess(Offset, ElemType, *DL));
2133 }
2134
2135 replaceValue(*EI, *NewLoad, false);
2136 }
2137
2138 FailureGuard.release();
2139 return true;
2140}
2141
2142/// Try to scalarize vector loads feeding bitcast instructions.
2143bool VectorCombine::scalarizeLoadBitcast(LoadInst *LI, VectorType *VecTy,
2144 Value *Ptr) {
2145 InstructionCost OriginalCost =
2146 TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
2148
2149 Type *TargetScalarType = nullptr;
2150 unsigned VecBitWidth = DL->getTypeSizeInBits(VecTy);
2151
2152 for (User *U : LI->users()) {
2153 auto *BC = cast<BitCastInst>(U);
2154
2155 Type *DestTy = BC->getDestTy();
2156 if (!DestTy->isIntegerTy() && !DestTy->isFloatingPointTy())
2157 return false;
2158
2159 unsigned DestBitWidth = DL->getTypeSizeInBits(DestTy);
2160 if (DestBitWidth != VecBitWidth)
2161 return false;
2162
2163 // All bitcasts must target the same scalar type.
2164 if (!TargetScalarType)
2165 TargetScalarType = DestTy;
2166 else if (TargetScalarType != DestTy)
2167 return false;
2168
2169 OriginalCost +=
2170 TTI.getCastInstrCost(Instruction::BitCast, TargetScalarType, VecTy,
2172 }
2173
2174 if (!TargetScalarType)
2175 return false;
2176
2177 assert(!LI->user_empty() && "Unexpected load without bitcast users");
2178 InstructionCost ScalarizedCost =
2179 TTI.getMemoryOpCost(Instruction::Load, TargetScalarType, LI->getAlign(),
2181
2182 LLVM_DEBUG(dbgs() << "Found vector load feeding only bitcasts: " << *LI
2183 << "\n OriginalCost: " << OriginalCost
2184 << " vs ScalarizedCost: " << ScalarizedCost << "\n");
2185
2186 if (ScalarizedCost >= OriginalCost)
2187 return false;
2188
2189 // Ensure we add the load back to the worklist BEFORE its users so they can
2190 // erased in the correct order.
2191 Worklist.push(LI);
2192
2193 Builder.SetInsertPoint(LI);
2194 auto *ScalarLoad =
2195 Builder.CreateLoad(TargetScalarType, Ptr, LI->getName() + ".scalar");
2196 ScalarLoad->setAlignment(LI->getAlign());
2197 ScalarLoad->copyMetadata(*LI);
2198
2199 // Replace all bitcast users with the scalar load.
2200 for (User *U : LI->users()) {
2201 auto *BC = cast<BitCastInst>(U);
2202 replaceValue(*BC, *ScalarLoad, false);
2203 }
2204
2205 return true;
2206}
2207
2208bool VectorCombine::scalarizeExtExtract(Instruction &I) {
2210 return false;
2211 auto *Ext = dyn_cast<ZExtInst>(&I);
2212 if (!Ext)
2213 return false;
2214
2215 // Try to convert a vector zext feeding only extracts to a set of scalar
2216 // (Src << ExtIdx *Size) & (Size -1)
2217 // if profitable .
2218 auto *SrcTy = dyn_cast<FixedVectorType>(Ext->getOperand(0)->getType());
2219 if (!SrcTy)
2220 return false;
2221 auto *DstTy = cast<FixedVectorType>(Ext->getType());
2222
2223 Type *ScalarDstTy = DstTy->getElementType();
2224 if (DL->getTypeSizeInBits(SrcTy) != DL->getTypeSizeInBits(ScalarDstTy))
2225 return false;
2226
2227 InstructionCost VectorCost =
2228 TTI.getCastInstrCost(Instruction::ZExt, DstTy, SrcTy,
2230 unsigned ExtCnt = 0;
2231 bool ExtLane0 = false;
2232 for (User *U : Ext->users()) {
2233 uint64_t Idx;
2234 if (!match(U, m_ExtractElt(m_Value(), m_ConstantInt(Idx))))
2235 return false;
2236 if (cast<Instruction>(U)->use_empty())
2237 continue;
2238 ExtCnt += 1;
2239 ExtLane0 |= !Idx;
2240 VectorCost += TTI.getVectorInstrCost(Instruction::ExtractElement, DstTy,
2241 CostKind, Idx, U);
2242 }
2243
2244 InstructionCost ScalarCost =
2245 ExtCnt * TTI.getArithmeticInstrCost(
2246 Instruction::And, ScalarDstTy, CostKind,
2249 (ExtCnt - ExtLane0) *
2251 Instruction::LShr, ScalarDstTy, CostKind,
2254 if (ScalarCost > VectorCost)
2255 return false;
2256
2257 Value *ScalarV = Ext->getOperand(0);
2258 if (!isGuaranteedNotToBePoison(ScalarV, &AC, dyn_cast<Instruction>(ScalarV),
2259 &DT)) {
2260 // Check wether all lanes are extracted, all extracts trigger UB
2261 // on poison, and the last extract (and hence all previous ones)
2262 // are guaranteed to execute if Ext executes. If so, we do not
2263 // need to insert a freeze.
2264 SmallDenseSet<ConstantInt *, 8> ExtractedLanes;
2265 bool AllExtractsTriggerUB = true;
2266 ExtractElementInst *LastExtract = nullptr;
2267 BasicBlock *ExtBB = Ext->getParent();
2268 for (User *U : Ext->users()) {
2269 auto *Extract = cast<ExtractElementInst>(U);
2270 if (Extract->getParent() != ExtBB || !programUndefinedIfPoison(Extract)) {
2271 AllExtractsTriggerUB = false;
2272 break;
2273 }
2274 ExtractedLanes.insert(cast<ConstantInt>(Extract->getIndexOperand()));
2275 if (!LastExtract || LastExtract->comesBefore(Extract))
2276 LastExtract = Extract;
2277 }
2278 if (ExtractedLanes.size() != DstTy->getNumElements() ||
2279 !AllExtractsTriggerUB ||
2281 LastExtract->getIterator()))
2282 ScalarV = Builder.CreateFreeze(ScalarV);
2283 }
2284 ScalarV = Builder.CreateBitCast(
2285 ScalarV,
2286 IntegerType::get(SrcTy->getContext(), DL->getTypeSizeInBits(SrcTy)));
2287 uint64_t SrcEltSizeInBits = DL->getTypeSizeInBits(SrcTy->getElementType());
2288 uint64_t TotalBits = DL->getTypeSizeInBits(SrcTy);
2289 APInt EltBitMask = APInt::getLowBitsSet(TotalBits, SrcEltSizeInBits);
2290 Type *PackedTy = IntegerType::get(SrcTy->getContext(), TotalBits);
2291 Value *Mask = ConstantInt::get(PackedTy, EltBitMask);
2292 for (User *U : Ext->users()) {
2293 auto *Extract = cast<ExtractElementInst>(U);
2294 uint64_t Idx =
2295 cast<ConstantInt>(Extract->getIndexOperand())->getZExtValue();
2296 uint64_t ShiftAmt =
2297 DL->isBigEndian()
2298 ? (TotalBits - SrcEltSizeInBits - Idx * SrcEltSizeInBits)
2299 : (Idx * SrcEltSizeInBits);
2300 Value *LShr = Builder.CreateLShr(ScalarV, ShiftAmt);
2301 Value *And = Builder.CreateAnd(LShr, Mask);
2302 U->replaceAllUsesWith(And);
2303 }
2304 return true;
2305}
2306
2307/// Try to fold "(or (zext (bitcast X)), (shl (zext (bitcast Y)), C))"
2308/// to "(bitcast (concat X, Y))"
2309/// where X/Y are bitcasted from i1 mask vectors.
2310bool VectorCombine::foldConcatOfBoolMasks(Instruction &I) {
2311 Type *Ty = I.getType();
2312 if (!Ty->isIntegerTy())
2313 return false;
2314
2315 // TODO: Add big endian test coverage
2316 if (DL->isBigEndian())
2317 return false;
2318
2319 // Restrict to disjoint cases so the mask vectors aren't overlapping.
2320 Instruction *X, *Y;
2322 return false;
2323
2324 // Allow both sources to contain shl, to handle more generic pattern:
2325 // "(or (shl (zext (bitcast X)), C1), (shl (zext (bitcast Y)), C2))"
2326 Value *SrcX;
2327 uint64_t ShAmtX = 0;
2328 if (!match(X, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcX)))))) &&
2329 !match(X, m_OneUse(
2331 m_ConstantInt(ShAmtX)))))
2332 return false;
2333
2334 Value *SrcY;
2335 uint64_t ShAmtY = 0;
2336 if (!match(Y, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcY)))))) &&
2337 !match(Y, m_OneUse(
2339 m_ConstantInt(ShAmtY)))))
2340 return false;
2341
2342 // Canonicalize larger shift to the RHS.
2343 if (ShAmtX > ShAmtY) {
2344 std::swap(X, Y);
2345 std::swap(SrcX, SrcY);
2346 std::swap(ShAmtX, ShAmtY);
2347 }
2348
2349 // Ensure both sources are matching vXi1 bool mask types, and that the shift
2350 // difference is the mask width so they can be easily concatenated together.
2351 uint64_t ShAmtDiff = ShAmtY - ShAmtX;
2352 unsigned NumSHL = (ShAmtX > 0) + (ShAmtY > 0);
2353 unsigned BitWidth = Ty->getPrimitiveSizeInBits();
2354 auto *MaskTy = dyn_cast<FixedVectorType>(SrcX->getType());
2355 if (!MaskTy || SrcX->getType() != SrcY->getType() ||
2356 !MaskTy->getElementType()->isIntegerTy(1) ||
2357 MaskTy->getNumElements() != ShAmtDiff ||
2358 MaskTy->getNumElements() > (BitWidth / 2))
2359 return false;
2360
2361 auto *ConcatTy = FixedVectorType::getDoubleElementsVectorType(MaskTy);
2362 auto *ConcatIntTy =
2363 Type::getIntNTy(Ty->getContext(), ConcatTy->getNumElements());
2364 auto *MaskIntTy = Type::getIntNTy(Ty->getContext(), ShAmtDiff);
2365
2366 SmallVector<int, 32> ConcatMask(ConcatTy->getNumElements());
2367 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
2368
2369 // TODO: Is it worth supporting multi use cases?
2370 InstructionCost OldCost = 0;
2371 OldCost += TTI.getArithmeticInstrCost(Instruction::Or, Ty, CostKind);
2372 OldCost +=
2373 NumSHL * TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2374 OldCost += 2 * TTI.getCastInstrCost(Instruction::ZExt, Ty, MaskIntTy,
2376 OldCost += 2 * TTI.getCastInstrCost(Instruction::BitCast, MaskIntTy, MaskTy,
2378
2379 InstructionCost NewCost = 0;
2381 MaskTy, ConcatMask, CostKind);
2382 NewCost += TTI.getCastInstrCost(Instruction::BitCast, ConcatIntTy, ConcatTy,
2384 if (Ty != ConcatIntTy)
2385 NewCost += TTI.getCastInstrCost(Instruction::ZExt, Ty, ConcatIntTy,
2387 if (ShAmtX > 0)
2388 NewCost += TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2389
2390 LLVM_DEBUG(dbgs() << "Found a concatenation of bitcasted bool masks: " << I
2391 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2392 << "\n");
2393
2394 if (NewCost > OldCost)
2395 return false;
2396
2397 // Build bool mask concatenation, bitcast back to scalar integer, and perform
2398 // any residual zero-extension or shifting.
2399 Value *Concat = Builder.CreateShuffleVector(SrcX, SrcY, ConcatMask);
2400 Worklist.pushValue(Concat);
2401
2402 Value *Result = Builder.CreateBitCast(Concat, ConcatIntTy);
2403
2404 if (Ty != ConcatIntTy) {
2405 Worklist.pushValue(Result);
2406 Result = Builder.CreateZExt(Result, Ty);
2407 }
2408
2409 if (ShAmtX > 0) {
2410 Worklist.pushValue(Result);
2411 Result = Builder.CreateShl(Result, ShAmtX);
2412 }
2413
2414 replaceValue(I, *Result);
2415 return true;
2416}
2417
2418/// Try to convert "shuffle (binop (shuffle, shuffle)), undef"
2419/// --> "binop (shuffle), (shuffle)".
2420bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
2421 BinaryOperator *BinOp;
2422 ArrayRef<int> OuterMask;
2423 if (!match(&I, m_Shuffle(m_BinOp(BinOp), m_Undef(), m_Mask(OuterMask))))
2424 return false;
2425
2426 // Don't introduce poison into div/rem.
2427 if (BinOp->isIntDivRem() && llvm::is_contained(OuterMask, PoisonMaskElem))
2428 return false;
2429
2430 Value *Op00, *Op01, *Op10, *Op11;
2431 ArrayRef<int> Mask0, Mask1;
2432 bool Match0 = match(BinOp->getOperand(0),
2433 m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0)));
2434 bool Match1 = match(BinOp->getOperand(1),
2435 m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1)));
2436 if (!Match0 && !Match1)
2437 return false;
2438
2439 Op00 = Match0 ? Op00 : BinOp->getOperand(0);
2440 Op01 = Match0 ? Op01 : BinOp->getOperand(0);
2441 Op10 = Match1 ? Op10 : BinOp->getOperand(1);
2442 Op11 = Match1 ? Op11 : BinOp->getOperand(1);
2443
2444 Instruction::BinaryOps Opcode = BinOp->getOpcode();
2445 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2446 auto *BinOpTy = dyn_cast<FixedVectorType>(BinOp->getType());
2447 auto *Op0Ty = dyn_cast<FixedVectorType>(Op00->getType());
2448 auto *Op1Ty = dyn_cast<FixedVectorType>(Op10->getType());
2449 if (!ShuffleDstTy || !BinOpTy || !Op0Ty || !Op1Ty)
2450 return false;
2451
2452 unsigned NumSrcElts = BinOpTy->getNumElements();
2453
2454 // Don't accept shuffles that reference the second operand in
2455 // div/rem or if its an undef arg.
2456 if ((BinOp->isIntDivRem() || !isa<PoisonValue>(I.getOperand(1))) &&
2457 any_of(OuterMask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; }))
2458 return false;
2459
2460 // Merge outer / inner (or identity if no match) shuffles.
2461 SmallVector<int> NewMask0, NewMask1;
2462 for (int M : OuterMask) {
2463 if (M < 0 || M >= (int)NumSrcElts) {
2464 NewMask0.push_back(PoisonMaskElem);
2465 NewMask1.push_back(PoisonMaskElem);
2466 } else {
2467 NewMask0.push_back(Match0 ? Mask0[M] : M);
2468 NewMask1.push_back(Match1 ? Mask1[M] : M);
2469 }
2470 }
2471
2472 unsigned NumOpElts = Op0Ty->getNumElements();
2473 bool IsIdentity0 = ShuffleDstTy == Op0Ty &&
2474 all_of(NewMask0, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2475 ShuffleVectorInst::isIdentityMask(NewMask0, NumOpElts);
2476 bool IsIdentity1 = ShuffleDstTy == Op1Ty &&
2477 all_of(NewMask1, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2478 ShuffleVectorInst::isIdentityMask(NewMask1, NumOpElts);
2479
2480 InstructionCost NewCost = 0;
2481 // Try to merge shuffles across the binop if the new shuffles are not costly.
2482 InstructionCost BinOpCost =
2483 TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind);
2484 InstructionCost OldCost =
2486 ShuffleDstTy, BinOpTy, OuterMask, CostKind,
2487 0, nullptr, {BinOp}, &I);
2488 if (!BinOp->hasOneUse())
2489 NewCost += BinOpCost;
2490
2491 if (Match0) {
2493 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op0Ty, Mask0, CostKind,
2494 0, nullptr, {Op00, Op01}, cast<Instruction>(BinOp->getOperand(0)));
2495 OldCost += Shuf0Cost;
2496 if (!BinOp->hasOneUse() || !BinOp->getOperand(0)->hasOneUse())
2497 NewCost += Shuf0Cost;
2498 }
2499 if (Match1) {
2501 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op1Ty, Mask1, CostKind,
2502 0, nullptr, {Op10, Op11}, cast<Instruction>(BinOp->getOperand(1)));
2503 OldCost += Shuf1Cost;
2504 if (!BinOp->hasOneUse() || !BinOp->getOperand(1)->hasOneUse())
2505 NewCost += Shuf1Cost;
2506 }
2507
2508 NewCost += TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind);
2509
2510 if (!IsIdentity0)
2511 NewCost +=
2513 Op0Ty, NewMask0, CostKind, 0, nullptr, {Op00, Op01});
2514 if (!IsIdentity1)
2515 NewCost +=
2517 Op1Ty, NewMask1, CostKind, 0, nullptr, {Op10, Op11});
2518
2519 LLVM_DEBUG(dbgs() << "Found a shuffle feeding a shuffled binop: " << I
2520 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2521 << "\n");
2522
2523 // If costs are equal, still fold as we reduce instruction count.
2524 if (NewCost > OldCost)
2525 return false;
2526
2527 Value *LHS =
2528 IsIdentity0 ? Op00 : Builder.CreateShuffleVector(Op00, Op01, NewMask0);
2529 Value *RHS =
2530 IsIdentity1 ? Op10 : Builder.CreateShuffleVector(Op10, Op11, NewMask1);
2531 Value *NewBO = Builder.CreateBinOp(Opcode, LHS, RHS);
2532
2533 // Intersect flags from the old binops.
2534 if (auto *NewInst = dyn_cast<Instruction>(NewBO))
2535 NewInst->copyIRFlags(BinOp);
2536
2537 Worklist.pushValue(LHS);
2538 Worklist.pushValue(RHS);
2539 replaceValue(I, *NewBO);
2540 return true;
2541}
2542
2543/// Try to convert "shuffle (binop), (binop)" into "binop (shuffle), (shuffle)".
2544/// Try to convert "shuffle (cmpop), (cmpop)" into "cmpop (shuffle), (shuffle)".
2545bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
2546 ArrayRef<int> OldMask;
2547 Instruction *LHS, *RHS;
2549 m_Mask(OldMask))))
2550 return false;
2551
2552 // TODO: Add support for addlike etc.
2553 if (LHS->getOpcode() != RHS->getOpcode())
2554 return false;
2555
2556 Value *X, *Y, *Z, *W;
2557 bool IsCommutative = false;
2558 CmpPredicate PredLHS = CmpInst::BAD_ICMP_PREDICATE;
2559 CmpPredicate PredRHS = CmpInst::BAD_ICMP_PREDICATE;
2560 if (match(LHS, m_BinOp(m_Value(X), m_Value(Y))) &&
2561 match(RHS, m_BinOp(m_Value(Z), m_Value(W)))) {
2562 auto *BO = cast<BinaryOperator>(LHS);
2563 // Don't introduce poison into div/rem.
2564 if (llvm::is_contained(OldMask, PoisonMaskElem) && BO->isIntDivRem())
2565 return false;
2566 IsCommutative = BinaryOperator::isCommutative(BO->getOpcode());
2567 } else if (match(LHS, m_Cmp(PredLHS, m_Value(X), m_Value(Y))) &&
2568 match(RHS, m_Cmp(PredRHS, m_Value(Z), m_Value(W))) &&
2569 (CmpInst::Predicate)PredLHS == (CmpInst::Predicate)PredRHS) {
2570 IsCommutative = cast<CmpInst>(LHS)->isCommutative();
2571 } else
2572 return false;
2573
2574 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2575 auto *BinResTy = dyn_cast<FixedVectorType>(LHS->getType());
2576 auto *BinOpTy = dyn_cast<FixedVectorType>(X->getType());
2577 if (!ShuffleDstTy || !BinResTy || !BinOpTy || X->getType() != Z->getType())
2578 return false;
2579
2580 bool SameBinOp = LHS == RHS;
2581 unsigned NumSrcElts = BinOpTy->getNumElements();
2582
2583 // If we have something like "add X, Y" and "add Z, X", swap ops to match.
2584 if (IsCommutative && X != Z && Y != W && (X == W || Y == Z))
2585 std::swap(X, Y);
2586
2587 auto ConvertToUnary = [NumSrcElts](int &M) {
2588 if (M >= (int)NumSrcElts)
2589 M -= NumSrcElts;
2590 };
2591
2592 SmallVector<int> NewMask0(OldMask);
2594 TTI::OperandValueInfo Op0Info = TTI.commonOperandInfo(X, Z);
2595 if (X == Z) {
2596 llvm::for_each(NewMask0, ConvertToUnary);
2598 Z = PoisonValue::get(BinOpTy);
2599 }
2600
2601 SmallVector<int> NewMask1(OldMask);
2603 TTI::OperandValueInfo Op1Info = TTI.commonOperandInfo(Y, W);
2604 if (Y == W) {
2605 llvm::for_each(NewMask1, ConvertToUnary);
2607 W = PoisonValue::get(BinOpTy);
2608 }
2609
2610 // Try to replace a binop with a shuffle if the shuffle is not costly.
2611 // When SameBinOp, only count the binop cost once.
2614
2615 InstructionCost OldCost = LHSCost;
2616 if (!SameBinOp) {
2617 OldCost += RHSCost;
2618 }
2620 ShuffleDstTy, BinResTy, OldMask, CostKind, 0,
2621 nullptr, {LHS, RHS}, &I);
2622
2623 // Handle shuffle(binop(shuffle(x),y),binop(z,shuffle(w))) style patterns
2624 // where one use shuffles have gotten split across the binop/cmp. These
2625 // often allow a major reduction in total cost that wouldn't happen as
2626 // individual folds.
2627 auto MergeInner = [&](Value *&Op, int Offset, MutableArrayRef<int> Mask,
2628 TTI::TargetCostKind CostKind) -> bool {
2629 Value *InnerOp;
2630 ArrayRef<int> InnerMask;
2631 if (match(Op, m_OneUse(m_Shuffle(m_Value(InnerOp), m_Undef(),
2632 m_Mask(InnerMask)))) &&
2633 InnerOp->getType() == Op->getType() &&
2634 all_of(InnerMask,
2635 [NumSrcElts](int M) { return M < (int)NumSrcElts; })) {
2636 for (int &M : Mask)
2637 if (Offset <= M && M < (int)(Offset + NumSrcElts)) {
2638 M = InnerMask[M - Offset];
2639 M = 0 <= M ? M + Offset : M;
2640 }
2642 Op = InnerOp;
2643 return true;
2644 }
2645 return false;
2646 };
2647 bool ReducedInstCount = false;
2648 ReducedInstCount |= MergeInner(X, 0, NewMask0, CostKind);
2649 ReducedInstCount |= MergeInner(Y, 0, NewMask1, CostKind);
2650 ReducedInstCount |= MergeInner(Z, NumSrcElts, NewMask0, CostKind);
2651 ReducedInstCount |= MergeInner(W, NumSrcElts, NewMask1, CostKind);
2652 bool SingleSrcBinOp = (X == Y) && (Z == W) && (NewMask0 == NewMask1);
2653 // SingleSrcBinOp only reduces instruction count if we also eliminate the
2654 // original binop(s). If binops have multiple uses, they won't be eliminated.
2655 ReducedInstCount |= SingleSrcBinOp && LHS->hasOneUser() && RHS->hasOneUser();
2656
2657 auto *ShuffleCmpTy =
2658 FixedVectorType::get(BinOpTy->getElementType(), ShuffleDstTy);
2660 SK0, ShuffleCmpTy, BinOpTy, NewMask0, CostKind, 0, nullptr, {X, Z});
2661 if (!SingleSrcBinOp)
2662 NewCost += TTI.getShuffleCost(SK1, ShuffleCmpTy, BinOpTy, NewMask1,
2663 CostKind, 0, nullptr, {Y, W});
2664
2665 if (PredLHS == CmpInst::BAD_ICMP_PREDICATE) {
2666 NewCost += TTI.getArithmeticInstrCost(LHS->getOpcode(), ShuffleDstTy,
2667 CostKind, Op0Info, Op1Info);
2668 } else {
2669 NewCost +=
2670 TTI.getCmpSelInstrCost(LHS->getOpcode(), ShuffleCmpTy, ShuffleDstTy,
2671 PredLHS, CostKind, Op0Info, Op1Info);
2672 }
2673 // If LHS/RHS have other uses, we need to account for the cost of keeping
2674 // the original instructions. When SameBinOp, only add the cost once.
2675 if (!LHS->hasOneUser())
2676 NewCost += LHSCost;
2677 if (!SameBinOp && !RHS->hasOneUser())
2678 NewCost += RHSCost;
2679
2680 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I
2681 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2682 << "\n");
2683
2684 // If either shuffle will constant fold away, then fold for the same cost as
2685 // we will reduce the instruction count.
2686 ReducedInstCount |= (isa<Constant>(X) && isa<Constant>(Z)) ||
2687 (isa<Constant>(Y) && isa<Constant>(W));
2688 if (ReducedInstCount ? (NewCost > OldCost) : (NewCost >= OldCost))
2689 return false;
2690
2691 Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0);
2692 Value *Shuf1 =
2693 SingleSrcBinOp ? Shuf0 : Builder.CreateShuffleVector(Y, W, NewMask1);
2694 Value *NewBO = PredLHS == CmpInst::BAD_ICMP_PREDICATE
2695 ? Builder.CreateBinOp(
2696 cast<BinaryOperator>(LHS)->getOpcode(), Shuf0, Shuf1)
2697 : Builder.CreateCmp(PredLHS, Shuf0, Shuf1);
2698
2699 // Intersect flags from the old binops.
2700 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
2701 NewInst->copyIRFlags(LHS);
2702 NewInst->andIRFlags(RHS);
2703 }
2704
2705 Worklist.pushValue(Shuf0);
2706 Worklist.pushValue(Shuf1);
2707 replaceValue(I, *NewBO);
2708 return true;
2709}
2710
2711/// Try to convert,
2712/// (shuffle(select(c1,t1,f1)), (select(c2,t2,f2)), m) into
2713/// (select (shuffle c1,c2,m), (shuffle t1,t2,m), (shuffle f1,f2,m))
2714bool VectorCombine::foldShuffleOfSelects(Instruction &I) {
2715 ArrayRef<int> Mask;
2716 Value *C1, *T1, *F1, *C2, *T2, *F2;
2717 if (!match(&I, m_Shuffle(m_Select(m_Value(C1), m_Value(T1), m_Value(F1)),
2718 m_Select(m_Value(C2), m_Value(T2), m_Value(F2)),
2719 m_Mask(Mask))))
2720 return false;
2721
2722 auto *Sel1 = cast<Instruction>(I.getOperand(0));
2723 auto *Sel2 = cast<Instruction>(I.getOperand(1));
2724
2725 auto *C1VecTy = dyn_cast<FixedVectorType>(C1->getType());
2726 auto *C2VecTy = dyn_cast<FixedVectorType>(C2->getType());
2727 if (!C1VecTy || !C2VecTy || C1VecTy != C2VecTy)
2728 return false;
2729
2730 auto *SI0FOp = dyn_cast<FPMathOperator>(I.getOperand(0));
2731 auto *SI1FOp = dyn_cast<FPMathOperator>(I.getOperand(1));
2732 // SelectInsts must have the same FMF.
2733 if (((SI0FOp == nullptr) != (SI1FOp == nullptr)) ||
2734 ((SI0FOp != nullptr) &&
2735 (SI0FOp->getFastMathFlags() != SI1FOp->getFastMathFlags())))
2736 return false;
2737
2738 auto *SrcVecTy = cast<FixedVectorType>(T1->getType());
2739 auto *DstVecTy = cast<FixedVectorType>(I.getType());
2741 auto SelOp = Instruction::Select;
2742
2744 SelOp, SrcVecTy, C1VecTy, CmpInst::BAD_ICMP_PREDICATE, CostKind);
2746 SelOp, SrcVecTy, C2VecTy, CmpInst::BAD_ICMP_PREDICATE, CostKind);
2747
2748 InstructionCost OldCost =
2749 CostSel1 + CostSel2 +
2750 TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0, nullptr,
2751 {I.getOperand(0), I.getOperand(1)}, &I);
2752
2754 SK, FixedVectorType::get(C1VecTy->getScalarType(), Mask.size()), C1VecTy,
2755 Mask, CostKind, 0, nullptr, {C1, C2});
2756 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2757 nullptr, {T1, T2});
2758 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2759 nullptr, {F1, F2});
2760 auto *C1C2ShuffledVecTy = cast<FixedVectorType>(
2761 toVectorTy(Type::getInt1Ty(I.getContext()), DstVecTy->getNumElements()));
2762 NewCost += TTI.getCmpSelInstrCost(SelOp, DstVecTy, C1C2ShuffledVecTy,
2764
2765 if (!Sel1->hasOneUse())
2766 NewCost += CostSel1;
2767 if (!Sel2->hasOneUse())
2768 NewCost += CostSel2;
2769
2770 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two selects: " << I
2771 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2772 << "\n");
2773 if (NewCost > OldCost)
2774 return false;
2775
2776 Value *ShuffleCmp = Builder.CreateShuffleVector(C1, C2, Mask);
2777 Value *ShuffleTrue = Builder.CreateShuffleVector(T1, T2, Mask);
2778 Value *ShuffleFalse = Builder.CreateShuffleVector(F1, F2, Mask);
2779 Value *NewSel;
2780 // We presuppose that the SelectInsts have the same FMF.
2781 if (SI0FOp)
2782 NewSel = Builder.CreateSelectFMF(ShuffleCmp, ShuffleTrue, ShuffleFalse,
2783 SI0FOp->getFastMathFlags());
2784 else
2785 NewSel = Builder.CreateSelect(ShuffleCmp, ShuffleTrue, ShuffleFalse);
2786
2787 Worklist.pushValue(ShuffleCmp);
2788 Worklist.pushValue(ShuffleTrue);
2789 Worklist.pushValue(ShuffleFalse);
2790 replaceValue(I, *NewSel);
2791 return true;
2792}
2793
2794/// Try to convert "shuffle (castop), (castop)" with a shared castop operand
2795/// into "castop (shuffle)".
2796bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
2797 Value *V0, *V1;
2798 ArrayRef<int> OldMask;
2799 if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask))))
2800 return false;
2801
2802 // Check whether this is a binary shuffle.
2803 bool IsBinaryShuffle = !isa<UndefValue>(V1);
2804
2805 auto *C0 = dyn_cast<CastInst>(V0);
2806 auto *C1 = dyn_cast<CastInst>(V1);
2807 if (!C0 || (IsBinaryShuffle && !C1))
2808 return false;
2809
2810 Instruction::CastOps Opcode = C0->getOpcode();
2811
2812 // If this is allowed, foldShuffleOfCastops can get stuck in a loop
2813 // with foldBitcastOfShuffle. Reject in favor of foldBitcastOfShuffle.
2814 if (!IsBinaryShuffle && Opcode == Instruction::BitCast)
2815 return false;
2816
2817 if (IsBinaryShuffle) {
2818 if (C0->getSrcTy() != C1->getSrcTy())
2819 return false;
2820 // Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds.
2821 if (Opcode != C1->getOpcode()) {
2822 if (match(C0, m_SExtLike(m_Value())) && match(C1, m_SExtLike(m_Value())))
2823 Opcode = Instruction::SExt;
2824 else
2825 return false;
2826 }
2827 }
2828
2829 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2830 auto *CastDstTy = dyn_cast<FixedVectorType>(C0->getDestTy());
2831 auto *CastSrcTy = dyn_cast<FixedVectorType>(C0->getSrcTy());
2832 if (!ShuffleDstTy || !CastDstTy || !CastSrcTy)
2833 return false;
2834
2835 unsigned NumSrcElts = CastSrcTy->getNumElements();
2836 unsigned NumDstElts = CastDstTy->getNumElements();
2837 assert((NumDstElts == NumSrcElts || Opcode == Instruction::BitCast) &&
2838 "Only bitcasts expected to alter src/dst element counts");
2839
2840 // Check for bitcasting of unscalable vector types.
2841 // e.g. <32 x i40> -> <40 x i32>
2842 if (NumDstElts != NumSrcElts && (NumSrcElts % NumDstElts) != 0 &&
2843 (NumDstElts % NumSrcElts) != 0)
2844 return false;
2845
2846 SmallVector<int, 16> NewMask;
2847 if (NumSrcElts >= NumDstElts) {
2848 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
2849 // always be expanded to the equivalent form choosing narrower elements.
2850 assert(NumSrcElts % NumDstElts == 0 && "Unexpected shuffle mask");
2851 unsigned ScaleFactor = NumSrcElts / NumDstElts;
2852 narrowShuffleMaskElts(ScaleFactor, OldMask, NewMask);
2853 } else {
2854 // The bitcast is from narrow elements to wide elements. The shuffle mask
2855 // must choose consecutive elements to allow casting first.
2856 assert(NumDstElts % NumSrcElts == 0 && "Unexpected shuffle mask");
2857 unsigned ScaleFactor = NumDstElts / NumSrcElts;
2858 if (!widenShuffleMaskElts(ScaleFactor, OldMask, NewMask))
2859 return false;
2860 }
2861
2862 auto *NewShuffleDstTy =
2863 FixedVectorType::get(CastSrcTy->getScalarType(), NewMask.size());
2864
2865 // Try to replace a castop with a shuffle if the shuffle is not costly.
2866 InstructionCost CostC0 =
2867 TTI.getCastInstrCost(C0->getOpcode(), CastDstTy, CastSrcTy,
2869
2871 if (IsBinaryShuffle)
2873 else
2875
2876 InstructionCost OldCost = CostC0;
2877 OldCost += TTI.getShuffleCost(ShuffleKind, ShuffleDstTy, CastDstTy, OldMask,
2878 CostKind, 0, nullptr, {}, &I);
2879
2880 InstructionCost NewCost = TTI.getShuffleCost(ShuffleKind, NewShuffleDstTy,
2881 CastSrcTy, NewMask, CostKind);
2882 NewCost += TTI.getCastInstrCost(Opcode, ShuffleDstTy, NewShuffleDstTy,
2884 if (!C0->hasOneUse())
2885 NewCost += CostC0;
2886 if (IsBinaryShuffle) {
2887 InstructionCost CostC1 =
2888 TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy,
2890 OldCost += CostC1;
2891 if (!C1->hasOneUse())
2892 NewCost += CostC1;
2893 }
2894
2895 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two casts: " << I
2896 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2897 << "\n");
2898 if (NewCost > OldCost)
2899 return false;
2900
2901 Value *Shuf;
2902 if (IsBinaryShuffle)
2903 Shuf = Builder.CreateShuffleVector(C0->getOperand(0), C1->getOperand(0),
2904 NewMask);
2905 else
2906 Shuf = Builder.CreateShuffleVector(C0->getOperand(0), NewMask);
2907
2908 Value *Cast = Builder.CreateCast(Opcode, Shuf, ShuffleDstTy);
2909
2910 // Intersect flags from the old casts.
2911 if (auto *NewInst = dyn_cast<Instruction>(Cast)) {
2912 NewInst->copyIRFlags(C0);
2913 if (IsBinaryShuffle)
2914 NewInst->andIRFlags(C1);
2915 }
2916
2917 Worklist.pushValue(Shuf);
2918 replaceValue(I, *Cast);
2919 return true;
2920}
2921
2922/// Try to convert any of:
2923/// "shuffle (shuffle x, y), (shuffle y, x)"
2924/// "shuffle (shuffle x, undef), (shuffle y, undef)"
2925/// "shuffle (shuffle x, undef), y"
2926/// "shuffle x, (shuffle y, undef)"
2927/// into "shuffle x, y".
2928bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
2929 ArrayRef<int> OuterMask;
2930 Value *OuterV0, *OuterV1;
2931 if (!match(&I,
2932 m_Shuffle(m_Value(OuterV0), m_Value(OuterV1), m_Mask(OuterMask))))
2933 return false;
2934
2935 ArrayRef<int> InnerMask0, InnerMask1;
2936 Value *X0, *X1, *Y0, *Y1;
2937 bool Match0 =
2938 match(OuterV0, m_Shuffle(m_Value(X0), m_Value(Y0), m_Mask(InnerMask0)));
2939 bool Match1 =
2940 match(OuterV1, m_Shuffle(m_Value(X1), m_Value(Y1), m_Mask(InnerMask1)));
2941 if (!Match0 && !Match1)
2942 return false;
2943
2944 // If the outer shuffle is a permute, then create a fake inner all-poison
2945 // shuffle. This is easier than accounting for length-changing shuffles below.
2946 SmallVector<int, 16> PoisonMask1;
2947 if (!Match1 && isa<PoisonValue>(OuterV1)) {
2948 X1 = X0;
2949 Y1 = Y0;
2950 PoisonMask1.append(InnerMask0.size(), PoisonMaskElem);
2951 InnerMask1 = PoisonMask1;
2952 Match1 = true; // fake match
2953 }
2954
2955 X0 = Match0 ? X0 : OuterV0;
2956 Y0 = Match0 ? Y0 : OuterV0;
2957 X1 = Match1 ? X1 : OuterV1;
2958 Y1 = Match1 ? Y1 : OuterV1;
2959 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2960 auto *ShuffleSrcTy = dyn_cast<FixedVectorType>(X0->getType());
2961 auto *ShuffleImmTy = dyn_cast<FixedVectorType>(OuterV0->getType());
2962 if (!ShuffleDstTy || !ShuffleSrcTy || !ShuffleImmTy ||
2963 X0->getType() != X1->getType())
2964 return false;
2965
2966 unsigned NumSrcElts = ShuffleSrcTy->getNumElements();
2967 unsigned NumImmElts = ShuffleImmTy->getNumElements();
2968
2969 // Attempt to merge shuffles, matching upto 2 source operands.
2970 // Replace index to a poison arg with PoisonMaskElem.
2971 // Bail if either inner masks reference an undef arg.
2972 SmallVector<int, 16> NewMask(OuterMask);
2973 Value *NewX = nullptr, *NewY = nullptr;
2974 for (int &M : NewMask) {
2975 Value *Src = nullptr;
2976 if (0 <= M && M < (int)NumImmElts) {
2977 Src = OuterV0;
2978 if (Match0) {
2979 M = InnerMask0[M];
2980 Src = M >= (int)NumSrcElts ? Y0 : X0;
2981 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
2982 }
2983 } else if (M >= (int)NumImmElts) {
2984 Src = OuterV1;
2985 M -= NumImmElts;
2986 if (Match1) {
2987 M = InnerMask1[M];
2988 Src = M >= (int)NumSrcElts ? Y1 : X1;
2989 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
2990 }
2991 }
2992 if (Src && M != PoisonMaskElem) {
2993 assert(0 <= M && M < (int)NumSrcElts && "Unexpected shuffle mask index");
2994 if (isa<UndefValue>(Src)) {
2995 // We've referenced an undef element - if its poison, update the shuffle
2996 // mask, else bail.
2997 if (!isa<PoisonValue>(Src))
2998 return false;
2999 M = PoisonMaskElem;
3000 continue;
3001 }
3002 if (!NewX || NewX == Src) {
3003 NewX = Src;
3004 continue;
3005 }
3006 if (!NewY || NewY == Src) {
3007 M += NumSrcElts;
3008 NewY = Src;
3009 continue;
3010 }
3011 return false;
3012 }
3013 }
3014
3015 if (!NewX)
3016 return PoisonValue::get(ShuffleDstTy);
3017 if (!NewY)
3018 NewY = PoisonValue::get(ShuffleSrcTy);
3019
3020 // Have we folded to an Identity shuffle?
3021 if (ShuffleVectorInst::isIdentityMask(NewMask, NumSrcElts)) {
3022 replaceValue(I, *NewX);
3023 return true;
3024 }
3025
3026 // Try to merge the shuffles if the new shuffle is not costly.
3027 InstructionCost InnerCost0 = 0;
3028 if (Match0)
3029 InnerCost0 = TTI.getInstructionCost(cast<User>(OuterV0), CostKind);
3030
3031 InstructionCost InnerCost1 = 0;
3032 if (Match1)
3033 InnerCost1 = TTI.getInstructionCost(cast<User>(OuterV1), CostKind);
3034
3036
3037 InstructionCost OldCost = InnerCost0 + InnerCost1 + OuterCost;
3038
3039 bool IsUnary = all_of(NewMask, [&](int M) { return M < (int)NumSrcElts; });
3043 InstructionCost NewCost =
3044 TTI.getShuffleCost(SK, ShuffleDstTy, ShuffleSrcTy, NewMask, CostKind, 0,
3045 nullptr, {NewX, NewY});
3046 if (!OuterV0->hasOneUse())
3047 NewCost += InnerCost0;
3048 if (!OuterV1->hasOneUse())
3049 NewCost += InnerCost1;
3050
3051 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two shuffles: " << I
3052 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
3053 << "\n");
3054 if (NewCost > OldCost)
3055 return false;
3056
3057 Value *Shuf = Builder.CreateShuffleVector(NewX, NewY, NewMask);
3058 replaceValue(I, *Shuf);
3059 return true;
3060}
3061
3062/// Try to convert a chain of length-preserving shuffles that are fed by
3063/// length-changing shuffles from the same source, e.g. a chain of length 3:
3064///
3065/// "shuffle (shuffle (shuffle x, (shuffle y, undef)),
3066/// (shuffle y, undef)),
3067// (shuffle y, undef)"
3068///
3069/// into a single shuffle fed by a length-changing shuffle:
3070///
3071/// "shuffle x, (shuffle y, undef)"
3072///
3073/// Such chains arise e.g. from folding extract/insert sequences.
3074bool VectorCombine::foldShufflesOfLengthChangingShuffles(Instruction &I) {
3075 FixedVectorType *TrunkType = dyn_cast<FixedVectorType>(I.getType());
3076 if (!TrunkType)
3077 return false;
3078
3079 unsigned ChainLength = 0;
3080 SmallVector<int> Mask;
3081 SmallVector<int> YMask;
3082 InstructionCost OldCost = 0;
3083 InstructionCost NewCost = 0;
3084 Value *Trunk = &I;
3085 unsigned NumTrunkElts = TrunkType->getNumElements();
3086 Value *Y = nullptr;
3087
3088 for (;;) {
3089 // Match the current trunk against (commutations of) the pattern
3090 // "shuffle trunk', (shuffle y, undef)"
3091 ArrayRef<int> OuterMask;
3092 Value *OuterV0, *OuterV1;
3093 if (ChainLength != 0 && !Trunk->hasOneUse())
3094 break;
3095 if (!match(Trunk, m_Shuffle(m_Value(OuterV0), m_Value(OuterV1),
3096 m_Mask(OuterMask))))
3097 break;
3098 if (OuterV0->getType() != TrunkType) {
3099 // This shuffle is not length-preserving, so it cannot be part of the
3100 // chain.
3101 break;
3102 }
3103
3104 ArrayRef<int> InnerMask0, InnerMask1;
3105 Value *A0, *A1, *B0, *B1;
3106 bool Match0 =
3107 match(OuterV0, m_Shuffle(m_Value(A0), m_Value(B0), m_Mask(InnerMask0)));
3108 bool Match1 =
3109 match(OuterV1, m_Shuffle(m_Value(A1), m_Value(B1), m_Mask(InnerMask1)));
3110 bool Match0Leaf = Match0 && A0->getType() != I.getType();
3111 bool Match1Leaf = Match1 && A1->getType() != I.getType();
3112 if (Match0Leaf == Match1Leaf) {
3113 // Only handle the case of exactly one leaf in each step. The "two leaves"
3114 // case is handled by foldShuffleOfShuffles.
3115 break;
3116 }
3117
3118 SmallVector<int> CommutedOuterMask;
3119 if (Match0Leaf) {
3120 std::swap(OuterV0, OuterV1);
3121 std::swap(InnerMask0, InnerMask1);
3122 std::swap(A0, A1);
3123 std::swap(B0, B1);
3124 llvm::append_range(CommutedOuterMask, OuterMask);
3125 for (int &M : CommutedOuterMask) {
3126 if (M == PoisonMaskElem)
3127 continue;
3128 if (M < (int)NumTrunkElts)
3129 M += NumTrunkElts;
3130 else
3131 M -= NumTrunkElts;
3132 }
3133 OuterMask = CommutedOuterMask;
3134 }
3135 if (!OuterV1->hasOneUse())
3136 break;
3137
3138 if (!isa<UndefValue>(A1)) {
3139 if (!Y)
3140 Y = A1;
3141 else if (Y != A1)
3142 break;
3143 }
3144 if (!isa<UndefValue>(B1)) {
3145 if (!Y)
3146 Y = B1;
3147 else if (Y != B1)
3148 break;
3149 }
3150
3151 auto *YType = cast<FixedVectorType>(A1->getType());
3152 int NumLeafElts = YType->getNumElements();
3153 SmallVector<int> LocalYMask(InnerMask1);
3154 for (int &M : LocalYMask) {
3155 if (M >= NumLeafElts)
3156 M -= NumLeafElts;
3157 }
3158
3159 InstructionCost LocalOldCost =
3162
3163 // Handle the initial (start of chain) case.
3164 if (!ChainLength) {
3165 Mask.assign(OuterMask);
3166 YMask.assign(LocalYMask);
3167 OldCost = NewCost = LocalOldCost;
3168 Trunk = OuterV0;
3169 ChainLength++;
3170 continue;
3171 }
3172
3173 // For the non-root case, first attempt to combine masks.
3174 SmallVector<int> NewYMask(YMask);
3175 bool Valid = true;
3176 for (auto [CombinedM, LeafM] : llvm::zip(NewYMask, LocalYMask)) {
3177 if (LeafM == -1 || CombinedM == LeafM)
3178 continue;
3179 if (CombinedM == -1) {
3180 CombinedM = LeafM;
3181 } else {
3182 Valid = false;
3183 break;
3184 }
3185 }
3186 if (!Valid)
3187 break;
3188
3189 SmallVector<int> NewMask;
3190 NewMask.reserve(NumTrunkElts);
3191 for (int M : Mask) {
3192 if (M < 0 || M >= static_cast<int>(NumTrunkElts))
3193 NewMask.push_back(M);
3194 else
3195 NewMask.push_back(OuterMask[M]);
3196 }
3197
3198 // Break the chain if adding this new step complicates the shuffles such
3199 // that it would increase the new cost by more than the old cost of this
3200 // step.
3201 InstructionCost LocalNewCost =
3203 YType, NewYMask, CostKind) +
3205 TrunkType, NewMask, CostKind);
3206
3207 if (LocalNewCost >= NewCost && LocalOldCost < LocalNewCost - NewCost)
3208 break;
3209
3210 LLVM_DEBUG({
3211 if (ChainLength == 1) {
3212 dbgs() << "Found chain of shuffles fed by length-changing shuffles: "
3213 << I << '\n';
3214 }
3215 dbgs() << " next chain link: " << *Trunk << '\n'
3216 << " old cost: " << (OldCost + LocalOldCost)
3217 << " new cost: " << LocalNewCost << '\n';
3218 });
3219
3220 Mask = NewMask;
3221 YMask = NewYMask;
3222 OldCost += LocalOldCost;
3223 NewCost = LocalNewCost;
3224 Trunk = OuterV0;
3225 ChainLength++;
3226 }
3227 if (ChainLength <= 1)
3228 return false;
3229
3230 if (llvm::all_of(Mask, [&](int M) {
3231 return M < 0 || M >= static_cast<int>(NumTrunkElts);
3232 })) {
3233 // Produce a canonical simplified form if all elements are sourced from Y.
3234 for (int &M : Mask) {
3235 if (M >= static_cast<int>(NumTrunkElts))
3236 M = YMask[M - NumTrunkElts];
3237 }
3238 Value *Root =
3239 Builder.CreateShuffleVector(Y, PoisonValue::get(Y->getType()), Mask);
3240 replaceValue(I, *Root);
3241 return true;
3242 }
3243
3244 Value *Leaf =
3245 Builder.CreateShuffleVector(Y, PoisonValue::get(Y->getType()), YMask);
3246 Value *Root = Builder.CreateShuffleVector(Trunk, Leaf, Mask);
3247 replaceValue(I, *Root);
3248 return true;
3249}
3250
3251/// Try to convert
3252/// "shuffle (intrinsic), (intrinsic)" into "intrinsic (shuffle), (shuffle)".
3253bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
3254 Value *V0, *V1;
3255 ArrayRef<int> OldMask;
3256 if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask))))
3257 return false;
3258
3259 auto *II0 = dyn_cast<IntrinsicInst>(V0);
3260 auto *II1 = dyn_cast<IntrinsicInst>(V1);
3261 if (!II0 || !II1)
3262 return false;
3263
3264 Intrinsic::ID IID = II0->getIntrinsicID();
3265 if (IID != II1->getIntrinsicID())
3266 return false;
3267 InstructionCost CostII0 =
3268 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind);
3269 InstructionCost CostII1 =
3270 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II1), CostKind);
3271
3272 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
3273 auto *II0Ty = dyn_cast<FixedVectorType>(II0->getType());
3274 if (!ShuffleDstTy || !II0Ty)
3275 return false;
3276
3277 if (!isTriviallyVectorizable(IID))
3278 return false;
3279
3280 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
3282 II0->getArgOperand(I) != II1->getArgOperand(I))
3283 return false;
3284
3285 InstructionCost OldCost =
3286 CostII0 + CostII1 +
3288 II0Ty, OldMask, CostKind, 0, nullptr, {II0, II1}, &I);
3289
3290 SmallVector<Type *> NewArgsTy;
3291 InstructionCost NewCost = 0;
3292 SmallDenseSet<std::pair<Value *, Value *>> SeenOperandPairs;
3293 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3295 NewArgsTy.push_back(II0->getArgOperand(I)->getType());
3296 } else {
3297 auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
3298 auto *ArgTy = FixedVectorType::get(VecTy->getElementType(),
3299 ShuffleDstTy->getNumElements());
3300 NewArgsTy.push_back(ArgTy);
3301 std::pair<Value *, Value *> OperandPair =
3302 std::make_pair(II0->getArgOperand(I), II1->getArgOperand(I));
3303 if (!SeenOperandPairs.insert(OperandPair).second) {
3304 // We've already computed the cost for this operand pair.
3305 continue;
3306 }
3307 NewCost += TTI.getShuffleCost(
3308 TargetTransformInfo::SK_PermuteTwoSrc, ArgTy, VecTy, OldMask,
3309 CostKind, 0, nullptr, {II0->getArgOperand(I), II1->getArgOperand(I)});
3310 }
3311 }
3312 IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
3313
3314 NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
3315 if (!II0->hasOneUse())
3316 NewCost += CostII0;
3317 if (II1 != II0 && !II1->hasOneUse())
3318 NewCost += CostII1;
3319
3320 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two intrinsics: " << I
3321 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
3322 << "\n");
3323
3324 if (NewCost > OldCost)
3325 return false;
3326
3327 SmallVector<Value *> NewArgs;
3328 SmallDenseMap<std::pair<Value *, Value *>, Value *> ShuffleCache;
3329 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
3331 NewArgs.push_back(II0->getArgOperand(I));
3332 } else {
3333 std::pair<Value *, Value *> OperandPair =
3334 std::make_pair(II0->getArgOperand(I), II1->getArgOperand(I));
3335 auto It = ShuffleCache.find(OperandPair);
3336 if (It != ShuffleCache.end()) {
3337 // Reuse previously created shuffle for this operand pair.
3338 NewArgs.push_back(It->second);
3339 continue;
3340 }
3341 Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I),
3342 II1->getArgOperand(I), OldMask);
3343 ShuffleCache[OperandPair] = Shuf;
3344 NewArgs.push_back(Shuf);
3345 Worklist.pushValue(Shuf);
3346 }
3347 Value *NewIntrinsic = Builder.CreateIntrinsic(ShuffleDstTy, IID, NewArgs);
3348
3349 // Intersect flags from the old intrinsics.
3350 if (auto *NewInst = dyn_cast<Instruction>(NewIntrinsic)) {
3351 NewInst->copyIRFlags(II0);
3352 NewInst->andIRFlags(II1);
3353 }
3354
3355 replaceValue(I, *NewIntrinsic);
3356 return true;
3357}
3358
3359/// Try to convert
3360/// "shuffle (intrinsic), (poison/undef)" into "intrinsic (shuffle)".
3361bool VectorCombine::foldPermuteOfIntrinsic(Instruction &I) {
3362 Value *V0;
3363 ArrayRef<int> Mask;
3364 if (!match(&I, m_Shuffle(m_Value(V0), m_Undef(), m_Mask(Mask))))
3365 return false;
3366
3367 auto *II0 = dyn_cast<IntrinsicInst>(V0);
3368 if (!II0)
3369 return false;
3370
3371 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
3372 auto *IntrinsicSrcTy = dyn_cast<FixedVectorType>(II0->getType());
3373 if (!ShuffleDstTy || !IntrinsicSrcTy)
3374 return false;
3375
3376 // Validate it's a pure permute, mask should only reference the first vector
3377 unsigned NumSrcElts = IntrinsicSrcTy->getNumElements();
3378 if (any_of(Mask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; }))
3379 return false;
3380
3381 Intrinsic::ID IID = II0->getIntrinsicID();
3382 if (!isTriviallyVectorizable(IID))
3383 return false;
3384
3385 // Cost analysis
3387 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind);
3388 InstructionCost OldCost =
3391 IntrinsicSrcTy, Mask, CostKind, 0, nullptr, {V0}, &I);
3392
3393 SmallVector<Type *> NewArgsTy;
3394 InstructionCost NewCost = 0;
3395 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3397 NewArgsTy.push_back(II0->getArgOperand(I)->getType());
3398 } else {
3399 auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
3400 auto *ArgTy = FixedVectorType::get(VecTy->getElementType(),
3401 ShuffleDstTy->getNumElements());
3402 NewArgsTy.push_back(ArgTy);
3404 ArgTy, VecTy, Mask, CostKind, 0, nullptr,
3405 {II0->getArgOperand(I)});
3406 }
3407 }
3408 IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
3409 NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
3410
3411 // If the intrinsic has multiple uses, we need to account for the cost of
3412 // keeping the original intrinsic around.
3413 if (!II0->hasOneUse())
3414 NewCost += IntrinsicCost;
3415
3416 LLVM_DEBUG(dbgs() << "Found a permute of intrinsic: " << I << "\n OldCost: "
3417 << OldCost << " vs NewCost: " << NewCost << "\n");
3418
3419 if (NewCost > OldCost)
3420 return false;
3421
3422 // Transform
3423 SmallVector<Value *> NewArgs;
3424 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3426 NewArgs.push_back(II0->getArgOperand(I));
3427 } else {
3428 Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I), Mask);
3429 NewArgs.push_back(Shuf);
3430 Worklist.pushValue(Shuf);
3431 }
3432 }
3433
3434 Value *NewIntrinsic = Builder.CreateIntrinsic(ShuffleDstTy, IID, NewArgs);
3435
3436 if (auto *NewInst = dyn_cast<Instruction>(NewIntrinsic))
3437 NewInst->copyIRFlags(II0);
3438
3439 replaceValue(I, *NewIntrinsic);
3440 return true;
3441}
3442
3443using InstLane = std::pair<Use *, int>;
3444
3445static InstLane lookThroughShuffles(Use *U, int Lane) {
3446 while (auto *SV = dyn_cast<ShuffleVectorInst>(U->get())) {
3447 unsigned NumElts =
3448 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
3449 int M = SV->getMaskValue(Lane);
3450 if (M < 0)
3451 return {nullptr, PoisonMaskElem};
3452 if (static_cast<unsigned>(M) < NumElts) {
3453 U = &SV->getOperandUse(0);
3454 Lane = M;
3455 } else {
3456 U = &SV->getOperandUse(1);
3457 Lane = M - NumElts;
3458 }
3459 }
3460 return InstLane{U, Lane};
3461}
3462
3466 for (InstLane IL : Item) {
3467 auto [U, Lane] = IL;
3468 InstLane OpLane =
3469 U ? lookThroughShuffles(&cast<Instruction>(U->get())->getOperandUse(Op),
3470 Lane)
3471 : InstLane{nullptr, PoisonMaskElem};
3472 NItem.emplace_back(OpLane);
3473 }
3474 return NItem;
3475}
3476
3477/// Detect concat of multiple values into a vector
3479 const TargetTransformInfo &TTI) {
3480 auto *Ty = cast<FixedVectorType>(Item.front().first->get()->getType());
3481 unsigned NumElts = Ty->getNumElements();
3482 if (Item.size() == NumElts || NumElts == 1 || Item.size() % NumElts != 0)
3483 return false;
3484
3485 // Check that the concat is free, usually meaning that the type will be split
3486 // during legalization.
3487 SmallVector<int, 16> ConcatMask(NumElts * 2);
3488 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
3489 if (TTI.getShuffleCost(TTI::SK_PermuteTwoSrc,
3490 FixedVectorType::get(Ty->getScalarType(), NumElts * 2),
3491 Ty, ConcatMask, CostKind) != 0)
3492 return false;
3493
3494 unsigned NumSlices = Item.size() / NumElts;
3495 // Currently we generate a tree of shuffles for the concats, which limits us
3496 // to a power2.
3497 if (!isPowerOf2_32(NumSlices))
3498 return false;
3499 for (unsigned Slice = 0; Slice < NumSlices; ++Slice) {
3500 Use *SliceV = Item[Slice * NumElts].first;
3501 if (!SliceV || SliceV->get()->getType() != Ty)
3502 return false;
3503 for (unsigned Elt = 0; Elt < NumElts; ++Elt) {
3504 auto [V, Lane] = Item[Slice * NumElts + Elt];
3505 if (Lane != static_cast<int>(Elt) || SliceV->get() != V->get())
3506 return false;
3507 }
3508 }
3509 return true;
3510}
3511
3513 const SmallPtrSet<Use *, 4> &IdentityLeafs,
3514 const SmallPtrSet<Use *, 4> &SplatLeafs,
3515 const SmallPtrSet<Use *, 4> &ConcatLeafs,
3516 IRBuilderBase &Builder,
3517 const TargetTransformInfo *TTI) {
3518 auto [FrontU, FrontLane] = Item.front();
3519
3520 if (IdentityLeafs.contains(FrontU)) {
3521 return FrontU->get();
3522 }
3523 if (SplatLeafs.contains(FrontU)) {
3524 SmallVector<int, 16> Mask(Ty->getNumElements(), FrontLane);
3525 return Builder.CreateShuffleVector(FrontU->get(), Mask);
3526 }
3527 if (ConcatLeafs.contains(FrontU)) {
3528 unsigned NumElts =
3529 cast<FixedVectorType>(FrontU->get()->getType())->getNumElements();
3530 SmallVector<Value *> Values(Item.size() / NumElts, nullptr);
3531 for (unsigned S = 0; S < Values.size(); ++S)
3532 Values[S] = Item[S * NumElts].first->get();
3533
3534 while (Values.size() > 1) {
3535 NumElts *= 2;
3536 SmallVector<int, 16> Mask(NumElts, 0);
3537 std::iota(Mask.begin(), Mask.end(), 0);
3538 SmallVector<Value *> NewValues(Values.size() / 2, nullptr);
3539 for (unsigned S = 0; S < NewValues.size(); ++S)
3540 NewValues[S] =
3541 Builder.CreateShuffleVector(Values[S * 2], Values[S * 2 + 1], Mask);
3542 Values = NewValues;
3543 }
3544 return Values[0];
3545 }
3546
3547 auto *I = cast<Instruction>(FrontU->get());
3548 auto *II = dyn_cast<IntrinsicInst>(I);
3549 unsigned NumOps = I->getNumOperands() - (II ? 1 : 0);
3551 for (unsigned Idx = 0; Idx < NumOps; Idx++) {
3552 if (II &&
3553 isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, TTI)) {
3554 Ops[Idx] = II->getOperand(Idx);
3555 continue;
3556 }
3558 Ty, IdentityLeafs, SplatLeafs, ConcatLeafs,
3559 Builder, TTI);
3560 }
3561
3562 SmallVector<Value *, 8> ValueList;
3563 for (const auto &Lane : Item)
3564 if (Lane.first)
3565 ValueList.push_back(Lane.first->get());
3566
3567 Type *DstTy =
3568 FixedVectorType::get(I->getType()->getScalarType(), Ty->getNumElements());
3569 if (auto *BI = dyn_cast<BinaryOperator>(I)) {
3570 auto *Value = Builder.CreateBinOp((Instruction::BinaryOps)BI->getOpcode(),
3571 Ops[0], Ops[1]);
3572 propagateIRFlags(Value, ValueList);
3573 return Value;
3574 }
3575 if (auto *CI = dyn_cast<CmpInst>(I)) {
3576 auto *Value = Builder.CreateCmp(CI->getPredicate(), Ops[0], Ops[1]);
3577 propagateIRFlags(Value, ValueList);
3578 return Value;
3579 }
3580 if (auto *SI = dyn_cast<SelectInst>(I)) {
3581 auto *Value = Builder.CreateSelect(Ops[0], Ops[1], Ops[2], "", SI);
3582 propagateIRFlags(Value, ValueList);
3583 return Value;
3584 }
3585 if (auto *CI = dyn_cast<CastInst>(I)) {
3586 auto *Value = Builder.CreateCast(CI->getOpcode(), Ops[0], DstTy);
3587 propagateIRFlags(Value, ValueList);
3588 return Value;
3589 }
3590 if (II) {
3591 auto *Value = Builder.CreateIntrinsic(DstTy, II->getIntrinsicID(), Ops);
3592 propagateIRFlags(Value, ValueList);
3593 return Value;
3594 }
3595 assert(isa<UnaryInstruction>(I) && "Unexpected instruction type in Generate");
3596 auto *Value =
3597 Builder.CreateUnOp((Instruction::UnaryOps)I->getOpcode(), Ops[0]);
3598 propagateIRFlags(Value, ValueList);
3599 return Value;
3600}
3601
3602// Starting from a shuffle, look up through operands tracking the shuffled index
3603// of each lane. If we can simplify away the shuffles to identities then
3604// do so.
3605bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
3606 auto *Ty = dyn_cast<FixedVectorType>(I.getType());
3607 if (!Ty || I.use_empty())
3608 return false;
3609
3610 SmallVector<InstLane> Start(Ty->getNumElements());
3611 for (unsigned M = 0, E = Ty->getNumElements(); M < E; ++M)
3612 Start[M] = lookThroughShuffles(&*I.use_begin(), M);
3613
3615 Worklist.push_back(Start);
3616 SmallPtrSet<Use *, 4> IdentityLeafs, SplatLeafs, ConcatLeafs;
3617 unsigned NumVisited = 0;
3618
3619 while (!Worklist.empty()) {
3620 if (++NumVisited > MaxInstrsToScan)
3621 return false;
3622
3623 SmallVector<InstLane> Item = Worklist.pop_back_val();
3624 auto [FrontU, FrontLane] = Item.front();
3625
3626 // If we found an undef first lane then bail out to keep things simple.
3627 if (!FrontU)
3628 return false;
3629
3630 // Helper to peek through bitcasts to the same value.
3631 auto IsEquiv = [&](Value *X, Value *Y) {
3632 return X->getType() == Y->getType() &&
3634 };
3635
3636 // Look for an identity value.
3637 if (FrontLane == 0 &&
3638 cast<FixedVectorType>(FrontU->get()->getType())->getNumElements() ==
3639 Ty->getNumElements() &&
3640 all_of(drop_begin(enumerate(Item)), [IsEquiv, Item](const auto &E) {
3641 Value *FrontV = Item.front().first->get();
3642 return !E.value().first || (IsEquiv(E.value().first->get(), FrontV) &&
3643 E.value().second == (int)E.index());
3644 })) {
3645 IdentityLeafs.insert(FrontU);
3646 continue;
3647 }
3648 // Look for constants, for the moment only supporting constant splats.
3649 if (auto *C = dyn_cast<Constant>(FrontU);
3650 C && C->getSplatValue() &&
3651 all_of(drop_begin(Item), [Item](InstLane &IL) {
3652 Value *FrontV = Item.front().first->get();
3653 Use *U = IL.first;
3654 return !U || (isa<Constant>(U->get()) &&
3655 cast<Constant>(U->get())->getSplatValue() ==
3656 cast<Constant>(FrontV)->getSplatValue());
3657 })) {
3658 SplatLeafs.insert(FrontU);
3659 continue;
3660 }
3661 // Look for a splat value.
3662 if (all_of(drop_begin(Item), [Item](InstLane &IL) {
3663 auto [FrontU, FrontLane] = Item.front();
3664 auto [U, Lane] = IL;
3665 return !U || (U->get() == FrontU->get() && Lane == FrontLane);
3666 })) {
3667 SplatLeafs.insert(FrontU);
3668 continue;
3669 }
3670
3671 // We need each element to be the same type of value, and check that each
3672 // element has a single use.
3673 auto CheckLaneIsEquivalentToFirst = [Item](InstLane IL) {
3674 Value *FrontV = Item.front().first->get();
3675 if (!IL.first)
3676 return true;
3677 Value *V = IL.first->get();
3678 if (auto *I = dyn_cast<Instruction>(V); I && !I->hasOneUser())
3679 return false;
3680 if (V->getValueID() != FrontV->getValueID())
3681 return false;
3682 if (auto *CI = dyn_cast<CmpInst>(V))
3683 if (CI->getPredicate() != cast<CmpInst>(FrontV)->getPredicate())
3684 return false;
3685 if (auto *CI = dyn_cast<CastInst>(V))
3686 if (CI->getSrcTy()->getScalarType() !=
3687 cast<CastInst>(FrontV)->getSrcTy()->getScalarType())
3688 return false;
3689 if (auto *SI = dyn_cast<SelectInst>(V))
3690 if (!isa<VectorType>(SI->getOperand(0)->getType()) ||
3691 SI->getOperand(0)->getType() !=
3692 cast<SelectInst>(FrontV)->getOperand(0)->getType())
3693 return false;
3694 if (isa<CallInst>(V) && !isa<IntrinsicInst>(V))
3695 return false;
3696 auto *II = dyn_cast<IntrinsicInst>(V);
3697 return !II || (isa<IntrinsicInst>(FrontV) &&
3698 II->getIntrinsicID() ==
3699 cast<IntrinsicInst>(FrontV)->getIntrinsicID() &&
3700 !II->hasOperandBundles());
3701 };
3702 if (all_of(drop_begin(Item), CheckLaneIsEquivalentToFirst)) {
3703 // Check the operator is one that we support.
3704 if (isa<BinaryOperator, CmpInst>(FrontU)) {
3705 // We exclude div/rem in case they hit UB from poison lanes.
3706 if (auto *BO = dyn_cast<BinaryOperator>(FrontU);
3707 BO && BO->isIntDivRem())
3708 return false;
3711 continue;
3712 } else if (isa<UnaryOperator, TruncInst, ZExtInst, SExtInst, FPToSIInst,
3713 FPToUIInst, SIToFPInst, UIToFPInst>(FrontU)) {
3715 continue;
3716 } else if (auto *BitCast = dyn_cast<BitCastInst>(FrontU)) {
3717 // TODO: Handle vector widening/narrowing bitcasts.
3718 auto *DstTy = dyn_cast<FixedVectorType>(BitCast->getDestTy());
3719 auto *SrcTy = dyn_cast<FixedVectorType>(BitCast->getSrcTy());
3720 if (DstTy && SrcTy &&
3721 SrcTy->getNumElements() == DstTy->getNumElements()) {
3723 continue;
3724 }
3725 } else if (isa<SelectInst>(FrontU)) {
3729 continue;
3730 } else if (auto *II = dyn_cast<IntrinsicInst>(FrontU);
3731 II && isTriviallyVectorizable(II->getIntrinsicID()) &&
3732 !II->hasOperandBundles()) {
3733 for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) {
3734 if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op,
3735 &TTI)) {
3736 if (!all_of(drop_begin(Item), [Item, Op](InstLane &IL) {
3737 Value *FrontV = Item.front().first->get();
3738 Use *U = IL.first;
3739 return !U || (cast<Instruction>(U->get())->getOperand(Op) ==
3740 cast<Instruction>(FrontV)->getOperand(Op));
3741 }))
3742 return false;
3743 continue;
3744 }
3746 }
3747 continue;
3748 }
3749 }
3750
3751 if (isFreeConcat(Item, CostKind, TTI)) {
3752 ConcatLeafs.insert(FrontU);
3753 continue;
3754 }
3755
3756 return false;
3757 }
3758
3759 if (NumVisited <= 1)
3760 return false;
3761
3762 LLVM_DEBUG(dbgs() << "Found a superfluous identity shuffle: " << I << "\n");
3763
3764 // If we got this far, we know the shuffles are superfluous and can be
3765 // removed. Scan through again and generate the new tree of instructions.
3766 Builder.SetInsertPoint(&I);
3767 Value *V = generateNewInstTree(Start, Ty, IdentityLeafs, SplatLeafs,
3768 ConcatLeafs, Builder, &TTI);
3769 replaceValue(I, *V);
3770 return true;
3771}
3772
3773/// Given a commutative reduction, the order of the input lanes does not alter
3774/// the results. We can use this to remove certain shuffles feeding the
3775/// reduction, removing the need to shuffle at all.
3776bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
3777 auto *II = dyn_cast<IntrinsicInst>(&I);
3778 if (!II)
3779 return false;
3780 switch (II->getIntrinsicID()) {
3781 case Intrinsic::vector_reduce_add:
3782 case Intrinsic::vector_reduce_mul:
3783 case Intrinsic::vector_reduce_and:
3784 case Intrinsic::vector_reduce_or:
3785 case Intrinsic::vector_reduce_xor:
3786 case Intrinsic::vector_reduce_smin:
3787 case Intrinsic::vector_reduce_smax:
3788 case Intrinsic::vector_reduce_umin:
3789 case Intrinsic::vector_reduce_umax:
3790 break;
3791 default:
3792 return false;
3793 }
3794
3795 // Find all the inputs when looking through operations that do not alter the
3796 // lane order (binops, for example). Currently we look for a single shuffle,
3797 // and can ignore splat values.
3798 std::queue<Value *> Worklist;
3799 SmallPtrSet<Value *, 4> Visited;
3800 ShuffleVectorInst *Shuffle = nullptr;
3801 if (auto *Op = dyn_cast<Instruction>(I.getOperand(0)))
3802 Worklist.push(Op);
3803
3804 while (!Worklist.empty()) {
3805 Value *CV = Worklist.front();
3806 Worklist.pop();
3807 if (Visited.contains(CV))
3808 continue;
3809
3810 // Splats don't change the order, so can be safely ignored.
3811 if (isSplatValue(CV))
3812 continue;
3813
3814 Visited.insert(CV);
3815
3816 if (auto *CI = dyn_cast<Instruction>(CV)) {
3817 if (CI->isBinaryOp()) {
3818 for (auto *Op : CI->operand_values())
3819 Worklist.push(Op);
3820 continue;
3821 } else if (auto *SV = dyn_cast<ShuffleVectorInst>(CI)) {
3822 if (Shuffle && Shuffle != SV)
3823 return false;
3824 Shuffle = SV;
3825 continue;
3826 }
3827 }
3828
3829 // Anything else is currently an unknown node.
3830 return false;
3831 }
3832
3833 if (!Shuffle)
3834 return false;
3835
3836 // Check all uses of the binary ops and shuffles are also included in the
3837 // lane-invariant operations (Visited should be the list of lanewise
3838 // instructions, including the shuffle that we found).
3839 for (auto *V : Visited)
3840 for (auto *U : V->users())
3841 if (!Visited.contains(U) && U != &I)
3842 return false;
3843
3844 FixedVectorType *VecType =
3845 dyn_cast<FixedVectorType>(II->getOperand(0)->getType());
3846 if (!VecType)
3847 return false;
3848 FixedVectorType *ShuffleInputType =
3850 if (!ShuffleInputType)
3851 return false;
3852 unsigned NumInputElts = ShuffleInputType->getNumElements();
3853
3854 // Find the mask from sorting the lanes into order. This is most likely to
3855 // become a identity or concat mask. Undef elements are pushed to the end.
3856 SmallVector<int> ConcatMask;
3857 Shuffle->getShuffleMask(ConcatMask);
3858 sort(ConcatMask, [](int X, int Y) { return (unsigned)X < (unsigned)Y; });
3859 bool UsesSecondVec =
3860 any_of(ConcatMask, [&](int M) { return M >= (int)NumInputElts; });
3861
3863 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3864 ShuffleInputType, Shuffle->getShuffleMask(), CostKind);
3866 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3867 ShuffleInputType, ConcatMask, CostKind);
3868
3869 LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle
3870 << "\n");
3871 LLVM_DEBUG(dbgs() << " OldCost: " << OldCost << " vs NewCost: " << NewCost
3872 << "\n");
3873 bool MadeChanges = false;
3874 if (NewCost < OldCost) {
3875 Builder.SetInsertPoint(Shuffle);
3876 Value *NewShuffle = Builder.CreateShuffleVector(
3877 Shuffle->getOperand(0), Shuffle->getOperand(1), ConcatMask);
3878 LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n");
3879 replaceValue(*Shuffle, *NewShuffle);
3880 return true;
3881 }
3882
3883 // See if we can re-use foldSelectShuffle, getting it to reduce the size of
3884 // the shuffle into a nicer order, as it can ignore the order of the shuffles.
3885 MadeChanges |= foldSelectShuffle(*Shuffle, true);
3886 return MadeChanges;
3887}
3888
3889/// For a given chain of patterns of the following form:
3890///
3891/// ```
3892/// %1 = shufflevector <n x ty1> %0, <n x ty1> poison <n x ty2> mask
3893///
3894/// %2 = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %0, <n x
3895/// ty1> %1)
3896/// OR
3897/// %2 = add/mul/or/and/xor <n x ty1> %0, %1
3898///
3899/// %3 = shufflevector <n x ty1> %2, <n x ty1> poison <n x ty2> mask
3900/// ...
3901/// ...
3902/// %(i - 1) = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %(i -
3903/// 3), <n x ty1> %(i - 2)
3904/// OR
3905/// %(i - 1) = add/mul/or/and/xor <n x ty1> %(i - 3), %(i - 2)
3906///
3907/// %(i) = extractelement <n x ty1> %(i - 1), 0
3908/// ```
3909///
3910/// Where:
3911/// `mask` follows a partition pattern:
3912///
3913/// Ex:
3914/// [n = 8, p = poison]
3915///
3916/// 4 5 6 7 | p p p p
3917/// 2 3 | p p p p p p
3918/// 1 | p p p p p p p
3919///
3920/// For powers of 2, there's a consistent pattern, but for other cases
3921/// the parity of the current half value at each step decides the
3922/// next partition half (see `ExpectedParityMask` for more logical details
3923/// in generalising this).
3924///
3925/// Ex:
3926/// [n = 6]
3927///
3928/// 3 4 5 | p p p
3929/// 1 2 | p p p p
3930/// 1 | p p p p p
3931bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
3932 // Going bottom-up for the pattern.
3933 std::queue<Value *> InstWorklist;
3934 InstructionCost OrigCost = 0;
3935
3936 // Common instruction operation after each shuffle op.
3937 std::optional<unsigned int> CommonCallOp = std::nullopt;
3938 std::optional<Instruction::BinaryOps> CommonBinOp = std::nullopt;
3939
3940 bool IsFirstCallOrBinInst = true;
3941 bool ShouldBeCallOrBinInst = true;
3942
3943 // This stores the last used instructions for shuffle/common op.
3944 //
3945 // PrevVecV[0] / PrevVecV[1] store the last two simultaneous
3946 // instructions from either shuffle/common op.
3947 SmallVector<Value *, 2> PrevVecV(2, nullptr);
3948
3949 Value *VecOpEE;
3950 if (!match(&I, m_ExtractElt(m_Value(VecOpEE), m_Zero())))
3951 return false;
3952
3953 auto *FVT = dyn_cast<FixedVectorType>(VecOpEE->getType());
3954 if (!FVT)
3955 return false;
3956
3957 int64_t VecSize = FVT->getNumElements();
3958 if (VecSize < 2)
3959 return false;
3960
3961 // Number of levels would be ~log2(n), considering we always partition
3962 // by half for this fold pattern.
3963 unsigned int NumLevels = Log2_64_Ceil(VecSize), VisitedCnt = 0;
3964 int64_t ShuffleMaskHalf = 1, ExpectedParityMask = 0;
3965
3966 // This is how we generalise for all element sizes.
3967 // At each step, if vector size is odd, we need non-poison
3968 // values to cover the dominant half so we don't miss out on any element.
3969 //
3970 // This mask will help us retrieve this as we go from bottom to top:
3971 //
3972 // Mask Set -> N = N * 2 - 1
3973 // Mask Unset -> N = N * 2
3974 for (int Cur = VecSize, Mask = NumLevels - 1; Cur > 1;
3975 Cur = (Cur + 1) / 2, --Mask) {
3976 if (Cur & 1)
3977 ExpectedParityMask |= (1ll << Mask);
3978 }
3979
3980 InstWorklist.push(VecOpEE);
3981
3982 while (!InstWorklist.empty()) {
3983 Value *CI = InstWorklist.front();
3984 InstWorklist.pop();
3985
3986 if (auto *II = dyn_cast<IntrinsicInst>(CI)) {
3987 if (!ShouldBeCallOrBinInst)
3988 return false;
3989
3990 if (!IsFirstCallOrBinInst && any_of(PrevVecV, equal_to(nullptr)))
3991 return false;
3992
3993 // For the first found call/bin op, the vector has to come from the
3994 // extract element op.
3995 if (II != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0]))
3996 return false;
3997 IsFirstCallOrBinInst = false;
3998
3999 if (!CommonCallOp)
4000 CommonCallOp = II->getIntrinsicID();
4001 if (II->getIntrinsicID() != *CommonCallOp)
4002 return false;
4003
4004 switch (II->getIntrinsicID()) {
4005 case Intrinsic::umin:
4006 case Intrinsic::umax:
4007 case Intrinsic::smin:
4008 case Intrinsic::smax: {
4009 auto *Op0 = II->getOperand(0);
4010 auto *Op1 = II->getOperand(1);
4011 PrevVecV[0] = Op0;
4012 PrevVecV[1] = Op1;
4013 break;
4014 }
4015 default:
4016 return false;
4017 }
4018 ShouldBeCallOrBinInst ^= 1;
4019
4020 IntrinsicCostAttributes ICA(
4021 *CommonCallOp, II->getType(),
4022 {PrevVecV[0]->getType(), PrevVecV[1]->getType()});
4023 OrigCost += TTI.getIntrinsicInstrCost(ICA, CostKind);
4024
4025 // We may need a swap here since it can be (a, b) or (b, a)
4026 // and accordingly change as we go up.
4027 if (!isa<ShuffleVectorInst>(PrevVecV[1]))
4028 std::swap(PrevVecV[0], PrevVecV[1]);
4029 InstWorklist.push(PrevVecV[1]);
4030 InstWorklist.push(PrevVecV[0]);
4031 } else if (auto *BinOp = dyn_cast<BinaryOperator>(CI)) {
4032 // Similar logic for bin ops.
4033
4034 if (!ShouldBeCallOrBinInst)
4035 return false;
4036
4037 if (!IsFirstCallOrBinInst && any_of(PrevVecV, equal_to(nullptr)))
4038 return false;
4039
4040 if (BinOp != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0]))
4041 return false;
4042 IsFirstCallOrBinInst = false;
4043
4044 if (!CommonBinOp)
4045 CommonBinOp = BinOp->getOpcode();
4046
4047 if (BinOp->getOpcode() != *CommonBinOp)
4048 return false;
4049
4050 switch (*CommonBinOp) {
4051 case BinaryOperator::Add:
4052 case BinaryOperator::Mul:
4053 case BinaryOperator::Or:
4054 case BinaryOperator::And:
4055 case BinaryOperator::Xor: {
4056 auto *Op0 = BinOp->getOperand(0);
4057 auto *Op1 = BinOp->getOperand(1);
4058 PrevVecV[0] = Op0;
4059 PrevVecV[1] = Op1;
4060 break;
4061 }
4062 default:
4063 return false;
4064 }
4065 ShouldBeCallOrBinInst ^= 1;
4066
4067 OrigCost +=
4068 TTI.getArithmeticInstrCost(*CommonBinOp, BinOp->getType(), CostKind);
4069
4070 if (!isa<ShuffleVectorInst>(PrevVecV[1]))
4071 std::swap(PrevVecV[0], PrevVecV[1]);
4072 InstWorklist.push(PrevVecV[1]);
4073 InstWorklist.push(PrevVecV[0]);
4074 } else if (auto *SVInst = dyn_cast<ShuffleVectorInst>(CI)) {
4075 // We shouldn't have any null values in the previous vectors,
4076 // is so, there was a mismatch in pattern.
4077 if (ShouldBeCallOrBinInst || any_of(PrevVecV, equal_to(nullptr)))
4078 return false;
4079
4080 if (SVInst != PrevVecV[1])
4081 return false;
4082
4083 ArrayRef<int> CurMask;
4084 if (!match(SVInst, m_Shuffle(m_Specific(PrevVecV[0]), m_Poison(),
4085 m_Mask(CurMask))))
4086 return false;
4087
4088 // Subtract the parity mask when checking the condition.
4089 for (int Mask = 0, MaskSize = CurMask.size(); Mask != MaskSize; ++Mask) {
4090 if (Mask < ShuffleMaskHalf &&
4091 CurMask[Mask] != ShuffleMaskHalf + Mask - (ExpectedParityMask & 1))
4092 return false;
4093 if (Mask >= ShuffleMaskHalf && CurMask[Mask] != -1)
4094 return false;
4095 }
4096
4097 // Update mask values.
4098 ShuffleMaskHalf *= 2;
4099 ShuffleMaskHalf -= (ExpectedParityMask & 1);
4100 ExpectedParityMask >>= 1;
4101
4103 SVInst->getType(), SVInst->getType(),
4104 CurMask, CostKind);
4105
4106 VisitedCnt += 1;
4107 if (!ExpectedParityMask && VisitedCnt == NumLevels)
4108 break;
4109
4110 ShouldBeCallOrBinInst ^= 1;
4111 } else {
4112 return false;
4113 }
4114 }
4115
4116 // Pattern should end with a shuffle op.
4117 if (ShouldBeCallOrBinInst)
4118 return false;
4119
4120 assert(VecSize != -1 && "Expected Match for Vector Size");
4121
4122 Value *FinalVecV = PrevVecV[0];
4123 if (!FinalVecV)
4124 return false;
4125
4126 auto *FinalVecVTy = cast<FixedVectorType>(FinalVecV->getType());
4127
4128 Intrinsic::ID ReducedOp =
4129 (CommonCallOp ? getMinMaxReductionIntrinsicID(*CommonCallOp)
4130 : getReductionForBinop(*CommonBinOp));
4131 if (!ReducedOp)
4132 return false;
4133
4134 IntrinsicCostAttributes ICA(ReducedOp, FinalVecVTy, {FinalVecV});
4136
4137 if (NewCost >= OrigCost)
4138 return false;
4139
4140 auto *ReducedResult =
4141 Builder.CreateIntrinsic(ReducedOp, {FinalVecV->getType()}, {FinalVecV});
4142 replaceValue(I, *ReducedResult);
4143
4144 return true;
4145}
4146
4147/// Determine if its more efficient to fold:
4148/// reduce(trunc(x)) -> trunc(reduce(x)).
4149/// reduce(sext(x)) -> sext(reduce(x)).
4150/// reduce(zext(x)) -> zext(reduce(x)).
4151bool VectorCombine::foldCastFromReductions(Instruction &I) {
4152 auto *II = dyn_cast<IntrinsicInst>(&I);
4153 if (!II)
4154 return false;
4155
4156 bool TruncOnly = false;
4157 Intrinsic::ID IID = II->getIntrinsicID();
4158 switch (IID) {
4159 case Intrinsic::vector_reduce_add:
4160 case Intrinsic::vector_reduce_mul:
4161 TruncOnly = true;
4162 break;
4163 case Intrinsic::vector_reduce_and:
4164 case Intrinsic::vector_reduce_or:
4165 case Intrinsic::vector_reduce_xor:
4166 break;
4167 default:
4168 return false;
4169 }
4170
4171 unsigned ReductionOpc = getArithmeticReductionInstruction(IID);
4172 Value *ReductionSrc = I.getOperand(0);
4173
4174 Value *Src;
4175 if (!match(ReductionSrc, m_OneUse(m_Trunc(m_Value(Src)))) &&
4176 (TruncOnly || !match(ReductionSrc, m_OneUse(m_ZExtOrSExt(m_Value(Src))))))
4177 return false;
4178
4179 auto CastOpc =
4180 (Instruction::CastOps)cast<Instruction>(ReductionSrc)->getOpcode();
4181
4182 auto *SrcTy = cast<VectorType>(Src->getType());
4183 auto *ReductionSrcTy = cast<VectorType>(ReductionSrc->getType());
4184 Type *ResultTy = I.getType();
4185
4187 ReductionOpc, ReductionSrcTy, std::nullopt, CostKind);
4188 OldCost += TTI.getCastInstrCost(CastOpc, ReductionSrcTy, SrcTy,
4190 cast<CastInst>(ReductionSrc));
4191 InstructionCost NewCost =
4192 TTI.getArithmeticReductionCost(ReductionOpc, SrcTy, std::nullopt,
4193 CostKind) +
4194 TTI.getCastInstrCost(CastOpc, ResultTy, ReductionSrcTy->getScalarType(),
4196
4197 if (OldCost <= NewCost || !NewCost.isValid())
4198 return false;
4199
4200 Value *NewReduction = Builder.CreateIntrinsic(SrcTy->getScalarType(),
4201 II->getIntrinsicID(), {Src});
4202 Value *NewCast = Builder.CreateCast(CastOpc, NewReduction, ResultTy);
4203 replaceValue(I, *NewCast);
4204 return true;
4205}
4206
4207/// Fold:
4208/// icmp pred (reduce.{add,or,and,umax,umin}(signbit_extract(x))), C
4209/// into:
4210/// icmp sgt/slt (reduce.{or,umax,and,umin}(x)), -1/0
4211///
4212/// Sign-bit reductions produce values with known semantics:
4213/// - reduce.{or,umax}: 0 if no element is negative, 1 if any is
4214/// - reduce.{and,umin}: 1 if all elements are negative, 0 if any isn't
4215/// - reduce.add: count of negative elements (0 to NumElts)
4216///
4217/// Both lshr and ashr are supported:
4218/// - lshr produces 0 or 1, so reduce.add range is [0, N]
4219/// - ashr produces 0 or -1, so reduce.add range is [-N, 0]
4220///
4221/// The fold generalizes to multiple source vectors combined with the same
4222/// operation as the reduction. For example:
4223/// reduce.or(or(shr A, shr B)) conceptually extends the vector
4224/// For reduce.add, this changes the count to M*N where M is the number of
4225/// source vectors.
4226///
4227/// We transform to a direct sign check on the original vector using
4228/// reduce.{or,umax} or reduce.{and,umin}.
4229///
4230/// In spirit, it's similar to foldSignBitCheck in InstCombine.
4231bool VectorCombine::foldSignBitReductionCmp(Instruction &I) {
4232 CmpPredicate Pred;
4233 IntrinsicInst *ReduceOp;
4234 const APInt *CmpVal;
4235 if (!match(&I,
4236 m_ICmp(Pred, m_OneUse(m_AnyIntrinsic(ReduceOp)), m_APInt(CmpVal))))
4237 return false;
4238
4239 Intrinsic::ID OrigIID = ReduceOp->getIntrinsicID();
4240 switch (OrigIID) {
4241 case Intrinsic::vector_reduce_or:
4242 case Intrinsic::vector_reduce_umax:
4243 case Intrinsic::vector_reduce_and:
4244 case Intrinsic::vector_reduce_umin:
4245 case Intrinsic::vector_reduce_add:
4246 break;
4247 default:
4248 return false;
4249 }
4250
4251 Value *ReductionSrc = ReduceOp->getArgOperand(0);
4252 auto *VecTy = dyn_cast<FixedVectorType>(ReductionSrc->getType());
4253 if (!VecTy)
4254 return false;
4255
4256 unsigned BitWidth = VecTy->getScalarSizeInBits();
4257 if (BitWidth == 1)
4258 return false;
4259
4260 unsigned NumElts = VecTy->getNumElements();
4261
4262 // Determine the expected tree opcode for multi-vector patterns.
4263 // The tree opcode must match the reduction's underlying operation.
4264 //
4265 // TODO: for pairs of equivalent operators, we should match both,
4266 // not only the most common.
4267 Instruction::BinaryOps TreeOpcode;
4268 switch (OrigIID) {
4269 case Intrinsic::vector_reduce_or:
4270 case Intrinsic::vector_reduce_umax:
4271 TreeOpcode = Instruction::Or;
4272 break;
4273 case Intrinsic::vector_reduce_and:
4274 case Intrinsic::vector_reduce_umin:
4275 TreeOpcode = Instruction::And;
4276 break;
4277 case Intrinsic::vector_reduce_add:
4278 TreeOpcode = Instruction::Add;
4279 break;
4280 default:
4281 llvm_unreachable("Unexpected intrinsic");
4282 }
4283
4284 // Collect sign-bit extraction leaves from an associative tree of TreeOpcode.
4285 // The tree conceptually extends the vector being reduced.
4286 SmallVector<Value *, 8> Worklist;
4287 SmallVector<Value *, 8> Sources; // Original vectors (X in shr X, BW-1)
4288 Worklist.push_back(ReductionSrc);
4289 std::optional<bool> IsAShr;
4290 constexpr unsigned MaxSources = 8;
4291
4292 // Calculate old cost: all shifts + tree ops + reduction
4293 InstructionCost OldCost = TTI.getInstructionCost(ReduceOp, CostKind);
4294
4295 while (!Worklist.empty() && Worklist.size() <= MaxSources &&
4296 Sources.size() <= MaxSources) {
4297 Value *V = Worklist.pop_back_val();
4298
4299 // Try to match sign-bit extraction: shr X, (bitwidth-1)
4300 Value *X;
4301 if (match(V, m_OneUse(m_Shr(m_Value(X), m_SpecificInt(BitWidth - 1))))) {
4302 auto *Shr = cast<Instruction>(V);
4303
4304 // All shifts must be the same type (all lshr or all ashr)
4305 bool ThisIsAShr = Shr->getOpcode() == Instruction::AShr;
4306 if (!IsAShr)
4307 IsAShr = ThisIsAShr;
4308 else if (*IsAShr != ThisIsAShr)
4309 return false;
4310
4311 Sources.push_back(X);
4312
4313 // As part of the fold, we remove all of the shifts, so we need to keep
4314 // track of their costs.
4315 OldCost += TTI.getInstructionCost(Shr, CostKind);
4316
4317 continue;
4318 }
4319
4320 // Try to extend through a tree node of the expected opcode
4321 Value *A, *B;
4322 if (!match(V, m_OneUse(m_BinOp(TreeOpcode, m_Value(A), m_Value(B)))))
4323 return false;
4324
4325 // We are potentially replacing these operations as well, so we add them
4326 // to the costs.
4328
4329 Worklist.push_back(A);
4330 Worklist.push_back(B);
4331 }
4332
4333 // Must have at least one source and not exceed limit
4334 if (Sources.empty() || Sources.size() > MaxSources ||
4335 Worklist.size() > MaxSources || !IsAShr)
4336 return false;
4337
4338 unsigned NumSources = Sources.size();
4339
4340 // For reduce.add, the total count must fit as a signed integer.
4341 // Range is [0, M*N] for lshr or [-M*N, 0] for ashr.
4342 if (OrigIID == Intrinsic::vector_reduce_add &&
4343 !isIntN(BitWidth, NumSources * NumElts))
4344 return false;
4345
4346 // Compute the boundary value when all elements are negative:
4347 // - Per-element contribution: 1 for lshr, -1 for ashr
4348 // - For add: M*N (total elements across all sources); for others: just 1
4349 unsigned Count =
4350 (OrigIID == Intrinsic::vector_reduce_add) ? NumSources * NumElts : 1;
4351 APInt NegativeVal(CmpVal->getBitWidth(), Count);
4352 if (*IsAShr)
4353 NegativeVal.negate();
4354
4355 // Range is [min(0, AllNegVal), max(0, AllNegVal)]
4356 APInt Zero = APInt::getZero(CmpVal->getBitWidth());
4357 APInt RangeLow = APIntOps::smin(Zero, NegativeVal);
4358 APInt RangeHigh = APIntOps::smax(Zero, NegativeVal);
4359
4360 // Determine comparison semantics:
4361 // - IsEq: true for equality test, false for inequality
4362 // - TestsNegative: true if testing against AllNegVal, false for zero
4363 //
4364 // In addition to EQ/NE against 0 or AllNegVal, we support inequalities
4365 // that fold to boundary tests given the narrow value range:
4366 // < RangeHigh -> != RangeHigh
4367 // > RangeHigh-1 -> == RangeHigh
4368 // > RangeLow -> != RangeLow
4369 // < RangeLow+1 -> == RangeLow
4370 //
4371 // For inequalities, we work with signed predicates only. Unsigned predicates
4372 // are canonicalized to signed when the range is non-negative (where they are
4373 // equivalent). When the range includes negative values, unsigned predicates
4374 // would have different semantics due to wrap-around, so we reject them.
4375 if (!ICmpInst::isEquality(Pred) && !ICmpInst::isSigned(Pred)) {
4376 if (RangeLow.isNegative())
4377 return false;
4378 Pred = ICmpInst::getSignedPredicate(Pred);
4379 }
4380
4381 bool IsEq;
4382 bool TestsNegative;
4383 if (ICmpInst::isEquality(Pred)) {
4384 if (CmpVal->isZero()) {
4385 TestsNegative = false;
4386 } else if (*CmpVal == NegativeVal) {
4387 TestsNegative = true;
4388 } else {
4389 return false;
4390 }
4391 IsEq = Pred == ICmpInst::ICMP_EQ;
4392 } else if (Pred == ICmpInst::ICMP_SLT && *CmpVal == RangeHigh) {
4393 IsEq = false;
4394 TestsNegative = (RangeHigh == NegativeVal);
4395 } else if (Pred == ICmpInst::ICMP_SGT && *CmpVal == RangeHigh - 1) {
4396 IsEq = true;
4397 TestsNegative = (RangeHigh == NegativeVal);
4398 } else if (Pred == ICmpInst::ICMP_SGT && *CmpVal == RangeLow) {
4399 IsEq = false;
4400 TestsNegative = (RangeLow == NegativeVal);
4401 } else if (Pred == ICmpInst::ICMP_SLT && *CmpVal == RangeLow + 1) {
4402 IsEq = true;
4403 TestsNegative = (RangeLow == NegativeVal);
4404 } else {
4405 return false;
4406 }
4407
4408 // For this fold we support four types of checks:
4409 //
4410 // 1. All lanes are negative - AllNeg
4411 // 2. All lanes are non-negative - AllNonNeg
4412 // 3. At least one negative lane - AnyNeg
4413 // 4. At least one non-negative lane - AnyNonNeg
4414 //
4415 // For each case, we can generate the following code:
4416 //
4417 // 1. AllNeg - reduce.and/umin(X) < 0
4418 // 2. AllNonNeg - reduce.or/umax(X) > -1
4419 // 3. AnyNeg - reduce.or/umax(X) < 0
4420 // 4. AnyNonNeg - reduce.and/umin(X) > -1
4421 //
4422 // The table below shows the aggregation of all supported cases
4423 // using these four cases.
4424 //
4425 // Reduction | == 0 | != 0 | == MAX | != MAX
4426 // ------------+-----------+-----------+-----------+-----------
4427 // or/umax | AllNonNeg | AnyNeg | AnyNeg | AllNonNeg
4428 // and/umin | AnyNonNeg | AllNeg | AllNeg | AnyNonNeg
4429 // add | AllNonNeg | AnyNeg | AllNeg | AnyNonNeg
4430 //
4431 // NOTE: MAX = 1 for or/and/umax/umin, and the vector size N for add
4432 //
4433 // For easier codegen and check inversion, we use the following encoding:
4434 //
4435 // 1. Bit-3 === requires or/umax (1) or and/umin (0) check
4436 // 2. Bit-2 === checks < 0 (1) or > -1 (0)
4437 // 3. Bit-1 === universal (1) or existential (0) check
4438 //
4439 // AnyNeg = 0b110: uses or/umax, checks negative, any-check
4440 // AllNonNeg = 0b101: uses or/umax, checks non-neg, all-check
4441 // AnyNonNeg = 0b000: uses and/umin, checks non-neg, any-check
4442 // AllNeg = 0b011: uses and/umin, checks negative, all-check
4443 //
4444 // XOR with 0b011 inverts the check (swaps all/any and neg/non-neg).
4445 //
4446 enum CheckKind : unsigned {
4447 AnyNonNeg = 0b000,
4448 AllNeg = 0b011,
4449 AllNonNeg = 0b101,
4450 AnyNeg = 0b110,
4451 };
4452 // Return true if we fold this check into or/umax and false for and/umin
4453 auto RequiresOr = [](CheckKind C) -> bool { return C & 0b100; };
4454 // Return true if we should check if result is negative and false otherwise
4455 auto IsNegativeCheck = [](CheckKind C) -> bool { return C & 0b010; };
4456 // Logically invert the check
4457 auto Invert = [](CheckKind C) { return CheckKind(C ^ 0b011); };
4458
4459 CheckKind Base;
4460 switch (OrigIID) {
4461 case Intrinsic::vector_reduce_or:
4462 case Intrinsic::vector_reduce_umax:
4463 Base = TestsNegative ? AnyNeg : AllNonNeg;
4464 break;
4465 case Intrinsic::vector_reduce_and:
4466 case Intrinsic::vector_reduce_umin:
4467 Base = TestsNegative ? AllNeg : AnyNonNeg;
4468 break;
4469 case Intrinsic::vector_reduce_add:
4470 Base = TestsNegative ? AllNeg : AllNonNeg;
4471 break;
4472 default:
4473 llvm_unreachable("Unexpected intrinsic");
4474 }
4475
4476 CheckKind Check = IsEq ? Base : Invert(Base);
4477
4478 auto PickCheaper = [&](Intrinsic::ID Arith, Intrinsic::ID MinMax) {
4479 InstructionCost ArithCost =
4481 VecTy, std::nullopt, CostKind);
4482 InstructionCost MinMaxCost =
4484 FastMathFlags(), CostKind);
4485 return ArithCost <= MinMaxCost ? std::make_pair(Arith, ArithCost)
4486 : std::make_pair(MinMax, MinMaxCost);
4487 };
4488
4489 // Choose output reduction based on encoding's MSB
4490 auto [NewIID, NewCost] = RequiresOr(Check)
4491 ? PickCheaper(Intrinsic::vector_reduce_or,
4492 Intrinsic::vector_reduce_umax)
4493 : PickCheaper(Intrinsic::vector_reduce_and,
4494 Intrinsic::vector_reduce_umin);
4495
4496 // Add cost of combining multiple sources with or/and
4497 if (NumSources > 1) {
4498 unsigned CombineOpc =
4499 RequiresOr(Check) ? Instruction::Or : Instruction::And;
4500 NewCost += TTI.getArithmeticInstrCost(CombineOpc, VecTy, CostKind) *
4501 (NumSources - 1);
4502 }
4503
4504 LLVM_DEBUG(dbgs() << "Found sign-bit reduction cmp: " << I << "\n OldCost: "
4505 << OldCost << " vs NewCost: " << NewCost << "\n");
4506
4507 if (NewCost > OldCost)
4508 return false;
4509
4510 // Generate the combined input and reduction
4511 Builder.SetInsertPoint(&I);
4512 Type *ScalarTy = VecTy->getScalarType();
4513
4514 Value *Input;
4515 if (NumSources == 1) {
4516 Input = Sources[0];
4517 } else {
4518 // Combine sources with or/and based on check type
4519 Input = RequiresOr(Check) ? Builder.CreateOr(Sources)
4520 : Builder.CreateAnd(Sources);
4521 }
4522
4523 Value *NewReduce = Builder.CreateIntrinsic(ScalarTy, NewIID, {Input});
4524 Value *NewCmp = IsNegativeCheck(Check) ? Builder.CreateIsNeg(NewReduce)
4525 : Builder.CreateIsNotNeg(NewReduce);
4526 replaceValue(I, *NewCmp);
4527 return true;
4528}
4529
4530/// vector.reduce.OP f(X_i) == 0 -> vector.reduce.OP X_i == 0
4531///
4532/// We can prove it for cases when:
4533///
4534/// 1. OP X_i == 0 <=> \forall i \in [1, N] X_i == 0
4535/// 1'. OP X_i == 0 <=> \exists j \in [1, N] X_j == 0
4536/// 2. f(x) == 0 <=> x == 0
4537///
4538/// From 1 and 2 (or 1' and 2), we can infer that
4539///
4540/// OP f(X_i) == 0 <=> OP X_i == 0.
4541///
4542/// (1)
4543/// OP f(X_i) == 0 <=> \forall i \in [1, N] f(X_i) == 0
4544/// (2)
4545/// <=> \forall i \in [1, N] X_i == 0
4546/// (1)
4547/// <=> OP(X_i) == 0
4548///
4549/// For some of the OP's and f's, we need to have domain constraints on X
4550/// to ensure properties 1 (or 1') and 2.
4551bool VectorCombine::foldICmpEqZeroVectorReduce(Instruction &I) {
4552 CmpPredicate Pred;
4553 Value *Op;
4554 if (!match(&I, m_ICmp(Pred, m_Value(Op), m_Zero())) ||
4555 !ICmpInst::isEquality(Pred))
4556 return false;
4557
4558 auto *II = dyn_cast<IntrinsicInst>(Op);
4559 if (!II)
4560 return false;
4561
4562 switch (II->getIntrinsicID()) {
4563 case Intrinsic::vector_reduce_add:
4564 case Intrinsic::vector_reduce_or:
4565 case Intrinsic::vector_reduce_umin:
4566 case Intrinsic::vector_reduce_umax:
4567 case Intrinsic::vector_reduce_smin:
4568 case Intrinsic::vector_reduce_smax:
4569 break;
4570 default:
4571 return false;
4572 }
4573
4574 Value *InnerOp = II->getArgOperand(0);
4575
4576 // TODO: fixed vector type might be too restrictive
4577 if (!II->hasOneUse() || !isa<FixedVectorType>(InnerOp->getType()))
4578 return false;
4579
4580 Value *X = nullptr;
4581
4582 // Check for zero-preserving operations where f(x) = 0 <=> x = 0
4583 //
4584 // 1. f(x) = shl nuw x, y for arbitrary y
4585 // 2. f(x) = mul nuw x, c for defined c != 0
4586 // 3. f(x) = zext x
4587 // 4. f(x) = sext x
4588 // 5. f(x) = neg x
4589 //
4590 if (!(match(InnerOp, m_NUWShl(m_Value(X), m_Value())) || // Case 1
4591 match(InnerOp, m_NUWMul(m_Value(X), m_NonZeroInt())) || // Case 2
4592 match(InnerOp, m_ZExt(m_Value(X))) || // Case 3
4593 match(InnerOp, m_SExt(m_Value(X))) || // Case 4
4594 match(InnerOp, m_Neg(m_Value(X))) // Case 5
4595 ))
4596 return false;
4597
4598 SimplifyQuery S = SQ.getWithInstruction(&I);
4599 auto *XTy = cast<FixedVectorType>(X->getType());
4600
4601 // Check for domain constraints for all supported reductions.
4602 //
4603 // a. OR X_i - has property 1 for every X
4604 // b. UMAX X_i - has property 1 for every X
4605 // c. UMIN X_i - has property 1' for every X
4606 // d. SMAX X_i - has property 1 for X >= 0
4607 // e. SMIN X_i - has property 1' for X >= 0
4608 // f. ADD X_i - has property 1 for X >= 0 && ADD X_i doesn't sign wrap
4609 //
4610 // In order for the proof to work, we need 1 (or 1') to be true for both
4611 // OP f(X_i) and OP X_i and that's why below we check constraints twice.
4612 //
4613 // NOTE: ADD X_i holds property 1 for a mirror case as well, i.e. when
4614 // X <= 0 && ADD X_i doesn't sign wrap. However, due to the nature
4615 // of known bits, we can't reasonably hold knowledge of "either 0
4616 // or negative".
4617 switch (II->getIntrinsicID()) {
4618 case Intrinsic::vector_reduce_add: {
4619 // We need to check that both X_i and f(X_i) have enough leading
4620 // zeros to not overflow.
4621 KnownBits KnownX = computeKnownBits(X, S);
4622 KnownBits KnownFX = computeKnownBits(InnerOp, S);
4623 unsigned NumElems = XTy->getNumElements();
4624 // Adding N elements loses at most ceil(log2(N)) leading bits.
4625 unsigned LostBits = Log2_32_Ceil(NumElems);
4626 unsigned LeadingZerosX = KnownX.countMinLeadingZeros();
4627 unsigned LeadingZerosFX = KnownFX.countMinLeadingZeros();
4628 // Need at least one leading zero left after summation to ensure no overflow
4629 if (LeadingZerosX <= LostBits || LeadingZerosFX <= LostBits)
4630 return false;
4631
4632 // We are not checking whether X or f(X) are positive explicitly because
4633 // we implicitly checked for it when we checked if both cases have enough
4634 // leading zeros to not wrap addition.
4635 break;
4636 }
4637 case Intrinsic::vector_reduce_smin:
4638 case Intrinsic::vector_reduce_smax:
4639 // Check whether X >= 0 and f(X) >= 0
4640 if (!isKnownNonNegative(InnerOp, S) || !isKnownNonNegative(X, S))
4641 return false;
4642
4643 break;
4644 default:
4645 break;
4646 };
4647
4648 LLVM_DEBUG(dbgs() << "Found a reduction to 0 comparison with removable op: "
4649 << *II << "\n");
4650
4651 // For zext/sext, check if the transform is profitable using cost model.
4652 // For other operations (shl, mul, neg), we're removing an instruction
4653 // while keeping the same reduction type, so it's always profitable.
4654 if (isa<ZExtInst>(InnerOp) || isa<SExtInst>(InnerOp)) {
4655 auto *FXTy = cast<FixedVectorType>(InnerOp->getType());
4656 Intrinsic::ID IID = II->getIntrinsicID();
4657
4659 cast<CastInst>(InnerOp)->getOpcode(), FXTy, XTy,
4661
4662 InstructionCost OldReduceCost, NewReduceCost;
4663 switch (IID) {
4664 case Intrinsic::vector_reduce_add:
4665 case Intrinsic::vector_reduce_or:
4666 OldReduceCost = TTI.getArithmeticReductionCost(
4667 getArithmeticReductionInstruction(IID), FXTy, std::nullopt, CostKind);
4668 NewReduceCost = TTI.getArithmeticReductionCost(
4669 getArithmeticReductionInstruction(IID), XTy, std::nullopt, CostKind);
4670 break;
4671 case Intrinsic::vector_reduce_umin:
4672 case Intrinsic::vector_reduce_umax:
4673 case Intrinsic::vector_reduce_smin:
4674 case Intrinsic::vector_reduce_smax:
4675 OldReduceCost = TTI.getMinMaxReductionCost(
4676 getMinMaxReductionIntrinsicOp(IID), FXTy, FastMathFlags(), CostKind);
4677 NewReduceCost = TTI.getMinMaxReductionCost(
4678 getMinMaxReductionIntrinsicOp(IID), XTy, FastMathFlags(), CostKind);
4679 break;
4680 default:
4681 llvm_unreachable("Unexpected reduction");
4682 }
4683
4684 InstructionCost OldCost = OldReduceCost + ExtCost;
4685 InstructionCost NewCost =
4686 NewReduceCost + (InnerOp->hasOneUse() ? 0 : ExtCost);
4687
4688 LLVM_DEBUG(dbgs() << "Found a removable extension before reduction: "
4689 << *InnerOp << "\n OldCost: " << OldCost
4690 << " vs NewCost: " << NewCost << "\n");
4691
4692 // We consider transformation to still be potentially beneficial even
4693 // when the costs are the same because we might remove a use from f(X)
4694 // and unlock other optimizations. Equal costs would just mean that we
4695 // didn't make it worse in the worst case.
4696 if (NewCost > OldCost)
4697 return false;
4698 }
4699
4700 // Since we support zext and sext as f, we might change the scalar type
4701 // of the intrinsic.
4702 Type *Ty = XTy->getScalarType();
4703 Value *NewReduce = Builder.CreateIntrinsic(Ty, II->getIntrinsicID(), {X});
4704 Value *NewCmp =
4705 Builder.CreateICmp(Pred, NewReduce, ConstantInt::getNullValue(Ty));
4706 replaceValue(I, *NewCmp);
4707 return true;
4708}
4709
4710/// Fold comparisons of reduce.or/reduce.and with reduce.umax/reduce.umin
4711/// based on cost, preserving the comparison semantics.
4712///
4713/// We use two fundamental properties for each pair:
4714///
4715/// 1. or(X) == 0 <=> umax(X) == 0
4716/// 2. or(X) == 1 <=> umax(X) == 1
4717/// 3. sign(or(X)) == sign(umax(X))
4718///
4719/// 1. and(X) == -1 <=> umin(X) == -1
4720/// 2. and(X) == -2 <=> umin(X) == -2
4721/// 3. sign(and(X)) == sign(umin(X))
4722///
4723/// From these we can infer the following transformations:
4724/// a. or(X) ==/!= 0 <-> umax(X) ==/!= 0
4725/// b. or(X) s< 0 <-> umax(X) s< 0
4726/// c. or(X) s> -1 <-> umax(X) s> -1
4727/// d. or(X) s< 1 <-> umax(X) s< 1
4728/// e. or(X) ==/!= 1 <-> umax(X) ==/!= 1
4729/// f. or(X) s< 2 <-> umax(X) s< 2
4730/// g. and(X) ==/!= -1 <-> umin(X) ==/!= -1
4731/// h. and(X) s< 0 <-> umin(X) s< 0
4732/// i. and(X) s> -1 <-> umin(X) s> -1
4733/// j. and(X) s> -2 <-> umin(X) s> -2
4734/// k. and(X) ==/!= -2 <-> umin(X) ==/!= -2
4735/// l. and(X) s> -3 <-> umin(X) s> -3
4736///
4737bool VectorCombine::foldEquivalentReductionCmp(Instruction &I) {
4738 CmpPredicate Pred;
4739 Value *ReduceOp;
4740 const APInt *CmpVal;
4741 if (!match(&I, m_ICmp(Pred, m_Value(ReduceOp), m_APInt(CmpVal))))
4742 return false;
4743
4744 auto *II = dyn_cast<IntrinsicInst>(ReduceOp);
4745 if (!II || !II->hasOneUse())
4746 return false;
4747
4748 const auto IsValidOrUmaxCmp = [&]() {
4749 // or === umax for i1
4750 if (CmpVal->getBitWidth() == 1)
4751 return true;
4752
4753 // Cases a and e
4754 bool IsEquality =
4755 (CmpVal->isZero() || CmpVal->isOne()) && ICmpInst::isEquality(Pred);
4756 // Case c
4757 bool IsPositive = CmpVal->isAllOnes() && Pred == ICmpInst::ICMP_SGT;
4758 // Cases b, d, and f
4759 bool IsNegative = (CmpVal->isZero() || CmpVal->isOne() || *CmpVal == 2) &&
4760 Pred == ICmpInst::ICMP_SLT;
4761 return IsEquality || IsPositive || IsNegative;
4762 };
4763
4764 const auto IsValidAndUminCmp = [&]() {
4765 // and === umin for i1
4766 if (CmpVal->getBitWidth() == 1)
4767 return true;
4768
4769 const auto LeadingOnes = CmpVal->countl_one();
4770
4771 // Cases g and k
4772 bool IsEquality =
4773 (CmpVal->isAllOnes() || LeadingOnes + 1 == CmpVal->getBitWidth()) &&
4775 // Case h
4776 bool IsNegative = CmpVal->isZero() && Pred == ICmpInst::ICMP_SLT;
4777 // Cases i, j, and l
4778 bool IsPositive =
4779 // if the number has at least N - 2 leading ones
4780 // and the two LSBs are:
4781 // - 1 x 1 -> -1
4782 // - 1 x 0 -> -2
4783 // - 0 x 1 -> -3
4784 LeadingOnes + 2 >= CmpVal->getBitWidth() &&
4785 ((*CmpVal)[0] || (*CmpVal)[1]) && Pred == ICmpInst::ICMP_SGT;
4786 return IsEquality || IsNegative || IsPositive;
4787 };
4788
4789 Intrinsic::ID OriginalIID = II->getIntrinsicID();
4790 Intrinsic::ID AlternativeIID;
4791
4792 // Check if this is a valid comparison pattern and determine the alternate
4793 // reduction intrinsic.
4794 switch (OriginalIID) {
4795 case Intrinsic::vector_reduce_or:
4796 if (!IsValidOrUmaxCmp())
4797 return false;
4798 AlternativeIID = Intrinsic::vector_reduce_umax;
4799 break;
4800 case Intrinsic::vector_reduce_umax:
4801 if (!IsValidOrUmaxCmp())
4802 return false;
4803 AlternativeIID = Intrinsic::vector_reduce_or;
4804 break;
4805 case Intrinsic::vector_reduce_and:
4806 if (!IsValidAndUminCmp())
4807 return false;
4808 AlternativeIID = Intrinsic::vector_reduce_umin;
4809 break;
4810 case Intrinsic::vector_reduce_umin:
4811 if (!IsValidAndUminCmp())
4812 return false;
4813 AlternativeIID = Intrinsic::vector_reduce_and;
4814 break;
4815 default:
4816 return false;
4817 }
4818
4819 Value *X = II->getArgOperand(0);
4820 auto *VecTy = dyn_cast<FixedVectorType>(X->getType());
4821 if (!VecTy)
4822 return false;
4823
4824 const auto GetReductionCost = [&](Intrinsic::ID IID) -> InstructionCost {
4825 unsigned ReductionOpc = getArithmeticReductionInstruction(IID);
4826 if (ReductionOpc != Instruction::ICmp)
4827 return TTI.getArithmeticReductionCost(ReductionOpc, VecTy, std::nullopt,
4828 CostKind);
4830 FastMathFlags(), CostKind);
4831 };
4832
4833 InstructionCost OrigCost = GetReductionCost(OriginalIID);
4834 InstructionCost AltCost = GetReductionCost(AlternativeIID);
4835
4836 LLVM_DEBUG(dbgs() << "Found equivalent reduction cmp: " << I
4837 << "\n OrigCost: " << OrigCost
4838 << " vs AltCost: " << AltCost << "\n");
4839
4840 if (AltCost >= OrigCost)
4841 return false;
4842
4843 Builder.SetInsertPoint(&I);
4844 Type *ScalarTy = VecTy->getScalarType();
4845 Value *NewReduce = Builder.CreateIntrinsic(ScalarTy, AlternativeIID, {X});
4846 Value *NewCmp =
4847 Builder.CreateICmp(Pred, NewReduce, ConstantInt::get(ScalarTy, *CmpVal));
4848
4849 replaceValue(I, *NewCmp);
4850 return true;
4851}
4852
4853/// Returns true if this ShuffleVectorInst eventually feeds into a
4854/// vector reduction intrinsic (e.g., vector_reduce_add) by only following
4855/// chains of shuffles and binary operators (in any combination/order).
4856/// The search does not go deeper than the given Depth.
4858 constexpr unsigned MaxVisited = 32;
4861 bool FoundReduction = false;
4862
4863 WorkList.push_back(SVI);
4864 while (!WorkList.empty()) {
4865 Instruction *I = WorkList.pop_back_val();
4866 for (User *U : I->users()) {
4867 auto *UI = cast<Instruction>(U);
4868 if (!UI || !Visited.insert(UI).second)
4869 continue;
4870 if (Visited.size() > MaxVisited)
4871 return false;
4872 if (auto *II = dyn_cast<IntrinsicInst>(UI)) {
4873 // More than one reduction reached
4874 if (FoundReduction)
4875 return false;
4876 switch (II->getIntrinsicID()) {
4877 case Intrinsic::vector_reduce_add:
4878 case Intrinsic::vector_reduce_mul:
4879 case Intrinsic::vector_reduce_and:
4880 case Intrinsic::vector_reduce_or:
4881 case Intrinsic::vector_reduce_xor:
4882 case Intrinsic::vector_reduce_smin:
4883 case Intrinsic::vector_reduce_smax:
4884 case Intrinsic::vector_reduce_umin:
4885 case Intrinsic::vector_reduce_umax:
4886 FoundReduction = true;
4887 continue;
4888 default:
4889 return false;
4890 }
4891 }
4892
4894 return false;
4895
4896 WorkList.emplace_back(UI);
4897 }
4898 }
4899 return FoundReduction;
4900}
4901
4902/// This method looks for groups of shuffles acting on binops, of the form:
4903/// %x = shuffle ...
4904/// %y = shuffle ...
4905/// %a = binop %x, %y
4906/// %b = binop %x, %y
4907/// shuffle %a, %b, selectmask
4908/// We may, especially if the shuffle is wider than legal, be able to convert
4909/// the shuffle to a form where only parts of a and b need to be computed. On
4910/// architectures with no obvious "select" shuffle, this can reduce the total
4911/// number of operations if the target reports them as cheaper.
4912bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
4913 auto *SVI = cast<ShuffleVectorInst>(&I);
4914 auto *VT = cast<FixedVectorType>(I.getType());
4915 auto *Op0 = dyn_cast<Instruction>(SVI->getOperand(0));
4916 auto *Op1 = dyn_cast<Instruction>(SVI->getOperand(1));
4917 if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() ||
4918 VT != Op0->getType())
4919 return false;
4920
4921 auto *SVI0A = dyn_cast<Instruction>(Op0->getOperand(0));
4922 auto *SVI0B = dyn_cast<Instruction>(Op0->getOperand(1));
4923 auto *SVI1A = dyn_cast<Instruction>(Op1->getOperand(0));
4924 auto *SVI1B = dyn_cast<Instruction>(Op1->getOperand(1));
4925 SmallPtrSet<Instruction *, 4> InputShuffles({SVI0A, SVI0B, SVI1A, SVI1B});
4926 auto checkSVNonOpUses = [&](Instruction *I) {
4927 if (!I || I->getOperand(0)->getType() != VT)
4928 return true;
4929 return any_of(I->users(), [&](User *U) {
4930 return U != Op0 && U != Op1 &&
4931 !(isa<ShuffleVectorInst>(U) &&
4932 (InputShuffles.contains(cast<Instruction>(U)) ||
4933 isInstructionTriviallyDead(cast<Instruction>(U))));
4934 });
4935 };
4936 if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) ||
4937 checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B))
4938 return false;
4939
4940 // Collect all the uses that are shuffles that we can transform together. We
4941 // may not have a single shuffle, but a group that can all be transformed
4942 // together profitably.
4944 auto collectShuffles = [&](Instruction *I) {
4945 for (auto *U : I->users()) {
4946 auto *SV = dyn_cast<ShuffleVectorInst>(U);
4947 if (!SV || SV->getType() != VT)
4948 return false;
4949 if ((SV->getOperand(0) != Op0 && SV->getOperand(0) != Op1) ||
4950 (SV->getOperand(1) != Op0 && SV->getOperand(1) != Op1))
4951 return false;
4952 if (!llvm::is_contained(Shuffles, SV))
4953 Shuffles.push_back(SV);
4954 }
4955 return true;
4956 };
4957 if (!collectShuffles(Op0) || !collectShuffles(Op1))
4958 return false;
4959 // From a reduction, we need to be processing a single shuffle, otherwise the
4960 // other uses will not be lane-invariant.
4961 if (FromReduction && Shuffles.size() > 1)
4962 return false;
4963
4964 // Add any shuffle uses for the shuffles we have found, to include them in our
4965 // cost calculations.
4966 if (!FromReduction) {
4967 for (ShuffleVectorInst *SV : Shuffles) {
4968 for (auto *U : SV->users()) {
4969 ShuffleVectorInst *SSV = dyn_cast<ShuffleVectorInst>(U);
4970 if (SSV && isa<UndefValue>(SSV->getOperand(1)) && SSV->getType() == VT)
4971 Shuffles.push_back(SSV);
4972 }
4973 }
4974 }
4975
4976 // For each of the output shuffles, we try to sort all the first vector
4977 // elements to the beginning, followed by the second array elements at the
4978 // end. If the binops are legalized to smaller vectors, this may reduce total
4979 // number of binops. We compute the ReconstructMask mask needed to convert
4980 // back to the original lane order.
4982 SmallVector<SmallVector<int>> OrigReconstructMasks;
4983 int MaxV1Elt = 0, MaxV2Elt = 0;
4984 unsigned NumElts = VT->getNumElements();
4985 for (ShuffleVectorInst *SVN : Shuffles) {
4986 SmallVector<int> Mask;
4987 SVN->getShuffleMask(Mask);
4988
4989 // Check the operands are the same as the original, or reversed (in which
4990 // case we need to commute the mask).
4991 Value *SVOp0 = SVN->getOperand(0);
4992 Value *SVOp1 = SVN->getOperand(1);
4993 if (isa<UndefValue>(SVOp1)) {
4994 auto *SSV = cast<ShuffleVectorInst>(SVOp0);
4995 SVOp0 = SSV->getOperand(0);
4996 SVOp1 = SSV->getOperand(1);
4997 for (int &Elem : Mask) {
4998 if (Elem >= static_cast<int>(SSV->getShuffleMask().size()))
4999 return false;
5000 Elem = Elem < 0 ? Elem : SSV->getMaskValue(Elem);
5001 }
5002 }
5003 if (SVOp0 == Op1 && SVOp1 == Op0) {
5004 std::swap(SVOp0, SVOp1);
5006 }
5007 if (SVOp0 != Op0 || SVOp1 != Op1)
5008 return false;
5009
5010 // Calculate the reconstruction mask for this shuffle, as the mask needed to
5011 // take the packed values from Op0/Op1 and reconstructing to the original
5012 // order.
5013 SmallVector<int> ReconstructMask;
5014 for (unsigned I = 0; I < Mask.size(); I++) {
5015 if (Mask[I] < 0) {
5016 ReconstructMask.push_back(-1);
5017 } else if (Mask[I] < static_cast<int>(NumElts)) {
5018 MaxV1Elt = std::max(MaxV1Elt, Mask[I]);
5019 auto It = find_if(V1, [&](const std::pair<int, int> &A) {
5020 return Mask[I] == A.first;
5021 });
5022 if (It != V1.end())
5023 ReconstructMask.push_back(It - V1.begin());
5024 else {
5025 ReconstructMask.push_back(V1.size());
5026 V1.emplace_back(Mask[I], V1.size());
5027 }
5028 } else {
5029 MaxV2Elt = std::max<int>(MaxV2Elt, Mask[I] - NumElts);
5030 auto It = find_if(V2, [&](const std::pair<int, int> &A) {
5031 return Mask[I] - static_cast<int>(NumElts) == A.first;
5032 });
5033 if (It != V2.end())
5034 ReconstructMask.push_back(NumElts + It - V2.begin());
5035 else {
5036 ReconstructMask.push_back(NumElts + V2.size());
5037 V2.emplace_back(Mask[I] - NumElts, NumElts + V2.size());
5038 }
5039 }
5040 }
5041
5042 // For reductions, we know that the lane ordering out doesn't alter the
5043 // result. In-order can help simplify the shuffle away.
5044 if (FromReduction)
5045 sort(ReconstructMask);
5046 OrigReconstructMasks.push_back(std::move(ReconstructMask));
5047 }
5048
5049 // If the Maximum element used from V1 and V2 are not larger than the new
5050 // vectors, the vectors are already packes and performing the optimization
5051 // again will likely not help any further. This also prevents us from getting
5052 // stuck in a cycle in case the costs do not also rule it out.
5053 if (V1.empty() || V2.empty() ||
5054 (MaxV1Elt == static_cast<int>(V1.size()) - 1 &&
5055 MaxV2Elt == static_cast<int>(V2.size()) - 1))
5056 return false;
5057
5058 // GetBaseMaskValue takes one of the inputs, which may either be a shuffle, a
5059 // shuffle of another shuffle, or not a shuffle (that is treated like a
5060 // identity shuffle).
5061 auto GetBaseMaskValue = [&](Instruction *I, int M) {
5062 auto *SV = dyn_cast<ShuffleVectorInst>(I);
5063 if (!SV)
5064 return M;
5065 if (isa<UndefValue>(SV->getOperand(1)))
5066 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
5067 if (InputShuffles.contains(SSV))
5068 return SSV->getMaskValue(SV->getMaskValue(M));
5069 return SV->getMaskValue(M);
5070 };
5071
5072 // Attempt to sort the inputs my ascending mask values to make simpler input
5073 // shuffles and push complex shuffles down to the uses. We sort on the first
5074 // of the two input shuffle orders, to try and get at least one input into a
5075 // nice order.
5076 auto SortBase = [&](Instruction *A, std::pair<int, int> X,
5077 std::pair<int, int> Y) {
5078 int MXA = GetBaseMaskValue(A, X.first);
5079 int MYA = GetBaseMaskValue(A, Y.first);
5080 return MXA < MYA;
5081 };
5082 stable_sort(V1, [&](std::pair<int, int> A, std::pair<int, int> B) {
5083 return SortBase(SVI0A, A, B);
5084 });
5085 stable_sort(V2, [&](std::pair<int, int> A, std::pair<int, int> B) {
5086 return SortBase(SVI1A, A, B);
5087 });
5088 // Calculate our ReconstructMasks from the OrigReconstructMasks and the
5089 // modified order of the input shuffles.
5090 SmallVector<SmallVector<int>> ReconstructMasks;
5091 for (const auto &Mask : OrigReconstructMasks) {
5092 SmallVector<int> ReconstructMask;
5093 for (int M : Mask) {
5094 auto FindIndex = [](const SmallVector<std::pair<int, int>> &V, int M) {
5095 auto It = find_if(V, [M](auto A) { return A.second == M; });
5096 assert(It != V.end() && "Expected all entries in Mask");
5097 return std::distance(V.begin(), It);
5098 };
5099 if (M < 0)
5100 ReconstructMask.push_back(-1);
5101 else if (M < static_cast<int>(NumElts)) {
5102 ReconstructMask.push_back(FindIndex(V1, M));
5103 } else {
5104 ReconstructMask.push_back(NumElts + FindIndex(V2, M));
5105 }
5106 }
5107 ReconstructMasks.push_back(std::move(ReconstructMask));
5108 }
5109
5110 // Calculate the masks needed for the new input shuffles, which get padded
5111 // with undef
5112 SmallVector<int> V1A, V1B, V2A, V2B;
5113 for (unsigned I = 0; I < V1.size(); I++) {
5114 V1A.push_back(GetBaseMaskValue(SVI0A, V1[I].first));
5115 V1B.push_back(GetBaseMaskValue(SVI0B, V1[I].first));
5116 }
5117 for (unsigned I = 0; I < V2.size(); I++) {
5118 V2A.push_back(GetBaseMaskValue(SVI1A, V2[I].first));
5119 V2B.push_back(GetBaseMaskValue(SVI1B, V2[I].first));
5120 }
5121 while (V1A.size() < NumElts) {
5124 }
5125 while (V2A.size() < NumElts) {
5128 }
5129
5130 auto AddShuffleCost = [&](InstructionCost C, Instruction *I) {
5131 auto *SV = dyn_cast<ShuffleVectorInst>(I);
5132 if (!SV)
5133 return C;
5134 return C + TTI.getShuffleCost(isa<UndefValue>(SV->getOperand(1))
5137 VT, VT, SV->getShuffleMask(), CostKind);
5138 };
5139 auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) {
5140 return C +
5142 };
5143
5144 unsigned ElementSize = VT->getElementType()->getPrimitiveSizeInBits();
5145 unsigned MaxVectorSize =
5147 unsigned MaxElementsInVector = MaxVectorSize / ElementSize;
5148 if (MaxElementsInVector == 0)
5149 return false;
5150 // When there are multiple shufflevector operations on the same input,
5151 // especially when the vector length is larger than the register size,
5152 // identical shuffle patterns may occur across different groups of elements.
5153 // To avoid overestimating the cost by counting these repeated shuffles more
5154 // than once, we only account for unique shuffle patterns. This adjustment
5155 // prevents inflated costs in the cost model for wide vectors split into
5156 // several register-sized groups.
5157 std::set<SmallVector<int, 4>> UniqueShuffles;
5158 auto AddShuffleMaskAdjustedCost = [&](InstructionCost C, ArrayRef<int> Mask) {
5159 // Compute the cost for performing the shuffle over the full vector.
5160 auto ShuffleCost =
5162 unsigned NumFullVectors = Mask.size() / MaxElementsInVector;
5163 if (NumFullVectors < 2)
5164 return C + ShuffleCost;
5165 SmallVector<int, 4> SubShuffle(MaxElementsInVector);
5166 unsigned NumUniqueGroups = 0;
5167 unsigned NumGroups = Mask.size() / MaxElementsInVector;
5168 // For each group of MaxElementsInVector contiguous elements,
5169 // collect their shuffle pattern and insert into the set of unique patterns.
5170 for (unsigned I = 0; I < NumFullVectors; ++I) {
5171 for (unsigned J = 0; J < MaxElementsInVector; ++J)
5172 SubShuffle[J] = Mask[MaxElementsInVector * I + J];
5173 if (UniqueShuffles.insert(SubShuffle).second)
5174 NumUniqueGroups += 1;
5175 }
5176 return C + ShuffleCost * NumUniqueGroups / NumGroups;
5177 };
5178 auto AddShuffleAdjustedCost = [&](InstructionCost C, Instruction *I) {
5179 auto *SV = dyn_cast<ShuffleVectorInst>(I);
5180 if (!SV)
5181 return C;
5182 SmallVector<int, 16> Mask;
5183 SV->getShuffleMask(Mask);
5184 return AddShuffleMaskAdjustedCost(C, Mask);
5185 };
5186 // Check that input consists of ShuffleVectors applied to the same input
5187 auto AllShufflesHaveSameOperands =
5188 [](SmallPtrSetImpl<Instruction *> &InputShuffles) {
5189 if (InputShuffles.size() < 2)
5190 return false;
5191 ShuffleVectorInst *FirstSV =
5192 dyn_cast<ShuffleVectorInst>(*InputShuffles.begin());
5193 if (!FirstSV)
5194 return false;
5195
5196 Value *In0 = FirstSV->getOperand(0), *In1 = FirstSV->getOperand(1);
5197 return std::all_of(
5198 std::next(InputShuffles.begin()), InputShuffles.end(),
5199 [&](Instruction *I) {
5200 ShuffleVectorInst *SV = dyn_cast<ShuffleVectorInst>(I);
5201 return SV && SV->getOperand(0) == In0 && SV->getOperand(1) == In1;
5202 });
5203 };
5204
5205 // Get the costs of the shuffles + binops before and after with the new
5206 // shuffle masks.
5207 InstructionCost CostBefore =
5208 TTI.getArithmeticInstrCost(Op0->getOpcode(), VT, CostKind) +
5209 TTI.getArithmeticInstrCost(Op1->getOpcode(), VT, CostKind);
5210 CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(),
5211 InstructionCost(0), AddShuffleCost);
5212 if (AllShufflesHaveSameOperands(InputShuffles)) {
5213 UniqueShuffles.clear();
5214 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
5215 InstructionCost(0), AddShuffleAdjustedCost);
5216 } else {
5217 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
5218 InstructionCost(0), AddShuffleCost);
5219 }
5220
5221 // The new binops will be unused for lanes past the used shuffle lengths.
5222 // These types attempt to get the correct cost for that from the target.
5223 FixedVectorType *Op0SmallVT =
5224 FixedVectorType::get(VT->getScalarType(), V1.size());
5225 FixedVectorType *Op1SmallVT =
5226 FixedVectorType::get(VT->getScalarType(), V2.size());
5227 InstructionCost CostAfter =
5228 TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT, CostKind) +
5229 TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT, CostKind);
5230 UniqueShuffles.clear();
5231 CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(),
5232 InstructionCost(0), AddShuffleMaskAdjustedCost);
5233 std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B});
5234 CostAfter +=
5235 std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(),
5236 InstructionCost(0), AddShuffleMaskCost);
5237
5238 LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n");
5239 LLVM_DEBUG(dbgs() << " CostBefore: " << CostBefore
5240 << " vs CostAfter: " << CostAfter << "\n");
5241 if (CostBefore < CostAfter ||
5242 (CostBefore == CostAfter && !feedsIntoVectorReduction(SVI)))
5243 return false;
5244
5245 // The cost model has passed, create the new instructions.
5246 auto GetShuffleOperand = [&](Instruction *I, unsigned Op) -> Value * {
5247 auto *SV = dyn_cast<ShuffleVectorInst>(I);
5248 if (!SV)
5249 return I;
5250 if (isa<UndefValue>(SV->getOperand(1)))
5251 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
5252 if (InputShuffles.contains(SSV))
5253 return SSV->getOperand(Op);
5254 return SV->getOperand(Op);
5255 };
5256 Builder.SetInsertPoint(*SVI0A->getInsertionPointAfterDef());
5257 Value *NSV0A = Builder.CreateShuffleVector(GetShuffleOperand(SVI0A, 0),
5258 GetShuffleOperand(SVI0A, 1), V1A);
5259 Builder.SetInsertPoint(*SVI0B->getInsertionPointAfterDef());
5260 Value *NSV0B = Builder.CreateShuffleVector(GetShuffleOperand(SVI0B, 0),
5261 GetShuffleOperand(SVI0B, 1), V1B);
5262 Builder.SetInsertPoint(*SVI1A->getInsertionPointAfterDef());
5263 Value *NSV1A = Builder.CreateShuffleVector(GetShuffleOperand(SVI1A, 0),
5264 GetShuffleOperand(SVI1A, 1), V2A);
5265 Builder.SetInsertPoint(*SVI1B->getInsertionPointAfterDef());
5266 Value *NSV1B = Builder.CreateShuffleVector(GetShuffleOperand(SVI1B, 0),
5267 GetShuffleOperand(SVI1B, 1), V2B);
5268 Builder.SetInsertPoint(Op0);
5269 Value *NOp0 = Builder.CreateBinOp((Instruction::BinaryOps)Op0->getOpcode(),
5270 NSV0A, NSV0B);
5271 if (auto *I = dyn_cast<Instruction>(NOp0))
5272 I->copyIRFlags(Op0, true);
5273 Builder.SetInsertPoint(Op1);
5274 Value *NOp1 = Builder.CreateBinOp((Instruction::BinaryOps)Op1->getOpcode(),
5275 NSV1A, NSV1B);
5276 if (auto *I = dyn_cast<Instruction>(NOp1))
5277 I->copyIRFlags(Op1, true);
5278
5279 for (int S = 0, E = ReconstructMasks.size(); S != E; S++) {
5280 Builder.SetInsertPoint(Shuffles[S]);
5281 Value *NSV = Builder.CreateShuffleVector(NOp0, NOp1, ReconstructMasks[S]);
5282 replaceValue(*Shuffles[S], *NSV, false);
5283 }
5284
5285 Worklist.pushValue(NSV0A);
5286 Worklist.pushValue(NSV0B);
5287 Worklist.pushValue(NSV1A);
5288 Worklist.pushValue(NSV1B);
5289 return true;
5290}
5291
5292/// Check if instruction depends on ZExt and this ZExt can be moved after the
5293/// instruction. Move ZExt if it is profitable. For example:
5294/// logic(zext(x),y) -> zext(logic(x,trunc(y)))
5295/// lshr((zext(x),y) -> zext(lshr(x,trunc(y)))
5296/// Cost model calculations takes into account if zext(x) has other users and
5297/// whether it can be propagated through them too.
5298bool VectorCombine::shrinkType(Instruction &I) {
5299 Value *ZExted, *OtherOperand;
5300 if (!match(&I, m_c_BitwiseLogic(m_ZExt(m_Value(ZExted)),
5301 m_Value(OtherOperand))) &&
5302 !match(&I, m_LShr(m_ZExt(m_Value(ZExted)), m_Value(OtherOperand))))
5303 return false;
5304
5305 Value *ZExtOperand = I.getOperand(I.getOperand(0) == OtherOperand ? 1 : 0);
5306
5307 auto *BigTy = cast<FixedVectorType>(I.getType());
5308 auto *SmallTy = cast<FixedVectorType>(ZExted->getType());
5309 unsigned BW = SmallTy->getElementType()->getPrimitiveSizeInBits();
5310
5311 if (I.getOpcode() == Instruction::LShr) {
5312 // Check that the shift amount is less than the number of bits in the
5313 // smaller type. Otherwise, the smaller lshr will return a poison value.
5314 KnownBits ShAmtKB = computeKnownBits(I.getOperand(1), *DL);
5315 if (ShAmtKB.getMaxValue().uge(BW))
5316 return false;
5317 } else {
5318 // Check that the expression overall uses at most the same number of bits as
5319 // ZExted
5320 KnownBits KB = computeKnownBits(&I, *DL);
5321 if (KB.countMaxActiveBits() > BW)
5322 return false;
5323 }
5324
5325 // Calculate costs of leaving current IR as it is and moving ZExt operation
5326 // later, along with adding truncates if needed
5328 Instruction::ZExt, BigTy, SmallTy,
5329 TargetTransformInfo::CastContextHint::None, CostKind);
5330 InstructionCost CurrentCost = ZExtCost;
5331 InstructionCost ShrinkCost = 0;
5332
5333 // Calculate total cost and check that we can propagate through all ZExt users
5334 for (User *U : ZExtOperand->users()) {
5335 auto *UI = cast<Instruction>(U);
5336 if (UI == &I) {
5337 CurrentCost +=
5338 TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
5339 ShrinkCost +=
5340 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
5341 ShrinkCost += ZExtCost;
5342 continue;
5343 }
5344
5345 if (!Instruction::isBinaryOp(UI->getOpcode()))
5346 return false;
5347
5348 // Check if we can propagate ZExt through its other users
5349 KnownBits KB = computeKnownBits(UI, *DL);
5350 if (KB.countMaxActiveBits() > BW)
5351 return false;
5352
5353 CurrentCost += TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
5354 ShrinkCost +=
5355 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
5356 ShrinkCost += ZExtCost;
5357 }
5358
5359 // If the other instruction operand is not a constant, we'll need to
5360 // generate a truncate instruction. So we have to adjust cost
5361 if (!isa<Constant>(OtherOperand))
5362 ShrinkCost += TTI.getCastInstrCost(
5363 Instruction::Trunc, SmallTy, BigTy,
5364 TargetTransformInfo::CastContextHint::None, CostKind);
5365
5366 // If the cost of shrinking types and leaving the IR is the same, we'll lean
5367 // towards modifying the IR because shrinking opens opportunities for other
5368 // shrinking optimisations.
5369 if (ShrinkCost > CurrentCost)
5370 return false;
5371
5372 Builder.SetInsertPoint(&I);
5373 Value *Op0 = ZExted;
5374 Value *Op1 = Builder.CreateTrunc(OtherOperand, SmallTy);
5375 // Keep the order of operands the same
5376 if (I.getOperand(0) == OtherOperand)
5377 std::swap(Op0, Op1);
5378 Value *NewBinOp =
5379 Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(), Op0, Op1);
5380 cast<Instruction>(NewBinOp)->copyIRFlags(&I);
5381 cast<Instruction>(NewBinOp)->copyMetadata(I);
5382 Value *NewZExtr = Builder.CreateZExt(NewBinOp, BigTy);
5383 replaceValue(I, *NewZExtr);
5384 return true;
5385}
5386
5387/// insert (DstVec, (extract SrcVec, ExtIdx), InsIdx) -->
5388/// shuffle (DstVec, SrcVec, Mask)
5389bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
5390 Value *DstVec, *SrcVec;
5391 uint64_t ExtIdx, InsIdx;
5392 if (!match(&I,
5393 m_InsertElt(m_Value(DstVec),
5394 m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx)),
5395 m_ConstantInt(InsIdx))))
5396 return false;
5397
5398 auto *DstVecTy = dyn_cast<FixedVectorType>(I.getType());
5399 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
5400 // We can try combining vectors with different element sizes.
5401 if (!DstVecTy || !SrcVecTy ||
5402 SrcVecTy->getElementType() != DstVecTy->getElementType())
5403 return false;
5404
5405 unsigned NumDstElts = DstVecTy->getNumElements();
5406 unsigned NumSrcElts = SrcVecTy->getNumElements();
5407 if (InsIdx >= NumDstElts || ExtIdx >= NumSrcElts || NumDstElts == 1)
5408 return false;
5409
5410 // Insertion into poison is a cheaper single operand shuffle.
5412 SmallVector<int> Mask(NumDstElts, PoisonMaskElem);
5413
5414 bool NeedExpOrNarrow = NumSrcElts != NumDstElts;
5415 bool NeedDstSrcSwap = isa<PoisonValue>(DstVec) && !isa<UndefValue>(SrcVec);
5416 if (NeedDstSrcSwap) {
5418 Mask[InsIdx] = ExtIdx % NumDstElts;
5419 std::swap(DstVec, SrcVec);
5420 } else {
5422 std::iota(Mask.begin(), Mask.end(), 0);
5423 Mask[InsIdx] = (ExtIdx % NumDstElts) + NumDstElts;
5424 }
5425
5426 // Cost
5427 auto *Ins = cast<InsertElementInst>(&I);
5428 auto *Ext = cast<ExtractElementInst>(I.getOperand(1));
5429 InstructionCost InsCost =
5430 TTI.getVectorInstrCost(*Ins, DstVecTy, CostKind, InsIdx);
5431 InstructionCost ExtCost =
5432 TTI.getVectorInstrCost(*Ext, DstVecTy, CostKind, ExtIdx);
5433 InstructionCost OldCost = ExtCost + InsCost;
5434
5435 InstructionCost NewCost = 0;
5436 SmallVector<int> ExtToVecMask;
5437 if (!NeedExpOrNarrow) {
5438 // Ignore 'free' identity insertion shuffle.
5439 // TODO: getShuffleCost should return TCC_Free for Identity shuffles.
5440 if (!ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts))
5441 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind, 0,
5442 nullptr, {DstVec, SrcVec});
5443 } else {
5444 // When creating a length-changing-vector, always try to keep the relevant
5445 // element in an equivalent position, so that bulk shuffles are more likely
5446 // to be useful.
5447 ExtToVecMask.assign(NumDstElts, PoisonMaskElem);
5448 ExtToVecMask[ExtIdx % NumDstElts] = ExtIdx;
5449 // Add cost for expanding or narrowing
5451 DstVecTy, SrcVecTy, ExtToVecMask, CostKind);
5452 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind);
5453 }
5454
5455 if (!Ext->hasOneUse())
5456 NewCost += ExtCost;
5457
5458 LLVM_DEBUG(dbgs() << "Found a insert/extract shuffle-like pair: " << I
5459 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
5460 << "\n");
5461
5462 if (OldCost < NewCost)
5463 return false;
5464
5465 if (NeedExpOrNarrow) {
5466 if (!NeedDstSrcSwap)
5467 SrcVec = Builder.CreateShuffleVector(SrcVec, ExtToVecMask);
5468 else
5469 DstVec = Builder.CreateShuffleVector(DstVec, ExtToVecMask);
5470 }
5471
5472 // Canonicalize undef param to RHS to help further folds.
5473 if (isa<UndefValue>(DstVec) && !isa<UndefValue>(SrcVec)) {
5474 ShuffleVectorInst::commuteShuffleMask(Mask, NumDstElts);
5475 std::swap(DstVec, SrcVec);
5476 }
5477
5478 Value *Shuf = Builder.CreateShuffleVector(DstVec, SrcVec, Mask);
5479 replaceValue(I, *Shuf);
5480
5481 return true;
5482}
5483
5484/// If we're interleaving 2 constant splats, for instance `<vscale x 8 x i32>
5485/// <splat of 666>` and `<vscale x 8 x i32> <splat of 777>`, we can create a
5486/// larger splat `<vscale x 8 x i64> <splat of ((777 << 32) | 666)>` first
5487/// before casting it back into `<vscale x 16 x i32>`.
5488bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
5489 const APInt *SplatVal0, *SplatVal1;
5491 m_APInt(SplatVal0), m_APInt(SplatVal1))))
5492 return false;
5493
5494 LLVM_DEBUG(dbgs() << "VC: Folding interleave2 with two splats: " << I
5495 << "\n");
5496
5497 auto *VTy =
5498 cast<VectorType>(cast<IntrinsicInst>(I).getArgOperand(0)->getType());
5499 auto *ExtVTy = VectorType::getExtendedElementVectorType(VTy);
5500 unsigned Width = VTy->getElementType()->getIntegerBitWidth();
5501
5502 // Just in case the cost of interleave2 intrinsic and bitcast are both
5503 // invalid, in which case we want to bail out, we use <= rather
5504 // than < here. Even they both have valid and equal costs, it's probably
5505 // not a good idea to emit a high-cost constant splat.
5507 TTI.getCastInstrCost(Instruction::BitCast, I.getType(), ExtVTy,
5509 LLVM_DEBUG(dbgs() << "VC: The cost to cast from " << *ExtVTy << " to "
5510 << *I.getType() << " is too high.\n");
5511 return false;
5512 }
5513
5514 APInt NewSplatVal = SplatVal1->zext(Width * 2);
5515 NewSplatVal <<= Width;
5516 NewSplatVal |= SplatVal0->zext(Width * 2);
5517 auto *NewSplat = ConstantVector::getSplat(
5518 ExtVTy->getElementCount(), ConstantInt::get(F.getContext(), NewSplatVal));
5519
5520 IRBuilder<> Builder(&I);
5521 replaceValue(I, *Builder.CreateBitCast(NewSplat, I.getType()));
5522 return true;
5523}
5524
5525// Attempt to shrink loads that are only used by shufflevector instructions.
5526bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
5527 auto *OldLoad = dyn_cast<LoadInst>(&I);
5528 if (!OldLoad || !OldLoad->isSimple())
5529 return false;
5530
5531 auto *OldLoadTy = dyn_cast<FixedVectorType>(OldLoad->getType());
5532 if (!OldLoadTy)
5533 return false;
5534
5535 unsigned const OldNumElements = OldLoadTy->getNumElements();
5536
5537 // Search all uses of load. If all uses are shufflevector instructions, and
5538 // the second operands are all poison values, find the minimum and maximum
5539 // indices of the vector elements referenced by all shuffle masks.
5540 // Otherwise return `std::nullopt`.
5541 using IndexRange = std::pair<int, int>;
5542 auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
5543 IndexRange OutputRange = IndexRange(OldNumElements, -1);
5544 for (llvm::Use &Use : I.uses()) {
5545 // Ensure all uses match the required pattern.
5546 User *Shuffle = Use.getUser();
5547 ArrayRef<int> Mask;
5548
5549 if (!match(Shuffle,
5550 m_Shuffle(m_Specific(OldLoad), m_Undef(), m_Mask(Mask))))
5551 return std::nullopt;
5552
5553 // Ignore shufflevector instructions that have no uses.
5554 if (Shuffle->use_empty())
5555 continue;
5556
5557 // Find the min and max indices used by the shufflevector instruction.
5558 for (int Index : Mask) {
5559 if (Index >= 0 && Index < static_cast<int>(OldNumElements)) {
5560 OutputRange.first = std::min(Index, OutputRange.first);
5561 OutputRange.second = std::max(Index, OutputRange.second);
5562 }
5563 }
5564 }
5565
5566 if (OutputRange.second < OutputRange.first)
5567 return std::nullopt;
5568
5569 return OutputRange;
5570 };
5571
5572 // Get the range of vector elements used by shufflevector instructions.
5573 if (std::optional<IndexRange> Indices = GetIndexRangeInShuffles()) {
5574 unsigned const NewNumElements = Indices->second + 1u;
5575
5576 // If the range of vector elements is smaller than the full load, attempt
5577 // to create a smaller load.
5578 if (NewNumElements < OldNumElements) {
5579 IRBuilder Builder(&I);
5580 Builder.SetCurrentDebugLocation(I.getDebugLoc());
5581
5582 // Calculate costs of old and new ops.
5583 Type *ElemTy = OldLoadTy->getElementType();
5584 FixedVectorType *NewLoadTy = FixedVectorType::get(ElemTy, NewNumElements);
5585 Value *PtrOp = OldLoad->getPointerOperand();
5586
5588 Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
5589 OldLoad->getPointerAddressSpace(), CostKind);
5590 InstructionCost NewCost =
5591 TTI.getMemoryOpCost(Instruction::Load, NewLoadTy, OldLoad->getAlign(),
5592 OldLoad->getPointerAddressSpace(), CostKind);
5593
5594 using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>;
5596 unsigned const MaxIndex = NewNumElements * 2u;
5597
5598 for (llvm::Use &Use : I.uses()) {
5599 auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
5600
5601 // Ignore shufflevector instructions that have no uses.
5602 if (Shuffle->use_empty())
5603 continue;
5604
5605 ArrayRef<int> OldMask = Shuffle->getShuffleMask();
5606
5607 // Create entry for new use.
5608 NewUses.push_back({Shuffle, OldMask});
5609
5610 // Validate mask indices.
5611 for (int Index : OldMask) {
5612 if (Index >= static_cast<int>(MaxIndex))
5613 return false;
5614 }
5615
5616 // Update costs.
5617 OldCost +=
5619 OldLoadTy, OldMask, CostKind);
5620 NewCost +=
5622 NewLoadTy, OldMask, CostKind);
5623 }
5624
5625 LLVM_DEBUG(
5626 dbgs() << "Found a load used only by shufflevector instructions: "
5627 << I << "\n OldCost: " << OldCost
5628 << " vs NewCost: " << NewCost << "\n");
5629
5630 if (OldCost < NewCost || !NewCost.isValid())
5631 return false;
5632
5633 // Create new load of smaller vector.
5634 auto *NewLoad = cast<LoadInst>(
5635 Builder.CreateAlignedLoad(NewLoadTy, PtrOp, OldLoad->getAlign()));
5636 NewLoad->copyMetadata(I);
5637
5638 // Replace all uses.
5639 for (UseEntry &Use : NewUses) {
5640 ShuffleVectorInst *Shuffle = Use.first;
5641 std::vector<int> &NewMask = Use.second;
5642
5643 Builder.SetInsertPoint(Shuffle);
5644 Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
5645 Value *NewShuffle = Builder.CreateShuffleVector(
5646 NewLoad, PoisonValue::get(NewLoadTy), NewMask);
5647
5648 replaceValue(*Shuffle, *NewShuffle, false);
5649 }
5650
5651 return true;
5652 }
5653 }
5654 return false;
5655}
5656
5657// Attempt to narrow a phi of shufflevector instructions where the two incoming
5658// values have the same operands but different masks. If the two shuffle masks
5659// are offsets of one another we can use one branch to rotate the incoming
5660// vector and perform one larger shuffle after the phi.
5661bool VectorCombine::shrinkPhiOfShuffles(Instruction &I) {
5662 auto *Phi = dyn_cast<PHINode>(&I);
5663 if (!Phi || Phi->getNumIncomingValues() != 2u)
5664 return false;
5665
5666 Value *Op = nullptr;
5667 ArrayRef<int> Mask0;
5668 ArrayRef<int> Mask1;
5669
5670 if (!match(Phi->getOperand(0u),
5671 m_OneUse(m_Shuffle(m_Value(Op), m_Poison(), m_Mask(Mask0)))) ||
5672 !match(Phi->getOperand(1u),
5673 m_OneUse(m_Shuffle(m_Specific(Op), m_Poison(), m_Mask(Mask1)))))
5674 return false;
5675
5676 auto *Shuf = cast<ShuffleVectorInst>(Phi->getOperand(0u));
5677
5678 // Ensure result vectors are wider than the argument vector.
5679 auto *InputVT = cast<FixedVectorType>(Op->getType());
5680 auto *ResultVT = cast<FixedVectorType>(Shuf->getType());
5681 auto const InputNumElements = InputVT->getNumElements();
5682
5683 if (InputNumElements >= ResultVT->getNumElements())
5684 return false;
5685
5686 // Take the difference of the two shuffle masks at each index. Ignore poison
5687 // values at the same index in both masks.
5688 SmallVector<int, 16> NewMask;
5689 NewMask.reserve(Mask0.size());
5690
5691 for (auto [M0, M1] : zip(Mask0, Mask1)) {
5692 if (M0 >= 0 && M1 >= 0)
5693 NewMask.push_back(M0 - M1);
5694 else if (M0 == -1 && M1 == -1)
5695 continue;
5696 else
5697 return false;
5698 }
5699
5700 // Ensure all elements of the new mask are equal. If the difference between
5701 // the incoming mask elements is the same, the two must be constant offsets
5702 // of one another.
5703 if (NewMask.empty() || !all_equal(NewMask))
5704 return false;
5705
5706 // Create new mask using difference of the two incoming masks.
5707 int MaskOffset = NewMask[0u];
5708 unsigned Index = (InputNumElements + MaskOffset) % InputNumElements;
5709 NewMask.clear();
5710
5711 for (unsigned I = 0u; I < InputNumElements; ++I) {
5712 NewMask.push_back(Index);
5713 Index = (Index + 1u) % InputNumElements;
5714 }
5715
5716 // Calculate costs for worst cases and compare.
5717 auto const Kind = TTI::SK_PermuteSingleSrc;
5718 auto OldCost =
5719 std::max(TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask0, CostKind),
5720 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind));
5721 auto NewCost = TTI.getShuffleCost(Kind, InputVT, InputVT, NewMask, CostKind) +
5722 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind);
5723
5724 LLVM_DEBUG(dbgs() << "Found a phi of mergeable shuffles: " << I
5725 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
5726 << "\n");
5727
5728 if (NewCost > OldCost)
5729 return false;
5730
5731 // Create new shuffles and narrowed phi.
5732 auto Builder = IRBuilder(Shuf);
5733 Builder.SetCurrentDebugLocation(Shuf->getDebugLoc());
5734 auto *PoisonVal = PoisonValue::get(InputVT);
5735 auto *NewShuf0 = Builder.CreateShuffleVector(Op, PoisonVal, NewMask);
5736 Worklist.push(cast<Instruction>(NewShuf0));
5737
5738 Builder.SetInsertPoint(Phi);
5739 Builder.SetCurrentDebugLocation(Phi->getDebugLoc());
5740 auto *NewPhi = Builder.CreatePHI(NewShuf0->getType(), 2u);
5741 NewPhi->addIncoming(NewShuf0, Phi->getIncomingBlock(0u));
5742 NewPhi->addIncoming(Op, Phi->getIncomingBlock(1u));
5743
5744 Builder.SetInsertPoint(*NewPhi->getInsertionPointAfterDef());
5745 PoisonVal = PoisonValue::get(NewPhi->getType());
5746 auto *NewShuf1 = Builder.CreateShuffleVector(NewPhi, PoisonVal, Mask1);
5747
5748 replaceValue(*Phi, *NewShuf1);
5749 return true;
5750}
5751
5752/// This is the entry point for all transforms. Pass manager differences are
5753/// handled in the callers of this function.
5754bool VectorCombine::run() {
5756 return false;
5757
5758 // Don't attempt vectorization if the target does not support vectors.
5759 if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true)))
5760 return false;
5761
5762 LLVM_DEBUG(dbgs() << "\n\nVECTORCOMBINE on " << F.getName() << "\n");
5763
5764 auto FoldInst = [this](Instruction &I) {
5765 Builder.SetInsertPoint(&I);
5766 bool IsVectorType = isa<VectorType>(I.getType());
5767 bool IsFixedVectorType = isa<FixedVectorType>(I.getType());
5768 auto Opcode = I.getOpcode();
5769
5770 LLVM_DEBUG(dbgs() << "VC: Visiting: " << I << '\n');
5771
5772 // These folds should be beneficial regardless of when this pass is run
5773 // in the optimization pipeline.
5774 // The type checking is for run-time efficiency. We can avoid wasting time
5775 // dispatching to folding functions if there's no chance of matching.
5776 if (IsFixedVectorType) {
5777 switch (Opcode) {
5778 case Instruction::InsertElement:
5779 if (vectorizeLoadInsert(I))
5780 return true;
5781 break;
5782 case Instruction::ShuffleVector:
5783 if (widenSubvectorLoad(I))
5784 return true;
5785 break;
5786 default:
5787 break;
5788 }
5789 }
5790
5791 // This transform works with scalable and fixed vectors
5792 // TODO: Identify and allow other scalable transforms
5793 if (IsVectorType) {
5794 if (scalarizeOpOrCmp(I))
5795 return true;
5796 if (scalarizeLoad(I))
5797 return true;
5798 if (scalarizeExtExtract(I))
5799 return true;
5800 if (scalarizeVPIntrinsic(I))
5801 return true;
5802 if (foldInterleaveIntrinsics(I))
5803 return true;
5804 }
5805
5806 if (Opcode == Instruction::Store)
5807 if (foldSingleElementStore(I))
5808 return true;
5809
5810 // If this is an early pipeline invocation of this pass, we are done.
5811 if (TryEarlyFoldsOnly)
5812 return false;
5813
5814 // Otherwise, try folds that improve codegen but may interfere with
5815 // early IR canonicalizations.
5816 // The type checking is for run-time efficiency. We can avoid wasting time
5817 // dispatching to folding functions if there's no chance of matching.
5818 if (IsFixedVectorType) {
5819 switch (Opcode) {
5820 case Instruction::InsertElement:
5821 if (foldInsExtFNeg(I))
5822 return true;
5823 if (foldInsExtBinop(I))
5824 return true;
5825 if (foldInsExtVectorToShuffle(I))
5826 return true;
5827 break;
5828 case Instruction::ShuffleVector:
5829 if (foldPermuteOfBinops(I))
5830 return true;
5831 if (foldShuffleOfBinops(I))
5832 return true;
5833 if (foldShuffleOfSelects(I))
5834 return true;
5835 if (foldShuffleOfCastops(I))
5836 return true;
5837 if (foldShuffleOfShuffles(I))
5838 return true;
5839 if (foldPermuteOfIntrinsic(I))
5840 return true;
5841 if (foldShufflesOfLengthChangingShuffles(I))
5842 return true;
5843 if (foldShuffleOfIntrinsics(I))
5844 return true;
5845 if (foldSelectShuffle(I))
5846 return true;
5847 if (foldShuffleToIdentity(I))
5848 return true;
5849 break;
5850 case Instruction::Load:
5851 if (shrinkLoadForShuffles(I))
5852 return true;
5853 break;
5854 case Instruction::BitCast:
5855 if (foldBitcastShuffle(I))
5856 return true;
5857 if (foldSelectsFromBitcast(I))
5858 return true;
5859 break;
5860 case Instruction::And:
5861 case Instruction::Or:
5862 case Instruction::Xor:
5863 if (foldBitOpOfCastops(I))
5864 return true;
5865 if (foldBitOpOfCastConstant(I))
5866 return true;
5867 break;
5868 case Instruction::PHI:
5869 if (shrinkPhiOfShuffles(I))
5870 return true;
5871 break;
5872 default:
5873 if (shrinkType(I))
5874 return true;
5875 break;
5876 }
5877 } else {
5878 switch (Opcode) {
5879 case Instruction::Call:
5880 if (foldShuffleFromReductions(I))
5881 return true;
5882 if (foldCastFromReductions(I))
5883 return true;
5884 break;
5885 case Instruction::ExtractElement:
5886 if (foldShuffleChainsToReduce(I))
5887 return true;
5888 break;
5889 case Instruction::ICmp:
5890 if (foldSignBitReductionCmp(I))
5891 return true;
5892 if (foldICmpEqZeroVectorReduce(I))
5893 return true;
5894 if (foldEquivalentReductionCmp(I))
5895 return true;
5896 [[fallthrough]];
5897 case Instruction::FCmp:
5898 if (foldExtractExtract(I))
5899 return true;
5900 break;
5901 case Instruction::Or:
5902 if (foldConcatOfBoolMasks(I))
5903 return true;
5904 [[fallthrough]];
5905 default:
5906 if (Instruction::isBinaryOp(Opcode)) {
5907 if (foldExtractExtract(I))
5908 return true;
5909 if (foldExtractedCmps(I))
5910 return true;
5911 if (foldBinopOfReductions(I))
5912 return true;
5913 }
5914 break;
5915 }
5916 }
5917 return false;
5918 };
5919
5920 bool MadeChange = false;
5921 for (BasicBlock &BB : F) {
5922 // Ignore unreachable basic blocks.
5923 if (!DT.isReachableFromEntry(&BB))
5924 continue;
5925 // Use early increment range so that we can erase instructions in loop.
5926 // make_early_inc_range is not applicable here, as the next iterator may
5927 // be invalidated by RecursivelyDeleteTriviallyDeadInstructions.
5928 // We manually maintain the next instruction and update it when it is about
5929 // to be deleted.
5930 Instruction *I = &BB.front();
5931 while (I) {
5932 NextInst = I->getNextNode();
5933 if (!I->isDebugOrPseudoInst())
5934 MadeChange |= FoldInst(*I);
5935 I = NextInst;
5936 }
5937 }
5938
5939 NextInst = nullptr;
5940
5941 while (!Worklist.isEmpty()) {
5942 Instruction *I = Worklist.removeOne();
5943 if (!I)
5944 continue;
5945
5948 continue;
5949 }
5950
5951 MadeChange |= FoldInst(*I);
5952 }
5953
5954 return MadeChange;
5955}
5956
5959 auto &AC = FAM.getResult<AssumptionAnalysis>(F);
5961 DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
5962 AAResults &AA = FAM.getResult<AAManager>(F);
5963 const DataLayout *DL = &F.getDataLayout();
5964 VectorCombine Combiner(F, TTI, DT, AA, AC, DL, TTI::TCK_RecipThroughput,
5965 TryEarlyFoldsOnly);
5966 if (!Combiner.run())
5967 return PreservedAnalyses::all();
5970 return PA;
5971}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< unsigned > MaxInstrsToScan("aggressive-instcombine-max-scan-instrs", cl::init(64), cl::Hidden, cl::desc("Max number of instructions to scan for aggressive instcombine."))
This is the interface for LLVM's primary stateless and local alias analysis.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
This file defines the DenseMap class.
#define Check(C,...)
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
Definition LICM.cpp:1449
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define T1
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
FunctionAnalysisManager FAM
const SmallVectorImpl< MachineOperand > & Cond
unsigned OpIndex
This file contains some templates that are useful if you are working with the STL at all.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static Value * generateNewInstTree(ArrayRef< InstLane > Item, FixedVectorType *Ty, const SmallPtrSet< Use *, 4 > &IdentityLeafs, const SmallPtrSet< Use *, 4 > &SplatLeafs, const SmallPtrSet< Use *, 4 > &ConcatLeafs, IRBuilderBase &Builder, const TargetTransformInfo *TTI)
static bool isFreeConcat(ArrayRef< InstLane > Item, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI)
Detect concat of multiple values into a vector.
static void analyzeCostOfVecReduction(const IntrinsicInst &II, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI, InstructionCost &CostBeforeReduction, InstructionCost &CostAfterReduction)
static SmallVector< InstLane > generateInstLaneVectorFromOperand(ArrayRef< InstLane > Item, int Op)
static Value * createShiftShuffle(Value *Vec, unsigned OldIndex, unsigned NewIndex, IRBuilderBase &Builder)
Create a shuffle that translates (shifts) 1 element from the input vector to a new element location.
static Align computeAlignmentAfterScalarization(Align VectorAlignment, Type *ScalarType, Value *Idx, const DataLayout &DL)
The memory operation on a vector of ScalarType had alignment of VectorAlignment.
static bool feedsIntoVectorReduction(ShuffleVectorInst *SVI)
Returns true if this ShuffleVectorInst eventually feeds into a vector reduction intrinsic (e....
static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx, Instruction *CtxI, AssumptionCache &AC, const DominatorTree &DT)
Check if it is legal to scalarize a memory access to VecTy at index Idx.
static cl::opt< bool > DisableVectorCombine("disable-vector-combine", cl::init(false), cl::Hidden, cl::desc("Disable all vector combine transforms"))
static InstLane lookThroughShuffles(Use *U, int Lane)
static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI)
static const unsigned InvalidIndex
std::pair< Use *, int > InstLane
static Value * translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex, IRBuilderBase &Builder)
Given an extract element instruction with constant index operand, shuffle the source vector (shift th...
static cl::opt< unsigned > MaxInstrsToScan("vector-combine-max-scan-instrs", cl::init(30), cl::Hidden, cl::desc("Max number of instructions to scan for vector combining."))
static cl::opt< bool > DisableBinopExtractShuffle("disable-binop-extract-shuffle", cl::init(false), cl::Hidden, cl::desc("Disable binop extract to shuffle transforms"))
static bool isMemModifiedBetween(BasicBlock::iterator Begin, BasicBlock::iterator End, const MemoryLocation &Loc, AAResults &AA)
static constexpr int Concat[]
Value * RHS
Value * LHS
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1023
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:372
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1503
bool isNegative() const
Determine sign of this APInt.
Definition APInt.h:330
unsigned countl_one() const
Count the number of leading one bits.
Definition APInt.h:1630
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:201
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:390
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Return true if the attribute exists in this set.
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
BinaryOps getOpcode() const
Definition InstrTypes.h:374
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
static LLVM_ABI CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:982
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
bool isFPPredicate() const
Definition InstrTypes.h:782
static LLVM_ABI std::optional< CmpPredicate > getMatching(CmpPredicate A, CmpPredicate B)
Compares two CmpPredicates taking samesign into account and returns the canonicalized CmpPredicate if...
Combiner implementation.
Definition Combiner.h:34
static LLVM_ABI Constant * getExtractElement(Constant *Vec, Constant *Idx, Type *OnlyIfReducedTy=nullptr)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
This class represents a range of values.
LLVM_ABI ConstantRange urem(const ConstantRange &Other) const
Return a new range representing the possible values resulting from an unsigned remainder operation of...
LLVM_ABI ConstantRange binaryAnd(const ConstantRange &Other) const
Return a new range representing the possible values resulting from a binary-and of a value in this ra...
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
bool empty() const
Definition DenseMap.h:109
iterator end()
Definition DenseMap.h:81
Analysis pass which computes a DominatorTree.
Definition Dominators.h:283
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
This instruction extracts a single (scalar) element from a VectorType value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
Predicate getSignedPredicate() const
For example, EQ->EQ, SLE->SLE, UGT->SGT, etc.
bool isEquality() const
Return true if this predicate is either EQ or NE.
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2561
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2549
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition IRBuilder.h:1871
LLVM_ABI Value * CreateSelectFMF(Value *C, Value *True, Value *False, FMFSource FMFSource, const Twine &Name="", Instruction *MDFrom=nullptr)
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2627
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1516
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
Definition IRBuilder.h:2210
Value * CreateIsNotNeg(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg > -1.
Definition IRBuilder.h:2651
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition IRBuilder.h:247
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition IRBuilder.h:1952
Value * CreatePointerBitCastOrAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2235
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:527
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:522
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2442
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2473
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition IRBuilder.h:172
Value * CreateIsNeg(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg < 0.
Definition IRBuilder.h:2646
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2176
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1854
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1495
LLVM_ABI Value * CreateNAryOp(unsigned Opc, ArrayRef< Value * > Ops, const Twine &Name="", MDNode *FPMathTag=nullptr)
Create either a UnaryOperator or BinaryOperator depending on Opc.
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2054
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2583
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1554
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1867
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition IRBuilder.h:2040
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition IRBuilder.h:604
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1711
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateFNegFMF(Value *V, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1802
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2418
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1576
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
void push(Instruction *I)
Push the instruction onto the worklist stack.
LLVM_ABI void setHasNoUnsignedWrap(bool b=true)
Set or clear the nuw flag on this instruction, which must be an operator which supports this flag.
LLVM_ABI void copyIRFlags(const Value *V, bool IncludeWrapFlags=true)
Convenience method to copy supported exact, fast-math, and (optionally) wrapping flags from V to this...
LLVM_ABI void setHasNoSignedWrap(bool b=true)
Set or clear the nsw flag on this instruction, which must be an operator which supports this flag.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void andIRFlags(const Value *V)
Logical 'and' of any supported wrapping, exact, and fast-math flags of V and this instruction.
bool isBinaryOp() const
LLVM_ABI void setNonNeg(bool b=true)
Set or clear the nneg flag on this instruction, which must be a zext instruction.
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
LLVM_ABI AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAlignment(Align Align)
Type * getPointerOperandType() const
Align getAlign() const
Return the alignment of the access that is being performed.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
const SDValue & getOperand(unsigned Num) const
This instruction constructs a fixed permutation of two input vectors.
int getMaskValue(unsigned Elt) const
Return the shuffle mask value of this instruction for the given element index.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static void commuteShuffleMask(MutableArrayRef< int > Mask, unsigned InVecNumElts)
Change values in a shuffle permute mask assuming the two vector operands of length InVecNumElts have ...
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
void setAlignment(Align Align)
Analysis pass providing the TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
@ None
The insert/extract is not used with a load/store.
LLVM_ABI InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI TypeSize getRegisterBitWidth(RegisterKind K) const
static LLVM_ABI OperandValueInfo commonOperandInfo(const Value *X, const Value *Y)
Collect common data between two OperandValueInfo inputs.
LLVM_ABI InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI bool allowVectorElementIndexingUsingGEP() const
Returns true if GEP should not be used to index into vectors for this target.
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
LLVM_ABI InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, const Value *Op0=nullptr, const Value *Op1=nullptr, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
LLVM_ABI InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
LLVM_ABI unsigned getMinVectorRegisterBitWidth() const
LLVM_ABI InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const
Estimate the overhead of scalarizing an instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
op_range operands()
Definition User.h:267
Value * getOperand(unsigned i) const
Definition User.h:207
static LLVM_ABI bool isVPBinOp(Intrinsic::ID ID)
std::optional< unsigned > getFunctionalIntrinsicID() const
std::optional< unsigned > getFunctionalOpcode() const
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition Value.h:761
LLVM_ABI bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition Value.cpp:166
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:440
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
iterator_range< user_iterator > users()
Definition Value.h:427
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:967
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition Value.h:544
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
bool use_empty() const
Definition Value.h:347
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
bool user_empty() const
Definition Value.h:390
PreservedAnalyses run(Function &F, FunctionAnalysisManager &)
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type size() const
Definition DenseSet.h:87
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
const APInt & smin(const APInt &A, const APInt &B)
Determine the smaller of two APInts considered to be signed.
Definition APInt.h:2263
const APInt & smax(const APInt &A, const APInt &B)
Determine the larger of two APInts considered to be signed.
Definition APInt.h:2268
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
class_match< PoisonValue > m_Poison()
Match an arbitrary poison constant.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::URem > m_URem(const LHS &L, const RHS &R)
class_match< Constant > m_Constant()
Match an arbitrary Constant and ignore it.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
class_match< IntrinsicInst > m_AnyIntrinsic()
Matches any intrinsic call and ignore it.
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
DisjointOr_match< LHS, RHS > m_DisjointOr(const LHS &L, const RHS &R)
BinOpPred_match< LHS, RHS, is_right_shift_op > m_Shr(const LHS &L, const RHS &R)
Matches logical shift operations.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
match_combine_and< LTy, RTy > m_CombineAnd(const LTy &L, const RTy &R)
Combine two pattern matchers matching L && R.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
cst_pred_ty< is_non_zero_int > m_NonZeroInt()
Match a non-zero integer or a vector with all non-zero elements.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
OverflowingBinaryOp_match< LHS, RHS, Instruction::Shl, OverflowingBinaryOperator::NoUnsignedWrap > m_NUWShl(const LHS &L, const RHS &R)
OverflowingBinaryOp_match< LHS, RHS, Instruction::Mul, OverflowingBinaryOperator::NoUnsignedWrap > m_NUWMul(const LHS &L, const RHS &R)
BinOpPred_match< LHS, RHS, is_bitwiselogic_op, true > m_c_BitwiseLogic(const LHS &L, const RHS &R)
Matches bitwise logic operations in either order.
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
match_combine_or< CastInst_match< OpTy, SExtInst >, NNegZExt_match< OpTy > > m_SExtLike(const OpTy &Op)
Match either "sext" or "zext nneg".
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
@ Valid
The data is already valid.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
@ User
could "use" a pointer
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:344
@ Offset
Definition DWP.cpp:532
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:831
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2116
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:538
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition MathExtras.h:350
LLVM_ABI Value * simplifyUnOp(unsigned Opcode, Value *Op, const SimplifyQuery &Q)
Given operand for a UnaryOperator, fold the result or return null.
scope_exit(Callable) -> scope_exit< Callable >
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI unsigned getArithmeticReductionInstruction(Intrinsic::ID RdxID)
Returns the arithmetic instruction opcode used when expanding a reduction.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:243
LLVM_ABI Value * simplifyCall(CallBase *Call, Value *Callee, ArrayRef< Value * > Args, const SimplifyQuery &Q)
Given a callsite, callee, and arguments, fold the result or return null.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
LLVM_ABI bool mustSuppressSpeculation(const LoadInst &LI)
Return true if speculation of the given load must be suppressed to avoid ordering or interfering with...
Definition Loads.cpp:431
LLVM_ABI bool widenShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Try to transform a shuffle mask by replacing elements with the scaled index for an equivalent mask of...
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2173
LLVM_ABI ConstantRange computeConstantRange(const Value *V, bool ForSigned, bool UseInstrInfo=true, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Determine the possible constant range of an integer or vector of integer value.
unsigned M1(unsigned Val)
Definition VE.h:377
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:406
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
bool isModSet(const ModRefInfo MRI)
Definition ModRef.h:49
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI bool programUndefinedIfPoison(const Instruction *Inst)
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:446
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ABI bool isKnownNonZero(const Value *V, const SimplifyQuery &Q, unsigned Depth=0)
Return true if the given value is known to be non-zero when defined.
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
LLVM_ABI bool isSafeToSpeculativelyExecuteWithOpcode(unsigned Opcode, const Instruction *Inst, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
This returns the same result as isSafeToSpeculativelyExecute if Opcode is the actual opcode of Inst.
@ Other
Any other memory.
Definition ModRef.h:68
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
LLVM_ABI void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
LLVM_ABI Intrinsic::ID getReductionForBinop(Instruction::BinaryOps Opc)
Returns the reduction intrinsic id corresponding to the binary operation.
@ And
Bitwise or logical AND of integers.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
LLVM_ABI Constant * getLosslessInvCast(Constant *C, Type *InvCastTo, unsigned CastOp, const DataLayout &DL, PreservedCastFlags *Flags=nullptr)
Try to cast C to InvC losslessly, satisfying CastOp(InvC) equals C, or CastOp(InvC) is a refined valu...
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1772
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2166
LLVM_ABI Value * simplifyCmpInst(CmpPredicate Predicate, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a CmpInst, fold the result or return null.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicID(Intrinsic::ID IID)
Returns the llvm.vector.reduce min/max intrinsic that corresponds to the intrinsic op.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
LLVM_ABI AAMDNodes adjustForAccess(unsigned AccessSize)
Create a new AAMDNode for accessing AccessSize bytes of this AAMDNode.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:312
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:264
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:148
SimplifyQuery getWithInstruction(const Instruction *I) const