LLVM 23.0.0git
AArch64TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "AArch64ExpandImm.h"
14#include "llvm/ADT/DenseMap.h"
22#include "llvm/IR/Intrinsics.h"
23#include "llvm/IR/IntrinsicsAArch64.h"
25#include "llvm/Support/Debug.h"
30#include <algorithm>
31#include <optional>
32using namespace llvm;
33using namespace llvm::PatternMatch;
34
35#define DEBUG_TYPE "aarch64tti"
36
37static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
38 cl::init(true), cl::Hidden);
39
41 "sve-prefer-fixed-over-scalable-if-equal", cl::Hidden);
42
43static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
45
46static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
47 cl::init(10), cl::Hidden);
48
49static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
50 cl::init(15), cl::Hidden);
51
53 NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
55
57 "call-penalty-sm-change", cl::init(5), cl::Hidden,
59 "Penalty of calling a function that requires a change to PSTATE.SM"));
60
62 "inline-call-penalty-sm-change", cl::init(10), cl::Hidden,
63 cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));
64
65static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
66 cl::init(true), cl::Hidden);
67
68static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
69 cl::init(true), cl::Hidden);
70
71// A complete guess as to a reasonable cost.
73 BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden,
74 cl::desc("The cost of a histcnt instruction"));
75
77 "dmb-lookahead-threshold", cl::init(10), cl::Hidden,
78 cl::desc("The number of instructions to search for a redundant dmb"));
79
81 "aarch64-force-unroll-threshold", cl::init(0), cl::Hidden,
82 cl::desc("Threshold for forced unrolling of small loops in AArch64"));
83
84namespace {
85class TailFoldingOption {
86 // These bitfields will only ever be set to something non-zero in operator=,
87 // when setting the -sve-tail-folding option. This option should always be of
88 // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here
89 // InitialBits is one of (disabled|all|simple). EnableBits represents
90 // additional flags we're enabling, and DisableBits for those flags we're
91 // disabling. The default flag is tracked in the variable NeedsDefault, since
92 // at the time of setting the option we may not know what the default value
93 // for the CPU is.
97
98 // This value needs to be initialised to true in case the user does not
99 // explicitly set the -sve-tail-folding option.
100 bool NeedsDefault = true;
101
102 void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
103
104 void setNeedsDefault(bool V) { NeedsDefault = V; }
105
106 void setEnableBit(TailFoldingOpts Bit) {
107 EnableBits |= Bit;
108 DisableBits &= ~Bit;
109 }
110
111 void setDisableBit(TailFoldingOpts Bit) {
112 EnableBits &= ~Bit;
113 DisableBits |= Bit;
114 }
115
116 TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
117 TailFoldingOpts Bits = TailFoldingOpts::Disabled;
118
119 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
120 "Initial bits should only include one of "
121 "(disabled|all|simple|default)");
122 Bits = NeedsDefault ? DefaultBits : InitialBits;
123 Bits |= EnableBits;
124 Bits &= ~DisableBits;
125
126 return Bits;
127 }
128
129 void reportError(std::string Opt) {
130 errs() << "invalid argument '" << Opt
131 << "' to -sve-tail-folding=; the option should be of the form\n"
132 " (disabled|all|default|simple)[+(reductions|recurrences"
133 "|reverse|noreductions|norecurrences|noreverse)]\n";
134 report_fatal_error("Unrecognised tail-folding option");
135 }
136
137public:
138
139 void operator=(const std::string &Val) {
140 // If the user explicitly sets -sve-tail-folding= then treat as an error.
141 if (Val.empty()) {
142 reportError("");
143 return;
144 }
145
146 // Since the user is explicitly setting the option we don't automatically
147 // need the default unless they require it.
148 setNeedsDefault(false);
149
150 SmallVector<StringRef, 4> TailFoldTypes;
151 StringRef(Val).split(TailFoldTypes, '+', -1, false);
152
153 unsigned StartIdx = 1;
154 if (TailFoldTypes[0] == "disabled")
155 setInitialBits(TailFoldingOpts::Disabled);
156 else if (TailFoldTypes[0] == "all")
157 setInitialBits(TailFoldingOpts::All);
158 else if (TailFoldTypes[0] == "default")
159 setNeedsDefault(true);
160 else if (TailFoldTypes[0] == "simple")
161 setInitialBits(TailFoldingOpts::Simple);
162 else {
163 StartIdx = 0;
164 setInitialBits(TailFoldingOpts::Disabled);
165 }
166
167 for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
168 if (TailFoldTypes[I] == "reductions")
169 setEnableBit(TailFoldingOpts::Reductions);
170 else if (TailFoldTypes[I] == "recurrences")
171 setEnableBit(TailFoldingOpts::Recurrences);
172 else if (TailFoldTypes[I] == "reverse")
173 setEnableBit(TailFoldingOpts::Reverse);
174 else if (TailFoldTypes[I] == "noreductions")
175 setDisableBit(TailFoldingOpts::Reductions);
176 else if (TailFoldTypes[I] == "norecurrences")
177 setDisableBit(TailFoldingOpts::Recurrences);
178 else if (TailFoldTypes[I] == "noreverse")
179 setDisableBit(TailFoldingOpts::Reverse);
180 else
181 reportError(Val);
182 }
183 }
184
185 bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
186 return (getBits(DefaultBits) & Required) == Required;
187 }
188};
189} // namespace
190
191TailFoldingOption TailFoldingOptionLoc;
192
194 "sve-tail-folding",
195 cl::desc(
196 "Control the use of vectorisation using tail-folding for SVE where the"
197 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
198 "\ndisabled (Initial) No loop types will vectorize using "
199 "tail-folding"
200 "\ndefault (Initial) Uses the default tail-folding settings for "
201 "the target CPU"
202 "\nall (Initial) All legal loop types will vectorize using "
203 "tail-folding"
204 "\nsimple (Initial) Use tail-folding for simple loops (not "
205 "reductions or recurrences)"
206 "\nreductions Use tail-folding for loops containing reductions"
207 "\nnoreductions Inverse of above"
208 "\nrecurrences Use tail-folding for loops containing fixed order "
209 "recurrences"
210 "\nnorecurrences Inverse of above"
211 "\nreverse Use tail-folding for loops requiring reversed "
212 "predicates"
213 "\nnoreverse Inverse of above"),
215
216// Experimental option that will only be fully functional when the
217// code-generator is changed to use SVE instead of NEON for all fixed-width
218// operations.
220 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
221
222// Experimental option that will only be fully functional when the cost-model
223// and code-generator have been changed to avoid using scalable vector
224// instructions that are not legal in streaming SVE mode.
226 "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
227
228static bool isSMEABIRoutineCall(const CallInst &CI,
229 const AArch64TargetLowering &TLI) {
230 const auto *F = CI.getCalledFunction();
231 return F &&
233}
234
235/// Returns true if the function has explicit operations that can only be
236/// lowered using incompatible instructions for the selected mode. This also
237/// returns true if the function F may use or modify ZA state.
239 const AArch64TargetLowering &TLI) {
240 for (const BasicBlock &BB : *F) {
241 for (const Instruction &I : BB) {
242 // Be conservative for now and assume that any call to inline asm or to
243 // intrinsics could could result in non-streaming ops (e.g. calls to
244 // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that
245 // all native LLVM instructions can be lowered to compatible instructions.
246 if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() &&
247 (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) ||
249 return true;
250 }
251 }
252 return false;
253}
254
256 SmallVectorImpl<StringRef> &Features) {
257 StringRef AttributeStr =
258 TTI->isMultiversionedFunction(F) ? "fmv-features" : "target-features";
259 StringRef FeatureStr = F.getFnAttribute(AttributeStr).getValueAsString();
260 FeatureStr.split(Features, ",");
261}
262
265 extractAttrFeatures(F, this, Features);
266 return AArch64::getCpuSupportsMask(Features);
267}
268
271 extractAttrFeatures(F, this, Features);
272 return AArch64::getFMVPriority(Features);
273}
274
276 return F.hasFnAttribute("fmv-features");
277}
278
279const FeatureBitset AArch64TTIImpl::InlineInverseFeatures = {
280 AArch64::FeatureExecuteOnly,
281};
282
284 const Function *Callee) const {
285 SMECallAttrs CallAttrs(*Caller, *Callee);
286
287 // Never inline a function explicitly marked as being streaming,
288 // into a non-streaming function. Assume it was marked as streaming
289 // for a reason.
290 if (CallAttrs.caller().hasNonStreamingInterfaceAndBody() &&
292 return false;
293
294 // When inlining, we should consider the body of the function, not the
295 // interface.
296 if (CallAttrs.callee().hasStreamingBody()) {
297 CallAttrs.callee().set(SMEAttrs::SM_Compatible, false);
298 CallAttrs.callee().set(SMEAttrs::SM_Enabled, true);
299 }
300
301 if (CallAttrs.callee().isNewZA() || CallAttrs.callee().isNewZT0())
302 return false;
303
304 if (CallAttrs.requiresLazySave() || CallAttrs.requiresSMChange() ||
305 CallAttrs.requiresPreservingZT0() ||
306 CallAttrs.requiresPreservingAllZAState()) {
307 if (hasPossibleIncompatibleOps(Callee, *getTLI()))
308 return false;
309 }
310
311 const TargetMachine &TM = getTLI()->getTargetMachine();
312 const FeatureBitset &CallerBits =
313 TM.getSubtargetImpl(*Caller)->getFeatureBits();
314 const FeatureBitset &CalleeBits =
315 TM.getSubtargetImpl(*Callee)->getFeatureBits();
316 // Adjust the feature bitsets by inverting some of the bits. This is needed
317 // for target features that represent restrictions rather than capabilities,
318 // for example a "+execute-only" callee can be inlined into a caller without
319 // "+execute-only", but not vice versa.
320 FeatureBitset EffectiveCallerBits = CallerBits ^ InlineInverseFeatures;
321 FeatureBitset EffectiveCalleeBits = CalleeBits ^ InlineInverseFeatures;
322
323 return (EffectiveCallerBits & EffectiveCalleeBits) == EffectiveCalleeBits;
324}
325
327 const Function *Callee,
328 ArrayRef<Type *> Types) const {
329 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
330 return false;
331
332 // We need to ensure that argument promotion does not attempt to promote
333 // pointers to fixed-length vector types larger than 128 bits like
334 // <8 x float> (and pointers to aggregate types which have such fixed-length
335 // vector type members) into the values of the pointees. Such vector types
336 // are used for SVE VLS but there is no ABI for SVE VLS arguments and the
337 // backend cannot lower such value arguments. The 128-bit fixed-length SVE
338 // types can be safely treated as 128-bit NEON types and they cannot be
339 // distinguished in IR.
340 if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) {
341 auto FVTy = dyn_cast<FixedVectorType>(Ty);
342 return FVTy &&
343 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
344 }))
345 return false;
346
347 return true;
348}
349
350unsigned
352 unsigned DefaultCallPenalty) const {
353 // This function calculates a penalty for executing Call in F.
354 //
355 // There are two ways this function can be called:
356 // (1) F:
357 // call from F -> G (the call here is Call)
358 //
359 // For (1), Call.getCaller() == F, so it will always return a high cost if
360 // a streaming-mode change is required (thus promoting the need to inline the
361 // function)
362 //
363 // (2) F:
364 // call from F -> G (the call here is not Call)
365 // G:
366 // call from G -> H (the call here is Call)
367 //
368 // For (2), if after inlining the body of G into F the call to H requires a
369 // streaming-mode change, and the call to G from F would also require a
370 // streaming-mode change, then there is benefit to do the streaming-mode
371 // change only once and avoid inlining of G into F.
372
373 SMEAttrs FAttrs(*F);
374 SMECallAttrs CallAttrs(Call, &getTLI()->getRuntimeLibcallsInfo());
375
376 if (SMECallAttrs(FAttrs, CallAttrs.callee()).requiresSMChange()) {
377 if (F == Call.getCaller()) // (1)
378 return CallPenaltyChangeSM * DefaultCallPenalty;
379 if (SMECallAttrs(FAttrs, CallAttrs.caller()).requiresSMChange()) // (2)
380 return InlineCallPenaltyChangeSM * DefaultCallPenalty;
381 }
382
383 return DefaultCallPenalty;
384}
385
389
390 if (K == TargetTransformInfo::RGK_FixedWidthVector && ST->isNeonAvailable())
391 return true;
392
394 ST->isSVEorStreamingSVEAvailable() &&
395 !ST->disableMaximizeScalableBandwidth();
396}
397
398/// Calculate the cost of materializing a 64-bit value. This helper
399/// method might only calculate a fraction of a larger immediate. Therefore it
400/// is valid to return a cost of ZERO.
402 // Check if the immediate can be encoded within an instruction.
403 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
404 return 0;
405
406 if (Val < 0)
407 Val = ~Val;
408
409 // Calculate how many moves we will need to materialize this constant.
411 AArch64_IMM::expandMOVImm(Val, 64, Insn);
412 return Insn.size();
413}
414
415/// Calculate the cost of materializing the given constant.
419 assert(Ty->isIntegerTy());
420
421 unsigned BitSize = Ty->getPrimitiveSizeInBits();
422 if (BitSize == 0)
423 return ~0U;
424
425 // Sign-extend all constants to a multiple of 64-bit.
426 APInt ImmVal = Imm;
427 if (BitSize & 0x3f)
428 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
429
430 // Split the constant into 64-bit chunks and calculate the cost for each
431 // chunk.
433 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
434 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
435 int64_t Val = Tmp.getSExtValue();
436 Cost += getIntImmCost(Val);
437 }
438 // We need at least one instruction to materialze the constant.
439 return std::max<InstructionCost>(1, Cost);
440}
441
443 const APInt &Imm, Type *Ty,
445 Instruction *Inst) const {
446 assert(Ty->isIntegerTy());
447
448 unsigned BitSize = Ty->getPrimitiveSizeInBits();
449 // There is no cost model for constants with a bit size of 0. Return TCC_Free
450 // here, so that constant hoisting will ignore this constant.
451 if (BitSize == 0)
452 return TTI::TCC_Free;
453
454 unsigned ImmIdx = ~0U;
455 switch (Opcode) {
456 default:
457 return TTI::TCC_Free;
458 case Instruction::GetElementPtr:
459 // Always hoist the base address of a GetElementPtr.
460 if (Idx == 0)
461 return 2 * TTI::TCC_Basic;
462 return TTI::TCC_Free;
463 case Instruction::Store:
464 ImmIdx = 0;
465 break;
466 case Instruction::Add:
467 case Instruction::Sub:
468 case Instruction::Mul:
469 case Instruction::UDiv:
470 case Instruction::SDiv:
471 case Instruction::URem:
472 case Instruction::SRem:
473 case Instruction::And:
474 case Instruction::Or:
475 case Instruction::Xor:
476 case Instruction::ICmp:
477 ImmIdx = 1;
478 break;
479 // Always return TCC_Free for the shift value of a shift instruction.
480 case Instruction::Shl:
481 case Instruction::LShr:
482 case Instruction::AShr:
483 if (Idx == 1)
484 return TTI::TCC_Free;
485 break;
486 case Instruction::Trunc:
487 case Instruction::ZExt:
488 case Instruction::SExt:
489 case Instruction::IntToPtr:
490 case Instruction::PtrToInt:
491 case Instruction::BitCast:
492 case Instruction::PHI:
493 case Instruction::Call:
494 case Instruction::Select:
495 case Instruction::Ret:
496 case Instruction::Load:
497 break;
498 }
499
500 if (Idx == ImmIdx) {
501 int NumConstants = (BitSize + 63) / 64;
503 return (Cost <= NumConstants * TTI::TCC_Basic)
504 ? static_cast<int>(TTI::TCC_Free)
505 : Cost;
506 }
508}
509
512 const APInt &Imm, Type *Ty,
514 assert(Ty->isIntegerTy());
515
516 unsigned BitSize = Ty->getPrimitiveSizeInBits();
517 // There is no cost model for constants with a bit size of 0. Return TCC_Free
518 // here, so that constant hoisting will ignore this constant.
519 if (BitSize == 0)
520 return TTI::TCC_Free;
521
522 // Most (all?) AArch64 intrinsics do not support folding immediates into the
523 // selected instruction, so we compute the materialization cost for the
524 // immediate directly.
525 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
527
528 switch (IID) {
529 default:
530 return TTI::TCC_Free;
531 case Intrinsic::sadd_with_overflow:
532 case Intrinsic::uadd_with_overflow:
533 case Intrinsic::ssub_with_overflow:
534 case Intrinsic::usub_with_overflow:
535 case Intrinsic::smul_with_overflow:
536 case Intrinsic::umul_with_overflow:
537 if (Idx == 1) {
538 int NumConstants = (BitSize + 63) / 64;
540 return (Cost <= NumConstants * TTI::TCC_Basic)
541 ? static_cast<int>(TTI::TCC_Free)
542 : Cost;
543 }
544 break;
545 case Intrinsic::experimental_stackmap:
546 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
547 return TTI::TCC_Free;
548 break;
549 case Intrinsic::experimental_patchpoint_void:
550 case Intrinsic::experimental_patchpoint:
551 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
552 return TTI::TCC_Free;
553 break;
554 case Intrinsic::experimental_gc_statepoint:
555 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
556 return TTI::TCC_Free;
557 break;
558 }
560}
561
563AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) const {
564 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
565 if (TyWidth == 32 || TyWidth == 64)
567 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
568 return TTI::PSK_Software;
569}
570
571static bool isUnpackedVectorVT(EVT VecVT) {
572 return VecVT.isScalableVector() &&
574}
575
577 const IntrinsicCostAttributes &ICA) {
578 // We need to know at least the number of elements in the vector of buckets
579 // and the size of each element to update.
580 if (ICA.getArgTypes().size() < 2)
582
583 // Only interested in costing for the hardware instruction from SVE2.
584 if (!ST->hasSVE2())
586
587 Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers
588 Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements
589 unsigned TotalHistCnts = 1;
590
591 unsigned EltSize = EltTy->getScalarSizeInBits();
592 // Only allow (up to 64b) integers or pointers
593 if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) || EltSize > 64)
595
596 // FIXME: We should be able to generate histcnt for fixed-length vectors
597 // using ptrue with a specific VL.
598 if (VectorType *VTy = dyn_cast<VectorType>(BucketPtrsTy)) {
599 unsigned EC = VTy->getElementCount().getKnownMinValue();
600 if (!isPowerOf2_64(EC) || !VTy->isScalableTy())
602
603 // HistCnt only supports 32b and 64b element types
604 unsigned LegalEltSize = EltSize <= 32 ? 32 : 64;
605
606 if (EC == 2 || (LegalEltSize == 32 && EC == 4))
608
609 unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize;
610 TotalHistCnts = EC / NaturalVectorWidth;
611
612 return InstructionCost(BaseHistCntCost * TotalHistCnts);
613 }
614
616}
617
621 // The code-generator is currently not able to handle scalable vectors
622 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
623 // it. This change will be removed when code-generation for these types is
624 // sufficiently reliable.
625 auto *RetTy = ICA.getReturnType();
626 if (auto *VTy = dyn_cast<ScalableVectorType>(RetTy))
627 if (VTy->getElementCount() == ElementCount::getScalable(1))
629
630 switch (ICA.getID()) {
631 case Intrinsic::experimental_vector_histogram_add: {
632 InstructionCost HistCost = getHistogramCost(ST, ICA);
633 // If the cost isn't valid, we may still be able to scalarize
634 if (HistCost.isValid())
635 return HistCost;
636 break;
637 }
638 case Intrinsic::umin:
639 case Intrinsic::umax:
640 case Intrinsic::smin:
641 case Intrinsic::smax: {
642 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
643 MVT::v8i16, MVT::v2i32, MVT::v4i32,
644 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
645 MVT::nxv2i64};
646 auto LT = getTypeLegalizationCost(RetTy);
647 // v2i64 types get converted to cmp+bif hence the cost of 2
648 if (LT.second == MVT::v2i64)
649 return LT.first * 2;
650 if (any_of(ValidMinMaxTys, equal_to(LT.second)))
651 return LT.first;
652 break;
653 }
654 case Intrinsic::scmp:
655 case Intrinsic::ucmp: {
656 static const CostTblEntry BitreverseTbl[] = {
657 {Intrinsic::scmp, MVT::i32, 3}, // cmp+cset+csinv
658 {Intrinsic::scmp, MVT::i64, 3}, // cmp+cset+csinv
659 {Intrinsic::scmp, MVT::v8i8, 3}, // cmgt+cmgt+sub
660 {Intrinsic::scmp, MVT::v16i8, 3}, // cmgt+cmgt+sub
661 {Intrinsic::scmp, MVT::v4i16, 3}, // cmgt+cmgt+sub
662 {Intrinsic::scmp, MVT::v8i16, 3}, // cmgt+cmgt+sub
663 {Intrinsic::scmp, MVT::v2i32, 3}, // cmgt+cmgt+sub
664 {Intrinsic::scmp, MVT::v4i32, 3}, // cmgt+cmgt+sub
665 {Intrinsic::scmp, MVT::v1i64, 3}, // cmgt+cmgt+sub
666 {Intrinsic::scmp, MVT::v2i64, 3}, // cmgt+cmgt+sub
667 };
668 const auto LT = getTypeLegalizationCost(RetTy);
669 const auto *Entry =
670 CostTableLookup(BitreverseTbl, Intrinsic::scmp, LT.second);
671 if (Entry)
672 return Entry->Cost * LT.first;
673 break;
674 }
675 case Intrinsic::sadd_sat:
676 case Intrinsic::ssub_sat:
677 case Intrinsic::uadd_sat:
678 case Intrinsic::usub_sat: {
679 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
680 MVT::v8i16, MVT::v2i32, MVT::v4i32,
681 MVT::v2i64};
682 auto LT = getTypeLegalizationCost(RetTy);
683 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
684 // need to extend the type, as it uses shr(qadd(shl, shl)).
685 unsigned Instrs =
686 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
687 if (any_of(ValidSatTys, equal_to(LT.second)))
688 return LT.first * Instrs;
689
691 uint64_t VectorSize = TS.getKnownMinValue();
692
693 if (ST->isSVEAvailable() && VectorSize >= 128 && isPowerOf2_64(VectorSize))
694 return LT.first * Instrs;
695
696 break;
697 }
698 case Intrinsic::abs: {
699 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
700 MVT::v8i16, MVT::v2i32, MVT::v4i32,
701 MVT::v2i64, MVT::nxv16i8, MVT::nxv8i16,
702 MVT::nxv4i32, MVT::nxv2i64};
703 auto LT = getTypeLegalizationCost(RetTy);
704 if (any_of(ValidAbsTys, equal_to(LT.second)))
705 return LT.first;
706 break;
707 }
708 case Intrinsic::bswap: {
709 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
710 MVT::v4i32, MVT::v2i64};
711 auto LT = getTypeLegalizationCost(RetTy);
712 if (any_of(ValidAbsTys, equal_to(LT.second)) &&
713 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
714 return LT.first;
715 break;
716 }
717 case Intrinsic::fma:
718 case Intrinsic::fmuladd: {
719 // Given a fma or fmuladd, cost it the same as a fmul instruction which are
720 // usually the same for costs. TODO: Add fp16 and bf16 expansion costs.
721 Type *EltTy = RetTy->getScalarType();
722 if (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
723 (EltTy->isHalfTy() && ST->hasFullFP16()))
724 return getArithmeticInstrCost(Instruction::FMul, RetTy, CostKind);
725 break;
726 }
727 case Intrinsic::stepvector: {
728 InstructionCost Cost = 1; // Cost of the `index' instruction
729 auto LT = getTypeLegalizationCost(RetTy);
730 // Legalisation of illegal vectors involves an `index' instruction plus
731 // (LT.first - 1) vector adds.
732 if (LT.first > 1) {
733 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
734 InstructionCost AddCost =
735 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
736 Cost += AddCost * (LT.first - 1);
737 }
738 return Cost;
739 }
740 case Intrinsic::vector_extract:
741 case Intrinsic::vector_insert: {
742 // If both the vector and subvector types are legal types and the index
743 // is 0, then this should be a no-op or simple operation; return a
744 // relatively low cost.
745
746 // If arguments aren't actually supplied, then we cannot determine the
747 // value of the index. We also want to skip predicate types.
748 if (ICA.getArgs().size() != ICA.getArgTypes().size() ||
750 break;
751
752 LLVMContext &C = RetTy->getContext();
753 EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
754 bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
755 EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy)
756 : getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
757 // Skip this if either the vector or subvector types are unpacked
758 // SVE types; they may get lowered to stack stores and loads.
759 if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT))
760 break;
761
763 getTLI()->getTypeConversion(C, SubVecVT);
765 getTLI()->getTypeConversion(C, VecVT);
766 const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2];
767 const ConstantInt *CIdx = cast<ConstantInt>(Idx);
768 if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
769 VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
770 return TTI::TCC_Free;
771 break;
772 }
773 case Intrinsic::bitreverse: {
774 static const CostTblEntry BitreverseTbl[] = {
775 {Intrinsic::bitreverse, MVT::i32, 1},
776 {Intrinsic::bitreverse, MVT::i64, 1},
777 {Intrinsic::bitreverse, MVT::v8i8, 1},
778 {Intrinsic::bitreverse, MVT::v16i8, 1},
779 {Intrinsic::bitreverse, MVT::v4i16, 2},
780 {Intrinsic::bitreverse, MVT::v8i16, 2},
781 {Intrinsic::bitreverse, MVT::v2i32, 2},
782 {Intrinsic::bitreverse, MVT::v4i32, 2},
783 {Intrinsic::bitreverse, MVT::v1i64, 2},
784 {Intrinsic::bitreverse, MVT::v2i64, 2},
785 };
786 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
787 const auto *Entry =
788 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
789 if (Entry) {
790 // Cost Model is using the legal type(i32) that i8 and i16 will be
791 // converted to +1 so that we match the actual lowering cost
792 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
793 TLI->getValueType(DL, RetTy, true) == MVT::i16)
794 return LegalisationCost.first * Entry->Cost + 1;
795
796 return LegalisationCost.first * Entry->Cost;
797 }
798 break;
799 }
800 case Intrinsic::ctpop: {
801 if (!ST->hasNEON()) {
802 // 32-bit or 64-bit ctpop without NEON is 12 instructions.
803 return getTypeLegalizationCost(RetTy).first * 12;
804 }
805 static const CostTblEntry CtpopCostTbl[] = {
806 {ISD::CTPOP, MVT::v2i64, 4},
807 {ISD::CTPOP, MVT::v4i32, 3},
808 {ISD::CTPOP, MVT::v8i16, 2},
809 {ISD::CTPOP, MVT::v16i8, 1},
810 {ISD::CTPOP, MVT::i64, 4},
811 {ISD::CTPOP, MVT::v2i32, 3},
812 {ISD::CTPOP, MVT::v4i16, 2},
813 {ISD::CTPOP, MVT::v8i8, 1},
814 {ISD::CTPOP, MVT::i32, 5},
815 };
816 auto LT = getTypeLegalizationCost(RetTy);
817 MVT MTy = LT.second;
818 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
819 // Extra cost of +1 when illegal vector types are legalized by promoting
820 // the integer type.
821 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
822 RetTy->getScalarSizeInBits()
823 ? 1
824 : 0;
825 return LT.first * Entry->Cost + ExtraCost;
826 }
827 break;
828 }
829 case Intrinsic::sadd_with_overflow:
830 case Intrinsic::uadd_with_overflow:
831 case Intrinsic::ssub_with_overflow:
832 case Intrinsic::usub_with_overflow:
833 case Intrinsic::smul_with_overflow:
834 case Intrinsic::umul_with_overflow: {
835 static const CostTblEntry WithOverflowCostTbl[] = {
836 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
837 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
838 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
839 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
840 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
841 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
842 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
843 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
844 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
845 {Intrinsic::usub_with_overflow, MVT::i8, 3},
846 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
847 {Intrinsic::usub_with_overflow, MVT::i16, 3},
848 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
849 {Intrinsic::usub_with_overflow, MVT::i32, 1},
850 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
851 {Intrinsic::usub_with_overflow, MVT::i64, 1},
852 {Intrinsic::smul_with_overflow, MVT::i8, 5},
853 {Intrinsic::umul_with_overflow, MVT::i8, 4},
854 {Intrinsic::smul_with_overflow, MVT::i16, 5},
855 {Intrinsic::umul_with_overflow, MVT::i16, 4},
856 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
857 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
858 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
859 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
860 };
861 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
862 if (MTy.isSimple())
863 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
864 MTy.getSimpleVT()))
865 return Entry->Cost;
866 break;
867 }
868 case Intrinsic::fptosi_sat:
869 case Intrinsic::fptoui_sat: {
870 if (ICA.getArgTypes().empty())
871 break;
872 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
873 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
874 EVT MTy = TLI->getValueType(DL, RetTy);
875 // Check for the legal types, which are where the size of the input and the
876 // output are the same, or we are using cvt f64->i32 or f32->i64.
877 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
878 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
879 LT.second == MVT::v2f64)) {
880 if ((LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
881 (LT.second == MVT::f64 && MTy == MVT::i32) ||
882 (LT.second == MVT::f32 && MTy == MVT::i64)))
883 return LT.first;
884 // Extending vector types v2f32->v2i64, fcvtl*2 + fcvt*2
885 if (LT.second.getScalarType() == MVT::f32 && MTy.isFixedLengthVector() &&
886 MTy.getScalarSizeInBits() == 64)
887 return LT.first * (MTy.getVectorNumElements() > 2 ? 4 : 2);
888 }
889 // Similarly for fp16 sizes. Without FullFP16 we generally need to fcvt to
890 // f32.
891 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
892 return LT.first + getIntrinsicInstrCost(
893 {ICA.getID(),
894 RetTy,
895 {ICA.getArgTypes()[0]->getWithNewType(
896 Type::getFloatTy(RetTy->getContext()))}},
897 CostKind);
898 if ((LT.second == MVT::f16 && MTy == MVT::i32) ||
899 (LT.second == MVT::f16 && MTy == MVT::i64) ||
900 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
901 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))
902 return LT.first;
903 // Extending vector types v8f16->v8i32, fcvtl*2 + fcvt*2
904 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
905 MTy.getScalarSizeInBits() == 32)
906 return LT.first * (MTy.getVectorNumElements() > 4 ? 4 : 2);
907 // Extending vector types v8f16->v8i32. These current scalarize but the
908 // codegen could be better.
909 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
910 MTy.getScalarSizeInBits() == 64)
911 return MTy.getVectorNumElements() * 3;
912
913 // If we can we use a legal convert followed by a min+max
914 if ((LT.second.getScalarType() == MVT::f32 ||
915 LT.second.getScalarType() == MVT::f64 ||
916 LT.second.getScalarType() == MVT::f16) &&
917 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
918 Type *LegalTy =
919 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
920 if (LT.second.isVector())
921 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
923 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
924 LegalTy, {LegalTy, LegalTy});
926 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
927 LegalTy, {LegalTy, LegalTy});
929 return LT.first * Cost +
930 ((LT.second.getScalarType() != MVT::f16 || ST->hasFullFP16()) ? 0
931 : 1);
932 }
933 // Otherwise we need to follow the default expansion that clamps the value
934 // using a float min/max with a fcmp+sel for nan handling when signed.
935 Type *FPTy = ICA.getArgTypes()[0]->getScalarType();
936 RetTy = RetTy->getScalarType();
937 if (LT.second.isVector()) {
938 FPTy = VectorType::get(FPTy, LT.second.getVectorElementCount());
939 RetTy = VectorType::get(RetTy, LT.second.getVectorElementCount());
940 }
941 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
943 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
945 Cost +=
946 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
948 if (IsSigned) {
949 Type *CondTy = RetTy->getWithNewBitWidth(1);
950 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
952 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
954 }
955 return LT.first * Cost;
956 }
957 case Intrinsic::fshl:
958 case Intrinsic::fshr: {
959 if (ICA.getArgs().empty())
960 break;
961
962 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]);
963
964 // ROTR / ROTL is a funnel shift with equal first and second operand. For
965 // ROTR on integer registers (i32/i64) this can be done in a single ror
966 // instruction. A fshl with a non-constant shift uses a neg + ror.
967 if (RetTy->isIntegerTy() && ICA.getArgs()[0] == ICA.getArgs()[1] &&
968 (RetTy->getPrimitiveSizeInBits() == 32 ||
969 RetTy->getPrimitiveSizeInBits() == 64)) {
970 InstructionCost NegCost =
971 (ICA.getID() == Intrinsic::fshl && !OpInfoZ.isConstant()) ? 1 : 0;
972 return 1 + NegCost;
973 }
974
975 // TODO: Add handling for fshl where third argument is not a constant.
976 if (!OpInfoZ.isConstant())
977 break;
978
979 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
980 if (OpInfoZ.isUniform()) {
981 static const CostTblEntry FshlTbl[] = {
982 {Intrinsic::fshl, MVT::v4i32, 2}, // shl + usra
983 {Intrinsic::fshl, MVT::v2i64, 2}, {Intrinsic::fshl, MVT::v16i8, 2},
984 {Intrinsic::fshl, MVT::v8i16, 2}, {Intrinsic::fshl, MVT::v2i32, 2},
985 {Intrinsic::fshl, MVT::v8i8, 2}, {Intrinsic::fshl, MVT::v4i16, 2}};
986 // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
987 // to avoid having to duplicate the costs.
988 const auto *Entry =
989 CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second);
990 if (Entry)
991 return LegalisationCost.first * Entry->Cost;
992 }
993
994 auto TyL = getTypeLegalizationCost(RetTy);
995 if (!RetTy->isIntegerTy())
996 break;
997
998 // Estimate cost manually, as types like i8 and i16 will get promoted to
999 // i32 and CostTableLookup will ignore the extra conversion cost.
1000 bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
1001 RetTy->getScalarSizeInBits() < 64) ||
1002 (RetTy->getScalarSizeInBits() % 64 != 0);
1003 unsigned ExtraCost = HigherCost ? 1 : 0;
1004 if (RetTy->getScalarSizeInBits() == 32 ||
1005 RetTy->getScalarSizeInBits() == 64)
1006 ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
1007 // extr instruction.
1008 else if (HigherCost)
1009 ExtraCost = 1;
1010 else
1011 break;
1012 return TyL.first + ExtraCost;
1013 }
1014 case Intrinsic::get_active_lane_mask: {
1015 auto RetTy = cast<VectorType>(ICA.getReturnType());
1016 EVT RetVT = getTLI()->getValueType(DL, RetTy);
1017 EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1018 if (getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT))
1019 break;
1020
1021 if (RetTy->isScalableTy()) {
1022 if (TLI->getTypeAction(RetTy->getContext(), RetVT) !=
1024 break;
1025
1026 auto LT = getTypeLegalizationCost(RetTy);
1027 InstructionCost Cost = LT.first;
1028 // When SVE2p1 or SME2 is available, we can halve getTypeLegalizationCost
1029 // as get_active_lane_mask may lower to the sve_whilelo_x2 intrinsic, e.g.
1030 // nxv32i1 = get_active_lane_mask(base, idx) ->
1031 // {nxv16i1, nxv16i1} = sve_whilelo_x2(base, idx)
1032 if (ST->hasSVE2p1() || ST->hasSME2()) {
1033 Cost /= 2;
1034 if (Cost == 1)
1035 return Cost;
1036 }
1037
1038 // If more than one whilelo intrinsic is required, include the extra cost
1039 // required by the saturating add & select required to increment the
1040 // start value after the first intrinsic call.
1041 Type *OpTy = ICA.getArgTypes()[0];
1042 IntrinsicCostAttributes AddAttrs(Intrinsic::uadd_sat, OpTy, {OpTy, OpTy});
1043 InstructionCost SplitCost = getIntrinsicInstrCost(AddAttrs, CostKind);
1044 Type *CondTy = OpTy->getWithNewBitWidth(1);
1045 SplitCost += getCmpSelInstrCost(Instruction::Select, OpTy, CondTy,
1047 return Cost + (SplitCost * (Cost - 1));
1048 } else if (!getTLI()->isTypeLegal(RetVT)) {
1049 // We don't have enough context at this point to determine if the mask
1050 // is going to be kept live after the block, which will force the vXi1
1051 // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
1052 // For now, we just assume the vectorizer created this intrinsic and
1053 // the result will be the input for a PHI. In this case the cost will
1054 // be extremely high for fixed-width vectors.
1055 // NOTE: getScalarizationOverhead returns a cost that's far too
1056 // pessimistic for the actual generated codegen. In reality there are
1057 // two instructions generated per lane.
1058 return cast<FixedVectorType>(RetTy)->getNumElements() * 2;
1059 }
1060 break;
1061 }
1062 case Intrinsic::experimental_vector_match: {
1063 auto *NeedleTy = cast<FixedVectorType>(ICA.getArgTypes()[1]);
1064 EVT SearchVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1065 unsigned SearchSize = NeedleTy->getNumElements();
1066 if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize)) {
1067 // Base cost for MATCH instructions. At least on the Neoverse V2 and
1068 // Neoverse V3, these are cheap operations with the same latency as a
1069 // vector ADD. In most cases, however, we also need to do an extra DUP.
1070 // For fixed-length vectors we currently need an extra five--six
1071 // instructions besides the MATCH.
1073 if (isa<FixedVectorType>(RetTy))
1074 Cost += 10;
1075 return Cost;
1076 }
1077 break;
1078 }
1079 case Intrinsic::experimental_cttz_elts: {
1080 EVT ArgVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1081 if (!getTLI()->shouldExpandCttzElements(ArgVT)) {
1082 // This will consist of a SVE brkb and a cntp instruction. These
1083 // typically have the same latency and half the throughput as a vector
1084 // add instruction.
1085 return 4;
1086 }
1087 break;
1088 }
1089 case Intrinsic::loop_dependence_raw_mask:
1090 case Intrinsic::loop_dependence_war_mask: {
1091 // The whilewr/rw instructions require SVE2 or SME.
1092 if (ST->hasSVE2() || ST->hasSME()) {
1093 EVT VecVT = getTLI()->getValueType(DL, RetTy);
1094 unsigned EltSizeInBytes =
1095 cast<ConstantInt>(ICA.getArgs()[2])->getZExtValue();
1096 if (!is_contained({1u, 2u, 4u, 8u}, EltSizeInBytes) ||
1097 VecVT.getVectorMinNumElements() != (16 / EltSizeInBytes))
1098 break;
1099 // For fixed-vector types we need to AND the mask with a ptrue vl<N>.
1100 return isa<FixedVectorType>(RetTy) ? 2 : 1;
1101 }
1102 break;
1103 }
1104 case Intrinsic::experimental_vector_extract_last_active:
1105 if (ST->isSVEorStreamingSVEAvailable()) {
1106 auto [LegalCost, _] = getTypeLegalizationCost(ICA.getArgTypes()[0]);
1107 // This should turn into chained clastb instructions.
1108 return LegalCost;
1109 }
1110 break;
1111 case Intrinsic::pow: {
1112 EVT VT = getTLI()->getValueType(DL, RetTy);
1113 RTLIB::Libcall LC = RTLIB::getPOW(VT);
1114 if (getTLI()->getLibcallImpl(LC) != RTLIB::Unsupported)
1115 return getCallInstrCost(nullptr, RetTy, ICA.getArgTypes(), CostKind);
1116 break;
1117 }
1118 case Intrinsic::sqrt:
1119 case Intrinsic::fabs:
1120 case Intrinsic::ceil:
1121 case Intrinsic::floor:
1122 case Intrinsic::nearbyint:
1123 case Intrinsic::round:
1124 case Intrinsic::rint:
1125 case Intrinsic::roundeven:
1126 case Intrinsic::trunc:
1127 case Intrinsic::minnum:
1128 case Intrinsic::maxnum:
1129 case Intrinsic::minimum:
1130 case Intrinsic::maximum: {
1131 if (isa<ScalableVectorType>(RetTy) && ST->isSVEorStreamingSVEAvailable()) {
1132 auto LT = getTypeLegalizationCost(RetTy);
1133 return LT.first;
1134 }
1135 break;
1136 }
1137 default:
1138 break;
1139 }
1141}
1142
1143/// The function will remove redundant reinterprets casting in the presence
1144/// of the control flow
1145static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
1146 IntrinsicInst &II) {
1148 auto RequiredType = II.getType();
1149
1150 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
1151 assert(PN && "Expected Phi Node!");
1152
1153 // Don't create a new Phi unless we can remove the old one.
1154 if (!PN->hasOneUse())
1155 return std::nullopt;
1156
1157 for (Value *IncValPhi : PN->incoming_values()) {
1158 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
1159 if (!Reinterpret ||
1160 Reinterpret->getIntrinsicID() !=
1161 Intrinsic::aarch64_sve_convert_to_svbool ||
1162 RequiredType != Reinterpret->getArgOperand(0)->getType())
1163 return std::nullopt;
1164 }
1165
1166 // Create the new Phi
1167 IC.Builder.SetInsertPoint(PN);
1168 PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
1169 Worklist.push_back(PN);
1170
1171 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
1172 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
1173 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
1174 Worklist.push_back(Reinterpret);
1175 }
1176
1177 // Cleanup Phi Node and reinterprets
1178 return IC.replaceInstUsesWith(II, NPN);
1179}
1180
1181// A collection of properties common to SVE intrinsics that allow for combines
1182// to be written without needing to know the specific intrinsic.
1184 //
1185 // Helper routines for common intrinsic definitions.
1186 //
1187
1188 // e.g. llvm.aarch64.sve.add pg, op1, op2
1189 // with IID ==> llvm.aarch64.sve.add_u
1190 static SVEIntrinsicInfo
1197
1198 // e.g. llvm.aarch64.sve.neg inactive, pg, op
1205
1206 // e.g. llvm.aarch64.sve.fcvtnt inactive, pg, op
1212
1213 // e.g. llvm.aarch64.sve.add_u pg, op1, op2
1219
1220 // e.g. llvm.aarch64.sve.prf pg, ptr (GPIndex = 0)
1221 // llvm.aarch64.sve.st1 data, pg, ptr (GPIndex = 1)
1222 static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex) {
1223 return SVEIntrinsicInfo()
1226 }
1227
1228 // e.g. llvm.aarch64.sve.cmpeq pg, op1, op2
1229 // llvm.aarch64.sve.ld1 pg, ptr
1236
1237 // All properties relate to predication and thus having a general predicate
1238 // is the minimum requirement to say there is intrinsic info to act on.
1239 explicit operator bool() const { return hasGoverningPredicate(); }
1240
1241 //
1242 // Properties relating to the governing predicate.
1243 //
1244
1246 return GoverningPredicateIdx != std::numeric_limits<unsigned>::max();
1247 }
1248
1250 assert(hasGoverningPredicate() && "Propery not set!");
1251 return GoverningPredicateIdx;
1252 }
1253
1255 assert(!hasGoverningPredicate() && "Cannot set property twice!");
1256 GoverningPredicateIdx = Index;
1257 return *this;
1258 }
1259
1260 //
1261 // Properties relating to operations the intrinsic could be transformed into.
1262 // NOTE: This does not mean such a transformation is always possible, but the
1263 // knowledge makes it possible to reuse existing optimisations without needing
1264 // to embed specific handling for each intrinsic. For example, instruction
1265 // simplification can be used to optimise an intrinsic's active lanes.
1266 //
1267
1269 return UndefIntrinsic != Intrinsic::not_intrinsic;
1270 }
1271
1273 assert(hasMatchingUndefIntrinsic() && "Propery not set!");
1274 return UndefIntrinsic;
1275 }
1276
1278 assert(!hasMatchingUndefIntrinsic() && "Cannot set property twice!");
1279 UndefIntrinsic = IID;
1280 return *this;
1281 }
1282
1283 bool hasMatchingIROpode() const { return IROpcode != 0; }
1284
1285 unsigned getMatchingIROpode() const {
1286 assert(hasMatchingIROpode() && "Propery not set!");
1287 return IROpcode;
1288 }
1289
1291 assert(!hasMatchingIROpode() && "Cannot set property twice!");
1292 IROpcode = Opcode;
1293 return *this;
1294 }
1295
1296 //
1297 // Properties relating to the result of inactive lanes.
1298 //
1299
1301 return ResultLanes == InactiveLanesTakenFromOperand;
1302 }
1303
1305 assert(inactiveLanesTakenFromOperand() && "Propery not set!");
1306 return OperandIdxForInactiveLanes;
1307 }
1308
1310 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1311 ResultLanes = InactiveLanesTakenFromOperand;
1312 OperandIdxForInactiveLanes = Index;
1313 return *this;
1314 }
1315
1317 return ResultLanes == InactiveLanesAreNotDefined;
1318 }
1319
1321 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1322 ResultLanes = InactiveLanesAreNotDefined;
1323 return *this;
1324 }
1325
1327 return ResultLanes == InactiveLanesAreUnused;
1328 }
1329
1331 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1332 ResultLanes = InactiveLanesAreUnused;
1333 return *this;
1334 }
1335
1336 // NOTE: Whilst not limited to only inactive lanes, the common use case is:
1337 // inactiveLanesAreZeroed =
1338 // resultIsZeroInitialized() && inactiveLanesAreUnused()
1339 bool resultIsZeroInitialized() const { return ResultIsZeroInitialized; }
1340
1342 ResultIsZeroInitialized = true;
1343 return *this;
1344 }
1345
1346 //
1347 // The first operand of unary merging operations is typically only used to
1348 // set the result for inactive lanes. Knowing this allows us to deadcode the
1349 // operand when we can prove there are no inactive lanes.
1350 //
1351
1353 return OperandIdxWithNoActiveLanes != std::numeric_limits<unsigned>::max();
1354 }
1355
1357 assert(hasOperandWithNoActiveLanes() && "Propery not set!");
1358 return OperandIdxWithNoActiveLanes;
1359 }
1360
1362 assert(!hasOperandWithNoActiveLanes() && "Cannot set property twice!");
1363 OperandIdxWithNoActiveLanes = Index;
1364 return *this;
1365 }
1366
1367private:
1368 unsigned GoverningPredicateIdx = std::numeric_limits<unsigned>::max();
1369
1370 Intrinsic::ID UndefIntrinsic = Intrinsic::not_intrinsic;
1371 unsigned IROpcode = 0;
1372
1373 enum PredicationStyle {
1375 InactiveLanesTakenFromOperand,
1376 InactiveLanesAreNotDefined,
1377 InactiveLanesAreUnused
1378 } ResultLanes = Uninitialized;
1379
1380 bool ResultIsZeroInitialized = false;
1381 unsigned OperandIdxForInactiveLanes = std::numeric_limits<unsigned>::max();
1382 unsigned OperandIdxWithNoActiveLanes = std::numeric_limits<unsigned>::max();
1383};
1384
1386 // Some SVE intrinsics do not use scalable vector types, but since they are
1387 // not relevant from an SVEIntrinsicInfo perspective, they are also ignored.
1388 if (!isa<ScalableVectorType>(II.getType()) &&
1389 all_of(II.args(), [&](const Value *V) {
1390 return !isa<ScalableVectorType>(V->getType());
1391 }))
1392 return SVEIntrinsicInfo();
1393
1394 Intrinsic::ID IID = II.getIntrinsicID();
1395 switch (IID) {
1396 default:
1397 break;
1398 case Intrinsic::aarch64_sve_fcvt_bf16f32_v2:
1399 case Intrinsic::aarch64_sve_fcvt_f16f32:
1400 case Intrinsic::aarch64_sve_fcvt_f16f64:
1401 case Intrinsic::aarch64_sve_fcvt_f32f16:
1402 case Intrinsic::aarch64_sve_fcvt_f32f64:
1403 case Intrinsic::aarch64_sve_fcvt_f64f16:
1404 case Intrinsic::aarch64_sve_fcvt_f64f32:
1405 case Intrinsic::aarch64_sve_fcvtlt_f32f16:
1406 case Intrinsic::aarch64_sve_fcvtlt_f64f32:
1407 case Intrinsic::aarch64_sve_fcvtx_f32f64:
1408 case Intrinsic::aarch64_sve_fcvtzs:
1409 case Intrinsic::aarch64_sve_fcvtzs_i32f16:
1410 case Intrinsic::aarch64_sve_fcvtzs_i32f64:
1411 case Intrinsic::aarch64_sve_fcvtzs_i64f16:
1412 case Intrinsic::aarch64_sve_fcvtzs_i64f32:
1413 case Intrinsic::aarch64_sve_fcvtzu:
1414 case Intrinsic::aarch64_sve_fcvtzu_i32f16:
1415 case Intrinsic::aarch64_sve_fcvtzu_i32f64:
1416 case Intrinsic::aarch64_sve_fcvtzu_i64f16:
1417 case Intrinsic::aarch64_sve_fcvtzu_i64f32:
1418 case Intrinsic::aarch64_sve_scvtf:
1419 case Intrinsic::aarch64_sve_scvtf_f16i32:
1420 case Intrinsic::aarch64_sve_scvtf_f16i64:
1421 case Intrinsic::aarch64_sve_scvtf_f32i64:
1422 case Intrinsic::aarch64_sve_scvtf_f64i32:
1423 case Intrinsic::aarch64_sve_ucvtf:
1424 case Intrinsic::aarch64_sve_ucvtf_f16i32:
1425 case Intrinsic::aarch64_sve_ucvtf_f16i64:
1426 case Intrinsic::aarch64_sve_ucvtf_f32i64:
1427 case Intrinsic::aarch64_sve_ucvtf_f64i32:
1429
1430 case Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2:
1431 case Intrinsic::aarch64_sve_fcvtnt_f16f32:
1432 case Intrinsic::aarch64_sve_fcvtnt_f32f64:
1433 case Intrinsic::aarch64_sve_fcvtxnt_f32f64:
1435
1436 case Intrinsic::aarch64_sve_fabd:
1437 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fabd_u);
1438 case Intrinsic::aarch64_sve_fadd:
1439 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fadd_u)
1440 .setMatchingIROpcode(Instruction::FAdd);
1441 case Intrinsic::aarch64_sve_fdiv:
1442 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fdiv_u)
1443 .setMatchingIROpcode(Instruction::FDiv);
1444 case Intrinsic::aarch64_sve_fmax:
1445 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmax_u);
1446 case Intrinsic::aarch64_sve_fmaxnm:
1447 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmaxnm_u);
1448 case Intrinsic::aarch64_sve_fmin:
1449 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmin_u);
1450 case Intrinsic::aarch64_sve_fminnm:
1451 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fminnm_u);
1452 case Intrinsic::aarch64_sve_fmla:
1453 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmla_u);
1454 case Intrinsic::aarch64_sve_fmls:
1455 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmls_u);
1456 case Intrinsic::aarch64_sve_fmul:
1457 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmul_u)
1458 .setMatchingIROpcode(Instruction::FMul);
1459 case Intrinsic::aarch64_sve_fmulx:
1460 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmulx_u);
1461 case Intrinsic::aarch64_sve_fnmla:
1462 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmla_u);
1463 case Intrinsic::aarch64_sve_fnmls:
1464 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmls_u);
1465 case Intrinsic::aarch64_sve_fsub:
1466 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fsub_u)
1467 .setMatchingIROpcode(Instruction::FSub);
1468 case Intrinsic::aarch64_sve_add:
1469 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_add_u)
1470 .setMatchingIROpcode(Instruction::Add);
1471 case Intrinsic::aarch64_sve_mla:
1472 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mla_u);
1473 case Intrinsic::aarch64_sve_mls:
1474 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mls_u);
1475 case Intrinsic::aarch64_sve_mul:
1476 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mul_u)
1477 .setMatchingIROpcode(Instruction::Mul);
1478 case Intrinsic::aarch64_sve_sabd:
1479 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sabd_u);
1480 case Intrinsic::aarch64_sve_sdiv:
1481 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sdiv_u)
1482 .setMatchingIROpcode(Instruction::SDiv);
1483 case Intrinsic::aarch64_sve_smax:
1484 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smax_u);
1485 case Intrinsic::aarch64_sve_smin:
1486 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smin_u);
1487 case Intrinsic::aarch64_sve_smulh:
1488 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smulh_u);
1489 case Intrinsic::aarch64_sve_sub:
1490 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sub_u)
1491 .setMatchingIROpcode(Instruction::Sub);
1492 case Intrinsic::aarch64_sve_uabd:
1493 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uabd_u);
1494 case Intrinsic::aarch64_sve_udiv:
1495 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_udiv_u)
1496 .setMatchingIROpcode(Instruction::UDiv);
1497 case Intrinsic::aarch64_sve_umax:
1498 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umax_u);
1499 case Intrinsic::aarch64_sve_umin:
1500 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umin_u);
1501 case Intrinsic::aarch64_sve_umulh:
1502 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umulh_u);
1503 case Intrinsic::aarch64_sve_asr:
1504 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_asr_u)
1505 .setMatchingIROpcode(Instruction::AShr);
1506 case Intrinsic::aarch64_sve_lsl:
1507 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsl_u)
1508 .setMatchingIROpcode(Instruction::Shl);
1509 case Intrinsic::aarch64_sve_lsr:
1510 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsr_u)
1511 .setMatchingIROpcode(Instruction::LShr);
1512 case Intrinsic::aarch64_sve_and:
1513 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_and_u)
1514 .setMatchingIROpcode(Instruction::And);
1515 case Intrinsic::aarch64_sve_bic:
1516 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_bic_u);
1517 case Intrinsic::aarch64_sve_eor:
1518 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_eor_u)
1519 .setMatchingIROpcode(Instruction::Xor);
1520 case Intrinsic::aarch64_sve_orr:
1521 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_orr_u)
1522 .setMatchingIROpcode(Instruction::Or);
1523 case Intrinsic::aarch64_sve_shsub:
1524 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_shsub_u);
1525 case Intrinsic::aarch64_sve_shsubr:
1527 case Intrinsic::aarch64_sve_sqrshl:
1528 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqrshl_u);
1529 case Intrinsic::aarch64_sve_sqshl:
1530 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqshl_u);
1531 case Intrinsic::aarch64_sve_sqsub:
1532 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqsub_u);
1533 case Intrinsic::aarch64_sve_srshl:
1534 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_srshl_u);
1535 case Intrinsic::aarch64_sve_uhsub:
1536 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uhsub_u);
1537 case Intrinsic::aarch64_sve_uhsubr:
1539 case Intrinsic::aarch64_sve_uqrshl:
1540 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqrshl_u);
1541 case Intrinsic::aarch64_sve_uqshl:
1542 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqshl_u);
1543 case Intrinsic::aarch64_sve_uqsub:
1544 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqsub_u);
1545 case Intrinsic::aarch64_sve_urshl:
1546 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_urshl_u);
1547
1548 case Intrinsic::aarch64_sve_add_u:
1550 Instruction::Add);
1551 case Intrinsic::aarch64_sve_and_u:
1553 Instruction::And);
1554 case Intrinsic::aarch64_sve_asr_u:
1556 Instruction::AShr);
1557 case Intrinsic::aarch64_sve_eor_u:
1559 Instruction::Xor);
1560 case Intrinsic::aarch64_sve_fadd_u:
1562 Instruction::FAdd);
1563 case Intrinsic::aarch64_sve_fdiv_u:
1565 Instruction::FDiv);
1566 case Intrinsic::aarch64_sve_fmul_u:
1568 Instruction::FMul);
1569 case Intrinsic::aarch64_sve_fsub_u:
1571 Instruction::FSub);
1572 case Intrinsic::aarch64_sve_lsl_u:
1574 Instruction::Shl);
1575 case Intrinsic::aarch64_sve_lsr_u:
1577 Instruction::LShr);
1578 case Intrinsic::aarch64_sve_mul_u:
1580 Instruction::Mul);
1581 case Intrinsic::aarch64_sve_orr_u:
1583 Instruction::Or);
1584 case Intrinsic::aarch64_sve_sdiv_u:
1586 Instruction::SDiv);
1587 case Intrinsic::aarch64_sve_sub_u:
1589 Instruction::Sub);
1590 case Intrinsic::aarch64_sve_udiv_u:
1592 Instruction::UDiv);
1593
1594 case Intrinsic::aarch64_sve_addqv:
1595 case Intrinsic::aarch64_sve_and_z:
1596 case Intrinsic::aarch64_sve_bic_z:
1597 case Intrinsic::aarch64_sve_brka_z:
1598 case Intrinsic::aarch64_sve_brkb_z:
1599 case Intrinsic::aarch64_sve_brkn_z:
1600 case Intrinsic::aarch64_sve_brkpa_z:
1601 case Intrinsic::aarch64_sve_brkpb_z:
1602 case Intrinsic::aarch64_sve_cntp:
1603 case Intrinsic::aarch64_sve_compact:
1604 case Intrinsic::aarch64_sve_eor_z:
1605 case Intrinsic::aarch64_sve_eorv:
1606 case Intrinsic::aarch64_sve_eorqv:
1607 case Intrinsic::aarch64_sve_nand_z:
1608 case Intrinsic::aarch64_sve_nor_z:
1609 case Intrinsic::aarch64_sve_orn_z:
1610 case Intrinsic::aarch64_sve_orr_z:
1611 case Intrinsic::aarch64_sve_orv:
1612 case Intrinsic::aarch64_sve_orqv:
1613 case Intrinsic::aarch64_sve_pnext:
1614 case Intrinsic::aarch64_sve_rdffr_z:
1615 case Intrinsic::aarch64_sve_saddv:
1616 case Intrinsic::aarch64_sve_uaddv:
1617 case Intrinsic::aarch64_sve_umaxv:
1618 case Intrinsic::aarch64_sve_umaxqv:
1619 case Intrinsic::aarch64_sve_cmpeq:
1620 case Intrinsic::aarch64_sve_cmpeq_wide:
1621 case Intrinsic::aarch64_sve_cmpge:
1622 case Intrinsic::aarch64_sve_cmpge_wide:
1623 case Intrinsic::aarch64_sve_cmpgt:
1624 case Intrinsic::aarch64_sve_cmpgt_wide:
1625 case Intrinsic::aarch64_sve_cmphi:
1626 case Intrinsic::aarch64_sve_cmphi_wide:
1627 case Intrinsic::aarch64_sve_cmphs:
1628 case Intrinsic::aarch64_sve_cmphs_wide:
1629 case Intrinsic::aarch64_sve_cmple_wide:
1630 case Intrinsic::aarch64_sve_cmplo_wide:
1631 case Intrinsic::aarch64_sve_cmpls_wide:
1632 case Intrinsic::aarch64_sve_cmplt_wide:
1633 case Intrinsic::aarch64_sve_cmpne:
1634 case Intrinsic::aarch64_sve_cmpne_wide:
1635 case Intrinsic::aarch64_sve_facge:
1636 case Intrinsic::aarch64_sve_facgt:
1637 case Intrinsic::aarch64_sve_fcmpeq:
1638 case Intrinsic::aarch64_sve_fcmpge:
1639 case Intrinsic::aarch64_sve_fcmpgt:
1640 case Intrinsic::aarch64_sve_fcmpne:
1641 case Intrinsic::aarch64_sve_fcmpuo:
1642 case Intrinsic::aarch64_sve_ld1:
1643 case Intrinsic::aarch64_sve_ld1_gather:
1644 case Intrinsic::aarch64_sve_ld1_gather_index:
1645 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
1646 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
1647 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
1648 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
1649 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
1650 case Intrinsic::aarch64_sve_ld1q_gather_index:
1651 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
1652 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
1653 case Intrinsic::aarch64_sve_ld1ro:
1654 case Intrinsic::aarch64_sve_ld1rq:
1655 case Intrinsic::aarch64_sve_ld1udq:
1656 case Intrinsic::aarch64_sve_ld1uwq:
1657 case Intrinsic::aarch64_sve_ld2_sret:
1658 case Intrinsic::aarch64_sve_ld2q_sret:
1659 case Intrinsic::aarch64_sve_ld3_sret:
1660 case Intrinsic::aarch64_sve_ld3q_sret:
1661 case Intrinsic::aarch64_sve_ld4_sret:
1662 case Intrinsic::aarch64_sve_ld4q_sret:
1663 case Intrinsic::aarch64_sve_ldff1:
1664 case Intrinsic::aarch64_sve_ldff1_gather:
1665 case Intrinsic::aarch64_sve_ldff1_gather_index:
1666 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
1667 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
1668 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
1669 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
1670 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
1671 case Intrinsic::aarch64_sve_ldnf1:
1672 case Intrinsic::aarch64_sve_ldnt1:
1673 case Intrinsic::aarch64_sve_ldnt1_gather:
1674 case Intrinsic::aarch64_sve_ldnt1_gather_index:
1675 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
1676 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
1678
1679 case Intrinsic::aarch64_sve_prf:
1680 case Intrinsic::aarch64_sve_prfb_gather_index:
1681 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
1682 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
1683 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
1684 case Intrinsic::aarch64_sve_prfd_gather_index:
1685 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
1686 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
1687 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
1688 case Intrinsic::aarch64_sve_prfh_gather_index:
1689 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
1690 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
1691 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
1692 case Intrinsic::aarch64_sve_prfw_gather_index:
1693 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
1694 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
1695 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
1697
1698 case Intrinsic::aarch64_sve_st1_scatter:
1699 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
1700 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
1701 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
1702 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
1703 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
1704 case Intrinsic::aarch64_sve_st1dq:
1705 case Intrinsic::aarch64_sve_st1q_scatter_index:
1706 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
1707 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
1708 case Intrinsic::aarch64_sve_st1wq:
1709 case Intrinsic::aarch64_sve_stnt1:
1710 case Intrinsic::aarch64_sve_stnt1_scatter:
1711 case Intrinsic::aarch64_sve_stnt1_scatter_index:
1712 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
1713 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
1715 case Intrinsic::aarch64_sve_st2:
1716 case Intrinsic::aarch64_sve_st2q:
1718 case Intrinsic::aarch64_sve_st3:
1719 case Intrinsic::aarch64_sve_st3q:
1721 case Intrinsic::aarch64_sve_st4:
1722 case Intrinsic::aarch64_sve_st4q:
1724 }
1725
1726 return SVEIntrinsicInfo();
1727}
1728
1729static bool isAllActivePredicate(Value *Pred) {
1730 Value *UncastedPred;
1731
1732 // Look through predicate casts that only remove lanes.
1734 m_Value(UncastedPred)))) {
1735 auto *OrigPredTy = cast<ScalableVectorType>(Pred->getType());
1736 Pred = UncastedPred;
1737
1739 m_Value(UncastedPred))))
1740 // If the predicate has the same or less lanes than the uncasted predicate
1741 // then we know the casting has no effect.
1742 if (OrigPredTy->getMinNumElements() <=
1743 cast<ScalableVectorType>(UncastedPred->getType())
1744 ->getMinNumElements())
1745 Pred = UncastedPred;
1746 }
1747
1748 auto *C = dyn_cast<Constant>(Pred);
1749 return C && C->isAllOnesValue();
1750}
1751
1752// Simplify `V` by only considering the operations that affect active lanes.
1753// This function should only return existing Values or newly created Constants.
1754static Value *stripInactiveLanes(Value *V, const Value *Pg) {
1755 auto *Dup = dyn_cast<IntrinsicInst>(V);
1756 if (Dup && Dup->getIntrinsicID() == Intrinsic::aarch64_sve_dup &&
1757 Dup->getOperand(1) == Pg && isa<Constant>(Dup->getOperand(2)))
1759 cast<VectorType>(V->getType())->getElementCount(),
1760 cast<Constant>(Dup->getOperand(2)));
1761
1762 return V;
1763}
1764
1765static std::optional<Instruction *>
1767 const SVEIntrinsicInfo &IInfo) {
1768 const unsigned Opc = IInfo.getMatchingIROpode();
1769 assert(Instruction::isBinaryOp(Opc) && "Expected a binary operation!");
1770
1771 Value *Pg = II.getOperand(0);
1772 Value *Op1 = II.getOperand(1);
1773 Value *Op2 = II.getOperand(2);
1774 const DataLayout &DL = II.getDataLayout();
1775
1776 // Canonicalise constants to the RHS.
1778 isa<Constant>(Op1) && !isa<Constant>(Op2)) {
1779 IC.replaceOperand(II, 1, Op2);
1780 IC.replaceOperand(II, 2, Op1);
1781 return &II;
1782 }
1783
1784 // Only active lanes matter when simplifying the operation.
1785 Op1 = stripInactiveLanes(Op1, Pg);
1786 Op2 = stripInactiveLanes(Op2, Pg);
1787
1788 Value *SimpleII;
1789 if (auto FII = dyn_cast<FPMathOperator>(&II))
1790 SimpleII = simplifyBinOp(Opc, Op1, Op2, FII->getFastMathFlags(), DL);
1791 else
1792 SimpleII = simplifyBinOp(Opc, Op1, Op2, DL);
1793
1794 // An SVE intrinsic's result is always defined. However, this is not the case
1795 // for its equivalent IR instruction (e.g. when shifting by an amount more
1796 // than the data's bitwidth). Simplifications to an undefined result must be
1797 // ignored to preserve the intrinsic's expected behaviour.
1798 if (!SimpleII || isa<UndefValue>(SimpleII))
1799 return std::nullopt;
1800
1801 if (IInfo.inactiveLanesAreNotDefined())
1802 return IC.replaceInstUsesWith(II, SimpleII);
1803
1804 Value *Inactive = II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom());
1805
1806 // The intrinsic does nothing (e.g. sve.mul(pg, A, 1.0)).
1807 if (SimpleII == Inactive)
1808 return IC.replaceInstUsesWith(II, SimpleII);
1809
1810 // Inactive lanes must be preserved.
1811 SimpleII = IC.Builder.CreateSelect(Pg, SimpleII, Inactive);
1812 return IC.replaceInstUsesWith(II, SimpleII);
1813}
1814
1815// Use SVE intrinsic info to eliminate redundant operands and/or canonicalise
1816// to operations with less strict inactive lane requirements.
1817static std::optional<Instruction *>
1819 const SVEIntrinsicInfo &IInfo) {
1820 if (!IInfo.hasGoverningPredicate())
1821 return std::nullopt;
1822
1823 auto *OpPredicate = II.getOperand(IInfo.getGoverningPredicateOperandIdx());
1824
1825 // If there are no active lanes.
1826 if (match(OpPredicate, m_ZeroInt())) {
1828 return IC.replaceInstUsesWith(
1829 II, II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom()));
1830
1831 if (IInfo.inactiveLanesAreUnused()) {
1832 if (IInfo.resultIsZeroInitialized())
1834
1835 return IC.eraseInstFromFunction(II);
1836 }
1837 }
1838
1839 // If there are no inactive lanes.
1840 if (isAllActivePredicate(OpPredicate)) {
1841 if (IInfo.hasOperandWithNoActiveLanes()) {
1842 unsigned OpIdx = IInfo.getOperandIdxWithNoActiveLanes();
1843 if (!isa<UndefValue>(II.getOperand(OpIdx)))
1844 return IC.replaceOperand(II, OpIdx, UndefValue::get(II.getType()));
1845 }
1846
1847 if (IInfo.hasMatchingUndefIntrinsic()) {
1848 auto *NewDecl = Intrinsic::getOrInsertDeclaration(
1849 II.getModule(), IInfo.getMatchingUndefIntrinsic(), {II.getType()});
1850 II.setCalledFunction(NewDecl);
1851 return &II;
1852 }
1853 }
1854
1855 // Operation specific simplifications.
1856 if (IInfo.hasMatchingIROpode() &&
1858 return simplifySVEIntrinsicBinOp(IC, II, IInfo);
1859
1860 return std::nullopt;
1861}
1862
1863// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
1864// => (binop (pred) (from_svbool _) (from_svbool _))
1865//
1866// The above transformation eliminates a `to_svbool` in the predicate
1867// operand of bitwise operation `binop` by narrowing the vector width of
1868// the operation. For example, it would convert a `<vscale x 16 x i1>
1869// and` into a `<vscale x 4 x i1> and`. This is profitable because
1870// to_svbool must zero the new lanes during widening, whereas
1871// from_svbool is free.
1872static std::optional<Instruction *>
1874 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
1875 if (!BinOp)
1876 return std::nullopt;
1877
1878 auto IntrinsicID = BinOp->getIntrinsicID();
1879 switch (IntrinsicID) {
1880 case Intrinsic::aarch64_sve_and_z:
1881 case Intrinsic::aarch64_sve_bic_z:
1882 case Intrinsic::aarch64_sve_eor_z:
1883 case Intrinsic::aarch64_sve_nand_z:
1884 case Intrinsic::aarch64_sve_nor_z:
1885 case Intrinsic::aarch64_sve_orn_z:
1886 case Intrinsic::aarch64_sve_orr_z:
1887 break;
1888 default:
1889 return std::nullopt;
1890 }
1891
1892 auto BinOpPred = BinOp->getOperand(0);
1893 auto BinOpOp1 = BinOp->getOperand(1);
1894 auto BinOpOp2 = BinOp->getOperand(2);
1895
1896 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
1897 if (!PredIntr ||
1898 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
1899 return std::nullopt;
1900
1901 auto PredOp = PredIntr->getOperand(0);
1902 auto PredOpTy = cast<VectorType>(PredOp->getType());
1903 if (PredOpTy != II.getType())
1904 return std::nullopt;
1905
1906 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
1907 auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
1908 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
1909 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1910 if (BinOpOp1 == BinOpOp2)
1911 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1912 else
1913 NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic(
1914 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
1915
1916 auto NarrowedBinOp =
1917 IC.Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
1918 return IC.replaceInstUsesWith(II, NarrowedBinOp);
1919}
1920
1921static std::optional<Instruction *>
1923 // If the reinterpret instruction operand is a PHI Node
1924 if (isa<PHINode>(II.getArgOperand(0)))
1925 return processPhiNode(IC, II);
1926
1927 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
1928 return BinOpCombine;
1929
1930 // Ignore converts to/from svcount_t.
1931 if (isa<TargetExtType>(II.getArgOperand(0)->getType()) ||
1932 isa<TargetExtType>(II.getType()))
1933 return std::nullopt;
1934
1935 SmallVector<Instruction *, 32> CandidatesForRemoval;
1936 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
1937
1938 const auto *IVTy = cast<VectorType>(II.getType());
1939
1940 // Walk the chain of conversions.
1941 while (Cursor) {
1942 // If the type of the cursor has fewer lanes than the final result, zeroing
1943 // must take place, which breaks the equivalence chain.
1944 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
1945 if (CursorVTy->getElementCount().getKnownMinValue() <
1946 IVTy->getElementCount().getKnownMinValue())
1947 break;
1948
1949 // If the cursor has the same type as I, it is a viable replacement.
1950 if (Cursor->getType() == IVTy)
1951 EarliestReplacement = Cursor;
1952
1953 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
1954
1955 // If this is not an SVE conversion intrinsic, this is the end of the chain.
1956 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
1957 Intrinsic::aarch64_sve_convert_to_svbool ||
1958 IntrinsicCursor->getIntrinsicID() ==
1959 Intrinsic::aarch64_sve_convert_from_svbool))
1960 break;
1961
1962 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
1963 Cursor = IntrinsicCursor->getOperand(0);
1964 }
1965
1966 // If no viable replacement in the conversion chain was found, there is
1967 // nothing to do.
1968 if (!EarliestReplacement)
1969 return std::nullopt;
1970
1971 return IC.replaceInstUsesWith(II, EarliestReplacement);
1972}
1973
1974static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
1975 IntrinsicInst &II) {
1976 // svsel(ptrue, x, y) => x
1977 auto *OpPredicate = II.getOperand(0);
1978 if (isAllActivePredicate(OpPredicate))
1979 return IC.replaceInstUsesWith(II, II.getOperand(1));
1980
1981 auto Select =
1982 IC.Builder.CreateSelect(OpPredicate, II.getOperand(1), II.getOperand(2));
1983 return IC.replaceInstUsesWith(II, Select);
1984}
1985
1986static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
1987 IntrinsicInst &II) {
1988 Value *Pg = II.getOperand(1);
1989
1990 // sve.dup(V, all_active, X) ==> splat(X)
1991 if (isAllActivePredicate(Pg)) {
1992 auto *RetTy = cast<ScalableVectorType>(II.getType());
1993 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
1994 II.getArgOperand(2));
1995 return IC.replaceInstUsesWith(II, Splat);
1996 }
1997
1999 m_SpecificInt(AArch64SVEPredPattern::vl1))))
2000 return std::nullopt;
2001
2002 // sve.dup(V, sve.ptrue(vl1), X) ==> insertelement V, X, 0
2003 Value *Insert = IC.Builder.CreateInsertElement(
2004 II.getArgOperand(0), II.getArgOperand(2), uint64_t(0));
2005 return IC.replaceInstUsesWith(II, Insert);
2006}
2007
2008static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
2009 IntrinsicInst &II) {
2010 // Replace DupX with a regular IR splat.
2011 auto *RetTy = cast<ScalableVectorType>(II.getType());
2012 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
2013 II.getArgOperand(0));
2014 Splat->takeName(&II);
2015 return IC.replaceInstUsesWith(II, Splat);
2016}
2017
2018static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
2019 IntrinsicInst &II) {
2020 LLVMContext &Ctx = II.getContext();
2021
2022 if (!isAllActivePredicate(II.getArgOperand(0)))
2023 return std::nullopt;
2024
2025 // Check that we have a compare of zero..
2026 auto *SplatValue =
2028 if (!SplatValue || !SplatValue->isZero())
2029 return std::nullopt;
2030
2031 // ..against a dupq
2032 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
2033 if (!DupQLane ||
2034 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
2035 return std::nullopt;
2036
2037 // Where the dupq is a lane 0 replicate of a vector insert
2038 auto *DupQLaneIdx = dyn_cast<ConstantInt>(DupQLane->getArgOperand(1));
2039 if (!DupQLaneIdx || !DupQLaneIdx->isZero())
2040 return std::nullopt;
2041
2042 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
2043 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
2044 return std::nullopt;
2045
2046 // Where the vector insert is a fixed constant vector insert into undef at
2047 // index zero
2048 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
2049 return std::nullopt;
2050
2051 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
2052 return std::nullopt;
2053
2054 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
2055 if (!ConstVec)
2056 return std::nullopt;
2057
2058 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
2059 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
2060 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
2061 return std::nullopt;
2062
2063 unsigned NumElts = VecTy->getNumElements();
2064 unsigned PredicateBits = 0;
2065
2066 // Expand intrinsic operands to a 16-bit byte level predicate
2067 for (unsigned I = 0; I < NumElts; ++I) {
2068 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
2069 if (!Arg)
2070 return std::nullopt;
2071 if (!Arg->isZero())
2072 PredicateBits |= 1 << (I * (16 / NumElts));
2073 }
2074
2075 // If all bits are zero bail early with an empty predicate
2076 if (PredicateBits == 0) {
2077 auto *PFalse = Constant::getNullValue(II.getType());
2078 PFalse->takeName(&II);
2079 return IC.replaceInstUsesWith(II, PFalse);
2080 }
2081
2082 // Calculate largest predicate type used (where byte predicate is largest)
2083 unsigned Mask = 8;
2084 for (unsigned I = 0; I < 16; ++I)
2085 if ((PredicateBits & (1 << I)) != 0)
2086 Mask |= (I % 8);
2087
2088 unsigned PredSize = Mask & -Mask;
2089 auto *PredType = ScalableVectorType::get(
2090 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
2091
2092 // Ensure all relevant bits are set
2093 for (unsigned I = 0; I < 16; I += PredSize)
2094 if ((PredicateBits & (1 << I)) == 0)
2095 return std::nullopt;
2096
2097 auto *PTruePat =
2098 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
2099 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
2100 {PredType}, {PTruePat});
2101 auto *ConvertToSVBool = IC.Builder.CreateIntrinsic(
2102 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
2103 auto *ConvertFromSVBool =
2104 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
2105 {II.getType()}, {ConvertToSVBool});
2106
2107 ConvertFromSVBool->takeName(&II);
2108 return IC.replaceInstUsesWith(II, ConvertFromSVBool);
2109}
2110
2111static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
2112 IntrinsicInst &II) {
2113 Value *Pg = II.getArgOperand(0);
2114 Value *Vec = II.getArgOperand(1);
2115 auto IntrinsicID = II.getIntrinsicID();
2116 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
2117
2118 // lastX(splat(X)) --> X
2119 if (auto *SplatVal = getSplatValue(Vec))
2120 return IC.replaceInstUsesWith(II, SplatVal);
2121
2122 // If x and/or y is a splat value then:
2123 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
2124 Value *LHS, *RHS;
2125 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
2126 if (isSplatValue(LHS) || isSplatValue(RHS)) {
2127 auto *OldBinOp = cast<BinaryOperator>(Vec);
2128 auto OpC = OldBinOp->getOpcode();
2129 auto *NewLHS =
2130 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
2131 auto *NewRHS =
2132 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
2134 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.getIterator());
2135 return IC.replaceInstUsesWith(II, NewBinOp);
2136 }
2137 }
2138
2139 auto *C = dyn_cast<Constant>(Pg);
2140 if (IsAfter && C && C->isNullValue()) {
2141 // The intrinsic is extracting lane 0 so use an extract instead.
2142 auto *IdxTy = Type::getInt64Ty(II.getContext());
2143 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
2144 Extract->insertBefore(II.getIterator());
2145 Extract->takeName(&II);
2146 return IC.replaceInstUsesWith(II, Extract);
2147 }
2148
2149 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
2150 if (!IntrPG)
2151 return std::nullopt;
2152
2153 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
2154 return std::nullopt;
2155
2156 const auto PTruePattern =
2157 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
2158
2159 // Can the intrinsic's predicate be converted to a known constant index?
2160 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
2161 if (!MinNumElts)
2162 return std::nullopt;
2163
2164 unsigned Idx = MinNumElts - 1;
2165 // Increment the index if extracting the element after the last active
2166 // predicate element.
2167 if (IsAfter)
2168 ++Idx;
2169
2170 // Ignore extracts whose index is larger than the known minimum vector
2171 // length. NOTE: This is an artificial constraint where we prefer to
2172 // maintain what the user asked for until an alternative is proven faster.
2173 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
2174 if (Idx >= PgVTy->getMinNumElements())
2175 return std::nullopt;
2176
2177 // The intrinsic is extracting a fixed lane so use an extract instead.
2178 auto *IdxTy = Type::getInt64Ty(II.getContext());
2179 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
2180 Extract->insertBefore(II.getIterator());
2181 Extract->takeName(&II);
2182 return IC.replaceInstUsesWith(II, Extract);
2183}
2184
2185static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
2186 IntrinsicInst &II) {
2187 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
2188 // integer variant across a variety of micro-architectures. Replace scalar
2189 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
2190 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
2191 // depending on the micro-architecture, but has been observed as generally
2192 // being faster, particularly when the CLAST[AB] op is a loop-carried
2193 // dependency.
2194 Value *Pg = II.getArgOperand(0);
2195 Value *Fallback = II.getArgOperand(1);
2196 Value *Vec = II.getArgOperand(2);
2197 Type *Ty = II.getType();
2198
2199 if (!Ty->isIntegerTy())
2200 return std::nullopt;
2201
2202 Type *FPTy;
2203 switch (cast<IntegerType>(Ty)->getBitWidth()) {
2204 default:
2205 return std::nullopt;
2206 case 16:
2207 FPTy = IC.Builder.getHalfTy();
2208 break;
2209 case 32:
2210 FPTy = IC.Builder.getFloatTy();
2211 break;
2212 case 64:
2213 FPTy = IC.Builder.getDoubleTy();
2214 break;
2215 }
2216
2217 Value *FPFallBack = IC.Builder.CreateBitCast(Fallback, FPTy);
2218 auto *FPVTy = VectorType::get(
2219 FPTy, cast<VectorType>(Vec->getType())->getElementCount());
2220 Value *FPVec = IC.Builder.CreateBitCast(Vec, FPVTy);
2221 auto *FPII = IC.Builder.CreateIntrinsic(
2222 II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
2223 Value *FPIItoInt = IC.Builder.CreateBitCast(FPII, II.getType());
2224 return IC.replaceInstUsesWith(II, FPIItoInt);
2225}
2226
2227static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
2228 IntrinsicInst &II) {
2229 LLVMContext &Ctx = II.getContext();
2230 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
2231 // can work with RDFFR_PP for ptest elimination.
2232 auto *AllPat =
2233 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
2234 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
2235 {II.getType()}, {AllPat});
2236 auto *RDFFR =
2237 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {PTrue});
2238 RDFFR->takeName(&II);
2239 return IC.replaceInstUsesWith(II, RDFFR);
2240}
2241
2242static std::optional<Instruction *>
2244 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
2245
2246 if (Pattern == AArch64SVEPredPattern::all) {
2248 II.getType(), ElementCount::getScalable(NumElts));
2249 Cnt->takeName(&II);
2250 return IC.replaceInstUsesWith(II, Cnt);
2251 }
2252
2253 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
2254
2255 return MinNumElts && NumElts >= MinNumElts
2256 ? std::optional<Instruction *>(IC.replaceInstUsesWith(
2257 II, ConstantInt::get(II.getType(), MinNumElts)))
2258 : std::nullopt;
2259}
2260
2261static std::optional<Instruction *>
2263 const AArch64Subtarget *ST) {
2264 if (!ST->isStreaming())
2265 return std::nullopt;
2266
2267 // In streaming-mode, aarch64_sme_cntds is equivalent to aarch64_sve_cntd
2268 // with SVEPredPattern::all
2269 Value *Cnt =
2271 Cnt->takeName(&II);
2272 return IC.replaceInstUsesWith(II, Cnt);
2273}
2274
2275static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
2276 IntrinsicInst &II) {
2277 Value *PgVal = II.getArgOperand(0);
2278 Value *OpVal = II.getArgOperand(1);
2279
2280 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
2281 // Later optimizations prefer this form.
2282 if (PgVal == OpVal &&
2283 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
2284 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
2285 Value *Ops[] = {PgVal, OpVal};
2286 Type *Tys[] = {PgVal->getType()};
2287
2288 auto *PTest =
2289 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
2290 PTest->takeName(&II);
2291
2292 return IC.replaceInstUsesWith(II, PTest);
2293 }
2294
2297
2298 if (!Pg || !Op)
2299 return std::nullopt;
2300
2301 Intrinsic::ID OpIID = Op->getIntrinsicID();
2302
2303 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
2304 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
2305 Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
2306 Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
2307 Type *Tys[] = {Pg->getArgOperand(0)->getType()};
2308
2309 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
2310
2311 PTest->takeName(&II);
2312 return IC.replaceInstUsesWith(II, PTest);
2313 }
2314
2315 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
2316 // Later optimizations may rewrite sequence to use the flag-setting variant
2317 // of instruction X to remove PTEST.
2318 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
2319 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
2320 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
2321 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
2322 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
2323 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
2324 (OpIID == Intrinsic::aarch64_sve_and_z) ||
2325 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
2326 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
2327 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
2328 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
2329 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
2330 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
2331 Value *Ops[] = {Pg->getArgOperand(0), Pg};
2332 Type *Tys[] = {Pg->getType()};
2333
2334 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
2335 PTest->takeName(&II);
2336
2337 return IC.replaceInstUsesWith(II, PTest);
2338 }
2339
2340 return std::nullopt;
2341}
2342
2343template <Intrinsic::ID MulOpc, Intrinsic::ID FuseOpc>
2344static std::optional<Instruction *>
2346 bool MergeIntoAddendOp) {
2347 Value *P = II.getOperand(0);
2348 Value *MulOp0, *MulOp1, *AddendOp, *Mul;
2349 if (MergeIntoAddendOp) {
2350 AddendOp = II.getOperand(1);
2351 Mul = II.getOperand(2);
2352 } else {
2353 AddendOp = II.getOperand(2);
2354 Mul = II.getOperand(1);
2355 }
2356
2358 m_Value(MulOp1))))
2359 return std::nullopt;
2360
2361 if (!Mul->hasOneUse())
2362 return std::nullopt;
2363
2364 Instruction *FMFSource = nullptr;
2365 if (II.getType()->isFPOrFPVectorTy()) {
2366 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
2367 // Stop the combine when the flags on the inputs differ in case dropping
2368 // flags would lead to us missing out on more beneficial optimizations.
2369 if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
2370 return std::nullopt;
2371 if (!FAddFlags.allowContract())
2372 return std::nullopt;
2373 FMFSource = &II;
2374 }
2375
2376 CallInst *Res;
2377 if (MergeIntoAddendOp)
2378 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
2379 {P, AddendOp, MulOp0, MulOp1}, FMFSource);
2380 else
2381 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
2382 {P, MulOp0, MulOp1, AddendOp}, FMFSource);
2383
2384 return IC.replaceInstUsesWith(II, Res);
2385}
2386
2387static std::optional<Instruction *>
2389 Value *Pred = II.getOperand(0);
2390 Value *PtrOp = II.getOperand(1);
2391 Type *VecTy = II.getType();
2392
2393 if (isAllActivePredicate(Pred)) {
2394 LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);
2395 Load->copyMetadata(II);
2396 return IC.replaceInstUsesWith(II, Load);
2397 }
2398
2399 CallInst *MaskedLoad =
2400 IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL),
2401 Pred, ConstantAggregateZero::get(VecTy));
2402 MaskedLoad->copyMetadata(II);
2403 return IC.replaceInstUsesWith(II, MaskedLoad);
2404}
2405
2406static std::optional<Instruction *>
2408 Value *VecOp = II.getOperand(0);
2409 Value *Pred = II.getOperand(1);
2410 Value *PtrOp = II.getOperand(2);
2411
2412 if (isAllActivePredicate(Pred)) {
2413 StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp);
2414 Store->copyMetadata(II);
2415 return IC.eraseInstFromFunction(II);
2416 }
2417
2418 CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
2419 VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred);
2420 MaskedStore->copyMetadata(II);
2421 return IC.eraseInstFromFunction(II);
2422}
2423
2425 switch (Intrinsic) {
2426 case Intrinsic::aarch64_sve_fmul_u:
2427 return Instruction::BinaryOps::FMul;
2428 case Intrinsic::aarch64_sve_fadd_u:
2429 return Instruction::BinaryOps::FAdd;
2430 case Intrinsic::aarch64_sve_fsub_u:
2431 return Instruction::BinaryOps::FSub;
2432 default:
2433 return Instruction::BinaryOpsEnd;
2434 }
2435}
2436
2437static std::optional<Instruction *>
2439 // Bail due to missing support for ISD::STRICT_ scalable vector operations.
2440 if (II.isStrictFP())
2441 return std::nullopt;
2442
2443 auto *OpPredicate = II.getOperand(0);
2444 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
2445 if (BinOpCode == Instruction::BinaryOpsEnd ||
2446 !isAllActivePredicate(OpPredicate))
2447 return std::nullopt;
2448 auto BinOp = IC.Builder.CreateBinOpFMF(
2449 BinOpCode, II.getOperand(1), II.getOperand(2), II.getFastMathFlags());
2450 return IC.replaceInstUsesWith(II, BinOp);
2451}
2452
2453static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
2454 IntrinsicInst &II) {
2455 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2456 Intrinsic::aarch64_sve_mla>(
2457 IC, II, true))
2458 return MLA;
2459 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2460 Intrinsic::aarch64_sve_mad>(
2461 IC, II, false))
2462 return MAD;
2463 return std::nullopt;
2464}
2465
2466static std::optional<Instruction *>
2468 if (auto FMLA =
2469 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2470 Intrinsic::aarch64_sve_fmla>(IC, II,
2471 true))
2472 return FMLA;
2473 if (auto FMAD =
2474 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2475 Intrinsic::aarch64_sve_fmad>(IC, II,
2476 false))
2477 return FMAD;
2478 if (auto FMLA =
2479 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2480 Intrinsic::aarch64_sve_fmla>(IC, II,
2481 true))
2482 return FMLA;
2483 return std::nullopt;
2484}
2485
2486static std::optional<Instruction *>
2488 if (auto FMLA =
2489 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2490 Intrinsic::aarch64_sve_fmla>(IC, II,
2491 true))
2492 return FMLA;
2493 if (auto FMAD =
2494 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2495 Intrinsic::aarch64_sve_fmad>(IC, II,
2496 false))
2497 return FMAD;
2498 if (auto FMLA_U =
2499 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2500 Intrinsic::aarch64_sve_fmla_u>(
2501 IC, II, true))
2502 return FMLA_U;
2503 return instCombineSVEVectorBinOp(IC, II);
2504}
2505
2506static std::optional<Instruction *>
2508 if (auto FMLS =
2509 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2510 Intrinsic::aarch64_sve_fmls>(IC, II,
2511 true))
2512 return FMLS;
2513 if (auto FMSB =
2514 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2515 Intrinsic::aarch64_sve_fnmsb>(
2516 IC, II, false))
2517 return FMSB;
2518 if (auto FMLS =
2519 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2520 Intrinsic::aarch64_sve_fmls>(IC, II,
2521 true))
2522 return FMLS;
2523 return std::nullopt;
2524}
2525
2526static std::optional<Instruction *>
2528 if (auto FMLS =
2529 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2530 Intrinsic::aarch64_sve_fmls>(IC, II,
2531 true))
2532 return FMLS;
2533 if (auto FMSB =
2534 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2535 Intrinsic::aarch64_sve_fnmsb>(
2536 IC, II, false))
2537 return FMSB;
2538 if (auto FMLS_U =
2539 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2540 Intrinsic::aarch64_sve_fmls_u>(
2541 IC, II, true))
2542 return FMLS_U;
2543 return instCombineSVEVectorBinOp(IC, II);
2544}
2545
2546static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
2547 IntrinsicInst &II) {
2548 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2549 Intrinsic::aarch64_sve_mls>(
2550 IC, II, true))
2551 return MLS;
2552 return std::nullopt;
2553}
2554
2555static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
2556 IntrinsicInst &II) {
2557 Value *UnpackArg = II.getArgOperand(0);
2558 auto *RetTy = cast<ScalableVectorType>(II.getType());
2559 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
2560 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
2561
2562 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
2563 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
2564 if (auto *ScalarArg = getSplatValue(UnpackArg)) {
2565 ScalarArg =
2566 IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
2567 Value *NewVal =
2568 IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
2569 NewVal->takeName(&II);
2570 return IC.replaceInstUsesWith(II, NewVal);
2571 }
2572
2573 return std::nullopt;
2574}
2575static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
2576 IntrinsicInst &II) {
2577 auto *OpVal = II.getOperand(0);
2578 auto *OpIndices = II.getOperand(1);
2579 VectorType *VTy = cast<VectorType>(II.getType());
2580
2581 // Check whether OpIndices is a constant splat value < minimal element count
2582 // of result.
2583 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
2584 if (!SplatValue ||
2585 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
2586 return std::nullopt;
2587
2588 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
2589 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
2590 auto *Extract = IC.Builder.CreateExtractElement(OpVal, SplatValue);
2591 auto *VectorSplat =
2592 IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
2593
2594 VectorSplat->takeName(&II);
2595 return IC.replaceInstUsesWith(II, VectorSplat);
2596}
2597
2598static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
2599 IntrinsicInst &II) {
2600 Value *A, *B;
2601 Type *RetTy = II.getType();
2602 constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
2603 constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
2604
2605 // uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
2606 // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
2607 if ((match(II.getArgOperand(0),
2609 match(II.getArgOperand(1),
2611 (match(II.getArgOperand(0), m_Intrinsic<ToSVB>(m_Value(A))) &&
2612 match(II.getArgOperand(1), m_Intrinsic<ToSVB>(m_Value(B))))) {
2613 auto *TyA = cast<ScalableVectorType>(A->getType());
2614 if (TyA == B->getType() &&
2616 auto *SubVec = IC.Builder.CreateInsertVector(
2617 RetTy, PoisonValue::get(RetTy), A, uint64_t(0));
2618 auto *ConcatVec = IC.Builder.CreateInsertVector(RetTy, SubVec, B,
2619 TyA->getMinNumElements());
2620 ConcatVec->takeName(&II);
2621 return IC.replaceInstUsesWith(II, ConcatVec);
2622 }
2623 }
2624
2625 return std::nullopt;
2626}
2627
2628static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
2629 IntrinsicInst &II) {
2630 // zip1(uzp1(A, B), uzp2(A, B)) --> A
2631 // zip2(uzp1(A, B), uzp2(A, B)) --> B
2632 Value *A, *B;
2633 if (match(II.getArgOperand(0),
2636 m_Specific(A), m_Specific(B))))
2637 return IC.replaceInstUsesWith(
2638 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
2639
2640 return std::nullopt;
2641}
2642
2643static std::optional<Instruction *>
2645 Value *Mask = II.getOperand(0);
2646 Value *BasePtr = II.getOperand(1);
2647 Value *Index = II.getOperand(2);
2648 Type *Ty = II.getType();
2649 Value *PassThru = ConstantAggregateZero::get(Ty);
2650
2651 // Contiguous gather => masked load.
2652 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
2653 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
2654 Value *IndexBase;
2656 m_Value(IndexBase), m_SpecificInt(1)))) {
2657 Align Alignment =
2658 BasePtr->getPointerAlignment(II.getDataLayout());
2659
2660 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
2661 BasePtr, IndexBase);
2662 CallInst *MaskedLoad =
2663 IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
2664 MaskedLoad->takeName(&II);
2665 return IC.replaceInstUsesWith(II, MaskedLoad);
2666 }
2667
2668 return std::nullopt;
2669}
2670
2671static std::optional<Instruction *>
2673 Value *Val = II.getOperand(0);
2674 Value *Mask = II.getOperand(1);
2675 Value *BasePtr = II.getOperand(2);
2676 Value *Index = II.getOperand(3);
2677 Type *Ty = Val->getType();
2678
2679 // Contiguous scatter => masked store.
2680 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
2681 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
2682 Value *IndexBase;
2684 m_Value(IndexBase), m_SpecificInt(1)))) {
2685 Align Alignment =
2686 BasePtr->getPointerAlignment(II.getDataLayout());
2687
2688 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
2689 BasePtr, IndexBase);
2690 (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
2691
2692 return IC.eraseInstFromFunction(II);
2693 }
2694
2695 return std::nullopt;
2696}
2697
2698static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
2699 IntrinsicInst &II) {
2701 Value *Pred = II.getOperand(0);
2702 Value *Vec = II.getOperand(1);
2703 Value *DivVec = II.getOperand(2);
2704
2705 Value *SplatValue = getSplatValue(DivVec);
2706 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
2707 if (!SplatConstantInt)
2708 return std::nullopt;
2709
2710 APInt Divisor = SplatConstantInt->getValue();
2711 const int64_t DivisorValue = Divisor.getSExtValue();
2712 if (DivisorValue == -1)
2713 return std::nullopt;
2714 if (DivisorValue == 1)
2715 IC.replaceInstUsesWith(II, Vec);
2716
2717 if (Divisor.isPowerOf2()) {
2718 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2719 auto ASRD = IC.Builder.CreateIntrinsic(
2720 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2721 return IC.replaceInstUsesWith(II, ASRD);
2722 }
2723 if (Divisor.isNegatedPowerOf2()) {
2724 Divisor.negate();
2725 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2726 auto ASRD = IC.Builder.CreateIntrinsic(
2727 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2728 auto NEG = IC.Builder.CreateIntrinsic(
2729 Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
2730 return IC.replaceInstUsesWith(II, NEG);
2731 }
2732
2733 return std::nullopt;
2734}
2735
2736bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
2737 size_t VecSize = Vec.size();
2738 if (VecSize == 1)
2739 return true;
2740 if (!isPowerOf2_64(VecSize))
2741 return false;
2742 size_t HalfVecSize = VecSize / 2;
2743
2744 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
2745 RHS != Vec.end(); LHS++, RHS++) {
2746 if (*LHS != nullptr && *RHS != nullptr) {
2747 if (*LHS == *RHS)
2748 continue;
2749 else
2750 return false;
2751 }
2752 if (!AllowPoison)
2753 return false;
2754 if (*LHS == nullptr && *RHS != nullptr)
2755 *LHS = *RHS;
2756 }
2757
2758 Vec.resize(HalfVecSize);
2759 SimplifyValuePattern(Vec, AllowPoison);
2760 return true;
2761}
2762
2763// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
2764// to dupqlane(f64(C)) where C is A concatenated with B
2765static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
2766 IntrinsicInst &II) {
2767 Value *CurrentInsertElt = nullptr, *Default = nullptr;
2768 if (!match(II.getOperand(0),
2770 m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
2771 !isa<FixedVectorType>(CurrentInsertElt->getType()))
2772 return std::nullopt;
2773 auto IIScalableTy = cast<ScalableVectorType>(II.getType());
2774
2775 // Insert the scalars into a container ordered by InsertElement index
2776 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
2777 while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
2778 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
2779 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
2780 CurrentInsertElt = InsertElt->getOperand(0);
2781 }
2782
2783 bool AllowPoison =
2784 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
2785 if (!SimplifyValuePattern(Elts, AllowPoison))
2786 return std::nullopt;
2787
2788 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
2789 Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
2790 for (size_t I = 0; I < Elts.size(); I++) {
2791 if (Elts[I] == nullptr)
2792 continue;
2793 InsertEltChain = IC.Builder.CreateInsertElement(InsertEltChain, Elts[I],
2794 IC.Builder.getInt64(I));
2795 }
2796 if (InsertEltChain == nullptr)
2797 return std::nullopt;
2798
2799 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
2800 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
2801 // be bitcast to a type wide enough to fit the sequence, be splatted, and then
2802 // be narrowed back to the original type.
2803 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
2804 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
2805 IIScalableTy->getMinNumElements() /
2806 PatternWidth;
2807
2808 IntegerType *WideTy = IC.Builder.getIntNTy(PatternWidth);
2809 auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
2810 auto *WideShuffleMaskTy =
2811 ScalableVectorType::get(IC.Builder.getInt32Ty(), PatternElementCount);
2812
2813 auto InsertSubvector = IC.Builder.CreateInsertVector(
2814 II.getType(), PoisonValue::get(II.getType()), InsertEltChain,
2815 uint64_t(0));
2816 auto WideBitcast =
2817 IC.Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
2818 auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
2819 auto WideShuffle = IC.Builder.CreateShuffleVector(
2820 WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
2821 auto NarrowBitcast =
2822 IC.Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
2823
2824 return IC.replaceInstUsesWith(II, NarrowBitcast);
2825}
2826
2827static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
2828 IntrinsicInst &II) {
2829 Value *A = II.getArgOperand(0);
2830 Value *B = II.getArgOperand(1);
2831 if (A == B)
2832 return IC.replaceInstUsesWith(II, A);
2833
2834 return std::nullopt;
2835}
2836
2837static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
2838 IntrinsicInst &II) {
2839 Value *Pred = II.getOperand(0);
2840 Value *Vec = II.getOperand(1);
2841 Value *Shift = II.getOperand(2);
2842
2843 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
2844 Value *AbsPred, *MergedValue;
2846 m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
2848 m_Value(MergedValue), m_Value(AbsPred), m_Value())))
2849
2850 return std::nullopt;
2851
2852 // Transform is valid if any of the following are true:
2853 // * The ABS merge value is an undef or non-negative
2854 // * The ABS predicate is all active
2855 // * The ABS predicate and the SRSHL predicates are the same
2856 if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
2857 AbsPred != Pred && !isAllActivePredicate(AbsPred))
2858 return std::nullopt;
2859
2860 // Only valid when the shift amount is non-negative, otherwise the rounding
2861 // behaviour of SRSHL cannot be ignored.
2862 if (!match(Shift, m_NonNegative()))
2863 return std::nullopt;
2864
2865 auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl,
2866 {II.getType()}, {Pred, Vec, Shift});
2867
2868 return IC.replaceInstUsesWith(II, LSL);
2869}
2870
2871static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC,
2872 IntrinsicInst &II) {
2873 Value *Vec = II.getOperand(0);
2874
2875 if (getSplatValue(Vec) == II.getOperand(1))
2876 return IC.replaceInstUsesWith(II, Vec);
2877
2878 return std::nullopt;
2879}
2880
2881static std::optional<Instruction *> instCombineDMB(InstCombiner &IC,
2882 IntrinsicInst &II) {
2883 // If this barrier is post-dominated by identical one we can remove it
2884 auto *NI = II.getNextNode();
2885 unsigned LookaheadThreshold = DMBLookaheadThreshold;
2886 auto CanSkipOver = [](Instruction *I) {
2887 return !I->mayReadOrWriteMemory() && !I->mayHaveSideEffects();
2888 };
2889 while (LookaheadThreshold-- && CanSkipOver(NI)) {
2890 auto *NIBB = NI->getParent();
2891 NI = NI->getNextNode();
2892 if (!NI) {
2893 if (auto *SuccBB = NIBB->getUniqueSuccessor())
2894 NI = &*SuccBB->getFirstNonPHIOrDbgOrLifetime();
2895 else
2896 break;
2897 }
2898 }
2899 auto *NextII = dyn_cast_or_null<IntrinsicInst>(NI);
2900 if (NextII && II.isIdenticalTo(NextII))
2901 return IC.eraseInstFromFunction(II);
2902
2903 return std::nullopt;
2904}
2905
2906static std::optional<Instruction *> instCombineWhilelo(InstCombiner &IC,
2907 IntrinsicInst &II) {
2908 return IC.replaceInstUsesWith(
2909 II,
2910 IC.Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
2911 {II.getType(), II.getOperand(0)->getType()},
2912 {II.getOperand(0), II.getOperand(1)}));
2913}
2914
2915static std::optional<Instruction *> instCombinePTrue(InstCombiner &IC,
2916 IntrinsicInst &II) {
2918 return IC.replaceInstUsesWith(II, Constant::getAllOnesValue(II.getType()));
2919 return std::nullopt;
2920}
2921
2922static std::optional<Instruction *> instCombineSVEUxt(InstCombiner &IC,
2924 unsigned NumBits) {
2925 Value *Passthru = II.getOperand(0);
2926 Value *Pg = II.getOperand(1);
2927 Value *Op = II.getOperand(2);
2928
2929 // Convert UXT[BHW] to AND.
2930 if (isa<UndefValue>(Passthru) || isAllActivePredicate(Pg)) {
2931 auto *Ty = cast<VectorType>(II.getType());
2932 auto MaskValue = APInt::getLowBitsSet(Ty->getScalarSizeInBits(), NumBits);
2933 auto *Mask = ConstantInt::get(Ty, MaskValue);
2934 auto *And = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_and_u, {Ty},
2935 {Pg, Op, Mask});
2936 return IC.replaceInstUsesWith(II, And);
2937 }
2938
2939 return std::nullopt;
2940}
2941
2942static std::optional<Instruction *>
2944 SMEAttrs FnSMEAttrs(*II.getFunction());
2945 bool IsStreaming = FnSMEAttrs.hasStreamingInterfaceOrBody();
2946 if (IsStreaming || !FnSMEAttrs.hasStreamingCompatibleInterface())
2947 return IC.replaceInstUsesWith(
2948 II, ConstantInt::getBool(II.getType(), IsStreaming));
2949 return std::nullopt;
2950}
2951
2952std::optional<Instruction *>
2954 IntrinsicInst &II) const {
2956 if (std::optional<Instruction *> I = simplifySVEIntrinsic(IC, II, IInfo))
2957 return I;
2958
2959 Intrinsic::ID IID = II.getIntrinsicID();
2960 switch (IID) {
2961 default:
2962 break;
2963 case Intrinsic::aarch64_dmb:
2964 return instCombineDMB(IC, II);
2965 case Intrinsic::aarch64_neon_fmaxnm:
2966 case Intrinsic::aarch64_neon_fminnm:
2967 return instCombineMaxMinNM(IC, II);
2968 case Intrinsic::aarch64_sve_convert_from_svbool:
2969 return instCombineConvertFromSVBool(IC, II);
2970 case Intrinsic::aarch64_sve_dup:
2971 return instCombineSVEDup(IC, II);
2972 case Intrinsic::aarch64_sve_dup_x:
2973 return instCombineSVEDupX(IC, II);
2974 case Intrinsic::aarch64_sve_cmpne:
2975 case Intrinsic::aarch64_sve_cmpne_wide:
2976 return instCombineSVECmpNE(IC, II);
2977 case Intrinsic::aarch64_sve_rdffr:
2978 return instCombineRDFFR(IC, II);
2979 case Intrinsic::aarch64_sve_lasta:
2980 case Intrinsic::aarch64_sve_lastb:
2981 return instCombineSVELast(IC, II);
2982 case Intrinsic::aarch64_sve_clasta_n:
2983 case Intrinsic::aarch64_sve_clastb_n:
2984 return instCombineSVECondLast(IC, II);
2985 case Intrinsic::aarch64_sve_cntd:
2986 return instCombineSVECntElts(IC, II, 2);
2987 case Intrinsic::aarch64_sve_cntw:
2988 return instCombineSVECntElts(IC, II, 4);
2989 case Intrinsic::aarch64_sve_cnth:
2990 return instCombineSVECntElts(IC, II, 8);
2991 case Intrinsic::aarch64_sve_cntb:
2992 return instCombineSVECntElts(IC, II, 16);
2993 case Intrinsic::aarch64_sme_cntsd:
2994 return instCombineSMECntsd(IC, II, ST);
2995 case Intrinsic::aarch64_sve_ptest_any:
2996 case Intrinsic::aarch64_sve_ptest_first:
2997 case Intrinsic::aarch64_sve_ptest_last:
2998 return instCombineSVEPTest(IC, II);
2999 case Intrinsic::aarch64_sve_fadd:
3000 return instCombineSVEVectorFAdd(IC, II);
3001 case Intrinsic::aarch64_sve_fadd_u:
3002 return instCombineSVEVectorFAddU(IC, II);
3003 case Intrinsic::aarch64_sve_fmul_u:
3004 return instCombineSVEVectorBinOp(IC, II);
3005 case Intrinsic::aarch64_sve_fsub:
3006 return instCombineSVEVectorFSub(IC, II);
3007 case Intrinsic::aarch64_sve_fsub_u:
3008 return instCombineSVEVectorFSubU(IC, II);
3009 case Intrinsic::aarch64_sve_add:
3010 return instCombineSVEVectorAdd(IC, II);
3011 case Intrinsic::aarch64_sve_add_u:
3012 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
3013 Intrinsic::aarch64_sve_mla_u>(
3014 IC, II, true);
3015 case Intrinsic::aarch64_sve_sub:
3016 return instCombineSVEVectorSub(IC, II);
3017 case Intrinsic::aarch64_sve_sub_u:
3018 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
3019 Intrinsic::aarch64_sve_mls_u>(
3020 IC, II, true);
3021 case Intrinsic::aarch64_sve_tbl:
3022 return instCombineSVETBL(IC, II);
3023 case Intrinsic::aarch64_sve_uunpkhi:
3024 case Intrinsic::aarch64_sve_uunpklo:
3025 case Intrinsic::aarch64_sve_sunpkhi:
3026 case Intrinsic::aarch64_sve_sunpklo:
3027 return instCombineSVEUnpack(IC, II);
3028 case Intrinsic::aarch64_sve_uzp1:
3029 return instCombineSVEUzp1(IC, II);
3030 case Intrinsic::aarch64_sve_zip1:
3031 case Intrinsic::aarch64_sve_zip2:
3032 return instCombineSVEZip(IC, II);
3033 case Intrinsic::aarch64_sve_ld1_gather_index:
3034 return instCombineLD1GatherIndex(IC, II);
3035 case Intrinsic::aarch64_sve_st1_scatter_index:
3036 return instCombineST1ScatterIndex(IC, II);
3037 case Intrinsic::aarch64_sve_ld1:
3038 return instCombineSVELD1(IC, II, DL);
3039 case Intrinsic::aarch64_sve_st1:
3040 return instCombineSVEST1(IC, II, DL);
3041 case Intrinsic::aarch64_sve_sdiv:
3042 return instCombineSVESDIV(IC, II);
3043 case Intrinsic::aarch64_sve_sel:
3044 return instCombineSVESel(IC, II);
3045 case Intrinsic::aarch64_sve_srshl:
3046 return instCombineSVESrshl(IC, II);
3047 case Intrinsic::aarch64_sve_dupq_lane:
3048 return instCombineSVEDupqLane(IC, II);
3049 case Intrinsic::aarch64_sve_insr:
3050 return instCombineSVEInsr(IC, II);
3051 case Intrinsic::aarch64_sve_whilelo:
3052 return instCombineWhilelo(IC, II);
3053 case Intrinsic::aarch64_sve_ptrue:
3054 return instCombinePTrue(IC, II);
3055 case Intrinsic::aarch64_sve_uxtb:
3056 return instCombineSVEUxt(IC, II, 8);
3057 case Intrinsic::aarch64_sve_uxth:
3058 return instCombineSVEUxt(IC, II, 16);
3059 case Intrinsic::aarch64_sve_uxtw:
3060 return instCombineSVEUxt(IC, II, 32);
3061 case Intrinsic::aarch64_sme_in_streaming_mode:
3062 return instCombineInStreamingMode(IC, II);
3063 }
3064
3065 return std::nullopt;
3066}
3067
3069 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
3070 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
3071 std::function<void(Instruction *, unsigned, APInt, APInt &)>
3072 SimplifyAndSetOp) const {
3073 switch (II.getIntrinsicID()) {
3074 default:
3075 break;
3076 case Intrinsic::aarch64_neon_fcvtxn:
3077 case Intrinsic::aarch64_neon_rshrn:
3078 case Intrinsic::aarch64_neon_sqrshrn:
3079 case Intrinsic::aarch64_neon_sqrshrun:
3080 case Intrinsic::aarch64_neon_sqshrn:
3081 case Intrinsic::aarch64_neon_sqshrun:
3082 case Intrinsic::aarch64_neon_sqxtn:
3083 case Intrinsic::aarch64_neon_sqxtun:
3084 case Intrinsic::aarch64_neon_uqrshrn:
3085 case Intrinsic::aarch64_neon_uqshrn:
3086 case Intrinsic::aarch64_neon_uqxtn:
3087 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
3088 break;
3089 }
3090
3091 return std::nullopt;
3092}
3093
3095 return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
3097}
3098
3101 switch (K) {
3103 return TypeSize::getFixed(64);
3105 if (ST->useSVEForFixedLengthVectors() &&
3106 (ST->isSVEAvailable() || EnableFixedwidthAutovecInStreamingMode))
3107 return TypeSize::getFixed(
3108 std::max(ST->getMinSVEVectorSizeInBits(), 128u));
3109 else if (ST->isNeonAvailable())
3110 return TypeSize::getFixed(128);
3111 else
3112 return TypeSize::getFixed(0);
3114 if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
3116 return TypeSize::getScalable(128);
3117 else
3118 return TypeSize::getScalable(0);
3119 }
3120 llvm_unreachable("Unsupported register kind");
3121}
3122
3123bool AArch64TTIImpl::isSingleExtWideningInstruction(
3124 unsigned Opcode, Type *DstTy, ArrayRef<const Value *> Args,
3125 Type *SrcOverrideTy) const {
3126 // A helper that returns a vector type from the given type. The number of
3127 // elements in type Ty determines the vector width.
3128 auto toVectorTy = [&](Type *ArgTy) {
3129 return VectorType::get(ArgTy->getScalarType(),
3130 cast<VectorType>(DstTy)->getElementCount());
3131 };
3132
3133 // Exit early if DstTy is not a vector type whose elements are one of [i16,
3134 // i32, i64]. SVE doesn't generally have the same set of instructions to
3135 // perform an extend with the add/sub/mul. There are SMULLB style
3136 // instructions, but they operate on top/bottom, requiring some sort of lane
3137 // interleaving to be used with zext/sext.
3138 unsigned DstEltSize = DstTy->getScalarSizeInBits();
3139 if (!useNeonVector(DstTy) || Args.size() != 2 ||
3140 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
3141 return false;
3142
3143 Type *SrcTy = SrcOverrideTy;
3144 switch (Opcode) {
3145 case Instruction::Add: // UADDW(2), SADDW(2).
3146 case Instruction::Sub: { // USUBW(2), SSUBW(2).
3147 // The second operand needs to be an extend
3148 if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
3149 if (!SrcTy)
3150 SrcTy =
3151 toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
3152 break;
3153 }
3154
3155 if (Opcode == Instruction::Sub)
3156 return false;
3157
3158 // UADDW(2), SADDW(2) can be commutted.
3159 if (isa<SExtInst>(Args[0]) || isa<ZExtInst>(Args[0])) {
3160 if (!SrcTy)
3161 SrcTy =
3162 toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
3163 break;
3164 }
3165 return false;
3166 }
3167 default:
3168 return false;
3169 }
3170
3171 // Legalize the destination type and ensure it can be used in a widening
3172 // operation.
3173 auto DstTyL = getTypeLegalizationCost(DstTy);
3174 if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
3175 return false;
3176
3177 // Legalize the source type and ensure it can be used in a widening
3178 // operation.
3179 assert(SrcTy && "Expected some SrcTy");
3180 auto SrcTyL = getTypeLegalizationCost(SrcTy);
3181 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
3182 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
3183 return false;
3184
3185 // Get the total number of vector elements in the legalized types.
3186 InstructionCost NumDstEls =
3187 DstTyL.first * DstTyL.second.getVectorMinNumElements();
3188 InstructionCost NumSrcEls =
3189 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
3190
3191 // Return true if the legalized types have the same number of vector elements
3192 // and the destination element type size is twice that of the source type.
3193 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
3194}
3195
3196Type *AArch64TTIImpl::isBinExtWideningInstruction(unsigned Opcode, Type *DstTy,
3198 Type *SrcOverrideTy) const {
3199 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
3200 Opcode != Instruction::Mul)
3201 return nullptr;
3202
3203 // Exit early if DstTy is not a vector type whose elements are one of [i16,
3204 // i32, i64]. SVE doesn't generally have the same set of instructions to
3205 // perform an extend with the add/sub/mul. There are SMULLB style
3206 // instructions, but they operate on top/bottom, requiring some sort of lane
3207 // interleaving to be used with zext/sext.
3208 unsigned DstEltSize = DstTy->getScalarSizeInBits();
3209 if (!useNeonVector(DstTy) || Args.size() != 2 ||
3210 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
3211 return nullptr;
3212
3213 auto getScalarSizeWithOverride = [&](const Value *V) {
3214 if (SrcOverrideTy)
3215 return SrcOverrideTy->getScalarSizeInBits();
3216 return cast<Instruction>(V)
3217 ->getOperand(0)
3218 ->getType()
3219 ->getScalarSizeInBits();
3220 };
3221
3222 unsigned MaxEltSize = 0;
3223 if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
3224 (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
3225 unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
3226 unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
3227 MaxEltSize = std::max(EltSize0, EltSize1);
3228 } else if (isa<SExtInst, ZExtInst>(Args[0]) &&
3229 isa<SExtInst, ZExtInst>(Args[1])) {
3230 unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
3231 unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
3232 // mul(sext, zext) will become smull(sext, zext) if the extends are large
3233 // enough.
3234 if (EltSize0 >= DstEltSize / 2 || EltSize1 >= DstEltSize / 2)
3235 return nullptr;
3236 MaxEltSize = DstEltSize / 2;
3237 } else if (Opcode == Instruction::Mul &&
3238 (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1]))) {
3239 // If one of the operands is a Zext and the other has enough zero bits
3240 // to be treated as unsigned, we can still generate a umull, meaning the
3241 // zext is free.
3242 KnownBits Known =
3243 computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
3244 if (Args[0]->getType()->getScalarSizeInBits() -
3245 Known.Zero.countLeadingOnes() >
3246 DstTy->getScalarSizeInBits() / 2)
3247 return nullptr;
3248
3249 MaxEltSize =
3250 getScalarSizeWithOverride(isa<ZExtInst>(Args[0]) ? Args[0] : Args[1]);
3251 } else
3252 return nullptr;
3253
3254 if (MaxEltSize * 2 > DstEltSize)
3255 return nullptr;
3256
3257 Type *ExtTy = DstTy->getWithNewBitWidth(MaxEltSize * 2);
3258 if (ExtTy->getPrimitiveSizeInBits() <= 64)
3259 return nullptr;
3260 return ExtTy;
3261}
3262
3263// s/urhadd instructions implement the following pattern, making the
3264// extends free:
3265// %x = add ((zext i8 -> i16), 1)
3266// %y = (zext i8 -> i16)
3267// trunc i16 (lshr (add %x, %y), 1) -> i8
3268//
3270 Type *Src) const {
3271 // The source should be a legal vector type.
3272 if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) ||
3273 (Src->isScalableTy() && !ST->hasSVE2()))
3274 return false;
3275
3276 if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
3277 return false;
3278
3279 // Look for trunc/shl/add before trying to match the pattern.
3280 const Instruction *Add = ExtUser;
3281 auto *AddUser =
3282 dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
3283 if (AddUser && AddUser->getOpcode() == Instruction::Add)
3284 Add = AddUser;
3285
3286 auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
3287 if (!Shr || Shr->getOpcode() != Instruction::LShr)
3288 return false;
3289
3290 auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
3291 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
3292 Src->getScalarSizeInBits() !=
3293 cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
3294 return false;
3295
3296 // Try to match the whole pattern. Ext could be either the first or second
3297 // m_ZExtOrSExt matched.
3298 Instruction *Ex1, *Ex2;
3299 if (!(match(Add, m_c_Add(m_Instruction(Ex1),
3300 m_c_Add(m_Instruction(Ex2), m_SpecificInt(1))))))
3301 return false;
3302
3303 // Ensure both extends are of the same type
3304 if (match(Ex1, m_ZExtOrSExt(m_Value())) &&
3305 Ex1->getOpcode() == Ex2->getOpcode())
3306 return true;
3307
3308 return false;
3309}
3310
3312 Type *Src,
3315 const Instruction *I) const {
3316 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3317 assert(ISD && "Invalid opcode");
3318 // If the cast is observable, and it is used by a widening instruction (e.g.,
3319 // uaddl, saddw, etc.), it may be free.
3320 if (I && I->hasOneUser()) {
3321 auto *SingleUser = cast<Instruction>(*I->user_begin());
3322 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
3323 if (Type *ExtTy = isBinExtWideningInstruction(
3324 SingleUser->getOpcode(), Dst, Operands,
3325 Src != I->getOperand(0)->getType() ? Src : nullptr)) {
3326 // The cost from Src->Src*2 needs to be added if required, the cost from
3327 // Src*2->ExtTy is free.
3328 if (ExtTy->getScalarSizeInBits() > Src->getScalarSizeInBits() * 2) {
3329 Type *DoubleSrcTy =
3330 Src->getWithNewBitWidth(Src->getScalarSizeInBits() * 2);
3331 return getCastInstrCost(Opcode, DoubleSrcTy, Src,
3333 }
3334
3335 return 0;
3336 }
3337
3338 if (isSingleExtWideningInstruction(
3339 SingleUser->getOpcode(), Dst, Operands,
3340 Src != I->getOperand(0)->getType() ? Src : nullptr)) {
3341 // For adds only count the second operand as free if both operands are
3342 // extends but not the same operation. (i.e both operands are not free in
3343 // add(sext, zext)).
3344 if (SingleUser->getOpcode() == Instruction::Add) {
3345 if (I == SingleUser->getOperand(1) ||
3346 (isa<CastInst>(SingleUser->getOperand(1)) &&
3347 cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
3348 return 0;
3349 } else {
3350 // Others are free so long as isSingleExtWideningInstruction
3351 // returned true.
3352 return 0;
3353 }
3354 }
3355
3356 // The cast will be free for the s/urhadd instructions
3357 if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
3358 isExtPartOfAvgExpr(SingleUser, Dst, Src))
3359 return 0;
3360 }
3361
3362 EVT SrcTy = TLI->getValueType(DL, Src);
3363 EVT DstTy = TLI->getValueType(DL, Dst);
3364
3365 if (!SrcTy.isSimple() || !DstTy.isSimple())
3366 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
3367
3368 // For the moment we do not have lowering for SVE1-only fptrunc f64->bf16 as
3369 // we use fcvtx under SVE2. Give them invalid costs.
3370 if (!ST->hasSVE2() && !ST->isStreamingSVEAvailable() &&
3371 ISD == ISD::FP_ROUND && SrcTy.isScalableVector() &&
3372 DstTy.getScalarType() == MVT::bf16 && SrcTy.getScalarType() == MVT::f64)
3374
3375 static const TypeConversionCostTblEntry BF16Tbl[] = {
3376 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 1}, // bfcvt
3377 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 1}, // bfcvt
3378 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 1}, // bfcvtn
3379 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 2}, // bfcvtn+bfcvtn2
3380 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 2}, // bfcvtn+fcvtn
3381 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 3}, // fcvtn+fcvtl2+bfcvtn
3382 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+bfcvtn
3383 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 1}, // bfcvt
3384 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 1}, // bfcvt
3385 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 3}, // bfcvt+bfcvt+uzp1
3386 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 2}, // fcvtx+bfcvt
3387 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 5}, // 2*fcvtx+2*bfcvt+uzp1
3388 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 11}, // 4*fcvt+4*bfcvt+3*uzp
3389 };
3390
3391 if (ST->hasBF16())
3392 if (const auto *Entry = ConvertCostTableLookup(
3393 BF16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3394 return Entry->Cost;
3395
3396 // Symbolic constants for the SVE sitofp/uitofp entries in the table below
3397 // The cost of unpacking twice is artificially increased for now in order
3398 // to avoid regressions against NEON, which will use tbl instructions directly
3399 // instead of multiple layers of [s|u]unpk[lo|hi].
3400 // We use the unpacks in cases where the destination type is illegal and
3401 // requires splitting of the input, even if the input type itself is legal.
3402 const unsigned int SVE_EXT_COST = 1;
3403 const unsigned int SVE_FCVT_COST = 1;
3404 const unsigned int SVE_UNPACK_ONCE = 4;
3405 const unsigned int SVE_UNPACK_TWICE = 16;
3406
3407 static const TypeConversionCostTblEntry ConversionTbl[] = {
3408 {ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn
3409 {ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn
3410 {ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn
3411 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn
3412 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1
3413 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn
3414 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn
3415 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1
3416 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn
3417 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn
3418 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn
3419 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1
3420 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1
3421 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1
3422 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1
3423 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1
3424 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1
3425 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1
3426 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1
3427 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1
3428
3429 // Truncations on nxvmiN
3430 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i8, 2},
3431 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 2},
3432 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 2},
3433 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 2},
3434 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i8, 2},
3435 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 2},
3436 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 2},
3437 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 5},
3438 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i8, 2},
3439 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 2},
3440 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 5},
3441 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 11},
3442 {ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 2},
3443 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i16, 0},
3444 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i32, 0},
3445 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i64, 0},
3446 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 0},
3447 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i64, 0},
3448 {ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 0},
3449 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i16, 0},
3450 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i32, 0},
3451 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i64, 1},
3452 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 0},
3453 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i64, 1},
3454 {ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 1},
3455 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i16, 0},
3456 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i32, 1},
3457 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i64, 3},
3458 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 1},
3459 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i64, 3},
3460 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i16, 1},
3461 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i32, 3},
3462 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i64, 7},
3463
3464 // The number of shll instructions for the extension.
3465 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3},
3466 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3},
3467 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2},
3468 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2},
3469 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3},
3470 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3},
3471 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2},
3472 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2},
3473 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7},
3474 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7},
3475 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6},
3476 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6},
3477 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2},
3478 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2},
3479 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6},
3480 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6},
3481
3482 // FP Ext and trunc
3483 {ISD::FP_EXTEND, MVT::f64, MVT::f32, 1}, // fcvt
3484 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f32, 1}, // fcvtl
3485 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 2}, // fcvtl+fcvtl2
3486 // FP16
3487 {ISD::FP_EXTEND, MVT::f32, MVT::f16, 1}, // fcvt
3488 {ISD::FP_EXTEND, MVT::f64, MVT::f16, 1}, // fcvt
3489 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1}, // fcvtl
3490 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 2}, // fcvtl+fcvtl2
3491 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f16, 2}, // fcvtl+fcvtl
3492 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, 3}, // fcvtl+fcvtl2+fcvtl
3493 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, 6}, // 2 * fcvtl+fcvtl2+fcvtl
3494 // BF16 (uses shift)
3495 {ISD::FP_EXTEND, MVT::f32, MVT::bf16, 1}, // shl
3496 {ISD::FP_EXTEND, MVT::f64, MVT::bf16, 2}, // shl+fcvt
3497 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4bf16, 1}, // shll
3498 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8bf16, 2}, // shll+shll2
3499 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2bf16, 2}, // shll+fcvtl
3500 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4bf16, 3}, // shll+fcvtl+fcvtl2
3501 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8bf16, 6}, // 2 * shll+fcvtl+fcvtl2
3502 // FP Ext and trunc
3503 {ISD::FP_ROUND, MVT::f32, MVT::f64, 1}, // fcvt
3504 {ISD::FP_ROUND, MVT::v2f32, MVT::v2f64, 1}, // fcvtn
3505 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 2}, // fcvtn+fcvtn2
3506 // FP16
3507 {ISD::FP_ROUND, MVT::f16, MVT::f32, 1}, // fcvt
3508 {ISD::FP_ROUND, MVT::f16, MVT::f64, 1}, // fcvt
3509 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, 1}, // fcvtn
3510 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, 2}, // fcvtn+fcvtn2
3511 {ISD::FP_ROUND, MVT::v2f16, MVT::v2f64, 2}, // fcvtn+fcvtn
3512 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f64, 3}, // fcvtn+fcvtn2+fcvtn
3513 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+fcvtn
3514 // BF16 (more complex, with +bf16 is handled above)
3515 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 8}, // Expansion is ~8 insns
3516 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 9}, // fcvtn + above
3517 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f32, 8},
3518 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 8},
3519 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 15},
3520 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 9},
3521 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 10},
3522 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 19},
3523
3524 // LowerVectorINT_TO_FP:
3525 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
3526 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
3527 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
3528 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
3529 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
3530 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
3531
3532 // SVE: to nxv2f16
3533 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,
3534 SVE_EXT_COST + SVE_FCVT_COST},
3535 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},
3536 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},
3537 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},
3538 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,
3539 SVE_EXT_COST + SVE_FCVT_COST},
3540 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},
3541 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},
3542 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},
3543
3544 // SVE: to nxv4f16
3545 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,
3546 SVE_EXT_COST + SVE_FCVT_COST},
3547 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},
3548 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},
3549 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,
3550 SVE_EXT_COST + SVE_FCVT_COST},
3551 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},
3552 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},
3553
3554 // SVE: to nxv8f16
3555 {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,
3556 SVE_EXT_COST + SVE_FCVT_COST},
3557 {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},
3558 {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,
3559 SVE_EXT_COST + SVE_FCVT_COST},
3560 {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},
3561
3562 // SVE: to nxv16f16
3563 {ISD::SINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,
3564 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3565 {ISD::UINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,
3566 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3567
3568 // Complex: to v2f32
3569 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
3570 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
3571 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
3572 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
3573
3574 // SVE: to nxv2f32
3575 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,
3576 SVE_EXT_COST + SVE_FCVT_COST},
3577 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},
3578 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},
3579 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},
3580 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,
3581 SVE_EXT_COST + SVE_FCVT_COST},
3582 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},
3583 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},
3584 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},
3585
3586 // Complex: to v4f32
3587 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4},
3588 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
3589 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3},
3590 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
3591
3592 // SVE: to nxv4f32
3593 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,
3594 SVE_EXT_COST + SVE_FCVT_COST},
3595 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},
3596 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},
3597 {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,
3598 SVE_EXT_COST + SVE_FCVT_COST},
3599 {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},
3600 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},
3601
3602 // Complex: to v8f32
3603 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
3604 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
3605 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
3606 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
3607
3608 // SVE: to nxv8f32
3609 {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,
3610 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3611 {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,
3612 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3613 {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,
3614 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3615 {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,
3616 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3617
3618 // SVE: to nxv16f32
3619 {ISD::SINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,
3620 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3621 {ISD::UINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,
3622 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3623
3624 // Complex: to v16f32
3625 {ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
3626 {ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
3627
3628 // Complex: to v2f64
3629 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
3630 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
3631 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
3632 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
3633 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
3634 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
3635
3636 // SVE: to nxv2f64
3637 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,
3638 SVE_EXT_COST + SVE_FCVT_COST},
3639 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},
3640 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},
3641 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},
3642 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,
3643 SVE_EXT_COST + SVE_FCVT_COST},
3644 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},
3645 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},
3646 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},
3647
3648 // Complex: to v4f64
3649 {ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
3650 {ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
3651
3652 // SVE: to nxv4f64
3653 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,
3654 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3655 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,
3656 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3657 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,
3658 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3659 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,
3660 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3661 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,
3662 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3663 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,
3664 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3665
3666 // SVE: to nxv8f64
3667 {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,
3668 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3669 {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,
3670 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3671 {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,
3672 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3673 {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,
3674 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3675
3676 // LowerVectorFP_TO_INT
3677 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1},
3678 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1},
3679 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1},
3680 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1},
3681 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1},
3682 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1},
3683
3684 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
3685 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2},
3686 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1},
3687 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1},
3688 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2},
3689 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1},
3690 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1},
3691
3692 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
3693 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2},
3694 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2},
3695 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2},
3696 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2},
3697
3698 // Complex, from nxv2f32.
3699 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1},
3700 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1},
3701 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1},
3702 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1},
3703 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1},
3704 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1},
3705 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1},
3706 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1},
3707
3708 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
3709 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2},
3710 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2},
3711 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2},
3712 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2},
3713 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2},
3714 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2},
3715
3716 // Complex, from nxv2f64.
3717 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3718 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3719 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3720 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3721 {ISD::FP_TO_SINT, MVT::nxv2i1, MVT::nxv2f64, 1},
3722 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3723 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3724 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3725 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3726 {ISD::FP_TO_UINT, MVT::nxv2i1, MVT::nxv2f64, 1},
3727
3728 // Complex, from nxv4f32.
3729 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3730 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3731 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3732 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3733 {ISD::FP_TO_SINT, MVT::nxv4i1, MVT::nxv4f32, 1},
3734 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3735 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3736 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3737 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3738 {ISD::FP_TO_UINT, MVT::nxv4i1, MVT::nxv4f32, 1},
3739
3740 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
3741 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3742 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3743 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3744 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3745
3746 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
3747 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3748 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3749 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3750 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3751 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3752 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3753
3754 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
3755 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3756 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3757 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3758 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3759
3760 // Complex, from nxv8f16.
3761 {ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3762 {ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3763 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3764 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3765 {ISD::FP_TO_SINT, MVT::nxv8i1, MVT::nxv8f16, 1},
3766 {ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3767 {ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3768 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3769 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3770 {ISD::FP_TO_UINT, MVT::nxv8i1, MVT::nxv8f16, 1},
3771
3772 // Complex, from nxv4f16.
3773 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3774 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3775 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3776 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3777 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3778 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3779 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3780 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3781
3782 // Complex, from nxv2f16.
3783 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3784 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3785 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3786 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3787 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3788 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3789 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3790 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3791
3792 // Truncate from nxvmf32 to nxvmf16.
3793 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1},
3794 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1},
3795 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3},
3796
3797 // Truncate from nxvmf32 to nxvmbf16.
3798 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 8},
3799 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 8},
3800 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 17},
3801
3802 // Truncate from nxvmf64 to nxvmf16.
3803 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1},
3804 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3},
3805 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7},
3806
3807 // Truncate from nxvmf64 to nxvmbf16.
3808 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 9},
3809 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 19},
3810 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 39},
3811
3812 // Truncate from nxvmf64 to nxvmf32.
3813 {ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1},
3814 {ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3},
3815 {ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6},
3816
3817 // Extend from nxvmf16 to nxvmf32.
3818 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
3819 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
3820 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
3821
3822 // Extend from nxvmbf16 to nxvmf32.
3823 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2bf16, 1}, // lsl
3824 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4bf16, 1}, // lsl
3825 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8bf16, 4}, // unpck+unpck+lsl+lsl
3826
3827 // Extend from nxvmf16 to nxvmf64.
3828 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
3829 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
3830 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
3831
3832 // Extend from nxvmbf16 to nxvmf64.
3833 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2bf16, 2}, // lsl+fcvt
3834 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4bf16, 6}, // 2*unpck+2*lsl+2*fcvt
3835 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8bf16, 14}, // 6*unpck+4*lsl+4*fcvt
3836
3837 // Extend from nxvmf32 to nxvmf64.
3838 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
3839 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
3840 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
3841
3842 // Bitcasts from float to integer
3843 {ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0},
3844 {ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0},
3845 {ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0},
3846
3847 // Bitcasts from integer to float
3848 {ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0},
3849 {ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0},
3850 {ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0},
3851
3852 // Add cost for extending to illegal -too wide- scalable vectors.
3853 // zero/sign extend are implemented by multiple unpack operations,
3854 // where each operation has a cost of 1.
3855 {ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3856 {ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3857 {ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3858 {ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3859 {ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3860 {ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3861
3862 {ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3863 {ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3864 {ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3865 {ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3866 {ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3867 {ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3868 };
3869
3870 // We have to estimate a cost of fixed length operation upon
3871 // SVE registers(operations) with the number of registers required
3872 // for a fixed type to be represented upon SVE registers.
3873 EVT WiderTy = SrcTy.bitsGT(DstTy) ? SrcTy : DstTy;
3874 if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
3875 SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
3876 ST->useSVEForFixedLengthVectors(WiderTy)) {
3877 std::pair<InstructionCost, MVT> LT =
3878 getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext()));
3879 unsigned NumElements =
3880 AArch64::SVEBitsPerBlock / LT.second.getScalarSizeInBits();
3881 return LT.first *
3883 Opcode,
3884 ScalableVectorType::get(Dst->getScalarType(), NumElements),
3885 ScalableVectorType::get(Src->getScalarType(), NumElements), CCH,
3886 CostKind, I);
3887 }
3888
3889 if (const auto *Entry = ConvertCostTableLookup(
3890 ConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3891 return Entry->Cost;
3892
3893 static const TypeConversionCostTblEntry FP16Tbl[] = {
3894 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
3895 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
3896 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
3897 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
3898 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
3899 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
3900 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
3901 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
3902 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
3903 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
3904 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
3905 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
3906 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
3907 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
3908 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
3909 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
3910 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
3911 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
3912 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf
3913 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf
3914 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
3915 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
3916 };
3917
3918 if (ST->hasFullFP16())
3919 if (const auto *Entry = ConvertCostTableLookup(
3920 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3921 return Entry->Cost;
3922
3923 // INT_TO_FP of i64->f32 will scalarize, which is required to avoid
3924 // double-rounding issues.
3925 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3926 DstTy.getScalarType() == MVT::f32 && SrcTy.getScalarSizeInBits() > 32 &&
3928 return cast<FixedVectorType>(Dst)->getNumElements() *
3929 getCastInstrCost(Opcode, Dst->getScalarType(),
3930 Src->getScalarType(), CCH, CostKind) +
3932 true, CostKind) +
3934 false, CostKind);
3935
3936 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3938 ST->isSVEorStreamingSVEAvailable() &&
3939 TLI->getTypeAction(Src->getContext(), SrcTy) ==
3941 TLI->getTypeAction(Dst->getContext(), DstTy) ==
3943 // The standard behaviour in the backend for these cases is to split the
3944 // extend up into two parts:
3945 // 1. Perform an extending load or masked load up to the legal type.
3946 // 2. Extend the loaded data to the final type.
3947 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
3948 Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext());
3950 Opcode, LegalTy, Src, CCH, CostKind, I);
3952 Opcode, Dst, LegalTy, TTI::CastContextHint::None, CostKind, I);
3953 return Part1 + Part2;
3954 }
3955
3956 // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
3957 // but we also want to include the TTI::CastContextHint::Masked case too.
3958 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3960 ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(DstTy))
3962
3963 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
3964}
3965
3968 VectorType *VecTy, unsigned Index,
3970
3971 // Make sure we were given a valid extend opcode.
3972 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
3973 "Invalid opcode");
3974
3975 // We are extending an element we extract from a vector, so the source type
3976 // of the extend is the element type of the vector.
3977 auto *Src = VecTy->getElementType();
3978
3979 // Sign- and zero-extends are for integer types only.
3980 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
3981
3982 // Get the cost for the extract. We compute the cost (if any) for the extend
3983 // below.
3984 InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
3985 CostKind, Index, nullptr, nullptr);
3986
3987 // Legalize the types.
3988 auto VecLT = getTypeLegalizationCost(VecTy);
3989 auto DstVT = TLI->getValueType(DL, Dst);
3990 auto SrcVT = TLI->getValueType(DL, Src);
3991
3992 // If the resulting type is still a vector and the destination type is legal,
3993 // we may get the extension for free. If not, get the default cost for the
3994 // extend.
3995 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
3996 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3997 CostKind);
3998
3999 // The destination type should be larger than the element type. If not, get
4000 // the default cost for the extend.
4001 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
4002 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
4003 CostKind);
4004
4005 switch (Opcode) {
4006 default:
4007 llvm_unreachable("Opcode should be either SExt or ZExt");
4008
4009 // For sign-extends, we only need a smov, which performs the extension
4010 // automatically.
4011 case Instruction::SExt:
4012 return Cost;
4013
4014 // For zero-extends, the extend is performed automatically by a umov unless
4015 // the destination type is i64 and the element type is i8 or i16.
4016 case Instruction::ZExt:
4017 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
4018 return Cost;
4019 }
4020
4021 // If we are unable to perform the extend for free, get the default cost.
4022 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
4023 CostKind);
4024}
4025
4028 const Instruction *I) const {
4030 return Opcode == Instruction::PHI ? 0 : 1;
4031 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
4032 // Branches are assumed to be predicted.
4033 return 0;
4034}
4035
4036InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
4037 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4038 const Instruction *I, Value *Scalar,
4039 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx,
4040 TTI::VectorInstrContext VIC) const {
4041 assert(Val->isVectorTy() && "This must be a vector type");
4042
4043 if (Index != -1U) {
4044 // Legalize the type.
4045 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
4046
4047 // This type is legalized to a scalar type.
4048 if (!LT.second.isVector())
4049 return 0;
4050
4051 // The type may be split. For fixed-width vectors we can normalize the
4052 // index to the new type.
4053 if (LT.second.isFixedLengthVector()) {
4054 unsigned Width = LT.second.getVectorNumElements();
4055 Index = Index % Width;
4056 }
4057
4058 // The element at index zero is already inside the vector.
4059 // - For a insert-element or extract-element
4060 // instruction that extracts integers, an explicit FPR -> GPR move is
4061 // needed. So it has non-zero cost.
4062 if (Index == 0 && !Val->getScalarType()->isIntegerTy())
4063 return 0;
4064
4065 // This is recognising a LD1 single-element structure to one lane of one
4066 // register instruction. I.e., if this is an `insertelement` instruction,
4067 // and its second operand is a load, then we will generate a LD1, which
4068 // are expensive instructions on some uArchs.
4069 if (VIC == TTI::VectorInstrContext::Load) {
4070 if (ST->hasFastLD1Single())
4071 return 0;
4072 return CostKind == TTI::TCK_CodeSize
4073 ? 0
4075 }
4076
4077 // i1 inserts and extract will include an extra cset or cmp of the vector
4078 // value. Increase the cost by 1 to account.
4079 if (Val->getScalarSizeInBits() == 1)
4080 return CostKind == TTI::TCK_CodeSize
4081 ? 2
4082 : ST->getVectorInsertExtractBaseCost() + 1;
4083
4084 // FIXME:
4085 // If the extract-element and insert-element instructions could be
4086 // simplified away (e.g., could be combined into users by looking at use-def
4087 // context), they have no cost. This is not done in the first place for
4088 // compile-time considerations.
4089 }
4090
4091 // In case of Neon, if there exists extractelement from lane != 0 such that
4092 // 1. extractelement does not necessitate a move from vector_reg -> GPR.
4093 // 2. extractelement result feeds into fmul.
4094 // 3. Other operand of fmul is an extractelement from lane 0 or lane
4095 // equivalent to 0.
4096 // then the extractelement can be merged with fmul in the backend and it
4097 // incurs no cost.
4098 // e.g.
4099 // define double @foo(<2 x double> %a) {
4100 // %1 = extractelement <2 x double> %a, i32 0
4101 // %2 = extractelement <2 x double> %a, i32 1
4102 // %res = fmul double %1, %2
4103 // ret double %res
4104 // }
4105 // %2 and %res can be merged in the backend to generate fmul d0, d0, v1.d[1]
4106 auto ExtractCanFuseWithFmul = [&]() {
4107 // We bail out if the extract is from lane 0.
4108 if (Index == 0)
4109 return false;
4110
4111 // Check if the scalar element type of the vector operand of ExtractElement
4112 // instruction is one of the allowed types.
4113 auto IsAllowedScalarTy = [&](const Type *T) {
4114 return T->isFloatTy() || T->isDoubleTy() ||
4115 (T->isHalfTy() && ST->hasFullFP16());
4116 };
4117
4118 // Check if the extractelement user is scalar fmul.
4119 auto IsUserFMulScalarTy = [](const Value *EEUser) {
4120 // Check if the user is scalar fmul.
4121 const auto *BO = dyn_cast<BinaryOperator>(EEUser);
4122 return BO && BO->getOpcode() == BinaryOperator::FMul &&
4123 !BO->getType()->isVectorTy();
4124 };
4125
4126 // Check if the extract index is from lane 0 or lane equivalent to 0 for a
4127 // certain scalar type and a certain vector register width.
4128 auto IsExtractLaneEquivalentToZero = [&](unsigned Idx, unsigned EltSz) {
4129 auto RegWidth =
4131 .getFixedValue();
4132 return Idx == 0 || (RegWidth != 0 && (Idx * EltSz) % RegWidth == 0);
4133 };
4134
4135 // Check if the type constraints on input vector type and result scalar type
4136 // of extractelement instruction are satisfied.
4137 if (!isa<FixedVectorType>(Val) || !IsAllowedScalarTy(Val->getScalarType()))
4138 return false;
4139
4140 if (Scalar) {
4141 DenseMap<User *, unsigned> UserToExtractIdx;
4142 for (auto *U : Scalar->users()) {
4143 if (!IsUserFMulScalarTy(U))
4144 return false;
4145 // Recording entry for the user is important. Index value is not
4146 // important.
4147 UserToExtractIdx[U];
4148 }
4149 if (UserToExtractIdx.empty())
4150 return false;
4151 for (auto &[S, U, L] : ScalarUserAndIdx) {
4152 for (auto *U : S->users()) {
4153 if (UserToExtractIdx.contains(U)) {
4154 auto *FMul = cast<BinaryOperator>(U);
4155 auto *Op0 = FMul->getOperand(0);
4156 auto *Op1 = FMul->getOperand(1);
4157 if ((Op0 == S && Op1 == S) || Op0 != S || Op1 != S) {
4158 UserToExtractIdx[U] = L;
4159 break;
4160 }
4161 }
4162 }
4163 }
4164 for (auto &[U, L] : UserToExtractIdx) {
4165 if (!IsExtractLaneEquivalentToZero(Index, Val->getScalarSizeInBits()) &&
4166 !IsExtractLaneEquivalentToZero(L, Val->getScalarSizeInBits()))
4167 return false;
4168 }
4169 } else {
4170 const auto *EE = cast<ExtractElementInst>(I);
4171
4172 const auto *IdxOp = dyn_cast<ConstantInt>(EE->getIndexOperand());
4173 if (!IdxOp)
4174 return false;
4175
4176 return !EE->users().empty() && all_of(EE->users(), [&](const User *U) {
4177 if (!IsUserFMulScalarTy(U))
4178 return false;
4179
4180 // Check if the other operand of extractelement is also extractelement
4181 // from lane equivalent to 0.
4182 const auto *BO = cast<BinaryOperator>(U);
4183 const auto *OtherEE = dyn_cast<ExtractElementInst>(
4184 BO->getOperand(0) == EE ? BO->getOperand(1) : BO->getOperand(0));
4185 if (OtherEE) {
4186 const auto *IdxOp = dyn_cast<ConstantInt>(OtherEE->getIndexOperand());
4187 if (!IdxOp)
4188 return false;
4189 return IsExtractLaneEquivalentToZero(
4190 cast<ConstantInt>(OtherEE->getIndexOperand())
4191 ->getValue()
4192 .getZExtValue(),
4193 OtherEE->getType()->getScalarSizeInBits());
4194 }
4195 return true;
4196 });
4197 }
4198 return true;
4199 };
4200
4201 if (Opcode == Instruction::ExtractElement && (I || Scalar) &&
4202 ExtractCanFuseWithFmul())
4203 return 0;
4204
4205 // All other insert/extracts cost this much.
4206 return CostKind == TTI::TCK_CodeSize ? 1
4207 : ST->getVectorInsertExtractBaseCost();
4208}
4209
4211 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4212 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
4213 // Treat insert at lane 0 into a poison vector as having zero cost. This
4214 // ensures vector broadcasts via an insert + shuffle (and will be lowered to a
4215 // single dup) are treated as cheap.
4216 if (Opcode == Instruction::InsertElement && Index == 0 && Op0 &&
4217 isa<PoisonValue>(Op0))
4218 return 0;
4219 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, nullptr,
4220 nullptr, {}, VIC);
4221}
4222
4224 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4225 Value *Scalar, ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx,
4226 TTI::VectorInstrContext VIC) const {
4227 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, nullptr, Scalar,
4228 ScalarUserAndIdx, VIC);
4229}
4230
4233 TTI::TargetCostKind CostKind, unsigned Index,
4234 TTI::VectorInstrContext VIC) const {
4235 return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index, &I,
4236 nullptr, {}, VIC);
4237}
4238
4242 unsigned Index) const {
4243 if (isa<FixedVectorType>(Val))
4245 Index);
4246
4247 // This typically requires both while and lastb instructions in order
4248 // to extract the last element. If this is in a loop the while
4249 // instruction can at least be hoisted out, although it will consume a
4250 // predicate register. The cost should be more expensive than the base
4251 // extract cost, which is 2 for most CPUs.
4252 return CostKind == TTI::TCK_CodeSize
4253 ? 2
4254 : ST->getVectorInsertExtractBaseCost() + 1;
4255}
4256
4258 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
4259 TTI::TargetCostKind CostKind, bool ForPoisonSrc, ArrayRef<Value *> VL,
4260 TTI::VectorInstrContext VIC) const {
4263 if (Ty->getElementType()->isFloatingPointTy())
4264 return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
4265 CostKind);
4266 unsigned VecInstCost =
4267 CostKind == TTI::TCK_CodeSize ? 1 : ST->getVectorInsertExtractBaseCost();
4268 return DemandedElts.popcount() * (Insert + Extract) * VecInstCost;
4269}
4270
4271std::optional<InstructionCost> AArch64TTIImpl::getFP16BF16PromoteCost(
4273 TTI::OperandValueInfo Op2Info, bool IncludeTrunc, bool CanUseSVE,
4274 std::function<InstructionCost(Type *)> InstCost) const {
4275 if (!Ty->getScalarType()->isHalfTy() && !Ty->getScalarType()->isBFloatTy())
4276 return std::nullopt;
4277 if (Ty->getScalarType()->isHalfTy() && ST->hasFullFP16())
4278 return std::nullopt;
4279 if (CanUseSVE && Ty->isScalableTy() && ST->hasSVEB16B16() &&
4280 ST->isNonStreamingSVEorSME2Available())
4281 return std::nullopt;
4282
4283 Type *PromotedTy = Ty->getWithNewType(Type::getFloatTy(Ty->getContext()));
4284 InstructionCost Cost = getCastInstrCost(Instruction::FPExt, PromotedTy, Ty,
4286 if (!Op1Info.isConstant() && !Op2Info.isConstant())
4287 Cost *= 2;
4288 Cost += InstCost(PromotedTy);
4289 if (IncludeTrunc)
4290 Cost += getCastInstrCost(Instruction::FPTrunc, Ty, PromotedTy,
4292 return Cost;
4293}
4294
4296 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
4298 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
4299
4300 // The code-generator is currently not able to handle scalable vectors
4301 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4302 // it. This change will be removed when code-generation for these types is
4303 // sufficiently reliable.
4304 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
4305 if (VTy->getElementCount() == ElementCount::getScalable(1))
4307
4308 // TODO: Handle more cost kinds.
4310 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4311 Op2Info, Args, CxtI);
4312
4313 // Legalize the type.
4314 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4315 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4316
4317 // Increase the cost for half and bfloat types if not architecturally
4318 // supported.
4319 if (ISD == ISD::FADD || ISD == ISD::FSUB || ISD == ISD::FMUL ||
4320 ISD == ISD::FDIV || ISD == ISD::FREM)
4321 if (auto PromotedCost = getFP16BF16PromoteCost(
4322 Ty, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/true,
4323 // There is not native support for fdiv/frem even with +sve-b16b16.
4324 /*CanUseSVE=*/ISD != ISD::FDIV && ISD != ISD::FREM,
4325 [&](Type *PromotedTy) {
4326 return getArithmeticInstrCost(Opcode, PromotedTy, CostKind,
4327 Op1Info, Op2Info);
4328 }))
4329 return *PromotedCost;
4330
4331 // If the operation is a widening instruction (smull or umull) and both
4332 // operands are extends the cost can be cheaper by considering that the
4333 // operation will operate on the narrowest type size possible (double the
4334 // largest input size) and a further extend.
4335 if (Type *ExtTy = isBinExtWideningInstruction(Opcode, Ty, Args)) {
4336 if (ExtTy != Ty)
4337 return getArithmeticInstrCost(Opcode, ExtTy, CostKind) +
4338 getCastInstrCost(Instruction::ZExt, Ty, ExtTy,
4340 return LT.first;
4341 }
4342
4343 switch (ISD) {
4344 default:
4345 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4346 Op2Info);
4347 case ISD::SREM:
4348 case ISD::SDIV:
4349 /*
4350 Notes for sdiv/srem specific costs:
4351 1. This only considers the cases where the divisor is constant, uniform and
4352 (pow-of-2/non-pow-of-2). Other cases are not important since they either
4353 result in some form of (ldr + adrp), corresponding to constant vectors, or
4354 scalarization of the division operation.
4355 2. Constant divisors, either negative in whole or partially, don't result in
4356 significantly different codegen as compared to positive constant divisors.
4357 So, we don't consider negative divisors separately.
4358 3. If the codegen is significantly different with SVE, it has been indicated
4359 using comments at appropriate places.
4360
4361 sdiv specific cases:
4362 -----------------------------------------------------------------------
4363 codegen | pow-of-2 | Type
4364 -----------------------------------------------------------------------
4365 add + cmp + csel + asr | Y | i64
4366 add + cmp + csel + asr | Y | i32
4367 -----------------------------------------------------------------------
4368
4369 srem specific cases:
4370 -----------------------------------------------------------------------
4371 codegen | pow-of-2 | Type
4372 -----------------------------------------------------------------------
4373 negs + and + and + csneg | Y | i64
4374 negs + and + and + csneg | Y | i32
4375 -----------------------------------------------------------------------
4376
4377 other sdiv/srem cases:
4378 -------------------------------------------------------------------------
4379 common codegen | + srem | + sdiv | pow-of-2 | Type
4380 -------------------------------------------------------------------------
4381 smulh + asr + add + add | - | - | N | i64
4382 smull + lsr + add + add | - | - | N | i32
4383 usra | and + sub | sshr | Y | <2 x i64>
4384 2 * (scalar code) | - | - | N | <2 x i64>
4385 usra | bic + sub | sshr + neg | Y | <4 x i32>
4386 smull2 + smull + uzp2 | mls | - | N | <4 x i32>
4387 + sshr + usra | | | |
4388 -------------------------------------------------------------------------
4389 */
4390 if (Op2Info.isConstant() && Op2Info.isUniform()) {
4391 InstructionCost AddCost =
4392 getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
4393 Op1Info.getNoProps(), Op2Info.getNoProps());
4394 InstructionCost AsrCost =
4395 getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
4396 Op1Info.getNoProps(), Op2Info.getNoProps());
4397 InstructionCost MulCost =
4398 getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
4399 Op1Info.getNoProps(), Op2Info.getNoProps());
4400 // add/cmp/csel/csneg should have similar cost while asr/negs/and should
4401 // have similar cost.
4402 auto VT = TLI->getValueType(DL, Ty);
4403 if (VT.isScalarInteger() && VT.getSizeInBits() <= 64) {
4404 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4405 // Neg can be folded into the asr instruction.
4406 return ISD == ISD::SDIV ? (3 * AddCost + AsrCost)
4407 : (3 * AsrCost + AddCost);
4408 } else {
4409 return MulCost + AsrCost + 2 * AddCost;
4410 }
4411 } else if (VT.isVector()) {
4412 InstructionCost UsraCost = 2 * AsrCost;
4413 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4414 // Division with scalable types corresponds to native 'asrd'
4415 // instruction when SVE is available.
4416 // e.g. %1 = sdiv <vscale x 4 x i32> %a, splat (i32 8)
4417
4418 // One more for the negation in SDIV
4420 (Op2Info.isNegatedPowerOf2() && ISD == ISD::SDIV) ? AsrCost : 0;
4421 if (Ty->isScalableTy() && ST->hasSVE())
4422 Cost += 2 * AsrCost;
4423 else {
4424 Cost +=
4425 UsraCost +
4426 (ISD == ISD::SDIV
4427 ? (LT.second.getScalarType() == MVT::i64 ? 1 : 2) * AsrCost
4428 : 2 * AddCost);
4429 }
4430 return Cost;
4431 } else if (LT.second == MVT::v2i64) {
4432 return VT.getVectorNumElements() *
4433 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind,
4434 Op1Info.getNoProps(),
4435 Op2Info.getNoProps());
4436 } else {
4437 // When SVE is available, we get:
4438 // smulh + lsr + add/sub + asr + add/sub.
4439 if (Ty->isScalableTy() && ST->hasSVE())
4440 return MulCost /*smulh cost*/ + 2 * AddCost + 2 * AsrCost;
4441 return 2 * MulCost + AddCost /*uzp2 cost*/ + AsrCost + UsraCost;
4442 }
4443 }
4444 }
4445 if (Op2Info.isConstant() && !Op2Info.isUniform() &&
4446 LT.second.isFixedLengthVector()) {
4447 // FIXME: When the constant vector is non-uniform, this may result in
4448 // loading the vector from constant pool or in some cases, may also result
4449 // in scalarization. For now, we are approximating this with the
4450 // scalarization cost.
4451 auto ExtractCost = 2 * getVectorInstrCost(Instruction::ExtractElement, Ty,
4452 CostKind, -1, nullptr, nullptr);
4453 auto InsertCost = getVectorInstrCost(Instruction::InsertElement, Ty,
4454 CostKind, -1, nullptr, nullptr);
4455 unsigned NElts = cast<FixedVectorType>(Ty)->getNumElements();
4456 return ExtractCost + InsertCost +
4457 NElts * getArithmeticInstrCost(Opcode, Ty->getScalarType(),
4458 CostKind, Op1Info.getNoProps(),
4459 Op2Info.getNoProps());
4460 }
4461 [[fallthrough]];
4462 case ISD::UDIV:
4463 case ISD::UREM: {
4464 auto VT = TLI->getValueType(DL, Ty);
4465 if (Op2Info.isConstant()) {
4466 // If the operand is a power of 2 we can use the shift or and cost.
4467 if (ISD == ISD::UDIV && Op2Info.isPowerOf2())
4468 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
4469 Op1Info.getNoProps(),
4470 Op2Info.getNoProps());
4471 if (ISD == ISD::UREM && Op2Info.isPowerOf2())
4472 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
4473 Op1Info.getNoProps(),
4474 Op2Info.getNoProps());
4475
4476 if (ISD == ISD::UDIV || ISD == ISD::UREM) {
4477 // Divides by a constant are expanded to MULHU + SUB + SRL + ADD + SRL.
4478 // The MULHU will be expanded to UMULL for the types not listed below,
4479 // and will become a pair of UMULL+MULL2 for 128bit vectors.
4480 bool HasMULH = VT == MVT::i64 || LT.second == MVT::nxv2i64 ||
4481 LT.second == MVT::nxv4i32 || LT.second == MVT::nxv8i16 ||
4482 LT.second == MVT::nxv16i8;
4483 bool Is128bit = LT.second.is128BitVector();
4484
4485 InstructionCost MulCost =
4486 getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
4487 Op1Info.getNoProps(), Op2Info.getNoProps());
4488 InstructionCost AddCost =
4489 getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
4490 Op1Info.getNoProps(), Op2Info.getNoProps());
4491 InstructionCost ShrCost =
4492 getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
4493 Op1Info.getNoProps(), Op2Info.getNoProps());
4494 InstructionCost DivCost = MulCost * (Is128bit ? 2 : 1) + // UMULL/UMULH
4495 (HasMULH ? 0 : ShrCost) + // UMULL shift
4496 AddCost * 2 + ShrCost;
4497 return DivCost + (ISD == ISD::UREM ? MulCost + AddCost : 0);
4498 }
4499 }
4500
4501 // div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are
4502 // emitted by the backend even when those functions are not declared in the
4503 // module.
4504 if (!VT.isVector() && VT.getSizeInBits() > 64)
4505 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
4506
4508 Opcode, Ty, CostKind, Op1Info, Op2Info);
4509 if (Ty->isVectorTy() && (ISD == ISD::SDIV || ISD == ISD::UDIV)) {
4510 if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
4511 // SDIV/UDIV operations are lowered using SVE, then we can have less
4512 // costs.
4513 if (VT.isSimple() && isa<FixedVectorType>(Ty) &&
4514 Ty->getPrimitiveSizeInBits().getFixedValue() < 128) {
4515 static const CostTblEntry DivTbl[]{
4516 {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8},
4517 {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5},
4518 {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1},
4519 {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8},
4520 {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5},
4521 {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}};
4522
4523 const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
4524 if (nullptr != Entry)
4525 return Entry->Cost;
4526 }
4527 // For 8/16-bit elements, the cost is higher because the type
4528 // requires promotion and possibly splitting:
4529 if (LT.second.getScalarType() == MVT::i8)
4530 Cost *= 8;
4531 else if (LT.second.getScalarType() == MVT::i16)
4532 Cost *= 4;
4533 return Cost;
4534 } else {
4535 // If one of the operands is a uniform constant then the cost for each
4536 // element is Cost for insertion, extraction and division.
4537 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
4538 // operation with scalar type
4539 if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
4540 (Op2Info.isConstant() && Op2Info.isUniform())) {
4541 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
4543 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
4544 return (4 + DivCost) * VTy->getNumElements();
4545 }
4546 }
4547 // On AArch64, without SVE, vector divisions are expanded
4548 // into scalar divisions of each pair of elements.
4549 Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind,
4550 -1, nullptr, nullptr);
4551 Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
4552 nullptr, nullptr);
4553 }
4554
4555 // TODO: if one of the arguments is scalar, then it's not necessary to
4556 // double the cost of handling the vector elements.
4557 Cost += Cost;
4558 }
4559 return Cost;
4560 }
4561 case ISD::MUL:
4562 // When SVE is available, then we can lower the v2i64 operation using
4563 // the SVE mul instruction, which has a lower cost.
4564 if (LT.second == MVT::v2i64 && ST->hasSVE())
4565 return LT.first;
4566
4567 // When SVE is not available, there is no MUL.2d instruction,
4568 // which means mul <2 x i64> is expensive as elements are extracted
4569 // from the vectors and the muls scalarized.
4570 // As getScalarizationOverhead is a bit too pessimistic, we
4571 // estimate the cost for a i64 vector directly here, which is:
4572 // - four 2-cost i64 extracts,
4573 // - two 2-cost i64 inserts, and
4574 // - two 1-cost muls.
4575 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
4576 // LT.first = 2 the cost is 28.
4577 if (LT.second != MVT::v2i64)
4578 return LT.first;
4579 return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() *
4580 (getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) +
4581 getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind, -1,
4582 nullptr, nullptr) *
4583 2 +
4584 getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
4585 nullptr, nullptr));
4586 case ISD::ADD:
4587 case ISD::XOR:
4588 case ISD::OR:
4589 case ISD::AND:
4590 case ISD::SRL:
4591 case ISD::SRA:
4592 case ISD::SHL:
4593 // These nodes are marked as 'custom' for combining purposes only.
4594 // We know that they are legal. See LowerAdd in ISelLowering.
4595 return LT.first;
4596
4597 case ISD::FNEG:
4598 // Scalar fmul(fneg) or fneg(fmul) can be converted to fnmul
4599 if ((Ty->isFloatTy() || Ty->isDoubleTy() ||
4600 (Ty->isHalfTy() && ST->hasFullFP16())) &&
4601 CxtI &&
4602 ((CxtI->hasOneUse() &&
4603 match(*CxtI->user_begin(), m_FMul(m_Value(), m_Value()))) ||
4604 match(CxtI->getOperand(0), m_FMul(m_Value(), m_Value()))))
4605 return 0;
4606 [[fallthrough]];
4607 case ISD::FADD:
4608 case ISD::FSUB:
4609 if (!Ty->getScalarType()->isFP128Ty())
4610 return LT.first;
4611 [[fallthrough]];
4612 case ISD::FMUL:
4613 case ISD::FDIV:
4614 // These nodes are marked as 'custom' just to lower them to SVE.
4615 // We know said lowering will incur no additional cost.
4616 if (!Ty->getScalarType()->isFP128Ty())
4617 return 2 * LT.first;
4618
4619 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4620 Op2Info);
4621 case ISD::FREM:
4622 // Pass nullptr as fmod/fmodf calls are emitted by the backend even when
4623 // those functions are not declared in the module.
4624 if (!Ty->isVectorTy())
4625 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
4626 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4627 Op2Info);
4628 }
4629}
4630
4633 const SCEV *Ptr,
4635 // Address computations in vectorized code with non-consecutive addresses will
4636 // likely result in more instructions compared to scalar code where the
4637 // computation can more often be merged into the index mode. The resulting
4638 // extra micro-ops can significantly decrease throughput.
4639 unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
4640 int MaxMergeDistance = 64;
4641
4642 if (PtrTy->isVectorTy() && SE &&
4643 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
4644 return NumVectorInstToHideOverhead;
4645
4646 // In many cases the address computation is not merged into the instruction
4647 // addressing mode.
4648 return 1;
4649}
4650
4651/// Check whether Opcode1 has less throughput according to the scheduling
4652/// model than Opcode2.
4654 unsigned Opcode1, unsigned Opcode2) const {
4655 const MCSchedModel &Sched = ST->getSchedModel();
4656 const TargetInstrInfo *TII = ST->getInstrInfo();
4657 if (!Sched.hasInstrSchedModel())
4658 return false;
4659
4660 const MCSchedClassDesc *SCD1 =
4661 Sched.getSchedClassDesc(TII->get(Opcode1).getSchedClass());
4662 const MCSchedClassDesc *SCD2 =
4663 Sched.getSchedClassDesc(TII->get(Opcode2).getSchedClass());
4664 // We cannot handle variant scheduling classes without an MI. If we need to
4665 // support them for any of the instructions we query the information of we
4666 // might need to add a way to resolve them without a MI or not use the
4667 // scheduling info.
4668 assert(!SCD1->isVariant() && !SCD2->isVariant() &&
4669 "Cannot handle variant scheduling classes without an MI");
4670 if (!SCD1->isValid() || !SCD2->isValid())
4671 return false;
4672
4673 return MCSchedModel::getReciprocalThroughput(*ST, *SCD1) >
4675}
4676
4678 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
4680 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
4681 // We don't lower some vector selects well that are wider than the register
4682 // width. TODO: Improve this with different cost kinds.
4683 if (isa<FixedVectorType>(ValTy) && Opcode == Instruction::Select) {
4684 // We would need this many instructions to hide the scalarization happening.
4685 const int AmortizationCost = 20;
4686
4687 // If VecPred is not set, check if we can get a predicate from the context
4688 // instruction, if its type matches the requested ValTy.
4689 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
4690 CmpPredicate CurrentPred;
4691 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
4692 m_Value())))
4693 VecPred = CurrentPred;
4694 }
4695 // Check if we have a compare/select chain that can be lowered using
4696 // a (F)CMxx & BFI pair.
4697 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
4698 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
4699 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
4700 VecPred == CmpInst::FCMP_UNE) {
4701 static const auto ValidMinMaxTys = {
4702 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
4703 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
4704 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
4705
4706 auto LT = getTypeLegalizationCost(ValTy);
4707 if (any_of(ValidMinMaxTys, equal_to(LT.second)) ||
4708 (ST->hasFullFP16() &&
4709 any_of(ValidFP16MinMaxTys, equal_to(LT.second))))
4710 return LT.first;
4711 }
4712
4713 static const TypeConversionCostTblEntry VectorSelectTbl[] = {
4714 {Instruction::Select, MVT::v2i1, MVT::v2f32, 2},
4715 {Instruction::Select, MVT::v2i1, MVT::v2f64, 2},
4716 {Instruction::Select, MVT::v4i1, MVT::v4f32, 2},
4717 {Instruction::Select, MVT::v4i1, MVT::v4f16, 2},
4718 {Instruction::Select, MVT::v8i1, MVT::v8f16, 2},
4719 {Instruction::Select, MVT::v16i1, MVT::v16i16, 16},
4720 {Instruction::Select, MVT::v8i1, MVT::v8i32, 8},
4721 {Instruction::Select, MVT::v16i1, MVT::v16i32, 16},
4722 {Instruction::Select, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost},
4723 {Instruction::Select, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost},
4724 {Instruction::Select, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost}};
4725
4726 EVT SelCondTy = TLI->getValueType(DL, CondTy);
4727 EVT SelValTy = TLI->getValueType(DL, ValTy);
4728 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
4729 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, Opcode,
4730 SelCondTy.getSimpleVT(),
4731 SelValTy.getSimpleVT()))
4732 return Entry->Cost;
4733 }
4734 }
4735
4736 if (Opcode == Instruction::FCmp) {
4737 if (auto PromotedCost = getFP16BF16PromoteCost(
4738 ValTy, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/false,
4739 // TODO: Consider costing SVE FCMPs.
4740 /*CanUseSVE=*/false, [&](Type *PromotedTy) {
4742 getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred,
4743 CostKind, Op1Info, Op2Info);
4744 if (isa<VectorType>(PromotedTy))
4746 Instruction::Trunc,
4750 return Cost;
4751 }))
4752 return *PromotedCost;
4753
4754 auto LT = getTypeLegalizationCost(ValTy);
4755 // Model unknown fp compares as a libcall.
4756 if (LT.second.getScalarType() != MVT::f64 &&
4757 LT.second.getScalarType() != MVT::f32 &&
4758 LT.second.getScalarType() != MVT::f16)
4759 return LT.first * getCallInstrCost(/*Function*/ nullptr, ValTy,
4760 {ValTy, ValTy}, CostKind);
4761
4762 // Some comparison operators require expanding to multiple compares + or.
4763 unsigned Factor = 1;
4764 if (!CondTy->isVectorTy() &&
4765 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4766 Factor = 2; // fcmp with 2 selects
4767 else if (isa<FixedVectorType>(ValTy) &&
4768 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ ||
4769 VecPred == FCmpInst::FCMP_ORD || VecPred == FCmpInst::FCMP_UNO))
4770 Factor = 3; // fcmxx+fcmyy+or
4771 else if (isa<ScalableVectorType>(ValTy) &&
4772 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4773 Factor = 3; // fcmxx+fcmyy+or
4774
4775 if (isa<ScalableVectorType>(ValTy) &&
4777 hasKnownLowerThroughputFromSchedulingModel(AArch64::FCMEQ_PPzZZ_S,
4778 AArch64::FCMEQv4f32))
4779 Factor *= 2;
4780
4781 return Factor * (CostKind == TTI::TCK_Latency ? 2 : LT.first);
4782 }
4783
4784 // Treat the icmp in icmp(and, 0) or icmp(and, -1/1) when it can be folded to
4785 // icmp(and, 0) as free, as we can make use of ands, but only if the
4786 // comparison is not unsigned. FIXME: Enable for non-throughput cost kinds
4787 // providing it will not cause performance regressions.
4788 if (CostKind == TTI::TCK_RecipThroughput && ValTy->isIntegerTy() &&
4789 Opcode == Instruction::ICmp && I && !CmpInst::isUnsigned(VecPred) &&
4790 TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&
4791 match(I->getOperand(0), m_And(m_Value(), m_Value()))) {
4792 if (match(I->getOperand(1), m_Zero()))
4793 return 0;
4794
4795 // x >= 1 / x < 1 -> x > 0 / x <= 0
4796 if (match(I->getOperand(1), m_One()) &&
4797 (VecPred == CmpInst::ICMP_SLT || VecPred == CmpInst::ICMP_SGE))
4798 return 0;
4799
4800 // x <= -1 / x > -1 -> x > 0 / x <= 0
4801 if (match(I->getOperand(1), m_AllOnes()) &&
4802 (VecPred == CmpInst::ICMP_SLE || VecPred == CmpInst::ICMP_SGT))
4803 return 0;
4804 }
4805
4806 // The base case handles scalable vectors fine for now, since it treats the
4807 // cost as 1 * legalization cost.
4808 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
4809 Op1Info, Op2Info, I);
4810}
4811
4813AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
4815 if (ST->requiresStrictAlign()) {
4816 // TODO: Add cost modeling for strict align. Misaligned loads expand to
4817 // a bunch of instructions when strict align is enabled.
4818 return Options;
4819 }
4820 Options.AllowOverlappingLoads = true;
4821 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
4822 Options.NumLoadsPerBlock = Options.MaxNumLoads;
4823 // TODO: Though vector loads usually perform well on AArch64, in some targets
4824 // they may wake up the FP unit, which raises the power consumption. Perhaps
4825 // they could be used with no holds barred (-O3).
4826 Options.LoadSizes = {8, 4, 2, 1};
4827 Options.AllowedTailExpansions = {3, 5, 6};
4828 return Options;
4829}
4830
4832 return ST->hasSVE();
4833}
4834
4838 switch (MICA.getID()) {
4839 case Intrinsic::masked_scatter:
4840 case Intrinsic::masked_gather:
4841 return getGatherScatterOpCost(MICA, CostKind);
4842 case Intrinsic::masked_load:
4843 case Intrinsic::masked_store:
4844 return getMaskedMemoryOpCost(MICA, CostKind);
4845 }
4847}
4848
4852 Type *Src = MICA.getDataType();
4853
4854 if (useNeonVector(Src))
4856 auto LT = getTypeLegalizationCost(Src);
4857 if (!LT.first.isValid())
4859
4860 // Return an invalid cost for element types that we are unable to lower.
4861 auto *VT = cast<VectorType>(Src);
4862 if (VT->getElementType()->isIntegerTy(1))
4864
4865 // The code-generator is currently not able to handle scalable vectors
4866 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4867 // it. This change will be removed when code-generation for these types is
4868 // sufficiently reliable.
4869 if (VT->getElementCount() == ElementCount::getScalable(1))
4871
4872 return LT.first;
4873}
4874
4875// This function returns gather/scatter overhead either from
4876// user-provided value or specialized values per-target from \p ST.
4877static unsigned getSVEGatherScatterOverhead(unsigned Opcode,
4878 const AArch64Subtarget *ST) {
4879 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
4880 "Should be called on only load or stores.");
4881 switch (Opcode) {
4882 case Instruction::Load:
4883 if (SVEGatherOverhead.getNumOccurrences() > 0)
4884 return SVEGatherOverhead;
4885 return ST->getGatherOverhead();
4886 break;
4887 case Instruction::Store:
4888 if (SVEScatterOverhead.getNumOccurrences() > 0)
4889 return SVEScatterOverhead;
4890 return ST->getScatterOverhead();
4891 break;
4892 default:
4893 llvm_unreachable("Shouldn't have reached here");
4894 }
4895}
4896
4900
4901 unsigned Opcode = (MICA.getID() == Intrinsic::masked_gather ||
4902 MICA.getID() == Intrinsic::vp_gather)
4903 ? Instruction::Load
4904 : Instruction::Store;
4905
4906 Type *DataTy = MICA.getDataType();
4907 Align Alignment = MICA.getAlignment();
4908 const Instruction *I = MICA.getInst();
4909
4910 if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy))
4912 auto *VT = cast<VectorType>(DataTy);
4913 auto LT = getTypeLegalizationCost(DataTy);
4914 if (!LT.first.isValid())
4916
4917 // Return an invalid cost for element types that we are unable to lower.
4918 if (!LT.second.isVector() ||
4919 !isElementTypeLegalForScalableVector(VT->getElementType()) ||
4920 VT->getElementType()->isIntegerTy(1))
4922
4923 // The code-generator is currently not able to handle scalable vectors
4924 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4925 // it. This change will be removed when code-generation for these types is
4926 // sufficiently reliable.
4927 if (VT->getElementCount() == ElementCount::getScalable(1))
4929
4930 ElementCount LegalVF = LT.second.getVectorElementCount();
4931 InstructionCost MemOpCost =
4932 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
4933 {TTI::OK_AnyValue, TTI::OP_None}, I);
4934 // Add on an overhead cost for using gathers/scatters.
4935 MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST);
4936 return LT.first * MemOpCost * getMaxNumElements(LegalVF);
4937}
4938
4940 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
4941}
4942
4944 Align Alignment,
4945 unsigned AddressSpace,
4947 TTI::OperandValueInfo OpInfo,
4948 const Instruction *I) const {
4949 EVT VT = TLI->getValueType(DL, Ty, true);
4950 // Type legalization can't handle structs
4951 if (VT == MVT::Other)
4952 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
4953 CostKind);
4954
4955 auto LT = getTypeLegalizationCost(Ty);
4956 if (!LT.first.isValid())
4958
4959 // The code-generator is currently not able to handle scalable vectors
4960 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4961 // it. This change will be removed when code-generation for these types is
4962 // sufficiently reliable.
4963 // We also only support full register predicate loads and stores.
4964 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
4965 if (VTy->getElementCount() == ElementCount::getScalable(1) ||
4966 (VTy->getElementType()->isIntegerTy(1) &&
4967 !VTy->getElementCount().isKnownMultipleOf(
4970
4971 // TODO: consider latency as well for TCK_SizeAndLatency.
4973 return LT.first;
4974
4976 return 1;
4977
4978 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
4979 LT.second.is128BitVector() && Alignment < Align(16)) {
4980 // Unaligned stores are extremely inefficient. We don't split all
4981 // unaligned 128-bit stores because the negative impact that has shown in
4982 // practice on inlined block copy code.
4983 // We make such stores expensive so that we will only vectorize if there
4984 // are 6 other instructions getting vectorized.
4985 const int AmortizationCost = 6;
4986
4987 return LT.first * 2 * AmortizationCost;
4988 }
4989
4990 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
4991 if (Ty->isPtrOrPtrVectorTy())
4992 return LT.first;
4993
4994 if (useNeonVector(Ty)) {
4995 // Check truncating stores and extending loads.
4996 if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
4997 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
4998 if (VT == MVT::v4i8)
4999 return 2;
5000 // Otherwise we need to scalarize.
5001 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
5002 }
5003 EVT EltVT = VT.getVectorElementType();
5004 unsigned EltSize = EltVT.getScalarSizeInBits();
5005 if (!isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 ||
5006 VT.getVectorNumElements() >= (128 / EltSize) || Alignment != Align(1))
5007 return LT.first;
5008 // FIXME: v3i8 lowering currently is very inefficient, due to automatic
5009 // widening to v4i8, which produces suboptimal results.
5010 if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8)
5011 return LT.first;
5012
5013 // Check non-power-of-2 loads/stores for legal vector element types with
5014 // NEON. Non-power-of-2 memory ops will get broken down to a set of
5015 // operations on smaller power-of-2 ops, including ld1/st1.
5016 LLVMContext &C = Ty->getContext();
5018 SmallVector<EVT> TypeWorklist;
5019 TypeWorklist.push_back(VT);
5020 while (!TypeWorklist.empty()) {
5021 EVT CurrVT = TypeWorklist.pop_back_val();
5022 unsigned CurrNumElements = CurrVT.getVectorNumElements();
5023 if (isPowerOf2_32(CurrNumElements)) {
5024 Cost += 1;
5025 continue;
5026 }
5027
5028 unsigned PrevPow2 = NextPowerOf2(CurrNumElements) / 2;
5029 TypeWorklist.push_back(EVT::getVectorVT(C, EltVT, PrevPow2));
5030 TypeWorklist.push_back(
5031 EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2));
5032 }
5033 return Cost;
5034 }
5035
5036 return LT.first;
5037}
5038
5040 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
5041 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
5042 bool UseMaskForCond, bool UseMaskForGaps) const {
5043 assert(Factor >= 2 && "Invalid interleave factor");
5044 auto *VecVTy = cast<VectorType>(VecTy);
5045
5046 if (VecTy->isScalableTy() && !ST->hasSVE())
5048
5049 // Scalable VFs will emit vector.[de]interleave intrinsics, and currently we
5050 // only have lowering for power-of-2 factors.
5051 // TODO: Add lowering for vector.[de]interleave3 intrinsics and support in
5052 // InterleavedAccessPass for ld3/st3
5053 if (VecTy->isScalableTy() && !isPowerOf2_32(Factor))
5055
5056 // Vectorization for masked interleaved accesses is only enabled for scalable
5057 // VF.
5058 if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
5060
5061 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
5062 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
5063 auto *SubVecTy =
5064 VectorType::get(VecVTy->getElementType(),
5065 VecVTy->getElementCount().divideCoefficientBy(Factor));
5066
5067 // ldN/stN only support legal vector types of size 64 or 128 in bits.
5068 // Accesses having vector types that are a multiple of 128 bits can be
5069 // matched to more than one ldN/stN instruction.
5070 bool UseScalable;
5071 if (MinElts % Factor == 0 &&
5072 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
5073 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
5074 }
5075
5076 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
5077 Alignment, AddressSpace, CostKind,
5078 UseMaskForCond, UseMaskForGaps);
5079}
5080
5085 for (auto *I : Tys) {
5086 if (!I->isVectorTy())
5087 continue;
5088 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
5089 128)
5090 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
5091 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
5092 }
5093 return Cost;
5094}
5095
5097 return ST->getMaxInterleaveFactor();
5098}
5099
5100// For Falkor, we want to avoid having too many strided loads in a loop since
5101// that can exhaust the HW prefetcher resources. We adjust the unroller
5102// MaxCount preference below to attempt to ensure unrolling doesn't create too
5103// many strided loads.
5104static void
5107 enum { MaxStridedLoads = 7 };
5108 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
5109 int StridedLoads = 0;
5110 // FIXME? We could make this more precise by looking at the CFG and
5111 // e.g. not counting loads in each side of an if-then-else diamond.
5112 for (const auto BB : L->blocks()) {
5113 for (auto &I : *BB) {
5114 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
5115 if (!LMemI)
5116 continue;
5117
5118 Value *PtrValue = LMemI->getPointerOperand();
5119 if (L->isLoopInvariant(PtrValue))
5120 continue;
5121
5122 const SCEV *LSCEV = SE.getSCEV(PtrValue);
5123 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
5124 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
5125 continue;
5126
5127 // FIXME? We could take pairing of unrolled load copies into account
5128 // by looking at the AddRec, but we would probably have to limit this
5129 // to loops with no stores or other memory optimization barriers.
5130 ++StridedLoads;
5131 // We've seen enough strided loads that seeing more won't make a
5132 // difference.
5133 if (StridedLoads > MaxStridedLoads / 2)
5134 return StridedLoads;
5135 }
5136 }
5137 return StridedLoads;
5138 };
5139
5140 int StridedLoads = countStridedLoads(L, SE);
5141 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
5142 << " strided loads\n");
5143 // Pick the largest power of 2 unroll count that won't result in too many
5144 // strided loads.
5145 if (StridedLoads) {
5146 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
5147 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
5148 << UP.MaxCount << '\n');
5149 }
5150}
5151
5152// This function returns true if the loop:
5153// 1. Has a valid cost, and
5154// 2. Has a cost within the supplied budget.
5155// Otherwise it returns false.
5157 InstructionCost Budget,
5158 unsigned *FinalSize) {
5159 // Estimate the size of the loop.
5160 InstructionCost LoopCost = 0;
5161
5162 for (auto *BB : L->getBlocks()) {
5163 for (auto &I : *BB) {
5164 SmallVector<const Value *, 4> Operands(I.operand_values());
5165 InstructionCost Cost =
5166 TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize);
5167 // This can happen with intrinsics that don't currently have a cost model
5168 // or for some operations that require SVE.
5169 if (!Cost.isValid())
5170 return false;
5171
5172 LoopCost += Cost;
5173 if (LoopCost > Budget)
5174 return false;
5175 }
5176 }
5177
5178 if (FinalSize)
5179 *FinalSize = LoopCost.getValue();
5180 return true;
5181}
5182
5184 const AArch64TTIImpl &TTI) {
5185 // Only consider loops with unknown trip counts for which we can determine
5186 // a symbolic expression. Multi-exit loops with small known trip counts will
5187 // likely be unrolled anyway.
5188 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
5190 return false;
5191
5192 // It might not be worth unrolling loops with low max trip counts. Restrict
5193 // this to max trip counts > 32 for now.
5194 unsigned MaxTC = SE.getSmallConstantMaxTripCount(L);
5195 if (MaxTC > 0 && MaxTC <= 32)
5196 return false;
5197
5198 // Make sure the loop size is <= 5.
5199 if (!isLoopSizeWithinBudget(L, TTI, 5, nullptr))
5200 return false;
5201
5202 // Small search loops with multiple exits can be highly beneficial to unroll.
5203 // We only care about loops with exactly two exiting blocks, although each
5204 // block could jump to the same exit block.
5205 ArrayRef<BasicBlock *> Blocks = L->getBlocks();
5206 if (Blocks.size() != 2)
5207 return false;
5208
5209 if (any_of(Blocks, [](BasicBlock *BB) {
5210 return !isa<BranchInst>(BB->getTerminator());
5211 }))
5212 return false;
5213
5214 return true;
5215}
5216
5217/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
5218/// OOO engine's wide instruction window and various predictors.
5219static void
5222 const AArch64TTIImpl &TTI) {
5223 // Limit loops with structure that is highly likely to benefit from runtime
5224 // unrolling; that is we exclude outer loops and loops with many blocks (i.e.
5225 // likely with complex control flow). Note that the heuristics here may be
5226 // overly conservative and we err on the side of avoiding runtime unrolling
5227 // rather than unroll excessively. They are all subject to further refinement.
5228 if (!L->isInnermost() || L->getNumBlocks() > 8)
5229 return;
5230
5231 // Loops with multiple exits are handled by common code.
5232 if (!L->getExitBlock())
5233 return;
5234
5235 // Check if the loop contains any reductions that could be parallelized when
5236 // unrolling. If so, enable partial unrolling, if the trip count is know to be
5237 // a multiple of 2.
5238 bool HasParellelizableReductions =
5239 L->getNumBlocks() == 1 &&
5240 any_of(L->getHeader()->phis(),
5241 [&SE, L](PHINode &Phi) {
5242 return canParallelizeReductionWhenUnrolling(Phi, L, &SE);
5243 }) &&
5244 isLoopSizeWithinBudget(L, TTI, 12, nullptr);
5245 if (HasParellelizableReductions &&
5246 SE.getSmallConstantTripMultiple(L, L->getExitingBlock()) % 2 == 0) {
5247 UP.Partial = true;
5248 UP.MaxCount = 4;
5249 UP.AddAdditionalAccumulators = true;
5250 }
5251
5252 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
5254 (SE.getSmallConstantMaxTripCount(L) > 0 &&
5255 SE.getSmallConstantMaxTripCount(L) <= 32))
5256 return;
5257
5258 if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
5259 return;
5260
5262 return;
5263
5264 // Limit to loops with trip counts that are cheap to expand.
5265 UP.SCEVExpansionBudget = 1;
5266
5267 if (HasParellelizableReductions) {
5268 UP.Runtime = true;
5270 UP.AddAdditionalAccumulators = true;
5271 }
5272
5273 // Try to unroll small loops, of few-blocks with low budget, if they have
5274 // load/store dependencies, to expose more parallel memory access streams,
5275 // or if they do little work inside a block (i.e. load -> X -> store pattern).
5276 BasicBlock *Header = L->getHeader();
5277 BasicBlock *Latch = L->getLoopLatch();
5278 if (Header == Latch) {
5279 // Estimate the size of the loop.
5280 unsigned Size;
5281 unsigned Width = 10;
5282 if (!isLoopSizeWithinBudget(L, TTI, Width, &Size))
5283 return;
5284
5285 // Try to find an unroll count that maximizes the use of the instruction
5286 // window, i.e. trying to fetch as many instructions per cycle as possible.
5287 unsigned MaxInstsPerLine = 16;
5288 unsigned UC = 1;
5289 unsigned BestUC = 1;
5290 unsigned SizeWithBestUC = BestUC * Size;
5291 while (UC <= 8) {
5292 unsigned SizeWithUC = UC * Size;
5293 if (SizeWithUC > 48)
5294 break;
5295 if ((SizeWithUC % MaxInstsPerLine) == 0 ||
5296 (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
5297 BestUC = UC;
5298 SizeWithBestUC = BestUC * Size;
5299 }
5300 UC++;
5301 }
5302
5303 if (BestUC == 1)
5304 return;
5305
5306 SmallPtrSet<Value *, 8> LoadedValuesPlus;
5308 for (auto *BB : L->blocks()) {
5309 for (auto &I : *BB) {
5311 if (!Ptr)
5312 continue;
5313 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
5314 if (SE.isLoopInvariant(PtrSCEV, L))
5315 continue;
5316 if (isa<LoadInst>(&I)) {
5317 LoadedValuesPlus.insert(&I);
5318 // Include in-loop 1st users of loaded values.
5319 for (auto *U : I.users())
5320 if (L->contains(cast<Instruction>(U)))
5321 LoadedValuesPlus.insert(U);
5322 } else
5323 Stores.push_back(cast<StoreInst>(&I));
5324 }
5325 }
5326
5327 if (none_of(Stores, [&LoadedValuesPlus](StoreInst *SI) {
5328 return LoadedValuesPlus.contains(SI->getOperand(0));
5329 }))
5330 return;
5331
5332 UP.Runtime = true;
5333 UP.DefaultUnrollRuntimeCount = BestUC;
5334 return;
5335 }
5336
5337 // Try to runtime-unroll loops with early-continues depending on loop-varying
5338 // loads; this helps with branch-prediction for the early-continues.
5339 auto *Term = dyn_cast<BranchInst>(Header->getTerminator());
5341 if (!Term || !Term->isConditional() || Preds.size() == 1 ||
5342 !llvm::is_contained(Preds, Header) ||
5343 none_of(Preds, [L](BasicBlock *Pred) { return L->contains(Pred); }))
5344 return;
5345
5346 std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad =
5347 [&](Instruction *I, unsigned Depth) -> bool {
5348 if (isa<PHINode>(I) || L->isLoopInvariant(I) || Depth > 8)
5349 return false;
5350
5351 if (isa<LoadInst>(I))
5352 return true;
5353
5354 return any_of(I->operands(), [&](Value *V) {
5355 auto *I = dyn_cast<Instruction>(V);
5356 return I && DependsOnLoopLoad(I, Depth + 1);
5357 });
5358 };
5359 CmpPredicate Pred;
5360 Instruction *I;
5361 if (match(Term, m_Br(m_ICmp(Pred, m_Instruction(I), m_Value()), m_Value(),
5362 m_Value())) &&
5363 DependsOnLoopLoad(I, 0)) {
5364 UP.Runtime = true;
5365 }
5366}
5367
5370 OptimizationRemarkEmitter *ORE) const {
5371 // Enable partial unrolling and runtime unrolling.
5372 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
5373
5374 UP.UpperBound = true;
5375
5376 // For inner loop, it is more likely to be a hot one, and the runtime check
5377 // can be promoted out from LICM pass, so the overhead is less, let's try
5378 // a larger threshold to unroll more loops.
5379 if (L->getLoopDepth() > 1)
5380 UP.PartialThreshold *= 2;
5381
5382 // Disable partial & runtime unrolling on -Os.
5384
5385 // Scan the loop: don't unroll loops with calls as this could prevent
5386 // inlining. Don't unroll auto-vectorized loops either, though do allow
5387 // unrolling of the scalar remainder.
5388 bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
5390 for (auto *BB : L->getBlocks()) {
5391 for (auto &I : *BB) {
5392 // Both auto-vectorized loops and the scalar remainder have the
5393 // isvectorized attribute, so differentiate between them by the presence
5394 // of vector instructions.
5395 if (IsVectorized && I.getType()->isVectorTy())
5396 return;
5397 if (isa<CallBase>(I)) {
5400 if (!isLoweredToCall(F))
5401 continue;
5402 return;
5403 }
5404
5405 SmallVector<const Value *, 4> Operands(I.operand_values());
5406 Cost += getInstructionCost(&I, Operands,
5408 }
5409 }
5410
5411 // Apply subtarget-specific unrolling preferences.
5412 if (ST->isAppleMLike())
5413 getAppleRuntimeUnrollPreferences(L, SE, UP, *this);
5414 else if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
5417
5418 // If this is a small, multi-exit loop similar to something like std::find,
5419 // then there is typically a performance improvement achieved by unrolling.
5420 if (!L->getExitBlock() && shouldUnrollMultiExitLoop(L, SE, *this)) {
5421 UP.RuntimeUnrollMultiExit = true;
5422 UP.Runtime = true;
5423 // Limit unroll count.
5425 // Allow slightly more costly trip-count expansion to catch search loops
5426 // with pointer inductions.
5427 UP.SCEVExpansionBudget = 5;
5428 return;
5429 }
5430
5431 // Enable runtime unrolling for in-order models
5432 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
5433 // checking for that case, we can ensure that the default behaviour is
5434 // unchanged
5435 if (ST->getProcFamily() != AArch64Subtarget::Generic &&
5436 !ST->getSchedModel().isOutOfOrder()) {
5437 UP.Runtime = true;
5438 UP.Partial = true;
5439 UP.UnrollRemainder = true;
5441
5442 UP.UnrollAndJam = true;
5444 }
5445
5446 // Force unrolling small loops can be very useful because of the branch
5447 // taken cost of the backedge.
5449 UP.Force = true;
5450}
5451
5456
5458 Type *ExpectedType,
5459 bool CanCreate) const {
5460 switch (Inst->getIntrinsicID()) {
5461 default:
5462 return nullptr;
5463 case Intrinsic::aarch64_neon_st2:
5464 case Intrinsic::aarch64_neon_st3:
5465 case Intrinsic::aarch64_neon_st4: {
5466 // Create a struct type
5467 StructType *ST = dyn_cast<StructType>(ExpectedType);
5468 if (!CanCreate || !ST)
5469 return nullptr;
5470 unsigned NumElts = Inst->arg_size() - 1;
5471 if (ST->getNumElements() != NumElts)
5472 return nullptr;
5473 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5474 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
5475 return nullptr;
5476 }
5477 Value *Res = PoisonValue::get(ExpectedType);
5478 IRBuilder<> Builder(Inst);
5479 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5480 Value *L = Inst->getArgOperand(i);
5481 Res = Builder.CreateInsertValue(Res, L, i);
5482 }
5483 return Res;
5484 }
5485 case Intrinsic::aarch64_neon_ld2:
5486 case Intrinsic::aarch64_neon_ld3:
5487 case Intrinsic::aarch64_neon_ld4:
5488 if (Inst->getType() == ExpectedType)
5489 return Inst;
5490 return nullptr;
5491 }
5492}
5493
5495 MemIntrinsicInfo &Info) const {
5496 switch (Inst->getIntrinsicID()) {
5497 default:
5498 break;
5499 case Intrinsic::aarch64_neon_ld2:
5500 case Intrinsic::aarch64_neon_ld3:
5501 case Intrinsic::aarch64_neon_ld4:
5502 Info.ReadMem = true;
5503 Info.WriteMem = false;
5504 Info.PtrVal = Inst->getArgOperand(0);
5505 break;
5506 case Intrinsic::aarch64_neon_st2:
5507 case Intrinsic::aarch64_neon_st3:
5508 case Intrinsic::aarch64_neon_st4:
5509 Info.ReadMem = false;
5510 Info.WriteMem = true;
5511 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
5512 break;
5513 }
5514
5515 switch (Inst->getIntrinsicID()) {
5516 default:
5517 return false;
5518 case Intrinsic::aarch64_neon_ld2:
5519 case Intrinsic::aarch64_neon_st2:
5520 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
5521 break;
5522 case Intrinsic::aarch64_neon_ld3:
5523 case Intrinsic::aarch64_neon_st3:
5524 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
5525 break;
5526 case Intrinsic::aarch64_neon_ld4:
5527 case Intrinsic::aarch64_neon_st4:
5528 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
5529 break;
5530 }
5531 return true;
5532}
5533
5534/// See if \p I should be considered for address type promotion. We check if \p
5535/// I is a sext with right type and used in memory accesses. If it used in a
5536/// "complex" getelementptr, we allow it to be promoted without finding other
5537/// sext instructions that sign extended the same initial value. A getelementptr
5538/// is considered as "complex" if it has more than 2 operands.
5540 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
5541 bool Considerable = false;
5542 AllowPromotionWithoutCommonHeader = false;
5543 if (!isa<SExtInst>(&I))
5544 return false;
5545 Type *ConsideredSExtType =
5546 Type::getInt64Ty(I.getParent()->getParent()->getContext());
5547 if (I.getType() != ConsideredSExtType)
5548 return false;
5549 // See if the sext is the one with the right type and used in at least one
5550 // GetElementPtrInst.
5551 for (const User *U : I.users()) {
5552 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
5553 Considerable = true;
5554 // A getelementptr is considered as "complex" if it has more than 2
5555 // operands. We will promote a SExt used in such complex GEP as we
5556 // expect some computation to be merged if they are done on 64 bits.
5557 if (GEPInst->getNumOperands() > 2) {
5558 AllowPromotionWithoutCommonHeader = true;
5559 break;
5560 }
5561 }
5562 }
5563 return Considerable;
5564}
5565
5567 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
5568 if (!VF.isScalable())
5569 return true;
5570
5571 Type *Ty = RdxDesc.getRecurrenceType();
5572 if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
5573 return false;
5574
5575 switch (RdxDesc.getRecurrenceKind()) {
5576 case RecurKind::Sub:
5578 case RecurKind::Add:
5579 case RecurKind::FAdd:
5580 case RecurKind::And:
5581 case RecurKind::Or:
5582 case RecurKind::Xor:
5583 case RecurKind::SMin:
5584 case RecurKind::SMax:
5585 case RecurKind::UMin:
5586 case RecurKind::UMax:
5587 case RecurKind::FMin:
5588 case RecurKind::FMax:
5589 case RecurKind::FMulAdd:
5590 case RecurKind::AnyOf:
5592 return true;
5593 default:
5594 return false;
5595 }
5596}
5597
5600 FastMathFlags FMF,
5602 // The code-generator is currently not able to handle scalable vectors
5603 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5604 // it. This change will be removed when code-generation for these types is
5605 // sufficiently reliable.
5606 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
5607 if (VTy->getElementCount() == ElementCount::getScalable(1))
5609
5610 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
5611
5612 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
5613 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
5614
5615 InstructionCost LegalizationCost = 0;
5616 if (LT.first > 1) {
5617 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
5618 IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
5619 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
5620 }
5621
5622 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
5623}
5624
5626 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const {
5627 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5628 InstructionCost LegalizationCost = 0;
5629 if (LT.first > 1) {
5630 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
5631 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
5632 LegalizationCost *= LT.first - 1;
5633 }
5634
5635 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5636 assert(ISD && "Invalid opcode");
5637 // Add the final reduction cost for the legal horizontal reduction
5638 switch (ISD) {
5639 case ISD::ADD:
5640 case ISD::AND:
5641 case ISD::OR:
5642 case ISD::XOR:
5643 case ISD::FADD:
5644 return LegalizationCost + 2;
5645 default:
5647 }
5648}
5649
5652 std::optional<FastMathFlags> FMF,
5654 // The code-generator is currently not able to handle scalable vectors
5655 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5656 // it. This change will be removed when code-generation for these types is
5657 // sufficiently reliable.
5658 if (auto *VTy = dyn_cast<ScalableVectorType>(ValTy))
5659 if (VTy->getElementCount() == ElementCount::getScalable(1))
5661
5663 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
5664 InstructionCost BaseCost =
5665 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5666 // Add on extra cost to reflect the extra overhead on some CPUs. We still
5667 // end up vectorizing for more computationally intensive loops.
5668 return BaseCost + FixedVTy->getNumElements();
5669 }
5670
5671 if (Opcode != Instruction::FAdd)
5673
5674 auto *VTy = cast<ScalableVectorType>(ValTy);
5676 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
5677 Cost *= getMaxNumElements(VTy->getElementCount());
5678 return Cost;
5679 }
5680
5681 if (isa<ScalableVectorType>(ValTy))
5682 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
5683
5684 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5685 MVT MTy = LT.second;
5686 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5687 assert(ISD && "Invalid opcode");
5688
5689 // Horizontal adds can use the 'addv' instruction. We model the cost of these
5690 // instructions as twice a normal vector add, plus 1 for each legalization
5691 // step (LT.first). This is the only arithmetic vector reduction operation for
5692 // which we have an instruction.
5693 // OR, XOR and AND costs should match the codegen from:
5694 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
5695 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
5696 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
5697 static const CostTblEntry CostTblNoPairwise[]{
5698 {ISD::ADD, MVT::v8i8, 2},
5699 {ISD::ADD, MVT::v16i8, 2},
5700 {ISD::ADD, MVT::v4i16, 2},
5701 {ISD::ADD, MVT::v8i16, 2},
5702 {ISD::ADD, MVT::v2i32, 2},
5703 {ISD::ADD, MVT::v4i32, 2},
5704 {ISD::ADD, MVT::v2i64, 2},
5705 {ISD::OR, MVT::v8i8, 5}, // fmov + orr_lsr + orr_lsr + lsr + orr
5706 {ISD::OR, MVT::v16i8, 7}, // ext + orr + same as v8i8
5707 {ISD::OR, MVT::v4i16, 4}, // fmov + orr_lsr + lsr + orr
5708 {ISD::OR, MVT::v8i16, 6}, // ext + orr + same as v4i16
5709 {ISD::OR, MVT::v2i32, 3}, // fmov + lsr + orr
5710 {ISD::OR, MVT::v4i32, 5}, // ext + orr + same as v2i32
5711 {ISD::OR, MVT::v2i64, 3}, // ext + orr + fmov
5712 {ISD::XOR, MVT::v8i8, 5}, // Same as above for or...
5713 {ISD::XOR, MVT::v16i8, 7},
5714 {ISD::XOR, MVT::v4i16, 4},
5715 {ISD::XOR, MVT::v8i16, 6},
5716 {ISD::XOR, MVT::v2i32, 3},
5717 {ISD::XOR, MVT::v4i32, 5},
5718 {ISD::XOR, MVT::v2i64, 3},
5719 {ISD::AND, MVT::v8i8, 5}, // Same as above for or...
5720 {ISD::AND, MVT::v16i8, 7},
5721 {ISD::AND, MVT::v4i16, 4},
5722 {ISD::AND, MVT::v8i16, 6},
5723 {ISD::AND, MVT::v2i32, 3},
5724 {ISD::AND, MVT::v4i32, 5},
5725 {ISD::AND, MVT::v2i64, 3},
5726 };
5727 switch (ISD) {
5728 default:
5729 break;
5730 case ISD::FADD:
5731 if (Type *EltTy = ValTy->getScalarType();
5732 // FIXME: For half types without fullfp16 support, this could extend and
5733 // use a fp32 faddp reduction but current codegen unrolls.
5734 MTy.isVector() && (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
5735 (EltTy->isHalfTy() && ST->hasFullFP16()))) {
5736 const unsigned NElts = MTy.getVectorNumElements();
5737 if (ValTy->getElementCount().getFixedValue() >= 2 && NElts >= 2 &&
5738 isPowerOf2_32(NElts))
5739 // Reduction corresponding to series of fadd instructions is lowered to
5740 // series of faddp instructions. faddp has latency/throughput that
5741 // matches fadd instruction and hence, every faddp instruction can be
5742 // considered to have a relative cost = 1 with
5743 // CostKind = TCK_RecipThroughput.
5744 // An faddp will pairwise add vector elements, so the size of input
5745 // vector reduces by half every time, requiring
5746 // #(faddp instructions) = log2_32(NElts).
5747 return (LT.first - 1) + /*No of faddp instructions*/ Log2_32(NElts);
5748 }
5749 break;
5750 case ISD::ADD:
5751 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
5752 return (LT.first - 1) + Entry->Cost;
5753 break;
5754 case ISD::XOR:
5755 case ISD::AND:
5756 case ISD::OR:
5757 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
5758 if (!Entry)
5759 break;
5760 auto *ValVTy = cast<FixedVectorType>(ValTy);
5761 if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
5762 isPowerOf2_32(ValVTy->getNumElements())) {
5763 InstructionCost ExtraCost = 0;
5764 if (LT.first != 1) {
5765 // Type needs to be split, so there is an extra cost of LT.first - 1
5766 // arithmetic ops.
5767 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
5768 MTy.getVectorNumElements());
5769 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5770 ExtraCost *= LT.first - 1;
5771 }
5772 // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
5773 auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;
5774 return Cost + ExtraCost;
5775 }
5776 break;
5777 }
5778 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5779}
5780
5782 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *VecTy,
5783 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
5784 EVT VecVT = TLI->getValueType(DL, VecTy);
5785 EVT ResVT = TLI->getValueType(DL, ResTy);
5786
5787 if (Opcode == Instruction::Add && VecVT.isSimple() && ResVT.isSimple() &&
5788 VecVT.getSizeInBits() >= 64) {
5789 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
5790
5791 // The legal cases are:
5792 // UADDLV 8/16/32->32
5793 // UADDLP 32->64
5794 unsigned RevVTSize = ResVT.getSizeInBits();
5795 if (((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
5796 RevVTSize <= 32) ||
5797 ((LT.second == MVT::v4i16 || LT.second == MVT::v8i16) &&
5798 RevVTSize <= 32) ||
5799 ((LT.second == MVT::v2i32 || LT.second == MVT::v4i32) &&
5800 RevVTSize <= 64))
5801 return (LT.first - 1) * 2 + 2;
5802 }
5803
5804 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, VecTy, FMF,
5805 CostKind);
5806}
5807
5809AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode,
5810 Type *ResTy, VectorType *VecTy,
5812 EVT VecVT = TLI->getValueType(DL, VecTy);
5813 EVT ResVT = TLI->getValueType(DL, ResTy);
5814
5815 if (ST->hasDotProd() && VecVT.isSimple() && ResVT.isSimple() &&
5816 RedOpcode == Instruction::Add) {
5817 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
5818
5819 // The legal cases with dotprod are
5820 // UDOT 8->32
5821 // Which requires an additional uaddv to sum the i32 values.
5822 if ((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
5823 ResVT == MVT::i32)
5824 return LT.first + 2;
5825 }
5826
5827 return BaseT::getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, VecTy,
5828 CostKind);
5829}
5830
5834 static const CostTblEntry ShuffleTbl[] = {
5835 { TTI::SK_Splice, MVT::nxv16i8, 1 },
5836 { TTI::SK_Splice, MVT::nxv8i16, 1 },
5837 { TTI::SK_Splice, MVT::nxv4i32, 1 },
5838 { TTI::SK_Splice, MVT::nxv2i64, 1 },
5839 { TTI::SK_Splice, MVT::nxv2f16, 1 },
5840 { TTI::SK_Splice, MVT::nxv4f16, 1 },
5841 { TTI::SK_Splice, MVT::nxv8f16, 1 },
5842 { TTI::SK_Splice, MVT::nxv2bf16, 1 },
5843 { TTI::SK_Splice, MVT::nxv4bf16, 1 },
5844 { TTI::SK_Splice, MVT::nxv8bf16, 1 },
5845 { TTI::SK_Splice, MVT::nxv2f32, 1 },
5846 { TTI::SK_Splice, MVT::nxv4f32, 1 },
5847 { TTI::SK_Splice, MVT::nxv2f64, 1 },
5848 };
5849
5850 // The code-generator is currently not able to handle scalable vectors
5851 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5852 // it. This change will be removed when code-generation for these types is
5853 // sufficiently reliable.
5856
5857 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
5858 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
5859 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
5860 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
5861 : LT.second;
5862 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
5863 InstructionCost LegalizationCost = 0;
5864 if (Index < 0) {
5865 LegalizationCost =
5866 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
5868 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
5870 }
5871
5872 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
5873 // Cost performed on a promoted type.
5874 if (LT.second.getScalarType() == MVT::i1) {
5875 LegalizationCost +=
5876 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
5878 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
5880 }
5881 const auto *Entry =
5882 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
5883 assert(Entry && "Illegal Type for Splice");
5884 LegalizationCost += Entry->Cost;
5885 return LegalizationCost * LT.first;
5886}
5887
5889 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
5891 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
5892 TTI::TargetCostKind CostKind, std::optional<FastMathFlags> FMF) const {
5894
5896 return Invalid;
5897
5898 if (VF.isFixed() && !ST->isSVEorStreamingSVEAvailable() &&
5899 (!ST->isNeonAvailable() || !ST->hasDotProd()))
5900 return Invalid;
5901
5902 if ((Opcode != Instruction::Add && Opcode != Instruction::Sub &&
5903 Opcode != Instruction::FAdd) ||
5904 OpAExtend == TTI::PR_None)
5905 return Invalid;
5906
5907 // Floating-point partial reductions are invalid if `reassoc` and `contract`
5908 // are not allowed.
5909 if (AccumType->isFloatingPointTy()) {
5910 assert(FMF && "Missing FastMathFlags for floating-point partial reduction");
5911 if (!FMF->allowReassoc() || !FMF->allowContract())
5912 return Invalid;
5913 } else {
5914 assert(!FMF &&
5915 "FastMathFlags only apply to floating-point partial reductions");
5916 }
5917
5918 assert((BinOp || (OpBExtend == TTI::PR_None && !InputTypeB)) &&
5919 (!BinOp || (OpBExtend != TTI::PR_None && InputTypeB)) &&
5920 "Unexpected values for OpBExtend or InputTypeB");
5921
5922 // We only support multiply binary operations for now, and for muls we
5923 // require the types being extended to be the same.
5924 if (BinOp && ((*BinOp != Instruction::Mul && *BinOp != Instruction::FMul) ||
5925 InputTypeA != InputTypeB))
5926 return Invalid;
5927
5928 bool IsUSDot = OpBExtend != TTI::PR_None && OpAExtend != OpBExtend;
5929 if (IsUSDot && !ST->hasMatMulInt8())
5930 return Invalid;
5931
5932 unsigned Ratio =
5933 AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits();
5934 if (VF.getKnownMinValue() <= Ratio)
5935 return Invalid;
5936
5937 VectorType *InputVectorType = VectorType::get(InputTypeA, VF);
5938 VectorType *AccumVectorType =
5939 VectorType::get(AccumType, VF.divideCoefficientBy(Ratio));
5940 // We don't yet support all kinds of legalization.
5941 auto TC = TLI->getTypeConversion(AccumVectorType->getContext(),
5942 EVT::getEVT(AccumVectorType));
5943 switch (TC.first) {
5944 default:
5945 return Invalid;
5949 // The legalised type (e.g. after splitting) must be legal too.
5950 if (TLI->getTypeAction(AccumVectorType->getContext(), TC.second) !=
5952 return Invalid;
5953 break;
5954 }
5955
5956 std::pair<InstructionCost, MVT> AccumLT =
5957 getTypeLegalizationCost(AccumVectorType);
5958 std::pair<InstructionCost, MVT> InputLT =
5959 getTypeLegalizationCost(InputVectorType);
5960
5961 InstructionCost Cost = InputLT.first * TTI::TCC_Basic;
5962
5963 // The sub/negation cannot be folded into the operands of
5964 // ISD::PARTIAL_REDUCE_*MLA, so make the cost more expensive.
5965 if (Opcode == Instruction::Sub)
5966 Cost += 8;
5967
5968 // Prefer using full types by costing half-full input types as more expensive.
5969 if (TypeSize::isKnownLT(InputVectorType->getPrimitiveSizeInBits(),
5971 // FIXME: This can be removed after the cost of the extends are folded into
5972 // the dot-product expression in VPlan, after landing:
5973 // https://github.com/llvm/llvm-project/pull/147302
5974 Cost *= 2;
5975
5976 if (ST->isSVEorStreamingSVEAvailable() && !IsUSDot) {
5977 // i16 -> i64 is natively supported for udot/sdot
5978 if (AccumLT.second.getScalarType() == MVT::i64 &&
5979 InputLT.second.getScalarType() == MVT::i16)
5980 return Cost;
5981 // i16 -> i32 is natively supported with SVE2p1
5982 if (AccumLT.second.getScalarType() == MVT::i32 &&
5983 InputLT.second.getScalarType() == MVT::i16 &&
5984 (ST->hasSVE2p1() || ST->hasSME2()))
5985 return Cost;
5986 // i8 -> i64 is supported with an extra level of extends
5987 if (AccumLT.second.getScalarType() == MVT::i64 &&
5988 InputLT.second.getScalarType() == MVT::i8)
5989 // FIXME: This cost should probably be a little higher, e.g. Cost + 2
5990 // because it requires two extra extends on the inputs. But if we'd change
5991 // that now, a regular reduction would be cheaper because the costs of
5992 // the extends in the IR are still counted. This can be fixed
5993 // after https://github.com/llvm/llvm-project/pull/147302 has landed.
5994 return Cost;
5995 }
5996
5997 // i8 -> i32 is natively supported for udot/sdot/usdot, both for NEON and SVE.
5998 if (ST->isSVEorStreamingSVEAvailable() ||
5999 (AccumLT.second.isFixedLengthVector() && ST->isNeonAvailable() &&
6000 ST->hasDotProd())) {
6001 if (AccumLT.second.getScalarType() == MVT::i32 &&
6002 InputLT.second.getScalarType() == MVT::i8)
6003 return Cost;
6004 }
6005
6006 // f16 -> f32 is natively supported for fdot
6007 if (Opcode == Instruction::FAdd && (ST->hasSME2() || ST->hasSVE2p1())) {
6008 if (AccumLT.second.getScalarType() == MVT::f32 &&
6009 InputLT.second.getScalarType() == MVT::f16 &&
6010 AccumLT.second.getVectorMinNumElements() == 4 &&
6011 InputLT.second.getVectorMinNumElements() == 8)
6012 return Cost;
6013 // Floating-point types aren't promoted, so expanding the partial reduction
6014 // is more expensive.
6015 return Cost + 20;
6016 }
6017
6018 // Add additional cost for the extends that would need to be inserted.
6019 return Cost + 2;
6020}
6021
6024 VectorType *SrcTy, ArrayRef<int> Mask,
6025 TTI::TargetCostKind CostKind, int Index,
6027 const Instruction *CxtI) const {
6028 assert((Mask.empty() || DstTy->isScalableTy() ||
6029 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
6030 "Expected the Mask to match the return size if given");
6031 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
6032 "Expected the same scalar types");
6033 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
6034
6035 // If we have a Mask, and the LT is being legalized somehow, split the Mask
6036 // into smaller vectors and sum the cost of each shuffle.
6037 if (!Mask.empty() && isa<FixedVectorType>(SrcTy) && LT.second.isVector() &&
6038 LT.second.getScalarSizeInBits() * Mask.size() > 128 &&
6039 SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
6040 Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
6041 // Check for LD3/LD4 instructions, which are represented in llvm IR as
6042 // deinterleaving-shuffle(load). The shuffle cost could potentially be free,
6043 // but we model it with a cost of LT.first so that LD3/LD4 have a higher
6044 // cost than just the load.
6045 if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
6048 return std::max<InstructionCost>(1, LT.first / 4);
6049
6050 // Check for ST3/ST4 instructions, which are represented in llvm IR as
6051 // store(interleaving-shuffle). The shuffle cost could potentially be free,
6052 // but we model it with a cost of LT.first so that ST3/ST4 have a higher
6053 // cost than just the store.
6054 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
6056 Mask, 4, SrcTy->getElementCount().getKnownMinValue() * 2) ||
6058 Mask, 3, SrcTy->getElementCount().getKnownMinValue() * 2)))
6059 return LT.first;
6060
6061 unsigned TpNumElts = Mask.size();
6062 unsigned LTNumElts = LT.second.getVectorNumElements();
6063 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
6064 VectorType *NTp = VectorType::get(SrcTy->getScalarType(),
6065 LT.second.getVectorElementCount());
6067 std::map<std::tuple<unsigned, unsigned, SmallVector<int>>, InstructionCost>
6068 PreviousCosts;
6069 for (unsigned N = 0; N < NumVecs; N++) {
6070 SmallVector<int> NMask;
6071 // Split the existing mask into chunks of size LTNumElts. Track the source
6072 // sub-vectors to ensure the result has at most 2 inputs.
6073 unsigned Source1 = -1U, Source2 = -1U;
6074 unsigned NumSources = 0;
6075 for (unsigned E = 0; E < LTNumElts; E++) {
6076 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
6078 if (MaskElt < 0) {
6080 continue;
6081 }
6082
6083 // Calculate which source from the input this comes from and whether it
6084 // is new to us.
6085 unsigned Source = MaskElt / LTNumElts;
6086 if (NumSources == 0) {
6087 Source1 = Source;
6088 NumSources = 1;
6089 } else if (NumSources == 1 && Source != Source1) {
6090 Source2 = Source;
6091 NumSources = 2;
6092 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
6093 NumSources++;
6094 }
6095
6096 // Add to the new mask. For the NumSources>2 case these are not correct,
6097 // but are only used for the modular lane number.
6098 if (Source == Source1)
6099 NMask.push_back(MaskElt % LTNumElts);
6100 else if (Source == Source2)
6101 NMask.push_back(MaskElt % LTNumElts + LTNumElts);
6102 else
6103 NMask.push_back(MaskElt % LTNumElts);
6104 }
6105 // Check if we have already generated this sub-shuffle, which means we
6106 // will have already generated the output. For example a <16 x i32> splat
6107 // will be the same sub-splat 4 times, which only needs to be generated
6108 // once and reused.
6109 auto Result =
6110 PreviousCosts.insert({std::make_tuple(Source1, Source2, NMask), 0});
6111 // Check if it was already in the map (already costed).
6112 if (!Result.second)
6113 continue;
6114 // If the sub-mask has at most 2 input sub-vectors then re-cost it using
6115 // getShuffleCost. If not then cost it using the worst case as the number
6116 // of element moves into a new vector.
6117 InstructionCost NCost =
6118 NumSources <= 2
6119 ? getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
6121 NTp, NTp, NMask, CostKind, 0, nullptr, Args,
6122 CxtI)
6123 : LTNumElts;
6124 Result.first->second = NCost;
6125 Cost += NCost;
6126 }
6127 return Cost;
6128 }
6129
6130 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
6131 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
6132 // A subvector extract can be implemented with a NEON/SVE ext (or trivial
6133 // extract, if from lane 0) for 128-bit NEON vectors or legal SVE vectors.
6134 // This currently only handles low or high extracts to prevent SLP vectorizer
6135 // regressions.
6136 // Note that SVE's ext instruction is destructive, but it can be fused with
6137 // a movprfx to act like a constructive instruction.
6138 if (IsExtractSubvector && LT.second.isFixedLengthVector()) {
6139 if (LT.second.getFixedSizeInBits() >= 128 &&
6140 cast<FixedVectorType>(SubTp)->getNumElements() ==
6141 LT.second.getVectorNumElements() / 2) {
6142 if (Index == 0)
6143 return 0;
6144 if (Index == (int)LT.second.getVectorNumElements() / 2)
6145 return 1;
6146 }
6148 }
6149 // FIXME: This was added to keep the costs equal when adding DstTys. Update
6150 // the code to handle length-changing shuffles.
6151 if (Kind == TTI::SK_InsertSubvector) {
6152 LT = getTypeLegalizationCost(DstTy);
6153 SrcTy = DstTy;
6154 }
6155
6156 // Check for identity masks, which we can treat as free for both fixed and
6157 // scalable vector paths.
6158 if (!Mask.empty() && LT.second.isFixedLengthVector() &&
6159 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
6160 all_of(enumerate(Mask), [](const auto &M) {
6161 return M.value() < 0 || M.value() == (int)M.index();
6162 }))
6163 return 0;
6164
6165 // Segmented shuffle matching.
6166 if (Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(SrcTy) &&
6167 !Mask.empty() && SrcTy->getPrimitiveSizeInBits().isNonZero() &&
6168 SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf(
6170
6172 unsigned Segments =
6174 unsigned SegmentElts = VTy->getNumElements() / Segments;
6175
6176 // dupq zd.t, zn.t[idx]
6177 if ((ST->hasSVE2p1() || ST->hasSME2p1()) &&
6178 ST->isSVEorStreamingSVEAvailable() &&
6179 isDUPQMask(Mask, Segments, SegmentElts))
6180 return LT.first;
6181
6182 // mov zd.q, vn
6183 if (ST->isSVEorStreamingSVEAvailable() &&
6184 isDUPFirstSegmentMask(Mask, Segments, SegmentElts))
6185 return LT.first;
6186 }
6187
6188 // Check for broadcast loads, which are supported by the LD1R instruction.
6189 // In terms of code-size, the shuffle vector is free when a load + dup get
6190 // folded into a LD1R. That's what we check and return here. For performance
6191 // and reciprocal throughput, a LD1R is not completely free. In this case, we
6192 // return the cost for the broadcast below (i.e. 1 for most/all types), so
6193 // that we model the load + dup sequence slightly higher because LD1R is a
6194 // high latency instruction.
6195 if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
6196 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
6197 if (IsLoad && LT.second.isVector() &&
6198 isLegalBroadcastLoad(SrcTy->getElementType(),
6199 LT.second.getVectorElementCount()))
6200 return 0;
6201 }
6202
6203 // If we have 4 elements for the shuffle and a Mask, get the cost straight
6204 // from the perfect shuffle tables.
6205 if (Mask.size() == 4 &&
6206 SrcTy->getElementCount() == ElementCount::getFixed(4) &&
6207 (SrcTy->getScalarSizeInBits() == 16 ||
6208 SrcTy->getScalarSizeInBits() == 32) &&
6209 all_of(Mask, [](int E) { return E < 8; }))
6210 return getPerfectShuffleCost(Mask);
6211
6212 // Check for other shuffles that are not SK_ kinds but we have native
6213 // instructions for, for example ZIP and UZP.
6214 unsigned Unused;
6215 if (LT.second.isFixedLengthVector() &&
6216 LT.second.getVectorNumElements() == Mask.size() &&
6217 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc ||
6218 // Discrepancies between isTRNMask and ShuffleVectorInst::isTransposeMask
6219 // mean that we can end up with shuffles that satisfy isTRNMask, but end
6220 // up labelled as TTI::SK_InsertSubvector. (e.g. {2, 0}).
6221 Kind == TTI::SK_InsertSubvector) &&
6222 (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused, Unused) ||
6223 isTRNMask(Mask, LT.second.getVectorNumElements(), Unused, Unused) ||
6224 isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
6225 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6226 LT.second.getVectorNumElements(), 16) ||
6227 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6228 LT.second.getVectorNumElements(), 32) ||
6229 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6230 LT.second.getVectorNumElements(), 64) ||
6231 // Check for non-zero lane splats
6232 all_of(drop_begin(Mask),
6233 [&Mask](int M) { return M < 0 || M == Mask[0]; })))
6234 return 1;
6235
6236 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
6237 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
6238 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
6239 static const CostTblEntry ShuffleTbl[] = {
6240 // Broadcast shuffle kinds can be performed with 'dup'.
6241 {TTI::SK_Broadcast, MVT::v8i8, 1},
6242 {TTI::SK_Broadcast, MVT::v16i8, 1},
6243 {TTI::SK_Broadcast, MVT::v4i16, 1},
6244 {TTI::SK_Broadcast, MVT::v8i16, 1},
6245 {TTI::SK_Broadcast, MVT::v2i32, 1},
6246 {TTI::SK_Broadcast, MVT::v4i32, 1},
6247 {TTI::SK_Broadcast, MVT::v2i64, 1},
6248 {TTI::SK_Broadcast, MVT::v4f16, 1},
6249 {TTI::SK_Broadcast, MVT::v8f16, 1},
6250 {TTI::SK_Broadcast, MVT::v4bf16, 1},
6251 {TTI::SK_Broadcast, MVT::v8bf16, 1},
6252 {TTI::SK_Broadcast, MVT::v2f32, 1},
6253 {TTI::SK_Broadcast, MVT::v4f32, 1},
6254 {TTI::SK_Broadcast, MVT::v2f64, 1},
6255 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
6256 // 'zip1/zip2' instructions.
6257 {TTI::SK_Transpose, MVT::v8i8, 1},
6258 {TTI::SK_Transpose, MVT::v16i8, 1},
6259 {TTI::SK_Transpose, MVT::v4i16, 1},
6260 {TTI::SK_Transpose, MVT::v8i16, 1},
6261 {TTI::SK_Transpose, MVT::v2i32, 1},
6262 {TTI::SK_Transpose, MVT::v4i32, 1},
6263 {TTI::SK_Transpose, MVT::v2i64, 1},
6264 {TTI::SK_Transpose, MVT::v4f16, 1},
6265 {TTI::SK_Transpose, MVT::v8f16, 1},
6266 {TTI::SK_Transpose, MVT::v4bf16, 1},
6267 {TTI::SK_Transpose, MVT::v8bf16, 1},
6268 {TTI::SK_Transpose, MVT::v2f32, 1},
6269 {TTI::SK_Transpose, MVT::v4f32, 1},
6270 {TTI::SK_Transpose, MVT::v2f64, 1},
6271 // Select shuffle kinds.
6272 // TODO: handle vXi8/vXi16.
6273 {TTI::SK_Select, MVT::v2i32, 1}, // mov.
6274 {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
6275 {TTI::SK_Select, MVT::v2i64, 1}, // mov.
6276 {TTI::SK_Select, MVT::v2f32, 1}, // mov.
6277 {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
6278 {TTI::SK_Select, MVT::v2f64, 1}, // mov.
6279 // PermuteSingleSrc shuffle kinds.
6280 {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov.
6281 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
6282 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov.
6283 {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov.
6284 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
6285 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov.
6286 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
6287 {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
6288 {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same
6289 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl
6290 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl
6291 {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
6292 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl
6293 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl
6294 // Reverse can be lowered with `rev`.
6295 {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
6296 {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
6297 {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
6298 {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
6299 {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
6300 {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
6301 {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
6302 {TTI::SK_Reverse, MVT::v8bf16, 2}, // REV64; EXT
6303 {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
6304 {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
6305 {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
6306 {TTI::SK_Reverse, MVT::v4bf16, 1}, // REV64
6307 {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
6308 {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64
6309 // Splice can all be lowered as `ext`.
6310 {TTI::SK_Splice, MVT::v2i32, 1},
6311 {TTI::SK_Splice, MVT::v4i32, 1},
6312 {TTI::SK_Splice, MVT::v2i64, 1},
6313 {TTI::SK_Splice, MVT::v2f32, 1},
6314 {TTI::SK_Splice, MVT::v4f32, 1},
6315 {TTI::SK_Splice, MVT::v2f64, 1},
6316 {TTI::SK_Splice, MVT::v8f16, 1},
6317 {TTI::SK_Splice, MVT::v8bf16, 1},
6318 {TTI::SK_Splice, MVT::v8i16, 1},
6319 {TTI::SK_Splice, MVT::v16i8, 1},
6320 {TTI::SK_Splice, MVT::v4f16, 1},
6321 {TTI::SK_Splice, MVT::v4bf16, 1},
6322 {TTI::SK_Splice, MVT::v4i16, 1},
6323 {TTI::SK_Splice, MVT::v8i8, 1},
6324 // Broadcast shuffle kinds for scalable vectors
6325 {TTI::SK_Broadcast, MVT::nxv16i8, 1},
6326 {TTI::SK_Broadcast, MVT::nxv8i16, 1},
6327 {TTI::SK_Broadcast, MVT::nxv4i32, 1},
6328 {TTI::SK_Broadcast, MVT::nxv2i64, 1},
6329 {TTI::SK_Broadcast, MVT::nxv2f16, 1},
6330 {TTI::SK_Broadcast, MVT::nxv4f16, 1},
6331 {TTI::SK_Broadcast, MVT::nxv8f16, 1},
6332 {TTI::SK_Broadcast, MVT::nxv2bf16, 1},
6333 {TTI::SK_Broadcast, MVT::nxv4bf16, 1},
6334 {TTI::SK_Broadcast, MVT::nxv8bf16, 1},
6335 {TTI::SK_Broadcast, MVT::nxv2f32, 1},
6336 {TTI::SK_Broadcast, MVT::nxv4f32, 1},
6337 {TTI::SK_Broadcast, MVT::nxv2f64, 1},
6338 {TTI::SK_Broadcast, MVT::nxv16i1, 1},
6339 {TTI::SK_Broadcast, MVT::nxv8i1, 1},
6340 {TTI::SK_Broadcast, MVT::nxv4i1, 1},
6341 {TTI::SK_Broadcast, MVT::nxv2i1, 1},
6342 // Handle the cases for vector.reverse with scalable vectors
6343 {TTI::SK_Reverse, MVT::nxv16i8, 1},
6344 {TTI::SK_Reverse, MVT::nxv8i16, 1},
6345 {TTI::SK_Reverse, MVT::nxv4i32, 1},
6346 {TTI::SK_Reverse, MVT::nxv2i64, 1},
6347 {TTI::SK_Reverse, MVT::nxv2f16, 1},
6348 {TTI::SK_Reverse, MVT::nxv4f16, 1},
6349 {TTI::SK_Reverse, MVT::nxv8f16, 1},
6350 {TTI::SK_Reverse, MVT::nxv2bf16, 1},
6351 {TTI::SK_Reverse, MVT::nxv4bf16, 1},
6352 {TTI::SK_Reverse, MVT::nxv8bf16, 1},
6353 {TTI::SK_Reverse, MVT::nxv2f32, 1},
6354 {TTI::SK_Reverse, MVT::nxv4f32, 1},
6355 {TTI::SK_Reverse, MVT::nxv2f64, 1},
6356 {TTI::SK_Reverse, MVT::nxv16i1, 1},
6357 {TTI::SK_Reverse, MVT::nxv8i1, 1},
6358 {TTI::SK_Reverse, MVT::nxv4i1, 1},
6359 {TTI::SK_Reverse, MVT::nxv2i1, 1},
6360 };
6361 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
6362 return LT.first * Entry->Cost;
6363 }
6364
6365 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(SrcTy))
6366 return getSpliceCost(SrcTy, Index, CostKind);
6367
6368 // Inserting a subvector can often be done with either a D, S or H register
6369 // move, so long as the inserted vector is "aligned".
6370 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
6371 LT.second.getSizeInBits() <= 128 && SubTp) {
6372 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
6373 if (SubLT.second.isVector()) {
6374 int NumElts = LT.second.getVectorNumElements();
6375 int NumSubElts = SubLT.second.getVectorNumElements();
6376 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
6377 return SubLT.first;
6378 }
6379 }
6380
6381 // Restore optimal kind.
6382 if (IsExtractSubvector)
6384 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, SubTp,
6385 Args, CxtI);
6386}
6387
6390 const DominatorTree &DT) {
6391 const auto &Strides = DenseMap<Value *, const SCEV *>();
6392 for (BasicBlock *BB : TheLoop->blocks()) {
6393 // Scan the instructions in the block and look for addresses that are
6394 // consecutive and decreasing.
6395 for (Instruction &I : *BB) {
6396 if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
6398 Type *AccessTy = getLoadStoreType(&I);
6399 if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, DT, Strides,
6400 /*Assume=*/true, /*ShouldCheckWrap=*/false)
6401 .value_or(0) < 0)
6402 return true;
6403 }
6404 }
6405 }
6406 return false;
6407}
6408
6410 if (SVEPreferFixedOverScalableIfEqualCost.getNumOccurrences())
6412 // For cases like post-LTO vectorization, when we eventually know the trip
6413 // count, epilogue with fixed-width vectorization can be deleted if the trip
6414 // count is less than the epilogue iterations. That's why we prefer
6415 // fixed-width vectorization in epilogue in case of equal costs.
6416 if (IsEpilogue)
6417 return true;
6418 return ST->useFixedOverScalableIfEqualCost();
6419}
6420
6422 return ST->getEpilogueVectorizationMinVF();
6423}
6424
6426 if (!ST->hasSVE())
6427 return false;
6428
6429 // We don't currently support vectorisation with interleaving for SVE - with
6430 // such loops we're better off not using tail-folding. This gives us a chance
6431 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
6432 if (TFI->IAI->hasGroups())
6433 return false;
6434
6436 if (TFI->LVL->getReductionVars().size())
6438 if (TFI->LVL->getFixedOrderRecurrences().size())
6440
6441 // We call this to discover whether any load/store pointers in the loop have
6442 // negative strides. This will require extra work to reverse the loop
6443 // predicate, which may be expensive.
6446 *TFI->LVL->getDominatorTree()))
6450
6451 if (!TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(),
6452 Required))
6453 return false;
6454
6455 // Don't tail-fold for tight loops where we would be better off interleaving
6456 // with an unpredicated loop.
6457 unsigned NumInsns = 0;
6458 for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
6459 NumInsns += BB->sizeWithoutDebug();
6460 }
6461
6462 // We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
6463 return NumInsns >= SVETailFoldInsnThreshold;
6464}
6465
6468 StackOffset BaseOffset, bool HasBaseReg,
6469 int64_t Scale, unsigned AddrSpace) const {
6470 // Scaling factors are not free at all.
6471 // Operands | Rt Latency
6472 // -------------------------------------------
6473 // Rt, [Xn, Xm] | 4
6474 // -------------------------------------------
6475 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
6476 // Rt, [Xn, Wm, <extend> #imm] |
6478 AM.BaseGV = BaseGV;
6479 AM.BaseOffs = BaseOffset.getFixed();
6480 AM.HasBaseReg = HasBaseReg;
6481 AM.Scale = Scale;
6482 AM.ScalableOffset = BaseOffset.getScalable();
6483 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
6484 // Scale represents reg2 * scale, thus account for 1 if
6485 // it is not equal to 0 or 1.
6486 return AM.Scale != 0 && AM.Scale != 1;
6488}
6489
6491 const Instruction *I) const {
6493 // For the binary operators (e.g. or) we need to be more careful than
6494 // selects, here we only transform them if they are already at a natural
6495 // break point in the code - the end of a block with an unconditional
6496 // terminator.
6497 if (I->getOpcode() == Instruction::Or &&
6498 isa<BranchInst>(I->getNextNode()) &&
6499 cast<BranchInst>(I->getNextNode())->isUnconditional())
6500 return true;
6501
6502 if (I->getOpcode() == Instruction::Add ||
6503 I->getOpcode() == Instruction::Sub)
6504 return true;
6505 }
6507}
6508
6511 const TargetTransformInfo::LSRCost &C2) const {
6512 // AArch64 specific here is adding the number of instructions to the
6513 // comparison (though not as the first consideration, as some targets do)
6514 // along with changing the priority of the base additions.
6515 // TODO: Maybe a more nuanced tradeoff between instruction count
6516 // and number of registers? To be investigated at a later date.
6517 if (EnableLSRCostOpt)
6518 return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost,
6519 C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
6520 std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost,
6521 C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
6522
6524}
6525
6526static bool isSplatShuffle(Value *V) {
6527 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
6528 return all_equal(Shuf->getShuffleMask());
6529 return false;
6530}
6531
6532/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
6533/// or upper half of the vector elements.
6534static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
6535 bool AllowSplat = false) {
6536 // Scalable types can't be extract shuffle vectors.
6537 if (Op1->getType()->isScalableTy() || Op2->getType()->isScalableTy())
6538 return false;
6539
6540 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
6541 auto *FullTy = FullV->getType();
6542 auto *HalfTy = HalfV->getType();
6543 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
6544 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
6545 };
6546
6547 auto extractHalf = [](Value *FullV, Value *HalfV) {
6548 auto *FullVT = cast<FixedVectorType>(FullV->getType());
6549 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
6550 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
6551 };
6552
6553 ArrayRef<int> M1, M2;
6554 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
6555 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
6556 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
6557 return false;
6558
6559 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relevant arg so that
6560 // it is not checked as an extract below.
6561 if (AllowSplat && isSplatShuffle(Op1))
6562 S1Op1 = nullptr;
6563 if (AllowSplat && isSplatShuffle(Op2))
6564 S2Op1 = nullptr;
6565
6566 // Check that the operands are half as wide as the result and we extract
6567 // half of the elements of the input vectors.
6568 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
6569 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
6570 return false;
6571
6572 // Check the mask extracts either the lower or upper half of vector
6573 // elements.
6574 int M1Start = 0;
6575 int M2Start = 0;
6576 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
6577 if ((S1Op1 &&
6578 !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) ||
6579 (S2Op1 &&
6580 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start)))
6581 return false;
6582
6583 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
6584 (M2Start != 0 && M2Start != (NumElements / 2)))
6585 return false;
6586 if (S1Op1 && S2Op1 && M1Start != M2Start)
6587 return false;
6588
6589 return true;
6590}
6591
6592/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
6593/// of the vector elements.
6594static bool areExtractExts(Value *Ext1, Value *Ext2) {
6595 auto areExtDoubled = [](Instruction *Ext) {
6596 return Ext->getType()->getScalarSizeInBits() ==
6597 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
6598 };
6599
6600 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
6601 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
6602 !areExtDoubled(cast<Instruction>(Ext1)) ||
6603 !areExtDoubled(cast<Instruction>(Ext2)))
6604 return false;
6605
6606 return true;
6607}
6608
6609/// Check if Op could be used with vmull_high_p64 intrinsic.
6611 Value *VectorOperand = nullptr;
6612 ConstantInt *ElementIndex = nullptr;
6613 return match(Op, m_ExtractElt(m_Value(VectorOperand),
6614 m_ConstantInt(ElementIndex))) &&
6615 ElementIndex->getValue() == 1 &&
6616 isa<FixedVectorType>(VectorOperand->getType()) &&
6617 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
6618}
6619
6620/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
6621static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
6623}
6624
6626 // Restrict ourselves to the form CodeGenPrepare typically constructs.
6627 auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs);
6628 if (!GEP || GEP->getNumOperands() != 2)
6629 return false;
6630
6631 Value *Base = GEP->getOperand(0);
6632 Value *Offsets = GEP->getOperand(1);
6633
6634 // We only care about scalar_base+vector_offsets.
6635 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
6636 return false;
6637
6638 // Sink extends that would allow us to use 32-bit offset vectors.
6639 if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) {
6640 auto *OffsetsInst = cast<Instruction>(Offsets);
6641 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
6642 OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32)
6643 Ops.push_back(&GEP->getOperandUse(1));
6644 }
6645
6646 // Sink the GEP.
6647 return true;
6648}
6649
6650/// We want to sink following cases:
6651/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale;
6652/// (add|sub|gep) A, ((mul|shl) zext(vscale), imm);
6654 if (match(Op, m_VScale()))
6655 return true;
6656 if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
6658 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
6659 return true;
6660 }
6661 if (match(Op, m_Shl(m_ZExt(m_VScale()), m_ConstantInt())) ||
6663 Value *ZExtOp = cast<Instruction>(Op)->getOperand(0);
6664 Ops.push_back(&cast<Instruction>(ZExtOp)->getOperandUse(0));
6665 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
6666 return true;
6667 }
6668 return false;
6669}
6670
6671static bool isFNeg(Value *Op) { return match(Op, m_FNeg(m_Value())); }
6672
6673/// Check if sinking \p I's operands to I's basic block is profitable, because
6674/// the operands can be folded into a target instruction, e.g.
6675/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
6679 switch (II->getIntrinsicID()) {
6680 case Intrinsic::aarch64_neon_smull:
6681 case Intrinsic::aarch64_neon_umull:
6682 if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1),
6683 /*AllowSplat=*/true)) {
6684 Ops.push_back(&II->getOperandUse(0));
6685 Ops.push_back(&II->getOperandUse(1));
6686 return true;
6687 }
6688 [[fallthrough]];
6689
6690 case Intrinsic::fma:
6691 case Intrinsic::fmuladd:
6692 if (isa<VectorType>(I->getType()) &&
6693 cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
6694 !ST->hasFullFP16())
6695 return false;
6696
6697 if (isFNeg(II->getOperand(0)))
6698 Ops.push_back(&II->getOperandUse(0));
6699 if (isFNeg(II->getOperand(1)))
6700 Ops.push_back(&II->getOperandUse(1));
6701
6702 [[fallthrough]];
6703 case Intrinsic::aarch64_neon_sqdmull:
6704 case Intrinsic::aarch64_neon_sqdmulh:
6705 case Intrinsic::aarch64_neon_sqrdmulh:
6706 // Sink splats for index lane variants
6707 if (isSplatShuffle(II->getOperand(0)))
6708 Ops.push_back(&II->getOperandUse(0));
6709 if (isSplatShuffle(II->getOperand(1)))
6710 Ops.push_back(&II->getOperandUse(1));
6711 return !Ops.empty();
6712 case Intrinsic::aarch64_neon_fmlal:
6713 case Intrinsic::aarch64_neon_fmlal2:
6714 case Intrinsic::aarch64_neon_fmlsl:
6715 case Intrinsic::aarch64_neon_fmlsl2:
6716 // Sink splats for index lane variants
6717 if (isSplatShuffle(II->getOperand(1)))
6718 Ops.push_back(&II->getOperandUse(1));
6719 if (isSplatShuffle(II->getOperand(2)))
6720 Ops.push_back(&II->getOperandUse(2));
6721 return !Ops.empty();
6722 case Intrinsic::aarch64_sve_ptest_first:
6723 case Intrinsic::aarch64_sve_ptest_last:
6724 if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
6725 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
6726 Ops.push_back(&II->getOperandUse(0));
6727 return !Ops.empty();
6728 case Intrinsic::aarch64_sme_write_horiz:
6729 case Intrinsic::aarch64_sme_write_vert:
6730 case Intrinsic::aarch64_sme_writeq_horiz:
6731 case Intrinsic::aarch64_sme_writeq_vert: {
6732 auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
6733 if (!Idx || Idx->getOpcode() != Instruction::Add)
6734 return false;
6735 Ops.push_back(&II->getOperandUse(1));
6736 return true;
6737 }
6738 case Intrinsic::aarch64_sme_read_horiz:
6739 case Intrinsic::aarch64_sme_read_vert:
6740 case Intrinsic::aarch64_sme_readq_horiz:
6741 case Intrinsic::aarch64_sme_readq_vert:
6742 case Intrinsic::aarch64_sme_ld1b_vert:
6743 case Intrinsic::aarch64_sme_ld1h_vert:
6744 case Intrinsic::aarch64_sme_ld1w_vert:
6745 case Intrinsic::aarch64_sme_ld1d_vert:
6746 case Intrinsic::aarch64_sme_ld1q_vert:
6747 case Intrinsic::aarch64_sme_st1b_vert:
6748 case Intrinsic::aarch64_sme_st1h_vert:
6749 case Intrinsic::aarch64_sme_st1w_vert:
6750 case Intrinsic::aarch64_sme_st1d_vert:
6751 case Intrinsic::aarch64_sme_st1q_vert:
6752 case Intrinsic::aarch64_sme_ld1b_horiz:
6753 case Intrinsic::aarch64_sme_ld1h_horiz:
6754 case Intrinsic::aarch64_sme_ld1w_horiz:
6755 case Intrinsic::aarch64_sme_ld1d_horiz:
6756 case Intrinsic::aarch64_sme_ld1q_horiz:
6757 case Intrinsic::aarch64_sme_st1b_horiz:
6758 case Intrinsic::aarch64_sme_st1h_horiz:
6759 case Intrinsic::aarch64_sme_st1w_horiz:
6760 case Intrinsic::aarch64_sme_st1d_horiz:
6761 case Intrinsic::aarch64_sme_st1q_horiz: {
6762 auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
6763 if (!Idx || Idx->getOpcode() != Instruction::Add)
6764 return false;
6765 Ops.push_back(&II->getOperandUse(3));
6766 return true;
6767 }
6768 case Intrinsic::aarch64_neon_pmull:
6769 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
6770 return false;
6771 Ops.push_back(&II->getOperandUse(0));
6772 Ops.push_back(&II->getOperandUse(1));
6773 return true;
6774 case Intrinsic::aarch64_neon_pmull64:
6775 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
6776 II->getArgOperand(1)))
6777 return false;
6778 Ops.push_back(&II->getArgOperandUse(0));
6779 Ops.push_back(&II->getArgOperandUse(1));
6780 return true;
6781 case Intrinsic::masked_gather:
6782 if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops))
6783 return false;
6784 Ops.push_back(&II->getArgOperandUse(0));
6785 return true;
6786 case Intrinsic::masked_scatter:
6787 if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops))
6788 return false;
6789 Ops.push_back(&II->getArgOperandUse(1));
6790 return true;
6791 default:
6792 return false;
6793 }
6794 }
6795
6796 auto ShouldSinkCondition = [](Value *Cond,
6797 SmallVectorImpl<Use *> &Ops) -> bool {
6799 return false;
6801 if (II->getIntrinsicID() != Intrinsic::vector_reduce_or ||
6802 !isa<ScalableVectorType>(II->getOperand(0)->getType()))
6803 return false;
6804 if (isa<CmpInst>(II->getOperand(0)))
6805 Ops.push_back(&II->getOperandUse(0));
6806 return true;
6807 };
6808
6809 switch (I->getOpcode()) {
6810 case Instruction::GetElementPtr:
6811 case Instruction::Add:
6812 case Instruction::Sub:
6813 // Sink vscales closer to uses for better isel
6814 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
6815 if (shouldSinkVScale(I->getOperand(Op), Ops)) {
6816 Ops.push_back(&I->getOperandUse(Op));
6817 return true;
6818 }
6819 }
6820 break;
6821 case Instruction::Select: {
6822 if (!ShouldSinkCondition(I->getOperand(0), Ops))
6823 return false;
6824
6825 Ops.push_back(&I->getOperandUse(0));
6826 return true;
6827 }
6828 case Instruction::Br: {
6829 if (cast<BranchInst>(I)->isUnconditional())
6830 return false;
6831
6832 if (!ShouldSinkCondition(cast<BranchInst>(I)->getCondition(), Ops))
6833 return false;
6834
6835 Ops.push_back(&I->getOperandUse(0));
6836 return true;
6837 }
6838 case Instruction::FMul:
6839 // fmul with contract flag can be combined with fadd into fma.
6840 // Sinking fneg into this block enables fmls pattern.
6841 if (cast<FPMathOperator>(I)->hasAllowContract()) {
6842 if (isFNeg(I->getOperand(0)))
6843 Ops.push_back(&I->getOperandUse(0));
6844 if (isFNeg(I->getOperand(1)))
6845 Ops.push_back(&I->getOperandUse(1));
6846 }
6847 break;
6848
6849 default:
6850 break;
6851 }
6852
6853 if (!I->getType()->isVectorTy())
6854 return !Ops.empty();
6855
6856 switch (I->getOpcode()) {
6857 case Instruction::Sub:
6858 case Instruction::Add: {
6859 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
6860 return false;
6861
6862 // If the exts' operands extract either the lower or upper elements, we
6863 // can sink them too.
6864 auto Ext1 = cast<Instruction>(I->getOperand(0));
6865 auto Ext2 = cast<Instruction>(I->getOperand(1));
6866 if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
6867 Ops.push_back(&Ext1->getOperandUse(0));
6868 Ops.push_back(&Ext2->getOperandUse(0));
6869 }
6870
6871 Ops.push_back(&I->getOperandUse(0));
6872 Ops.push_back(&I->getOperandUse(1));
6873
6874 return true;
6875 }
6876 case Instruction::Or: {
6877 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
6878 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
6879 if (ST->hasNEON()) {
6880 Instruction *OtherAnd, *IA, *IB;
6881 Value *MaskValue;
6882 // MainAnd refers to And instruction that has 'Not' as one of its operands
6883 if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)),
6884 m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))),
6885 m_Instruction(IA)))))) {
6886 if (match(OtherAnd,
6887 m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) {
6888 Instruction *MainAnd = I->getOperand(0) == OtherAnd
6889 ? cast<Instruction>(I->getOperand(1))
6890 : cast<Instruction>(I->getOperand(0));
6891
6892 // Both Ands should be in same basic block as Or
6893 if (I->getParent() != MainAnd->getParent() ||
6894 I->getParent() != OtherAnd->getParent())
6895 return false;
6896
6897 // Non-mask operands of both Ands should also be in same basic block
6898 if (I->getParent() != IA->getParent() ||
6899 I->getParent() != IB->getParent())
6900 return false;
6901
6902 Ops.push_back(
6903 &MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));
6904 Ops.push_back(&I->getOperandUse(0));
6905 Ops.push_back(&I->getOperandUse(1));
6906
6907 return true;
6908 }
6909 }
6910 }
6911
6912 return false;
6913 }
6914 case Instruction::Mul: {
6915 auto ShouldSinkSplatForIndexedVariant = [](Value *V) {
6916 auto *Ty = cast<VectorType>(V->getType());
6917 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
6918 if (Ty->isScalableTy())
6919 return false;
6920
6921 // Indexed variants of Mul exist for i16 and i32 element types only.
6922 return Ty->getScalarSizeInBits() == 16 || Ty->getScalarSizeInBits() == 32;
6923 };
6924
6925 int NumZExts = 0, NumSExts = 0;
6926 for (auto &Op : I->operands()) {
6927 // Make sure we are not already sinking this operand
6928 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
6929 continue;
6930
6931 if (match(&Op, m_ZExtOrSExt(m_Value()))) {
6932 auto *Ext = cast<Instruction>(Op);
6933 auto *ExtOp = Ext->getOperand(0);
6934 if (isSplatShuffle(ExtOp) && ShouldSinkSplatForIndexedVariant(ExtOp))
6935 Ops.push_back(&Ext->getOperandUse(0));
6936 Ops.push_back(&Op);
6937
6938 if (isa<SExtInst>(Ext)) {
6939 NumSExts++;
6940 } else {
6941 NumZExts++;
6942 // A zext(a) is also a sext(zext(a)), if we take more than 2 steps.
6943 if (Ext->getOperand(0)->getType()->getScalarSizeInBits() * 2 <
6944 I->getType()->getScalarSizeInBits())
6945 NumSExts++;
6946 }
6947
6948 continue;
6949 }
6950
6952 if (!Shuffle)
6953 continue;
6954
6955 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
6956 // operand and the s/zext can help create indexed s/umull. This is
6957 // especially useful to prevent i64 mul being scalarized.
6958 if (isSplatShuffle(Shuffle) &&
6959 match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
6960 Ops.push_back(&Shuffle->getOperandUse(0));
6961 Ops.push_back(&Op);
6962 if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
6963 NumSExts++;
6964 else
6965 NumZExts++;
6966 continue;
6967 }
6968
6969 Value *ShuffleOperand = Shuffle->getOperand(0);
6970 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
6971 if (!Insert)
6972 continue;
6973
6974 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
6975 if (!OperandInstr)
6976 continue;
6977
6978 ConstantInt *ElementConstant =
6979 dyn_cast<ConstantInt>(Insert->getOperand(2));
6980 // Check that the insertelement is inserting into element 0
6981 if (!ElementConstant || !ElementConstant->isZero())
6982 continue;
6983
6984 unsigned Opcode = OperandInstr->getOpcode();
6985 if (Opcode == Instruction::SExt)
6986 NumSExts++;
6987 else if (Opcode == Instruction::ZExt)
6988 NumZExts++;
6989 else {
6990 // If we find that the top bits are known 0, then we can sink and allow
6991 // the backend to generate a umull.
6992 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
6993 APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
6994 if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
6995 continue;
6996 NumZExts++;
6997 }
6998
6999 // And(Load) is excluded to prevent CGP getting stuck in a loop of sinking
7000 // the And, just to hoist it again back to the load.
7001 if (!match(OperandInstr, m_And(m_Load(m_Value()), m_Value())))
7002 Ops.push_back(&Insert->getOperandUse(1));
7003 Ops.push_back(&Shuffle->getOperandUse(0));
7004 Ops.push_back(&Op);
7005 }
7006
7007 // It is profitable to sink if we found two of the same type of extends.
7008 if (!Ops.empty() && (NumSExts == 2 || NumZExts == 2))
7009 return true;
7010
7011 // Otherwise, see if we should sink splats for indexed variants.
7012 if (!ShouldSinkSplatForIndexedVariant(I))
7013 return false;
7014
7015 Ops.clear();
7016 if (isSplatShuffle(I->getOperand(0)))
7017 Ops.push_back(&I->getOperandUse(0));
7018 if (isSplatShuffle(I->getOperand(1)))
7019 Ops.push_back(&I->getOperandUse(1));
7020
7021 return !Ops.empty();
7022 }
7023 case Instruction::FMul: {
7024 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
7025 if (I->getType()->isScalableTy())
7026 return !Ops.empty();
7027
7028 if (cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
7029 !ST->hasFullFP16())
7030 return !Ops.empty();
7031
7032 // Sink splats for index lane variants
7033 if (isSplatShuffle(I->getOperand(0)))
7034 Ops.push_back(&I->getOperandUse(0));
7035 if (isSplatShuffle(I->getOperand(1)))
7036 Ops.push_back(&I->getOperandUse(1));
7037 return !Ops.empty();
7038 }
7039 default:
7040 return false;
7041 }
7042 return false;
7043}
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static std::optional< Instruction * > instCombinePTrue(InstCombiner &IC, IntrinsicInst &II)
TailFoldingOption TailFoldingOptionLoc
static std::optional< Instruction * > instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, bool MergeIntoAddendOp)
static void getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP)
bool SimplifyValuePattern(SmallVector< Value * > &Vec, bool AllowPoison)
static std::optional< Instruction * > instCombineSVESel(InstCombiner &IC, IntrinsicInst &II)
static bool hasPossibleIncompatibleOps(const Function *F, const AArch64TargetLowering &TLI)
Returns true if the function has explicit operations that can only be lowered using incompatible inst...
static bool shouldSinkVScale(Value *Op, SmallVectorImpl< Use * > &Ops)
We want to sink following cases: (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A,...
static InstructionCost getHistogramCost(const AArch64Subtarget *ST, const IntrinsicCostAttributes &ICA)
static std::optional< Instruction * > tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEUnpack(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold", cl::init(15), cl::Hidden)
static cl::opt< bool > EnableFixedwidthAutovecInStreamingMode("enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static void getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP, const AArch64TTIImpl &TTI)
For Apple CPUs, we want to runtime-unroll loops to make better use if the OOO engine's wide instructi...
static std::optional< Instruction * > instCombineWhilelo(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II)
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
static cl::opt< bool > EnableLSRCostOpt("enable-aarch64-lsr-cost-opt", cl::init(true), cl::Hidden)
static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl< Use * > &Ops)
static bool shouldUnrollMultiExitLoop(Loop *L, ScalarEvolution &SE, const AArch64TTIImpl &TTI)
static std::optional< Instruction * > simplifySVEIntrinsicBinOp(InstCombiner &IC, IntrinsicInst &II, const SVEIntrinsicInfo &IInfo)
static std::optional< Instruction * > instCombineSVEVectorSub(InstCombiner &IC, IntrinsicInst &II)
static bool isLoopSizeWithinBudget(Loop *L, const AArch64TTIImpl &TTI, InstructionCost Budget, unsigned *FinalSize)
static std::optional< Instruction * > instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > processPhiNode(InstCombiner &IC, IntrinsicInst &II)
The function will remove redundant reinterprets casting in the presence of the control flow.
static std::optional< Instruction * > instCombineSVEInsr(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSMECntsd(InstCombiner &IC, IntrinsicInst &II, const AArch64Subtarget *ST)
static void extractAttrFeatures(const Function &F, const AArch64TTIImpl *TTI, SmallVectorImpl< StringRef > &Features)
static std::optional< Instruction * > instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II)
static bool isSMEABIRoutineCall(const CallInst &CI, const AArch64TargetLowering &TLI)
static std::optional< Instruction * > instCombineSVESDIV(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static Value * stripInactiveLanes(Value *V, const Value *Pg)
static cl::opt< bool > SVEPreferFixedOverScalableIfEqualCost("sve-prefer-fixed-over-scalable-if-equal", cl::Hidden)
static bool isUnpackedVectorVT(EVT VecVT)
static std::optional< Instruction * > instCombineSVEDupX(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVECmpNE(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineDMB(InstCombiner &IC, IntrinsicInst &II)
static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineRDFFR(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineMaxMinNM(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVEGatherOverhead("sve-gather-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVECondLast(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEPTest(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEZip(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< int > Aarch64ForceUnrollThreshold("aarch64-force-unroll-threshold", cl::init(0), cl::Hidden, cl::desc("Threshold for forced unrolling of small loops in AArch64"))
static std::optional< Instruction * > instCombineSVEDup(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden, cl::desc("The cost of a histcnt instruction"))
static std::optional< Instruction * > instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > CallPenaltyChangeSM("call-penalty-sm-change", cl::init(5), cl::Hidden, cl::desc("Penalty of calling a function that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVEUzp1(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableScalableAutovecInStreamingMode("enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVETBL(InstCombiner &IC, IntrinsicInst &II)
static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2)
Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
static bool isFNeg(Value *Op)
static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic)
static bool containsDecreasingPointers(Loop *TheLoop, PredicatedScalarEvolution *PSE, const DominatorTree &DT)
static bool isSplatShuffle(Value *V)
static cl::opt< unsigned > InlineCallPenaltyChangeSM("inline-call-penalty-sm-change", cl::init(10), cl::Hidden, cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static std::optional< Instruction * > instCombineSVESrshl(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > DMBLookaheadThreshold("dmb-lookahead-threshold", cl::init(10), cl::Hidden, cl::desc("The number of instructions to search for a redundant dmb"))
static std::optional< Instruction * > simplifySVEIntrinsic(InstCombiner &IC, IntrinsicInst &II, const SVEIntrinsicInfo &IInfo)
static unsigned getSVEGatherScatterOverhead(unsigned Opcode, const AArch64Subtarget *ST)
static bool isOperandOfVmullHighP64(Value *Op)
Check if Op could be used with vmull_high_p64 intrinsic.
static std::optional< Instruction * > instCombineInStreamingMode(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVELast(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10), cl::Hidden)
static cl::opt< bool > EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", cl::init(true), cl::Hidden)
static std::optional< Instruction * > instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts)
static std::optional< Instruction * > instCombineSVEUxt(InstCombiner &IC, IntrinsicInst &II, unsigned NumBits)
static cl::opt< TailFoldingOption, true, cl::parser< std::string > > SVETailFolding("sve-tail-folding", cl::desc("Control the use of vectorisation using tail-folding for SVE where the" " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:" "\ndisabled (Initial) No loop types will vectorize using " "tail-folding" "\ndefault (Initial) Uses the default tail-folding settings for " "the target CPU" "\nall (Initial) All legal loop types will vectorize using " "tail-folding" "\nsimple (Initial) Use tail-folding for simple loops (not " "reductions or recurrences)" "\nreductions Use tail-folding for loops containing reductions" "\nnoreductions Inverse of above" "\nrecurrences Use tail-folding for loops containing fixed order " "recurrences" "\nnorecurrences Inverse of above" "\nreverse Use tail-folding for loops requiring reversed " "predicates" "\nnoreverse Inverse of above"), cl::location(TailFoldingOptionLoc))
static bool areExtractShuffleVectors(Value *Op1, Value *Op2, bool AllowSplat=false)
Check if both Op1 and Op2 are shufflevector extracts of either the lower or upper half of the vector ...
static std::optional< Instruction * > instCombineSVEVectorAdd(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden)
static cl::opt< unsigned > SVEScatterOverhead("sve-scatter-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVEDupqLane(InstCombiner &IC, IntrinsicInst &II)
This file a TargetTransformInfoImplBase conforming object specific to the AArch64 target machine.
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static Error reportError(StringRef Message)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
This file defines the DenseMap class.
@ Default
static Value * getCondition(Instruction *I)
Hexagon Common GEP
const HexagonInstrInfo * TII
#define _
This file provides the interface for the instcombine pass implementation.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
#define T
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
const SmallVectorImpl< MachineOperand > & Cond
static uint64_t getBits(uint64_t Val, int Start, int End)
#define LLVM_DEBUG(...)
Definition Debug.h:114
static unsigned getScalarSizeInBits(Type *Ty)
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Value * RHS
Value * LHS
BinaryOperator * Mul
unsigned getVectorInsertExtractBaseCost() const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
bool isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst, Type *Src) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getIntImmCost(int64_t Val) const
Calculate the cost of materializing a 64-bit value.
std::optional< InstructionCost > getFP16BF16PromoteCost(Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, bool IncludeTrunc, bool CanUseSVE, std::function< InstructionCost(Type *)> InstCost) const
FP16 and BF16 operations are lowered to fptrunc(op(fpext, fpext) if the architecture features are not...
bool prefersVectorizedAddressing() const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
bool isElementTypeLegalForScalableVector(Type *Ty) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
APInt getPriorityMask(const Function &F) const override
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const override
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const override
bool useNeonVector(const Type *Ty) const
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind) const override
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
unsigned getMaxNumElements(ElementCount VF) const
Try to return an estimate cost factor that can be used as a multiplier when scalarizing an operation ...
bool shouldTreatInstructionLikeSelect(const Instruction *I) const override
bool isMultiversionedFunction(const Function &F) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
bool isLegalMaskedGatherScatter(Type *DataType) const
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const override
See if I should be considered for address type promotion.
APInt getFeatureMask(const Function &F) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Types) const override
bool enableScalableVectorization() const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
Value * getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType, bool CanCreate=true) const override
bool hasKnownLowerThroughputFromSchedulingModel(unsigned Opcode1, unsigned Opcode2) const
Check whether Opcode1 has less throughput according to the scheduling model than Opcode2.
unsigned getEpilogueVectorizationMinVF() const override
InstructionCost getSpliceCost(VectorType *Tp, int Index, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
bool preferFixedOverScalableIfEqualCost(bool IsEpilogue) const override
Class for arbitrary precision integers.
Definition APInt.h:78
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition APInt.h:450
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1685
unsigned countLeadingOnes() const
Definition APInt.h:1639
void negate()
Negate this APInt in place.
Definition APInt.h:1483
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1052
unsigned logBase2() const
Definition APInt.h:1776
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:834
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1577
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
bool isTypeLegal(Type *Ty) const override
static BinaryOperator * CreateWithCopiedFlags(BinaryOps Opc, Value *V1, Value *V2, Value *CopyO, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Definition InstrTypes.h:219
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:776
bool isUnsigned() const
Definition InstrTypes.h:936
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static LLVM_ABI ConstantAggregateZero * get(Type *Ty)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:219
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
static LLVM_ABI ConstantInt * getBool(LLVMContext &Context, bool V)
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition DataLayout.h:771
bool empty() const
Definition DenseMap.h:109
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:169
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
static ExtractElementInst * Create(Value *Vec, Value *Idx, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
This provides a helper for copying FMF from an instruction or setting specified flags.
Definition IRBuilder.h:93
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
bool allowContract() const
Definition FMF.h:72
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2561
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition IRBuilder.h:1110
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2549
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:574
Type * getDoubleTy()
Fetch the type representing a 64-bit floating point value.
Definition IRBuilder.h:594
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
LLVM_ABI CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:561
Type * getHalfTy()
Fetch the type representing a 16-bit floating point value.
Definition IRBuilder.h:579
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition IRBuilder.h:1944
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:527
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2258
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2473
Value * CreateBinOpFMF(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1717
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2176
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1854
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2583
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1867
LLVM_ABI CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition IRBuilder.h:589
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition IRBuilder.h:2249
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
LLVM_ABI Value * CreateElementCount(Type *Ty, ElementCount EC)
Create an expression which evaluates to the number of elements in EC at runtime.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
This instruction inserts a single (scalar) element into a VectorType value.
The core instruction combiner logic.
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
BuilderTy & Builder
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
bool isBinaryOp() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
bool hasGroups() const
Returns true if we have any interleave groups.
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Value * getPointerOperand()
iterator_range< block_iterator > blocks() const
RecurrenceSet & getFixedOrderRecurrences()
Return the fixed-order recurrences found in the loop.
PredicatedScalarEvolution * getPredicatedScalarEvolution() const
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
const FeatureBitset & getFeatureBits() const
Machine Value Type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
size_type size() const
Definition MapVector.h:56
Information for memory intrinsic cost model.
const Instruction * getInst() const
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Type * getRecurrenceType() const
Returns the type of the recurrence.
RecurKind getRecurrenceKind() const
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents an analyzed expression in the program.
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasNonStreamingInterfaceAndBody() const
bool hasStreamingCompatibleInterface() const
bool hasStreamingInterfaceOrBody() const
bool isSMEABIRoutine() const
bool hasStreamingBody() const
void set(unsigned M, bool Enable=true)
SMECallAttrs is a utility class to hold the SMEAttrs for a callsite.
bool requiresPreservingZT0() const
bool requiresPreservingAllZAState() const
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:824
static ScalableVectorType * getDoubleElementsVectorType(ScalableVectorType *VTy)
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI unsigned getSmallConstantTripMultiple(const Loop *L, const SCEV *ExitCount)
Returns the largest constant divisor of the trip count as a normal unsigned value,...
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI unsigned getSmallConstantMaxTripCount(const Loop *L, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
Returns the upper bound of the loop trip count as a normal unsigned value.
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
const SCEV * getSymbolicMaxBackedgeTakenCount(const Loop *L)
When successful, this returns a SCEV that is greater than or equal to (i.e.
This instruction constructs a fixed permutation of two input vectors.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
iterator insert(iterator I, T &&Elt)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:40
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:39
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition StringRef.h:730
Class to represent struct types.
TargetInstrInfo - Interface to description of machine instruction set.
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
const RTLIB::RuntimeLibcallsInfo & getRuntimeLibcallsInfo() const
Primary interface to the complete machine description for the target machine.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
virtual const DataLayout & getDataLayout() const
virtual bool shouldTreatInstructionLikeSelect(const Instruction *I) const
virtual bool isLoweredToCall(const Function *F) const
virtual bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
virtual bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Types) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
@ None
The insert/extract is not used with a load/store.
@ Load
The value being inserted comes from a load (InsertElement only).
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Masked
The cast is used with a masked load/store.
@ Normal
The cast is used with a normal load/store.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:61
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:156
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:284
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
const Use & getOperandUse(unsigned i) const
Definition User.h:220
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:403
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:440
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:967
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:403
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
const ParentTy * getParent() const
Definition ilist_node.h:34
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
LLVM_ABI APInt getCpuSupportsMask(ArrayRef< StringRef > Features)
static constexpr unsigned SVEBitsPerBlock
LLVM_ABI APInt getFMVPriority(ArrayRef< StringRef > Features)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:880
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:993
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:844
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:765
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:850
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:978
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:926
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:959
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:856
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
cst_pred_ty< is_nonnegative > m_NonNegative()
Match an integer or vector of non-negative values.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
IntrinsicID_match m_VScale()
Matches a call to llvm.vscale().
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
brc_match< Cond_t, bind_ty< BasicBlock >, bind_ty< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
LLVM_ABI Libcall getPOW(EVT RetVT)
getPOW - Return the POW_* value for the given types, or UNKNOWN_LIBCALL if there is none.
initializer< Ty > init(const Ty &Val)
LocationClass< Ty > location(Ty &L)
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
std::optional< unsigned > isDUPQMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPQMask - matches a splat of equivalent lanes within segments of a given number of elements.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut, unsigned &OperandOrderOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> (WhichResultOut = 0,...
TailFoldingOpts
An enum to describe what types of loops we should attempt to tail-fold: Disabled: None Reductions: Lo...
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
bool isDUPFirstSegmentMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPFirstSegmentMask - matches a splat of the first 128b segment.
TypeConversionCostTblEntryT< unsigned > TypeConversionCostTblEntry
Definition CostTable.h:61
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
@ Uninitialized
Definition Threading.h:60
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:296
LLVM_ABI std::optional< const MDOperand * > findStringMetadataForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for loop.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2173
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
unsigned M1(unsigned Val)
Definition VE.h:377
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
bool isREVMask(ArrayRef< int > M, unsigned EltSize, unsigned NumElts, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
TargetTransformInfo TTI
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Or
Bitwise or logical OR of integers.
@ AnyOf
AnyOf reduction with select(cmp(),x,y) where one of (x,y) is loop invariant, and both x and y are int...
@ Xor
Bitwise or logical XOR of integers.
@ FindLast
FindLast reduction with select(cmp(),x,y) where x and y.
@ FMax
FP max implemented in terms of select(cmp()).
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2166
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DominatorTree &DT, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition CostTable.h:66
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition MathExtras.h:373
bool isTRNMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut, unsigned &OperandOrderOut)
Return true for trn1 or trn2 masks of the form: <0, 8, 2, 10, 4, 12, 6, 14> (WhichResultOut = 0,...
#define N
static SVEIntrinsicInfo defaultMergingUnaryNarrowingTopOp()
static SVEIntrinsicInfo defaultZeroingOp()
SVEIntrinsicInfo & setOperandIdxInactiveLanesTakenFrom(unsigned Index)
static SVEIntrinsicInfo defaultMergingOp(Intrinsic::ID IID=Intrinsic::not_intrinsic)
SVEIntrinsicInfo & setOperandIdxWithNoActiveLanes(unsigned Index)
unsigned getOperandIdxWithNoActiveLanes() const
SVEIntrinsicInfo & setInactiveLanesAreUnused()
SVEIntrinsicInfo & setInactiveLanesAreNotDefined()
SVEIntrinsicInfo & setGoverningPredicateOperandIdx(unsigned Index)
static SVEIntrinsicInfo defaultUndefOp()
Intrinsic::ID getMatchingUndefIntrinsic() const
SVEIntrinsicInfo & setResultIsZeroInitialized()
static SVEIntrinsicInfo defaultMergingUnaryOp()
SVEIntrinsicInfo & setMatchingUndefIntrinsic(Intrinsic::ID IID)
unsigned getGoverningPredicateOperandIdx() const
SVEIntrinsicInfo & setMatchingIROpcode(unsigned Opcode)
unsigned getOperandIdxInactiveLanesTakenFrom() const
static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Extended Value Type.
Definition ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:145
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:70
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:292
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:381
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:367
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:393
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:324
bool isFixedLengthVector() const
Definition ValueTypes.h:189
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:331
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition ValueTypes.h:182
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:336
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:344
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition MCSchedule.h:123
bool isVariant() const
Definition MCSchedule.h:144
Machine model for scheduling, bundling, and heuristics.
Definition MCSchedule.h:258
static LLVM_ABI double getReciprocalThroughput(const MCSubtargetInfo &STI, const MCSchedClassDesc &SCDesc)
Matching combinators.
Information about a load/store intrinsic defined by the target.
InterleavedAccessInfo * IAI
LoopVectorizationLegality * LVL
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
bool RuntimeUnrollMultiExit
Allow runtime unrolling multi-exit loops.
unsigned SCEVExpansionBudget
Don't allow runtime unrolling if expanding the trip count takes more than SCEVExpansionBudget.
bool AddAdditionalAccumulators
Allow unrolling to add parallel reduction phis.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...