LLVM 23.0.0git
AArch64InstrInfo.cpp
Go to the documentation of this file.
1//===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains the AArch64 implementation of the TargetInstrInfo class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64InstrInfo.h"
14#include "AArch64ExpandImm.h"
16#include "AArch64PointerAuth.h"
17#include "AArch64Subtarget.h"
21#include "llvm/ADT/ArrayRef.h"
22#include "llvm/ADT/STLExtras.h"
23#include "llvm/ADT/SmallSet.h"
25#include "llvm/ADT/Statistic.h"
44#include "llvm/IR/DebugLoc.h"
45#include "llvm/IR/GlobalValue.h"
46#include "llvm/IR/Module.h"
47#include "llvm/MC/MCAsmInfo.h"
48#include "llvm/MC/MCInst.h"
50#include "llvm/MC/MCInstrDesc.h"
55#include "llvm/Support/LEB128.h"
59#include <cassert>
60#include <cstdint>
61#include <iterator>
62#include <utility>
63
64using namespace llvm;
65
66#define GET_INSTRINFO_CTOR_DTOR
67#include "AArch64GenInstrInfo.inc"
68
69#define DEBUG_TYPE "AArch64InstrInfo"
70
71STATISTIC(NumCopyInstrs, "Number of COPY instructions expanded");
72STATISTIC(NumZCRegMoveInstrsGPR, "Number of zero-cycle GPR register move "
73 "instructions expanded from canonical COPY");
74STATISTIC(NumZCRegMoveInstrsFPR, "Number of zero-cycle FPR register move "
75 "instructions expanded from canonical COPY");
76STATISTIC(NumZCZeroingInstrsGPR, "Number of zero-cycle GPR zeroing "
77 "instructions expanded from canonical COPY");
78// NumZCZeroingInstrsFPR is counted at AArch64AsmPrinter
79
81 CBDisplacementBits("aarch64-cb-offset-bits", cl::Hidden, cl::init(9),
82 cl::desc("Restrict range of CB instructions (DEBUG)"));
83
85 "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
86 cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
87
89 "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
90 cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
91
93 BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
94 cl::desc("Restrict range of Bcc instructions (DEBUG)"));
95
97 BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26),
98 cl::desc("Restrict range of B instructions (DEBUG)"));
99
101 "aarch64-search-limit", cl::Hidden, cl::init(2048),
102 cl::desc("Restrict range of instructions to search for the "
103 "machine-combiner gather pattern optimization"));
104
106 : AArch64GenInstrInfo(STI, RI, AArch64::ADJCALLSTACKDOWN,
107 AArch64::ADJCALLSTACKUP, AArch64::CATCHRET),
108 RI(STI.getTargetTriple(), STI.getHwMode()), Subtarget(STI) {}
109
110/// Return the maximum number of bytes of code the specified instruction may be
111/// after LFI rewriting. If the instruction is not rewritten, std::nullopt is
112/// returned (use default sizing).
113///
114/// NOTE: the size estimates here must be kept in sync with the rewrites in
115/// AArch64MCLFIRewriter.cpp. Sizes may be overestimates of the rewritten
116/// instruction sequences.
117static std::optional<unsigned> getLFIInstSizeInBytes(const MachineInstr &MI) {
118 switch (MI.getOpcode()) {
119 case AArch64::SVC:
120 // SVC expands to 4 instructions.
121 return 16;
122 case AArch64::BR:
123 case AArch64::BLR:
124 // Indirect branches/calls expand to 2 instructions (guard + br/blr).
125 return 8;
126 case AArch64::RET:
127 // RET through LR is not rewritten, but RET through another register
128 // expands to 2 instructions (guard + ret).
129 if (MI.getOperand(0).getReg() != AArch64::LR)
130 return 8;
131 return 4;
132 default:
133 break;
134 }
135
136 // Instructions that explicitly modify LR expand to 2 instructions.
137 for (const MachineOperand &MO : MI.explicit_operands())
138 if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::LR)
139 return 8;
140
141 // Default case: instructions that don't cause expansion.
142 // - TP accesses in LFI are a single load/store, so no expansion.
143 // - All remaining instructions are not rewritten.
144 return std::nullopt;
145}
146
147/// GetInstSize - Return the number of bytes of code the specified
148/// instruction may be. This returns the maximum number of bytes.
150 const MachineBasicBlock &MBB = *MI.getParent();
151 const MachineFunction *MF = MBB.getParent();
152 const Function &F = MF->getFunction();
153 const MCAsmInfo &MAI = MF->getTarget().getMCAsmInfo();
154
155 {
156 auto Op = MI.getOpcode();
157 if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR)
158 return getInlineAsmLength(MI.getOperand(0).getSymbolName(), MAI);
159 }
160
161 // Meta-instructions emit no code.
162 if (MI.isMetaInstruction())
163 return 0;
164
165 // FIXME: We currently only handle pseudoinstructions that don't get expanded
166 // before the assembly printer.
167 unsigned NumBytes = 0;
168 const MCInstrDesc &Desc = MI.getDesc();
169
170 // LFI rewriter expansions that supersede normal sizing.
171 const auto &STI = MF->getSubtarget<AArch64Subtarget>();
172 if (STI.isLFI())
173 if (auto Size = getLFIInstSizeInBytes(MI))
174 return *Size;
175
176 if (!MI.isBundle() && isTailCallReturnInst(MI)) {
177 NumBytes = Desc.getSize() ? Desc.getSize() : 4;
178
179 const auto *MFI = MF->getInfo<AArch64FunctionInfo>();
180 if (!MFI->shouldSignReturnAddress(*MF))
181 return NumBytes;
182
183 auto Method = STI.getAuthenticatedLRCheckMethod(*MF);
184 NumBytes += AArch64PAuth::getCheckerSizeInBytes(Method);
185 return NumBytes;
186 }
187
188 // Size should be preferably set in
189 // llvm/lib/Target/AArch64/AArch64InstrInfo.td (default case).
190 // Specific cases handle instructions of variable sizes
191 switch (Desc.getOpcode()) {
192 default:
193 if (Desc.getSize())
194 return Desc.getSize();
195
196 // Anything not explicitly designated otherwise (i.e. pseudo-instructions
197 // with fixed constant size but not specified in .td file) is a normal
198 // 4-byte insn.
199 NumBytes = 4;
200 break;
201 case TargetOpcode::STACKMAP:
202 // The upper bound for a stackmap intrinsic is the full length of its shadow
203 NumBytes = StackMapOpers(&MI).getNumPatchBytes();
204 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
205 break;
206 case TargetOpcode::PATCHPOINT:
207 // The size of the patchpoint intrinsic is the number of bytes requested
208 NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
209 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
210 break;
211 case TargetOpcode::STATEPOINT:
212 NumBytes = StatepointOpers(&MI).getNumPatchBytes();
213 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
214 // No patch bytes means a normal call inst is emitted
215 if (NumBytes == 0)
216 NumBytes = 4;
217 break;
218 case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
219 // If `patchable-function-entry` is set, PATCHABLE_FUNCTION_ENTER
220 // instructions are expanded to the specified number of NOPs. Otherwise,
221 // they are expanded to 36-byte XRay sleds.
222 NumBytes =
223 F.getFnAttributeAsParsedInteger("patchable-function-entry", 9) * 4;
224 break;
225 case TargetOpcode::PATCHABLE_FUNCTION_EXIT:
226 case TargetOpcode::PATCHABLE_TAIL_CALL:
227 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
228 // An XRay sled can be 4 bytes of alignment plus a 32-byte block.
229 NumBytes = 36;
230 break;
231 case TargetOpcode::PATCHABLE_EVENT_CALL:
232 // EVENT_CALL XRay sleds are exactly 6 instructions long (no alignment).
233 NumBytes = 24;
234 break;
235
236 case AArch64::SPACE:
237 NumBytes = MI.getOperand(1).getImm();
238 break;
239 case AArch64::MOVaddr:
240 case AArch64::MOVaddrJT:
241 case AArch64::MOVaddrCP:
242 case AArch64::MOVaddrBA:
243 case AArch64::MOVaddrTLS:
244 case AArch64::MOVaddrEXT: {
245 // Use the same logic as the pseudo expansion to count instructions.
248 MI.getOperand(1).getTargetFlags(),
249 Subtarget.isTargetMachO(), Insn);
250 NumBytes = Insn.size() * 4;
251 break;
252 }
253
254 case AArch64::MOVi32imm:
255 case AArch64::MOVi64imm: {
256 // Use the same logic as the pseudo expansion to count instructions.
257 unsigned BitSize = Desc.getOpcode() == AArch64::MOVi32imm ? 32 : 64;
259 AArch64_IMM::expandMOVImm(MI.getOperand(1).getImm(), BitSize, Insn);
260 NumBytes = Insn.size() * 4;
261 break;
262 }
263
264 case TargetOpcode::BUNDLE:
265 NumBytes = getInstBundleSize(MI);
266 break;
267 }
268
269 return NumBytes;
270}
271
274 // Block ends with fall-through condbranch.
275 switch (LastInst->getOpcode()) {
276 default:
277 llvm_unreachable("Unknown branch instruction?");
278 case AArch64::Bcc:
279 Target = LastInst->getOperand(1).getMBB();
280 Cond.push_back(LastInst->getOperand(0));
281 break;
282 case AArch64::CBZW:
283 case AArch64::CBZX:
284 case AArch64::CBNZW:
285 case AArch64::CBNZX:
286 Target = LastInst->getOperand(1).getMBB();
287 Cond.push_back(MachineOperand::CreateImm(-1));
288 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
289 Cond.push_back(LastInst->getOperand(0));
290 break;
291 case AArch64::TBZW:
292 case AArch64::TBZX:
293 case AArch64::TBNZW:
294 case AArch64::TBNZX:
295 Target = LastInst->getOperand(2).getMBB();
296 Cond.push_back(MachineOperand::CreateImm(-1));
297 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
298 Cond.push_back(LastInst->getOperand(0));
299 Cond.push_back(LastInst->getOperand(1));
300 break;
301 case AArch64::CBWPri:
302 case AArch64::CBXPri:
303 case AArch64::CBWPrr:
304 case AArch64::CBXPrr:
305 Target = LastInst->getOperand(3).getMBB();
306 Cond.push_back(MachineOperand::CreateImm(-1));
307 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
308 Cond.push_back(LastInst->getOperand(0));
309 Cond.push_back(LastInst->getOperand(1));
310 Cond.push_back(LastInst->getOperand(2));
311 break;
312 case AArch64::CBBAssertExt:
313 case AArch64::CBHAssertExt:
314 Target = LastInst->getOperand(3).getMBB();
315 Cond.push_back(MachineOperand::CreateImm(-1)); // -1
316 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); // Opc
317 Cond.push_back(LastInst->getOperand(0)); // Cond
318 Cond.push_back(LastInst->getOperand(1)); // Op0
319 Cond.push_back(LastInst->getOperand(2)); // Op1
320 Cond.push_back(LastInst->getOperand(4)); // Ext0
321 Cond.push_back(LastInst->getOperand(5)); // Ext1
322 break;
323 }
324}
325
326static unsigned getBranchDisplacementBits(unsigned Opc) {
327 switch (Opc) {
328 default:
329 llvm_unreachable("unexpected opcode!");
330 case AArch64::B:
331 return BDisplacementBits;
332 case AArch64::TBNZW:
333 case AArch64::TBZW:
334 case AArch64::TBNZX:
335 case AArch64::TBZX:
336 return TBZDisplacementBits;
337 case AArch64::CBNZW:
338 case AArch64::CBZW:
339 case AArch64::CBNZX:
340 case AArch64::CBZX:
341 return CBZDisplacementBits;
342 case AArch64::Bcc:
343 return BCCDisplacementBits;
344 case AArch64::CBWPri:
345 case AArch64::CBXPri:
346 case AArch64::CBBAssertExt:
347 case AArch64::CBHAssertExt:
348 case AArch64::CBWPrr:
349 case AArch64::CBXPrr:
350 return CBDisplacementBits;
351 }
352}
353
355 int64_t BrOffset) const {
356 unsigned Bits = getBranchDisplacementBits(BranchOp);
357 assert(Bits >= 3 && "max branch displacement must be enough to jump"
358 "over conditional branch expansion");
359 return isIntN(Bits, BrOffset / 4);
360}
361
364 switch (MI.getOpcode()) {
365 default:
366 llvm_unreachable("unexpected opcode!");
367 case AArch64::B:
368 return MI.getOperand(0).getMBB();
369 case AArch64::TBZW:
370 case AArch64::TBNZW:
371 case AArch64::TBZX:
372 case AArch64::TBNZX:
373 return MI.getOperand(2).getMBB();
374 case AArch64::CBZW:
375 case AArch64::CBNZW:
376 case AArch64::CBZX:
377 case AArch64::CBNZX:
378 case AArch64::Bcc:
379 return MI.getOperand(1).getMBB();
380 case AArch64::CBWPri:
381 case AArch64::CBXPri:
382 case AArch64::CBBAssertExt:
383 case AArch64::CBHAssertExt:
384 case AArch64::CBWPrr:
385 case AArch64::CBXPrr:
386 return MI.getOperand(3).getMBB();
387 }
388}
389
391 MachineBasicBlock &NewDestBB,
392 MachineBasicBlock &RestoreBB,
393 const DebugLoc &DL,
394 int64_t BrOffset,
395 RegScavenger *RS) const {
396 assert(RS && "RegScavenger required for long branching");
397 assert(MBB.empty() &&
398 "new block should be inserted for expanding unconditional branch");
399 assert(MBB.pred_size() == 1);
400 assert(RestoreBB.empty() &&
401 "restore block should be inserted for restoring clobbered registers");
402
403 auto buildIndirectBranch = [&](Register Reg, MachineBasicBlock &DestBB) {
404 // Offsets outside of the signed 33-bit range are not supported for ADRP +
405 // ADD.
406 if (!isInt<33>(BrOffset))
408 "Branch offsets outside of the signed 33-bit range not supported");
409
410 BuildMI(MBB, MBB.end(), DL, get(AArch64::ADRP), Reg)
411 .addSym(DestBB.getSymbol(), AArch64II::MO_PAGE);
412 BuildMI(MBB, MBB.end(), DL, get(AArch64::ADDXri), Reg)
413 .addReg(Reg)
414 .addSym(DestBB.getSymbol(), AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
415 .addImm(0);
416 BuildMI(MBB, MBB.end(), DL, get(AArch64::BR)).addReg(Reg);
417 };
418
419 RS->enterBasicBlockEnd(MBB);
420 // If X16 is unused, we can rely on the linker to insert a range extension
421 // thunk if NewDestBB is out of range of a single B instruction.
422 constexpr Register Reg = AArch64::X16;
423 if (!RS->isRegUsed(Reg)) {
424 insertUnconditionalBranch(MBB, &NewDestBB, DL);
425 RS->setRegUsed(Reg);
426 return;
427 }
428
429 // In a cold block without BTI, insert the indirect branch if a register is
430 // free. Skip this if BTI is enabled to avoid inserting a BTI at the target,
431 // prioritizing a dynamic cost in cold code over a static cost in hot code.
432 AArch64FunctionInfo *AFI = MBB.getParent()->getInfo<AArch64FunctionInfo>();
433 bool HasBTI = AFI && AFI->branchTargetEnforcement();
434 if (MBB.getSectionID() == MBBSectionID::ColdSectionID && !HasBTI) {
435 Register Scavenged = RS->FindUnusedReg(&AArch64::GPR64RegClass);
436 if (Scavenged != AArch64::NoRegister) {
437 buildIndirectBranch(Scavenged, NewDestBB);
438 RS->setRegUsed(Scavenged);
439 return;
440 }
441 }
442
443 // Note: Spilling X16 briefly moves the stack pointer, making it incompatible
444 // with red zones.
445 if (!AFI || AFI->hasRedZone().value_or(true))
447 "Unable to insert indirect branch inside function that has red zone");
448
449 // Otherwise, spill X16 and defer range extension to the linker.
450 BuildMI(MBB, MBB.end(), DL, get(AArch64::STRXpre))
451 .addReg(AArch64::SP, RegState::Define)
452 .addReg(Reg)
453 .addReg(AArch64::SP)
454 .addImm(-16);
455
456 BuildMI(MBB, MBB.end(), DL, get(AArch64::B)).addMBB(&RestoreBB);
457
458 BuildMI(RestoreBB, RestoreBB.end(), DL, get(AArch64::LDRXpost))
459 .addReg(AArch64::SP, RegState::Define)
461 .addReg(AArch64::SP)
462 .addImm(16);
463}
464
465// Branch analysis.
468 MachineBasicBlock *&FBB,
470 bool AllowModify) const {
471 // If the block has no terminators, it just falls into the block after it.
472 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
473 if (I == MBB.end())
474 return false;
475
476 // Skip over SpeculationBarrierEndBB terminators
477 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
478 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
479 --I;
480 }
481
482 if (!isUnpredicatedTerminator(*I))
483 return false;
484
485 // Get the last instruction in the block.
486 MachineInstr *LastInst = &*I;
487
488 // If there is only one terminator instruction, process it.
489 unsigned LastOpc = LastInst->getOpcode();
490 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
491 if (isUncondBranchOpcode(LastOpc)) {
492 TBB = LastInst->getOperand(0).getMBB();
493 return false;
494 }
495 if (isCondBranchOpcode(LastOpc)) {
496 // Block ends with fall-through condbranch.
497 parseCondBranch(LastInst, TBB, Cond);
498 return false;
499 }
500 return true; // Can't handle indirect branch.
501 }
502
503 // Get the instruction before it if it is a terminator.
504 MachineInstr *SecondLastInst = &*I;
505 unsigned SecondLastOpc = SecondLastInst->getOpcode();
506
507 // If AllowModify is true and the block ends with two or more unconditional
508 // branches, delete all but the first unconditional branch.
509 if (AllowModify && isUncondBranchOpcode(LastOpc)) {
510 while (isUncondBranchOpcode(SecondLastOpc)) {
511 LastInst->eraseFromParent();
512 LastInst = SecondLastInst;
513 LastOpc = LastInst->getOpcode();
514 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
515 // Return now the only terminator is an unconditional branch.
516 TBB = LastInst->getOperand(0).getMBB();
517 return false;
518 }
519 SecondLastInst = &*I;
520 SecondLastOpc = SecondLastInst->getOpcode();
521 }
522 }
523
524 // If we're allowed to modify and the block ends in a unconditional branch
525 // which could simply fallthrough, remove the branch. (Note: This case only
526 // matters when we can't understand the whole sequence, otherwise it's also
527 // handled by BranchFolding.cpp.)
528 if (AllowModify && isUncondBranchOpcode(LastOpc) &&
529 MBB.isLayoutSuccessor(getBranchDestBlock(*LastInst))) {
530 LastInst->eraseFromParent();
531 LastInst = SecondLastInst;
532 LastOpc = LastInst->getOpcode();
533 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
534 assert(!isUncondBranchOpcode(LastOpc) &&
535 "unreachable unconditional branches removed above");
536
537 if (isCondBranchOpcode(LastOpc)) {
538 // Block ends with fall-through condbranch.
539 parseCondBranch(LastInst, TBB, Cond);
540 return false;
541 }
542 return true; // Can't handle indirect branch.
543 }
544 SecondLastInst = &*I;
545 SecondLastOpc = SecondLastInst->getOpcode();
546 }
547
548 // If there are three terminators, we don't know what sort of block this is.
549 if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
550 return true;
551
552 // If the block ends with a B and a Bcc, handle it.
553 if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
554 parseCondBranch(SecondLastInst, TBB, Cond);
555 FBB = LastInst->getOperand(0).getMBB();
556 return false;
557 }
558
559 // If the block ends with two unconditional branches, handle it. The second
560 // one is not executed, so remove it.
561 if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
562 TBB = SecondLastInst->getOperand(0).getMBB();
563 I = LastInst;
564 if (AllowModify)
565 I->eraseFromParent();
566 return false;
567 }
568
569 // ...likewise if it ends with an indirect branch followed by an unconditional
570 // branch.
571 if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
572 I = LastInst;
573 if (AllowModify)
574 I->eraseFromParent();
575 return true;
576 }
577
578 // Otherwise, can't handle this.
579 return true;
580}
581
583 MachineBranchPredicate &MBP,
584 bool AllowModify) const {
585 // Use analyzeBranch to validate the branch pattern.
586 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
588 if (analyzeBranch(MBB, TBB, FBB, Cond, AllowModify))
589 return true;
590
591 // analyzeBranch returns success with empty Cond for unconditional branches.
592 if (Cond.empty())
593 return true;
594
595 MBP.TrueDest = TBB;
596 assert(MBP.TrueDest && "expected!");
597 MBP.FalseDest = FBB ? FBB : MBB.getNextNode();
598
599 MBP.ConditionDef = nullptr;
600 MBP.SingleUseCondition = false;
601
602 // Find the conditional branch. After analyzeBranch succeeds with non-empty
603 // Cond, there's exactly one conditional branch - either last (fallthrough)
604 // or second-to-last (followed by unconditional B).
605 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
606 if (I == MBB.end())
607 return true;
608
609 if (isUncondBranchOpcode(I->getOpcode())) {
610 if (I == MBB.begin())
611 return true;
612 --I;
613 }
614
615 MachineInstr *CondBranch = &*I;
616 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
617
618 switch (CondBranch->getOpcode()) {
619 default:
620 return true;
621
622 case AArch64::Bcc:
623 // Bcc takes the NZCV flag as the operand to branch on, walk up the
624 // instruction stream to find the last instruction to define NZCV.
626 if (MI.modifiesRegister(AArch64::NZCV, /*TRI=*/nullptr)) {
627 MBP.ConditionDef = &MI;
628 break;
629 }
630 }
631 return false;
632
633 case AArch64::CBZW:
634 case AArch64::CBZX:
635 case AArch64::CBNZW:
636 case AArch64::CBNZX: {
637 MBP.LHS = CondBranch->getOperand(0);
638 MBP.RHS = MachineOperand::CreateImm(0);
639 unsigned Opc = CondBranch->getOpcode();
640 MBP.Predicate = (Opc == AArch64::CBNZX || Opc == AArch64::CBNZW)
641 ? MachineBranchPredicate::PRED_NE
642 : MachineBranchPredicate::PRED_EQ;
643 Register CondReg = MBP.LHS.getReg();
644 if (CondReg.isVirtual())
645 MBP.ConditionDef = MRI.getVRegDef(CondReg);
646 return false;
647 }
648
649 case AArch64::TBZW:
650 case AArch64::TBZX:
651 case AArch64::TBNZW:
652 case AArch64::TBNZX: {
653 Register CondReg = CondBranch->getOperand(0).getReg();
654 if (CondReg.isVirtual())
655 MBP.ConditionDef = MRI.getVRegDef(CondReg);
656 return false;
657 }
658 }
659}
660
663 if (Cond[0].getImm() != -1) {
664 // Regular Bcc
665 AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
667 } else {
668 // Folded compare-and-branch
669 switch (Cond[1].getImm()) {
670 default:
671 llvm_unreachable("Unknown conditional branch!");
672 case AArch64::CBZW:
673 Cond[1].setImm(AArch64::CBNZW);
674 break;
675 case AArch64::CBNZW:
676 Cond[1].setImm(AArch64::CBZW);
677 break;
678 case AArch64::CBZX:
679 Cond[1].setImm(AArch64::CBNZX);
680 break;
681 case AArch64::CBNZX:
682 Cond[1].setImm(AArch64::CBZX);
683 break;
684 case AArch64::TBZW:
685 Cond[1].setImm(AArch64::TBNZW);
686 break;
687 case AArch64::TBNZW:
688 Cond[1].setImm(AArch64::TBZW);
689 break;
690 case AArch64::TBZX:
691 Cond[1].setImm(AArch64::TBNZX);
692 break;
693 case AArch64::TBNZX:
694 Cond[1].setImm(AArch64::TBZX);
695 break;
696
697 // Cond is { -1, Opcode, CC, Op0, Op1, ... }
698 case AArch64::CBWPri:
699 case AArch64::CBXPri:
700 case AArch64::CBBAssertExt:
701 case AArch64::CBHAssertExt:
702 case AArch64::CBWPrr:
703 case AArch64::CBXPrr: {
704 // Pseudos using standard 4bit Arm condition codes
706 static_cast<AArch64CC::CondCode>(Cond[2].getImm());
708 }
709 }
710 }
711
712 return false;
713}
714
716 int *BytesRemoved) const {
717 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
718 if (I == MBB.end())
719 return 0;
720
721 if (!isUncondBranchOpcode(I->getOpcode()) &&
722 !isCondBranchOpcode(I->getOpcode()))
723 return 0;
724
725 // Remove the branch.
726 I->eraseFromParent();
727
728 I = MBB.end();
729
730 if (I == MBB.begin()) {
731 if (BytesRemoved)
732 *BytesRemoved = 4;
733 return 1;
734 }
735 --I;
736 if (!isCondBranchOpcode(I->getOpcode())) {
737 if (BytesRemoved)
738 *BytesRemoved = 4;
739 return 1;
740 }
741
742 // Remove the branch.
743 I->eraseFromParent();
744 if (BytesRemoved)
745 *BytesRemoved = 8;
746
747 return 2;
748}
749
750void AArch64InstrInfo::instantiateCondBranch(
753 if (Cond[0].getImm() != -1) {
754 // Regular Bcc
755 BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
756 } else {
757 // Folded compare-and-branch
758 // Note that we use addOperand instead of addReg to keep the flags.
759
760 // cbz, cbnz
761 const MachineInstrBuilder MIB =
762 BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);
763
764 // tbz/tbnz
765 if (Cond.size() > 3)
766 MIB.add(Cond[3]);
767
768 // cb
769 if (Cond.size() > 4)
770 MIB.add(Cond[4]);
771
772 MIB.addMBB(TBB);
773
774 // cb[b,h]
775 if (Cond.size() > 5) {
776 MIB.addImm(Cond[5].getImm());
777 MIB.addImm(Cond[6].getImm());
778 }
779 }
780}
781
784 ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
785 // Shouldn't be a fall through.
786 assert(TBB && "insertBranch must not be told to insert a fallthrough");
787
788 if (!FBB) {
789 if (Cond.empty()) // Unconditional branch?
790 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
791 else
792 instantiateCondBranch(MBB, DL, TBB, Cond);
793
794 if (BytesAdded)
795 *BytesAdded = 4;
796
797 return 1;
798 }
799
800 // Two-way conditional branch.
801 instantiateCondBranch(MBB, DL, TBB, Cond);
802 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
803
804 if (BytesAdded)
805 *BytesAdded = 8;
806
807 return 2;
808}
809
811 const TargetInstrInfo &TII) {
812 for (MachineInstr &MI : MBB->terminators()) {
813 unsigned Opc = MI.getOpcode();
814 switch (Opc) {
815 case AArch64::CBZW:
816 case AArch64::CBZX:
817 case AArch64::TBZW:
818 case AArch64::TBZX:
819 // CBZ/TBZ with WZR/XZR -> unconditional B
820 if (MI.getOperand(0).getReg() == AArch64::WZR ||
821 MI.getOperand(0).getReg() == AArch64::XZR) {
822 DEBUG_WITH_TYPE("optimizeTerminators",
823 dbgs() << "Removing always taken branch: " << MI);
824 MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
825 SmallVector<MachineBasicBlock *> Succs(MBB->successors());
826 for (auto *S : Succs)
827 if (S != Target)
828 MBB->removeSuccessor(S);
829 DebugLoc DL = MI.getDebugLoc();
830 while (MBB->rbegin() != &MI)
831 MBB->rbegin()->eraseFromParent();
832 MI.eraseFromParent();
833 BuildMI(MBB, DL, TII.get(AArch64::B)).addMBB(Target);
834 return true;
835 }
836 break;
837 case AArch64::CBNZW:
838 case AArch64::CBNZX:
839 case AArch64::TBNZW:
840 case AArch64::TBNZX:
841 // CBNZ/TBNZ with WZR/XZR -> never taken, remove branch and successor
842 if (MI.getOperand(0).getReg() == AArch64::WZR ||
843 MI.getOperand(0).getReg() == AArch64::XZR) {
844 DEBUG_WITH_TYPE("optimizeTerminators",
845 dbgs() << "Removing never taken branch: " << MI);
846 MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
847 MI.getParent()->removeSuccessor(Target);
848 MI.eraseFromParent();
849 return true;
850 }
851 break;
852 }
853 }
854 return false;
855}
856
857// Find the original register that VReg is copied from.
858static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
859 while (Register::isVirtualRegister(VReg)) {
860 const MachineInstr *DefMI = MRI.getVRegDef(VReg);
861 if (!DefMI->isFullCopy())
862 return VReg;
863 VReg = DefMI->getOperand(1).getReg();
864 }
865 return VReg;
866}
867
868// Determine if VReg is defined by an instruction that can be folded into a
869// csel instruction. If so, return the folded opcode, and the replacement
870// register.
871static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
872 unsigned *NewReg = nullptr) {
873 VReg = removeCopies(MRI, VReg);
875 return 0;
876
877 bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
878 const MachineInstr *DefMI = MRI.getVRegDef(VReg);
879 unsigned Opc = 0;
880 unsigned SrcReg = 0;
881 switch (DefMI->getOpcode()) {
882 case AArch64::SUBREG_TO_REG:
883 // Check for the following way to define an 64-bit immediate:
884 // %0:gpr32 = MOVi32imm 1
885 // %1:gpr64 = SUBREG_TO_REG %0:gpr32, %subreg.sub_32
886 if (!DefMI->getOperand(1).isReg())
887 return 0;
888 if (!DefMI->getOperand(2).isImm() ||
889 DefMI->getOperand(2).getImm() != AArch64::sub_32)
890 return 0;
891 DefMI = MRI.getVRegDef(DefMI->getOperand(1).getReg());
892 if (DefMI->getOpcode() != AArch64::MOVi32imm)
893 return 0;
894 if (!DefMI->getOperand(1).isImm() || DefMI->getOperand(1).getImm() != 1)
895 return 0;
896 assert(Is64Bit);
897 SrcReg = AArch64::XZR;
898 Opc = AArch64::CSINCXr;
899 break;
900
901 case AArch64::MOVi32imm:
902 case AArch64::MOVi64imm:
903 if (!DefMI->getOperand(1).isImm() || DefMI->getOperand(1).getImm() != 1)
904 return 0;
905 SrcReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
906 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
907 break;
908
909 case AArch64::ADDSXri:
910 case AArch64::ADDSWri:
911 // if NZCV is used, do not fold.
912 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
913 true) == -1)
914 return 0;
915 // fall-through to ADDXri and ADDWri.
916 [[fallthrough]];
917 case AArch64::ADDXri:
918 case AArch64::ADDWri:
919 // add x, 1 -> csinc.
920 if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
921 DefMI->getOperand(3).getImm() != 0)
922 return 0;
923 SrcReg = DefMI->getOperand(1).getReg();
924 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
925 break;
926
927 case AArch64::ORNXrr:
928 case AArch64::ORNWrr: {
929 // not x -> csinv, represented as orn dst, xzr, src.
930 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
931 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
932 return 0;
933 SrcReg = DefMI->getOperand(2).getReg();
934 Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
935 break;
936 }
937
938 case AArch64::SUBSXrr:
939 case AArch64::SUBSWrr:
940 // if NZCV is used, do not fold.
941 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
942 true) == -1)
943 return 0;
944 // fall-through to SUBXrr and SUBWrr.
945 [[fallthrough]];
946 case AArch64::SUBXrr:
947 case AArch64::SUBWrr: {
948 // neg x -> csneg, represented as sub dst, xzr, src.
949 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
950 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
951 return 0;
952 SrcReg = DefMI->getOperand(2).getReg();
953 Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
954 break;
955 }
956 default:
957 return 0;
958 }
959 assert(Opc && SrcReg && "Missing parameters");
960
961 if (NewReg)
962 *NewReg = SrcReg;
963 return Opc;
964}
965
968 Register DstReg, Register TrueReg,
969 Register FalseReg, int &CondCycles,
970 int &TrueCycles,
971 int &FalseCycles) const {
972 // Check register classes.
973 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
974 const TargetRegisterClass *RC =
975 RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
976 if (!RC)
977 return false;
978
979 // Also need to check the dest regclass, in case we're trying to optimize
980 // something like:
981 // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2
982 if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg)))
983 return false;
984
985 // Expanding cbz/tbz requires an extra cycle of latency on the condition.
986 unsigned ExtraCondLat = Cond.size() != 1;
987
988 // GPRs are handled by csel.
989 // FIXME: Fold in x+1, -x, and ~x when applicable.
990 if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
991 AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
992 // Single-cycle csel, csinc, csinv, and csneg.
993 CondCycles = 1 + ExtraCondLat;
994 TrueCycles = FalseCycles = 1;
995 if (canFoldIntoCSel(MRI, TrueReg))
996 TrueCycles = 0;
997 else if (canFoldIntoCSel(MRI, FalseReg))
998 FalseCycles = 0;
999 return true;
1000 }
1001
1002 // Scalar floating point is handled by fcsel.
1003 // FIXME: Form fabs, fmin, and fmax when applicable.
1004 if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
1005 AArch64::FPR32RegClass.hasSubClassEq(RC)) {
1006 CondCycles = 5 + ExtraCondLat;
1007 TrueCycles = FalseCycles = 2;
1008 return true;
1009 }
1010
1011 // Can't do vectors.
1012 return false;
1013}
1014
1017 const DebugLoc &DL, Register DstReg,
1019 Register TrueReg, Register FalseReg) const {
1020 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1021
1022 // Parse the condition code, see parseCondBranch() above.
1024 switch (Cond.size()) {
1025 default:
1026 llvm_unreachable("Unknown condition opcode in Cond");
1027 case 1: // b.cc
1028 CC = AArch64CC::CondCode(Cond[0].getImm());
1029 break;
1030 case 3: { // cbz/cbnz
1031 // We must insert a compare against 0.
1032 bool Is64Bit;
1033 switch (Cond[1].getImm()) {
1034 default:
1035 llvm_unreachable("Unknown branch opcode in Cond");
1036 case AArch64::CBZW:
1037 Is64Bit = false;
1038 CC = AArch64CC::EQ;
1039 break;
1040 case AArch64::CBZX:
1041 Is64Bit = true;
1042 CC = AArch64CC::EQ;
1043 break;
1044 case AArch64::CBNZW:
1045 Is64Bit = false;
1046 CC = AArch64CC::NE;
1047 break;
1048 case AArch64::CBNZX:
1049 Is64Bit = true;
1050 CC = AArch64CC::NE;
1051 break;
1052 }
1053 Register SrcReg = Cond[2].getReg();
1054 if (Is64Bit) {
1055 // cmp reg, #0 is actually subs xzr, reg, #0.
1056 MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
1057 BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
1058 .addReg(SrcReg)
1059 .addImm(0)
1060 .addImm(0);
1061 } else {
1062 MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
1063 BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
1064 .addReg(SrcReg)
1065 .addImm(0)
1066 .addImm(0);
1067 }
1068 break;
1069 }
1070 case 4: { // tbz/tbnz
1071 // We must insert a tst instruction.
1072 switch (Cond[1].getImm()) {
1073 default:
1074 llvm_unreachable("Unknown branch opcode in Cond");
1075 case AArch64::TBZW:
1076 case AArch64::TBZX:
1077 CC = AArch64CC::EQ;
1078 break;
1079 case AArch64::TBNZW:
1080 case AArch64::TBNZX:
1081 CC = AArch64CC::NE;
1082 break;
1083 }
1084 // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
1085 if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
1086 BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
1087 .addReg(Cond[2].getReg())
1088 .addImm(
1090 else
1091 BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
1092 .addReg(Cond[2].getReg())
1093 .addImm(
1095 break;
1096 }
1097 case 5: { // cb
1098 // We must insert a cmp, that is a subs
1099 // 0 1 2 3 4
1100 // Cond is { -1, Opcode, CC, Op0, Op1 }
1101
1102 unsigned SubsOpc, SubsDestReg;
1103 bool IsImm = false;
1104 CC = static_cast<AArch64CC::CondCode>(Cond[2].getImm());
1105 switch (Cond[1].getImm()) {
1106 default:
1107 llvm_unreachable("Unknown branch opcode in Cond");
1108 case AArch64::CBWPri:
1109 SubsOpc = AArch64::SUBSWri;
1110 SubsDestReg = AArch64::WZR;
1111 IsImm = true;
1112 break;
1113 case AArch64::CBXPri:
1114 SubsOpc = AArch64::SUBSXri;
1115 SubsDestReg = AArch64::XZR;
1116 IsImm = true;
1117 break;
1118 case AArch64::CBWPrr:
1119 SubsOpc = AArch64::SUBSWrr;
1120 SubsDestReg = AArch64::WZR;
1121 IsImm = false;
1122 break;
1123 case AArch64::CBXPrr:
1124 SubsOpc = AArch64::SUBSXrr;
1125 SubsDestReg = AArch64::XZR;
1126 IsImm = false;
1127 break;
1128 }
1129
1130 if (IsImm)
1131 BuildMI(MBB, I, DL, get(SubsOpc), SubsDestReg)
1132 .addReg(Cond[3].getReg())
1133 .addImm(Cond[4].getImm())
1134 .addImm(0);
1135 else
1136 BuildMI(MBB, I, DL, get(SubsOpc), SubsDestReg)
1137 .addReg(Cond[3].getReg())
1138 .addReg(Cond[4].getReg());
1139 } break;
1140 case 7: { // cb[b,h]
1141 // We must insert a cmp, that is a subs, but also zero- or sign-extensions
1142 // that have been folded. For the first operand we codegen an explicit
1143 // extension, for the second operand we fold the extension into cmp.
1144 // 0 1 2 3 4 5 6
1145 // Cond is { -1, Opcode, CC, Op0, Op1, Ext0, Ext1 }
1146
1147 // We need a new register for the now explicitly extended register
1148 Register Reg = Cond[4].getReg();
1150 unsigned ExtOpc;
1151 unsigned ExtBits;
1152 AArch64_AM::ShiftExtendType ExtendType =
1154 switch (ExtendType) {
1155 default:
1156 llvm_unreachable("Unknown shift-extend for CB instruction");
1157 case AArch64_AM::SXTB:
1158 assert(
1159 Cond[1].getImm() == AArch64::CBBAssertExt &&
1160 "Unexpected compare-and-branch instruction for SXTB shift-extend");
1161 ExtOpc = AArch64::SBFMWri;
1162 ExtBits = AArch64_AM::encodeLogicalImmediate(0xff, 32);
1163 break;
1164 case AArch64_AM::SXTH:
1165 assert(
1166 Cond[1].getImm() == AArch64::CBHAssertExt &&
1167 "Unexpected compare-and-branch instruction for SXTH shift-extend");
1168 ExtOpc = AArch64::SBFMWri;
1169 ExtBits = AArch64_AM::encodeLogicalImmediate(0xffff, 32);
1170 break;
1171 case AArch64_AM::UXTB:
1172 assert(
1173 Cond[1].getImm() == AArch64::CBBAssertExt &&
1174 "Unexpected compare-and-branch instruction for UXTB shift-extend");
1175 ExtOpc = AArch64::ANDWri;
1176 ExtBits = AArch64_AM::encodeLogicalImmediate(0xff, 32);
1177 break;
1178 case AArch64_AM::UXTH:
1179 assert(
1180 Cond[1].getImm() == AArch64::CBHAssertExt &&
1181 "Unexpected compare-and-branch instruction for UXTH shift-extend");
1182 ExtOpc = AArch64::ANDWri;
1183 ExtBits = AArch64_AM::encodeLogicalImmediate(0xffff, 32);
1184 break;
1185 }
1186
1187 // Build the explicit extension of the first operand
1188 Reg = MRI.createVirtualRegister(&AArch64::GPR32spRegClass);
1190 BuildMI(MBB, I, DL, get(ExtOpc), Reg).addReg(Cond[4].getReg());
1191 if (ExtOpc != AArch64::ANDWri)
1192 MBBI.addImm(0);
1193 MBBI.addImm(ExtBits);
1194 }
1195
1196 // Now, subs with an extended second operand
1198 AArch64_AM::ShiftExtendType ExtendType =
1200 MRI.constrainRegClass(Reg, MRI.getRegClass(Cond[3].getReg()));
1201 MRI.constrainRegClass(Cond[3].getReg(), &AArch64::GPR32spRegClass);
1202 BuildMI(MBB, I, DL, get(AArch64::SUBSWrx), AArch64::WZR)
1203 .addReg(Cond[3].getReg())
1204 .addReg(Reg)
1205 .addImm(AArch64_AM::getArithExtendImm(ExtendType, 0));
1206 } // If no extension is needed, just a regular subs
1207 else {
1208 MRI.constrainRegClass(Reg, MRI.getRegClass(Cond[3].getReg()));
1209 MRI.constrainRegClass(Cond[3].getReg(), &AArch64::GPR32spRegClass);
1210 BuildMI(MBB, I, DL, get(AArch64::SUBSWrr), AArch64::WZR)
1211 .addReg(Cond[3].getReg())
1212 .addReg(Reg);
1213 }
1214
1215 CC = static_cast<AArch64CC::CondCode>(Cond[2].getImm());
1216 } break;
1217 }
1218
1219 unsigned Opc = 0;
1220 const TargetRegisterClass *RC = nullptr;
1221 bool TryFold = false;
1222 if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
1223 RC = &AArch64::GPR64RegClass;
1224 Opc = AArch64::CSELXr;
1225 TryFold = true;
1226 } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
1227 RC = &AArch64::GPR32RegClass;
1228 Opc = AArch64::CSELWr;
1229 TryFold = true;
1230 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
1231 RC = &AArch64::FPR64RegClass;
1232 Opc = AArch64::FCSELDrrr;
1233 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
1234 RC = &AArch64::FPR32RegClass;
1235 Opc = AArch64::FCSELSrrr;
1236 }
1237 assert(RC && "Unsupported regclass");
1238
1239 // Try folding simple instructions into the csel.
1240 if (TryFold) {
1241 unsigned NewReg = 0;
1242 unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewReg);
1243 if (FoldedOpc) {
1244 // The folded opcodes csinc, csinc and csneg apply the operation to
1245 // FalseReg, so we need to invert the condition.
1247 TrueReg = FalseReg;
1248 } else
1249 FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewReg);
1250
1251 // Fold the operation. Leave any dead instructions for DCE to clean up.
1252 if (FoldedOpc) {
1253 FalseReg = NewReg;
1254 Opc = FoldedOpc;
1255 // Extend the live range of NewReg.
1256 MRI.clearKillFlags(NewReg);
1257 }
1258 }
1259
1260 // Pull all virtual register into the appropriate class.
1261 MRI.constrainRegClass(TrueReg, RC);
1262 // FalseReg might be WZR or XZR if the folded operand is a literal 1.
1263 assert(
1264 (FalseReg.isVirtual() || FalseReg == AArch64::WZR ||
1265 FalseReg == AArch64::XZR) &&
1266 "FalseReg was folded into a non-virtual register other than WZR or XZR");
1267 if (FalseReg.isVirtual())
1268 MRI.constrainRegClass(FalseReg, RC);
1269
1270 // Insert the csel.
1271 BuildMI(MBB, I, DL, get(Opc), DstReg)
1272 .addReg(TrueReg)
1273 .addReg(FalseReg)
1274 .addImm(CC);
1275}
1276
1277// Return true if Imm can be loaded into a register by a "cheap" sequence of
1278// instructions. For now, "cheap" means at most two instructions.
1279static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize) {
1280 if (BitSize == 32)
1281 return true;
1282
1283 assert(BitSize == 64 && "Only bit sizes of 32 or 64 allowed");
1284 uint64_t Imm = static_cast<uint64_t>(MI.getOperand(1).getImm());
1286 AArch64_IMM::expandMOVImm(Imm, BitSize, Is);
1287
1288 return Is.size() <= 2;
1289}
1290
1291// Check if a COPY instruction is cheap.
1292static bool isCheapCopy(const MachineInstr &MI, const AArch64RegisterInfo &RI) {
1293 assert(MI.isCopy() && "Expected COPY instruction");
1294 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
1295
1296 // Cross-bank copies (e.g., between GPR and FPR) are expensive on AArch64,
1297 // typically requiring an FMOV instruction with a 2-6 cycle latency.
1298 auto GetRegClass = [&](Register Reg) -> const TargetRegisterClass * {
1299 if (Reg.isVirtual())
1300 return MRI.getRegClass(Reg);
1301 if (Reg.isPhysical())
1302 return RI.getMinimalPhysRegClass(Reg);
1303 return nullptr;
1304 };
1305 const TargetRegisterClass *DstRC = GetRegClass(MI.getOperand(0).getReg());
1306 const TargetRegisterClass *SrcRC = GetRegClass(MI.getOperand(1).getReg());
1307 if (DstRC && SrcRC && !RI.getCommonSubClass(DstRC, SrcRC))
1308 return false;
1309
1310 return MI.isAsCheapAsAMove();
1311}
1312
1313// FIXME: this implementation should be micro-architecture dependent, so a
1314// micro-architecture target hook should be introduced here in future.
1316 if (Subtarget.hasExynosCheapAsMoveHandling()) {
1317 if (isExynosCheapAsMove(MI))
1318 return true;
1319 return MI.isAsCheapAsAMove();
1320 }
1321
1322 switch (MI.getOpcode()) {
1323 default:
1324 return MI.isAsCheapAsAMove();
1325
1326 case TargetOpcode::COPY:
1327 return isCheapCopy(MI, RI);
1328
1329 case AArch64::ADDWrs:
1330 case AArch64::ADDXrs:
1331 case AArch64::SUBWrs:
1332 case AArch64::SUBXrs:
1333 return Subtarget.hasALULSLFast() && MI.getOperand(3).getImm() <= 4;
1334
1335 // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
1336 // ORRXri, it is as cheap as MOV.
1337 // Likewise if it can be expanded to MOVZ/MOVN/MOVK.
1338 case AArch64::MOVi32imm:
1339 return isCheapImmediate(MI, 32);
1340 case AArch64::MOVi64imm:
1341 return isCheapImmediate(MI, 64);
1342 }
1343}
1344
1345bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
1346 switch (MI.getOpcode()) {
1347 default:
1348 return false;
1349
1350 case AArch64::ADDWrs:
1351 case AArch64::ADDXrs:
1352 case AArch64::ADDSWrs:
1353 case AArch64::ADDSXrs: {
1354 unsigned Imm = MI.getOperand(3).getImm();
1355 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1356 if (ShiftVal == 0)
1357 return true;
1358 return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
1359 }
1360
1361 case AArch64::ADDWrx:
1362 case AArch64::ADDXrx:
1363 case AArch64::ADDXrx64:
1364 case AArch64::ADDSWrx:
1365 case AArch64::ADDSXrx:
1366 case AArch64::ADDSXrx64: {
1367 unsigned Imm = MI.getOperand(3).getImm();
1368 switch (AArch64_AM::getArithExtendType(Imm)) {
1369 default:
1370 return false;
1371 case AArch64_AM::UXTB:
1372 case AArch64_AM::UXTH:
1373 case AArch64_AM::UXTW:
1374 case AArch64_AM::UXTX:
1375 return AArch64_AM::getArithShiftValue(Imm) <= 4;
1376 }
1377 }
1378
1379 case AArch64::SUBWrs:
1380 case AArch64::SUBSWrs: {
1381 unsigned Imm = MI.getOperand(3).getImm();
1382 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1383 return ShiftVal == 0 ||
1384 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
1385 }
1386
1387 case AArch64::SUBXrs:
1388 case AArch64::SUBSXrs: {
1389 unsigned Imm = MI.getOperand(3).getImm();
1390 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1391 return ShiftVal == 0 ||
1392 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
1393 }
1394
1395 case AArch64::SUBWrx:
1396 case AArch64::SUBXrx:
1397 case AArch64::SUBXrx64:
1398 case AArch64::SUBSWrx:
1399 case AArch64::SUBSXrx:
1400 case AArch64::SUBSXrx64: {
1401 unsigned Imm = MI.getOperand(3).getImm();
1402 switch (AArch64_AM::getArithExtendType(Imm)) {
1403 default:
1404 return false;
1405 case AArch64_AM::UXTB:
1406 case AArch64_AM::UXTH:
1407 case AArch64_AM::UXTW:
1408 case AArch64_AM::UXTX:
1409 return AArch64_AM::getArithShiftValue(Imm) == 0;
1410 }
1411 }
1412
1413 case AArch64::LDRBBroW:
1414 case AArch64::LDRBBroX:
1415 case AArch64::LDRBroW:
1416 case AArch64::LDRBroX:
1417 case AArch64::LDRDroW:
1418 case AArch64::LDRDroX:
1419 case AArch64::LDRHHroW:
1420 case AArch64::LDRHHroX:
1421 case AArch64::LDRHroW:
1422 case AArch64::LDRHroX:
1423 case AArch64::LDRQroW:
1424 case AArch64::LDRQroX:
1425 case AArch64::LDRSBWroW:
1426 case AArch64::LDRSBWroX:
1427 case AArch64::LDRSBXroW:
1428 case AArch64::LDRSBXroX:
1429 case AArch64::LDRSHWroW:
1430 case AArch64::LDRSHWroX:
1431 case AArch64::LDRSHXroW:
1432 case AArch64::LDRSHXroX:
1433 case AArch64::LDRSWroW:
1434 case AArch64::LDRSWroX:
1435 case AArch64::LDRSroW:
1436 case AArch64::LDRSroX:
1437 case AArch64::LDRWroW:
1438 case AArch64::LDRWroX:
1439 case AArch64::LDRXroW:
1440 case AArch64::LDRXroX:
1441 case AArch64::PRFMroW:
1442 case AArch64::PRFMroX:
1443 case AArch64::STRBBroW:
1444 case AArch64::STRBBroX:
1445 case AArch64::STRBroW:
1446 case AArch64::STRBroX:
1447 case AArch64::STRDroW:
1448 case AArch64::STRDroX:
1449 case AArch64::STRHHroW:
1450 case AArch64::STRHHroX:
1451 case AArch64::STRHroW:
1452 case AArch64::STRHroX:
1453 case AArch64::STRQroW:
1454 case AArch64::STRQroX:
1455 case AArch64::STRSroW:
1456 case AArch64::STRSroX:
1457 case AArch64::STRWroW:
1458 case AArch64::STRWroX:
1459 case AArch64::STRXroW:
1460 case AArch64::STRXroX: {
1461 unsigned IsSigned = MI.getOperand(3).getImm();
1462 return !IsSigned;
1463 }
1464 }
1465}
1466
1467bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
1468 unsigned Opc = MI.getOpcode();
1469 switch (Opc) {
1470 default:
1471 return false;
1472 case AArch64::SEH_StackAlloc:
1473 case AArch64::SEH_SaveFPLR:
1474 case AArch64::SEH_SaveFPLR_X:
1475 case AArch64::SEH_SaveReg:
1476 case AArch64::SEH_SaveReg_X:
1477 case AArch64::SEH_SaveRegP:
1478 case AArch64::SEH_SaveRegP_X:
1479 case AArch64::SEH_SaveFReg:
1480 case AArch64::SEH_SaveFReg_X:
1481 case AArch64::SEH_SaveFRegP:
1482 case AArch64::SEH_SaveFRegP_X:
1483 case AArch64::SEH_SetFP:
1484 case AArch64::SEH_AddFP:
1485 case AArch64::SEH_Nop:
1486 case AArch64::SEH_PrologEnd:
1487 case AArch64::SEH_EpilogStart:
1488 case AArch64::SEH_EpilogEnd:
1489 case AArch64::SEH_PACSignLR:
1490 case AArch64::SEH_SaveAnyRegI:
1491 case AArch64::SEH_SaveAnyRegIP:
1492 case AArch64::SEH_SaveAnyRegQP:
1493 case AArch64::SEH_SaveAnyRegQPX:
1494 case AArch64::SEH_AllocZ:
1495 case AArch64::SEH_SaveZReg:
1496 case AArch64::SEH_SavePReg:
1497 return true;
1498 }
1499}
1500
1502 Register &SrcReg, Register &DstReg,
1503 unsigned &SubIdx) const {
1504 switch (MI.getOpcode()) {
1505 default:
1506 return false;
1507 case AArch64::SBFMXri: // aka sxtw
1508 case AArch64::UBFMXri: // aka uxtw
1509 // Check for the 32 -> 64 bit extension case, these instructions can do
1510 // much more.
1511 if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
1512 return false;
1513 // This is a signed or unsigned 32 -> 64 bit extension.
1514 SrcReg = MI.getOperand(1).getReg();
1515 DstReg = MI.getOperand(0).getReg();
1516 SubIdx = AArch64::sub_32;
1517 return true;
1518 }
1519}
1520
1522 const MachineInstr &MIa, const MachineInstr &MIb) const {
1524 const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
1525 int64_t OffsetA = 0, OffsetB = 0;
1526 TypeSize WidthA(0, false), WidthB(0, false);
1527 bool OffsetAIsScalable = false, OffsetBIsScalable = false;
1528
1529 assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
1530 assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
1531
1534 return false;
1535
1536 // Retrieve the base, offset from the base and width. Width
1537 // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If
1538 // base are identical, and the offset of a lower memory access +
1539 // the width doesn't overlap the offset of a higher memory access,
1540 // then the memory accesses are different.
1541 // If OffsetAIsScalable and OffsetBIsScalable are both true, they
1542 // are assumed to have the same scale (vscale).
1543 if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable,
1544 WidthA, TRI) &&
1545 getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable,
1546 WidthB, TRI)) {
1547 if (BaseOpA->isIdenticalTo(*BaseOpB) &&
1548 OffsetAIsScalable == OffsetBIsScalable) {
1549 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
1550 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
1551 TypeSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
1552 if (LowWidth.isScalable() == OffsetAIsScalable &&
1553 LowOffset + (int)LowWidth.getKnownMinValue() <= HighOffset)
1554 return true;
1555 }
1556 }
1557 return false;
1558}
1559
1561 const MachineBasicBlock *MBB,
1562 const MachineFunction &MF) const {
1564 return true;
1565
1566 // Do not move an instruction that can be recognized as a branch target.
1567 if (hasBTISemantics(MI))
1568 return true;
1569
1570 switch (MI.getOpcode()) {
1571 case AArch64::HINT:
1572 // CSDB hints are scheduling barriers.
1573 if (MI.getOperand(0).getImm() == 0x14)
1574 return true;
1575 break;
1576 case AArch64::DSB:
1577 case AArch64::ISB:
1578 // DSB and ISB also are scheduling barriers.
1579 return true;
1580 case AArch64::MSRpstatesvcrImm1:
1581 // SMSTART and SMSTOP are also scheduling barriers.
1582 return true;
1583 default:;
1584 }
1585 if (isSEHInstruction(MI))
1586 return true;
1587 auto Next = std::next(MI.getIterator());
1588 return Next != MBB->end() && Next->isCFIInstruction();
1589}
1590
1591/// analyzeCompare - For a comparison instruction, return the source registers
1592/// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
1593/// Return true if the comparison instruction can be analyzed.
1595 Register &SrcReg2, int64_t &CmpMask,
1596 int64_t &CmpValue) const {
1597 // The first operand can be a frame index where we'd normally expect a
1598 // register.
1599 // FIXME: Pass subregisters out of analyzeCompare
1600 assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
1601 if (!MI.getOperand(1).isReg() || MI.getOperand(1).getSubReg())
1602 return false;
1603
1604 switch (MI.getOpcode()) {
1605 default:
1606 break;
1607 case AArch64::PTEST_PP:
1608 case AArch64::PTEST_PP_ANY:
1609 case AArch64::PTEST_PP_FIRST:
1610 SrcReg = MI.getOperand(0).getReg();
1611 SrcReg2 = MI.getOperand(1).getReg();
1612 if (MI.getOperand(2).getSubReg())
1613 return false;
1614
1615 // Not sure about the mask and value for now...
1616 CmpMask = ~0;
1617 CmpValue = 0;
1618 return true;
1619 case AArch64::SUBSWrr:
1620 case AArch64::SUBSWrs:
1621 case AArch64::SUBSWrx:
1622 case AArch64::SUBSXrr:
1623 case AArch64::SUBSXrs:
1624 case AArch64::SUBSXrx:
1625 case AArch64::ADDSWrr:
1626 case AArch64::ADDSWrs:
1627 case AArch64::ADDSWrx:
1628 case AArch64::ADDSXrr:
1629 case AArch64::ADDSXrs:
1630 case AArch64::ADDSXrx:
1631 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1632 SrcReg = MI.getOperand(1).getReg();
1633 SrcReg2 = MI.getOperand(2).getReg();
1634
1635 // FIXME: Pass subregisters out of analyzeCompare
1636 if (MI.getOperand(2).getSubReg())
1637 return false;
1638
1639 CmpMask = ~0;
1640 CmpValue = 0;
1641 return true;
1642 case AArch64::SUBSWri:
1643 case AArch64::ADDSWri:
1644 case AArch64::SUBSXri:
1645 case AArch64::ADDSXri:
1646 SrcReg = MI.getOperand(1).getReg();
1647 SrcReg2 = 0;
1648 CmpMask = ~0;
1649 CmpValue = MI.getOperand(2).getImm();
1650 return true;
1651 case AArch64::ANDSWri:
1652 case AArch64::ANDSXri:
1653 // ANDS does not use the same encoding scheme as the others xxxS
1654 // instructions.
1655 SrcReg = MI.getOperand(1).getReg();
1656 SrcReg2 = 0;
1657 CmpMask = ~0;
1659 MI.getOperand(2).getImm(),
1660 MI.getOpcode() == AArch64::ANDSWri ? 32 : 64);
1661 return true;
1662 }
1663
1664 return false;
1665}
1666
1668 MachineBasicBlock *MBB = Instr.getParent();
1669 assert(MBB && "Can't get MachineBasicBlock here");
1670 MachineFunction *MF = MBB->getParent();
1671 assert(MF && "Can't get MachineFunction here");
1674 MachineRegisterInfo *MRI = &MF->getRegInfo();
1675
1676 for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
1677 ++OpIdx) {
1678 MachineOperand &MO = Instr.getOperand(OpIdx);
1679 const TargetRegisterClass *OpRegCstraints =
1680 Instr.getRegClassConstraint(OpIdx, TII, TRI);
1681
1682 // If there's no constraint, there's nothing to do.
1683 if (!OpRegCstraints)
1684 continue;
1685 // If the operand is a frame index, there's nothing to do here.
1686 // A frame index operand will resolve correctly during PEI.
1687 if (MO.isFI())
1688 continue;
1689
1690 assert(MO.isReg() &&
1691 "Operand has register constraints without being a register!");
1692
1693 Register Reg = MO.getReg();
1694 if (Reg.isPhysical()) {
1695 if (!OpRegCstraints->contains(Reg))
1696 return false;
1697 } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
1698 !MRI->constrainRegClass(Reg, OpRegCstraints))
1699 return false;
1700 }
1701
1702 return true;
1703}
1704
1705/// Return the opcode that does not set flags when possible - otherwise
1706/// return the original opcode. The caller is responsible to do the actual
1707/// substitution and legality checking.
1709 // Don't convert all compare instructions, because for some the zero register
1710 // encoding becomes the sp register.
1711 bool MIDefinesZeroReg = false;
1712 if (MI.definesRegister(AArch64::WZR, /*TRI=*/nullptr) ||
1713 MI.definesRegister(AArch64::XZR, /*TRI=*/nullptr))
1714 MIDefinesZeroReg = true;
1715
1716 switch (MI.getOpcode()) {
1717 default:
1718 return MI.getOpcode();
1719 case AArch64::ADDSWrr:
1720 return AArch64::ADDWrr;
1721 case AArch64::ADDSWri:
1722 return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
1723 case AArch64::ADDSWrs:
1724 return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
1725 case AArch64::ADDSWrx:
1726 return AArch64::ADDWrx;
1727 case AArch64::ADDSXrr:
1728 return AArch64::ADDXrr;
1729 case AArch64::ADDSXri:
1730 return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
1731 case AArch64::ADDSXrs:
1732 return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
1733 case AArch64::ADDSXrx:
1734 return AArch64::ADDXrx;
1735 case AArch64::SUBSWrr:
1736 return AArch64::SUBWrr;
1737 case AArch64::SUBSWri:
1738 return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
1739 case AArch64::SUBSWrs:
1740 return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
1741 case AArch64::SUBSWrx:
1742 return AArch64::SUBWrx;
1743 case AArch64::SUBSXrr:
1744 return AArch64::SUBXrr;
1745 case AArch64::SUBSXri:
1746 return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
1747 case AArch64::SUBSXrs:
1748 return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
1749 case AArch64::SUBSXrx:
1750 return AArch64::SUBXrx;
1751 }
1752}
1753
1754enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };
1755
1756/// True when condition flags are accessed (either by writing or reading)
1757/// on the instruction trace starting at From and ending at To.
1758///
1759/// Note: If From and To are from different blocks it's assumed CC are accessed
1760/// on the path.
1763 const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
1764 // Early exit if To is at the beginning of the BB.
1765 if (To == To->getParent()->begin())
1766 return true;
1767
1768 // Check whether the instructions are in the same basic block
1769 // If not, assume the condition flags might get modified somewhere.
1770 if (To->getParent() != From->getParent())
1771 return true;
1772
1773 // From must be above To.
1774 assert(std::any_of(
1775 ++To.getReverse(), To->getParent()->rend(),
1776 [From](MachineInstr &MI) { return MI.getIterator() == From; }));
1777
1778 // We iterate backward starting at \p To until we hit \p From.
1779 for (const MachineInstr &Instr :
1781 if (((AccessToCheck & AK_Write) &&
1782 Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
1783 ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
1784 return true;
1785 }
1786 return false;
1787}
1788
1789std::optional<unsigned>
1790AArch64InstrInfo::canRemovePTestInstr(MachineInstr *PTest, MachineInstr *Mask,
1791 MachineInstr *Pred,
1792 const MachineRegisterInfo *MRI) const {
1793 unsigned MaskOpcode = Mask->getOpcode();
1794 unsigned PredOpcode = Pred->getOpcode();
1795 bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode);
1796 bool PredIsWhileLike = isWhileOpcode(PredOpcode);
1797
1798 if (PredIsWhileLike) {
1799 // For PTEST(PG, PG), PTEST is redundant when PG is the result of a WHILEcc
1800 // instruction and the condition is "any" since WHILcc does an implicit
1801 // PTEST(ALL, PG) check and PG is always a subset of ALL.
1802 if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1803 return PredOpcode;
1804
1805 // For PTEST(PTRUE_ALL, WHILE), if the element size matches, the PTEST is
1806 // redundant since WHILE performs an implicit PTEST with an all active
1807 // mask.
1808 if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 &&
1809 getElementSizeForOpcode(MaskOpcode) ==
1810 getElementSizeForOpcode(PredOpcode))
1811 return PredOpcode;
1812
1813 // For PTEST_FIRST(PTRUE_ALL, WHILE), the PTEST_FIRST is redundant since
1814 // WHILEcc performs an implicit PTEST with an all active mask, setting
1815 // the N flag as the PTEST_FIRST would.
1816 if (PTest->getOpcode() == AArch64::PTEST_PP_FIRST &&
1817 isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31)
1818 return PredOpcode;
1819
1820 return {};
1821 }
1822
1823 if (PredIsPTestLike) {
1824 // For PTEST(PG, PG), PTEST is redundant when PG is the result of an
1825 // instruction that sets the flags as PTEST would and the condition is
1826 // "any" since PG is always a subset of the governing predicate of the
1827 // ptest-like instruction.
1828 if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1829 return PredOpcode;
1830
1831 auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1832
1833 // If the PTEST like instruction's general predicate is not `Mask`, attempt
1834 // to look through a copy and try again. This is because some instructions
1835 // take a predicate whose register class is a subset of its result class.
1836 if (Mask != PTestLikeMask && PTestLikeMask->isFullCopy() &&
1837 PTestLikeMask->getOperand(1).getReg().isVirtual())
1838 PTestLikeMask =
1839 MRI->getUniqueVRegDef(PTestLikeMask->getOperand(1).getReg());
1840
1841 // For PTEST(PTRUE_ALL, PTEST_LIKE), the PTEST is redundant if the
1842 // the element size matches and either the PTEST_LIKE instruction uses
1843 // the same all active mask or the condition is "any".
1844 if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 &&
1845 getElementSizeForOpcode(MaskOpcode) ==
1846 getElementSizeForOpcode(PredOpcode)) {
1847 if (Mask == PTestLikeMask || PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1848 return PredOpcode;
1849 }
1850
1851 // For PTEST(PG, PTEST_LIKE(PG, ...)), the PTEST is redundant since the
1852 // flags are set based on the same mask 'PG', but PTEST_LIKE must operate
1853 // on 8-bit predicates like the PTEST. Otherwise, for instructions like
1854 // compare that also support 16/32/64-bit predicates, the implicit PTEST
1855 // performed by the compare could consider fewer lanes for these element
1856 // sizes.
1857 //
1858 // For example, consider
1859 //
1860 // ptrue p0.b ; P0=1111-1111-1111-1111
1861 // index z0.s, #0, #1 ; Z0=<0,1,2,3>
1862 // index z1.s, #1, #1 ; Z1=<1,2,3,4>
1863 // cmphi p1.s, p0/z, z1.s, z0.s ; P1=0001-0001-0001-0001
1864 // ; ^ last active
1865 // ptest p0, p1.b ; P1=0001-0001-0001-0001
1866 // ; ^ last active
1867 //
1868 // where the compare generates a canonical all active 32-bit predicate
1869 // (equivalent to 'ptrue p1.s, all'). The implicit PTEST sets the last
1870 // active flag, whereas the PTEST instruction with the same mask doesn't.
1871 // For PTEST_ANY this doesn't apply as the flags in this case would be
1872 // identical regardless of element size.
1873 uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode);
1874 if (Mask == PTestLikeMask && (PredElementSize == AArch64::ElementSizeB ||
1875 PTest->getOpcode() == AArch64::PTEST_PP_ANY))
1876 return PredOpcode;
1877
1878 return {};
1879 }
1880
1881 // If OP in PTEST(PG, OP(PG, ...)) has a flag-setting variant change the
1882 // opcode so the PTEST becomes redundant.
1883 switch (PredOpcode) {
1884 case AArch64::AND_PPzPP:
1885 case AArch64::BIC_PPzPP:
1886 case AArch64::EOR_PPzPP:
1887 case AArch64::NAND_PPzPP:
1888 case AArch64::NOR_PPzPP:
1889 case AArch64::ORN_PPzPP:
1890 case AArch64::ORR_PPzPP:
1891 case AArch64::BRKA_PPzP:
1892 case AArch64::BRKPA_PPzPP:
1893 case AArch64::BRKB_PPzP:
1894 case AArch64::BRKPB_PPzPP:
1895 case AArch64::RDFFR_PPz: {
1896 // Check to see if our mask is the same. If not the resulting flag bits
1897 // may be different and we can't remove the ptest.
1898 auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1899 if (Mask != PredMask)
1900 return {};
1901 break;
1902 }
1903 case AArch64::BRKN_PPzP: {
1904 // BRKN uses an all active implicit mask to set flags unlike the other
1905 // flag-setting instructions.
1906 // PTEST(PTRUE_B(31), BRKN(PG, A, B)) -> BRKNS(PG, A, B).
1907 if ((MaskOpcode != AArch64::PTRUE_B) ||
1908 (Mask->getOperand(1).getImm() != 31))
1909 return {};
1910 break;
1911 }
1912 case AArch64::PTRUE_B:
1913 // PTEST(OP=PTRUE_B(A), OP) -> PTRUES_B(A)
1914 break;
1915 default:
1916 // Bail out if we don't recognize the input
1917 return {};
1918 }
1919
1920 return convertToFlagSettingOpc(PredOpcode);
1921}
1922
1923/// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating
1924/// operation which could set the flags in an identical manner
1925bool AArch64InstrInfo::optimizePTestInstr(
1926 MachineInstr *PTest, unsigned MaskReg, unsigned PredReg,
1927 const MachineRegisterInfo *MRI) const {
1928 auto *Mask = MRI->getUniqueVRegDef(MaskReg);
1929 auto *Pred = MRI->getUniqueVRegDef(PredReg);
1930
1931 if (Pred->isCopy() && PTest->getOpcode() == AArch64::PTEST_PP_FIRST) {
1932 // Instructions which return a multi-vector (e.g. WHILECC_x2) require copies
1933 // before the branch to extract each subregister.
1934 auto Op = Pred->getOperand(1);
1935 if (Op.isReg() && Op.getReg().isVirtual() &&
1936 Op.getSubReg() == AArch64::psub0)
1937 Pred = MRI->getUniqueVRegDef(Op.getReg());
1938 }
1939
1940 unsigned PredOpcode = Pred->getOpcode();
1941 auto NewOp = canRemovePTestInstr(PTest, Mask, Pred, MRI);
1942 if (!NewOp)
1943 return false;
1944
1945 const TargetRegisterInfo *TRI = &getRegisterInfo();
1946
1947 // If another instruction between Pred and PTest accesses flags, don't remove
1948 // the ptest or update the earlier instruction to modify them.
1949 if (areCFlagsAccessedBetweenInstrs(Pred, PTest, TRI))
1950 return false;
1951
1952 // If we pass all the checks, it's safe to remove the PTEST and use the flags
1953 // as they are prior to PTEST. Sometimes this requires the tested PTEST
1954 // operand to be replaced with an equivalent instruction that also sets the
1955 // flags.
1956 PTest->eraseFromParent();
1957 if (*NewOp != PredOpcode) {
1958 Pred->setDesc(get(*NewOp));
1959 bool succeeded = UpdateOperandRegClass(*Pred);
1960 (void)succeeded;
1961 assert(succeeded && "Operands have incompatible register classes!");
1962 Pred->addRegisterDefined(AArch64::NZCV, TRI);
1963 }
1964
1965 // Ensure that the flags def is live.
1966 if (Pred->registerDefIsDead(AArch64::NZCV, TRI)) {
1967 unsigned i = 0, e = Pred->getNumOperands();
1968 for (; i != e; ++i) {
1969 MachineOperand &MO = Pred->getOperand(i);
1970 if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) {
1971 MO.setIsDead(false);
1972 break;
1973 }
1974 }
1975 }
1976 return true;
1977}
1978
1979/// Try to optimize a compare instruction. A compare instruction is an
1980/// instruction which produces AArch64::NZCV. It can be truly compare
1981/// instruction
1982/// when there are no uses of its destination register.
1983///
1984/// The following steps are tried in order:
1985/// 1. Convert CmpInstr into an unconditional version.
1986/// 2. Remove CmpInstr if above there is an instruction producing a needed
1987/// condition code or an instruction which can be converted into such an
1988/// instruction.
1989/// Only comparison with zero is supported.
1991 MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask,
1992 int64_t CmpValue, const MachineRegisterInfo *MRI) const {
1993 assert(CmpInstr.getParent());
1994 assert(MRI);
1995
1996 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1997 int DeadNZCVIdx =
1998 CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true);
1999 if (DeadNZCVIdx != -1) {
2000 if (CmpInstr.definesRegister(AArch64::WZR, /*TRI=*/nullptr) ||
2001 CmpInstr.definesRegister(AArch64::XZR, /*TRI=*/nullptr)) {
2002 CmpInstr.eraseFromParent();
2003 return true;
2004 }
2005 unsigned Opc = CmpInstr.getOpcode();
2006 unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
2007 if (NewOpc == Opc)
2008 return false;
2009 const MCInstrDesc &MCID = get(NewOpc);
2010 CmpInstr.setDesc(MCID);
2011 CmpInstr.removeOperand(DeadNZCVIdx);
2012 bool succeeded = UpdateOperandRegClass(CmpInstr);
2013 (void)succeeded;
2014 assert(succeeded && "Some operands reg class are incompatible!");
2015 return true;
2016 }
2017
2018 if (CmpInstr.getOpcode() == AArch64::PTEST_PP ||
2019 CmpInstr.getOpcode() == AArch64::PTEST_PP_ANY ||
2020 CmpInstr.getOpcode() == AArch64::PTEST_PP_FIRST)
2021 return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI);
2022
2023 if (SrcReg2 != 0)
2024 return false;
2025
2026 // CmpInstr is a Compare instruction if destination register is not used.
2027 if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
2028 return false;
2029
2030 if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, *MRI))
2031 return true;
2032 return (CmpValue == 0 || CmpValue == 1) &&
2033 removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI);
2034}
2035
2036/// Get opcode of S version of Instr.
2037/// If Instr is S version its opcode is returned.
2038/// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
2039/// or we are not interested in it.
2040static unsigned sForm(MachineInstr &Instr) {
2041 switch (Instr.getOpcode()) {
2042 default:
2043 return AArch64::INSTRUCTION_LIST_END;
2044
2045 case AArch64::ADDSWrr:
2046 case AArch64::ADDSWri:
2047 case AArch64::ADDSXrr:
2048 case AArch64::ADDSXri:
2049 case AArch64::ADDSWrx:
2050 case AArch64::ADDSXrx:
2051 case AArch64::SUBSWrr:
2052 case AArch64::SUBSWri:
2053 case AArch64::SUBSWrx:
2054 case AArch64::SUBSXrr:
2055 case AArch64::SUBSXri:
2056 case AArch64::SUBSXrx:
2057 case AArch64::ANDSWri:
2058 case AArch64::ANDSWrr:
2059 case AArch64::ANDSWrs:
2060 case AArch64::ANDSXri:
2061 case AArch64::ANDSXrr:
2062 case AArch64::ANDSXrs:
2063 case AArch64::BICSWrr:
2064 case AArch64::BICSXrr:
2065 case AArch64::BICSWrs:
2066 case AArch64::BICSXrs:
2067 return Instr.getOpcode();
2068
2069 case AArch64::ADDWrr:
2070 return AArch64::ADDSWrr;
2071 case AArch64::ADDWri:
2072 return AArch64::ADDSWri;
2073 case AArch64::ADDXrr:
2074 return AArch64::ADDSXrr;
2075 case AArch64::ADDXri:
2076 return AArch64::ADDSXri;
2077 case AArch64::ADDWrx:
2078 return AArch64::ADDSWrx;
2079 case AArch64::ADDXrx:
2080 return AArch64::ADDSXrx;
2081 case AArch64::ADCWr:
2082 return AArch64::ADCSWr;
2083 case AArch64::ADCXr:
2084 return AArch64::ADCSXr;
2085 case AArch64::SUBWrr:
2086 return AArch64::SUBSWrr;
2087 case AArch64::SUBWri:
2088 return AArch64::SUBSWri;
2089 case AArch64::SUBXrr:
2090 return AArch64::SUBSXrr;
2091 case AArch64::SUBXri:
2092 return AArch64::SUBSXri;
2093 case AArch64::SUBWrx:
2094 return AArch64::SUBSWrx;
2095 case AArch64::SUBXrx:
2096 return AArch64::SUBSXrx;
2097 case AArch64::SBCWr:
2098 return AArch64::SBCSWr;
2099 case AArch64::SBCXr:
2100 return AArch64::SBCSXr;
2101 case AArch64::ANDWri:
2102 return AArch64::ANDSWri;
2103 case AArch64::ANDXri:
2104 return AArch64::ANDSXri;
2105 case AArch64::ANDWrr:
2106 return AArch64::ANDSWrr;
2107 case AArch64::ANDWrs:
2108 return AArch64::ANDSWrs;
2109 case AArch64::ANDXrr:
2110 return AArch64::ANDSXrr;
2111 case AArch64::ANDXrs:
2112 return AArch64::ANDSXrs;
2113 case AArch64::BICWrr:
2114 return AArch64::BICSWrr;
2115 case AArch64::BICXrr:
2116 return AArch64::BICSXrr;
2117 case AArch64::BICWrs:
2118 return AArch64::BICSWrs;
2119 case AArch64::BICXrs:
2120 return AArch64::BICSXrs;
2121 }
2122}
2123
2124/// Check if AArch64::NZCV should be alive in successors of MBB.
2126 for (auto *BB : MBB->successors())
2127 if (BB->isLiveIn(AArch64::NZCV))
2128 return true;
2129 return false;
2130}
2131
2132/// \returns The condition code operand index for \p Instr if it is a branch
2133/// or select and -1 otherwise.
2134int AArch64InstrInfo::findCondCodeUseOperandIdxForBranchOrSelect(
2135 const MachineInstr &Instr) {
2136 switch (Instr.getOpcode()) {
2137 default:
2138 return -1;
2139
2140 case AArch64::Bcc: {
2141 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr);
2142 assert(Idx >= 2);
2143 return Idx - 2;
2144 }
2145
2146 case AArch64::CSINVWr:
2147 case AArch64::CSINVXr:
2148 case AArch64::CSINCWr:
2149 case AArch64::CSINCXr:
2150 case AArch64::CSELWr:
2151 case AArch64::CSELXr:
2152 case AArch64::CSNEGWr:
2153 case AArch64::CSNEGXr:
2154 case AArch64::FCSELSrrr:
2155 case AArch64::FCSELDrrr: {
2156 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr);
2157 assert(Idx >= 1);
2158 return Idx - 1;
2159 }
2160 }
2161}
2162
2163/// Find a condition code used by the instruction.
2164/// Returns AArch64CC::Invalid if either the instruction does not use condition
2165/// codes or we don't optimize CmpInstr in the presence of such instructions.
2167 int CCIdx =
2168 AArch64InstrInfo::findCondCodeUseOperandIdxForBranchOrSelect(Instr);
2169 return CCIdx >= 0 ? static_cast<AArch64CC::CondCode>(
2170 Instr.getOperand(CCIdx).getImm())
2172}
2173
2176 UsedNZCV UsedFlags;
2177 switch (CC) {
2178 default:
2179 break;
2180
2181 case AArch64CC::EQ: // Z set
2182 case AArch64CC::NE: // Z clear
2183 UsedFlags.Z = true;
2184 break;
2185
2186 case AArch64CC::HI: // Z clear and C set
2187 case AArch64CC::LS: // Z set or C clear
2188 UsedFlags.Z = true;
2189 [[fallthrough]];
2190 case AArch64CC::HS: // C set
2191 case AArch64CC::LO: // C clear
2192 UsedFlags.C = true;
2193 break;
2194
2195 case AArch64CC::MI: // N set
2196 case AArch64CC::PL: // N clear
2197 UsedFlags.N = true;
2198 break;
2199
2200 case AArch64CC::VS: // V set
2201 case AArch64CC::VC: // V clear
2202 UsedFlags.V = true;
2203 break;
2204
2205 case AArch64CC::GT: // Z clear, N and V the same
2206 case AArch64CC::LE: // Z set, N and V differ
2207 UsedFlags.Z = true;
2208 [[fallthrough]];
2209 case AArch64CC::GE: // N and V the same
2210 case AArch64CC::LT: // N and V differ
2211 UsedFlags.N = true;
2212 UsedFlags.V = true;
2213 break;
2214 }
2215 return UsedFlags;
2216}
2217
2218/// \returns Conditions flags used after \p CmpInstr in its MachineBB if NZCV
2219/// flags are not alive in successors of the same \p CmpInstr and \p MI parent.
2220/// \returns std::nullopt otherwise.
2221///
2222/// Collect instructions using that flags in \p CCUseInstrs if provided.
2223std::optional<UsedNZCV>
2225 const TargetRegisterInfo &TRI,
2226 SmallVectorImpl<MachineInstr *> *CCUseInstrs) {
2227 MachineBasicBlock *CmpParent = CmpInstr.getParent();
2228 if (MI.getParent() != CmpParent)
2229 return std::nullopt;
2230
2231 if (areCFlagsAliveInSuccessors(CmpParent))
2232 return std::nullopt;
2233
2234 UsedNZCV NZCVUsedAfterCmp;
2236 std::next(CmpInstr.getIterator()), CmpParent->instr_end())) {
2237 if (Instr.readsRegister(AArch64::NZCV, &TRI)) {
2239 if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
2240 return std::nullopt;
2241 NZCVUsedAfterCmp |= getUsedNZCV(CC);
2242 if (CCUseInstrs)
2243 CCUseInstrs->push_back(&Instr);
2244 }
2245 if (Instr.modifiesRegister(AArch64::NZCV, &TRI))
2246 break;
2247 }
2248 return NZCVUsedAfterCmp;
2249}
2250
2251static bool isADDSRegImm(unsigned Opcode) {
2252 return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
2253}
2254
2255static bool isSUBSRegImm(unsigned Opcode) {
2256 return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
2257}
2258
2260 unsigned Opc = sForm(MI);
2261 switch (Opc) {
2262 case AArch64::ANDSWri:
2263 case AArch64::ANDSWrr:
2264 case AArch64::ANDSWrs:
2265 case AArch64::ANDSXri:
2266 case AArch64::ANDSXrr:
2267 case AArch64::ANDSXrs:
2268 case AArch64::BICSWrr:
2269 case AArch64::BICSXrr:
2270 case AArch64::BICSWrs:
2271 case AArch64::BICSXrs:
2272 return true;
2273 default:
2274 return false;
2275 }
2276}
2277
2278/// Check if CmpInstr can be substituted by MI.
2279///
2280/// CmpInstr can be substituted:
2281/// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
2282/// - and, MI and CmpInstr are from the same MachineBB
2283/// - and, condition flags are not alive in successors of the CmpInstr parent
2284/// - and, if MI opcode is the S form there must be no defs of flags between
2285/// MI and CmpInstr
2286/// or if MI opcode is not the S form there must be neither defs of flags
2287/// nor uses of flags between MI and CmpInstr.
2288/// - and, if C/V flags are not used after CmpInstr
2289/// or if N flag is used but MI produces poison value if signed overflow
2290/// occurs.
2292 const TargetRegisterInfo &TRI) {
2293 // NOTE this assertion guarantees that MI.getOpcode() is add or subtraction
2294 // that may or may not set flags.
2295 assert(sForm(MI) != AArch64::INSTRUCTION_LIST_END);
2296
2297 const unsigned CmpOpcode = CmpInstr.getOpcode();
2298 if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
2299 return false;
2300
2301 assert((CmpInstr.getOperand(2).isImm() &&
2302 CmpInstr.getOperand(2).getImm() == 0) &&
2303 "Caller guarantees that CmpInstr compares with constant 0");
2304
2305 std::optional<UsedNZCV> NZVCUsed = examineCFlagsUse(MI, CmpInstr, TRI);
2306 if (!NZVCUsed || NZVCUsed->C)
2307 return false;
2308
2309 // CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0', and MI is either
2310 // '%vreg = add ...' or '%vreg = sub ...'.
2311 // Condition flag V is used to indicate signed overflow.
2312 // 1) MI and CmpInstr set N and V to the same value.
2313 // 2) If MI is add/sub with no-signed-wrap, it produces a poison value when
2314 // signed overflow occurs, so CmpInstr could still be simplified away.
2315 // Note that Ands and Bics instructions always clear the V flag.
2316 if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap) && !isANDOpcode(MI))
2317 return false;
2318
2319 AccessKind AccessToCheck = AK_Write;
2320 if (sForm(MI) != MI.getOpcode())
2321 AccessToCheck = AK_All;
2322 return !areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AccessToCheck);
2323}
2324
2325/// Substitute an instruction comparing to zero with another instruction
2326/// which produces needed condition flags.
2327///
2328/// Return true on success.
2329bool AArch64InstrInfo::substituteCmpToZero(
2330 MachineInstr &CmpInstr, unsigned SrcReg,
2331 const MachineRegisterInfo &MRI) const {
2332 // Get the unique definition of SrcReg.
2333 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
2334 if (!MI)
2335 return false;
2336
2337 const TargetRegisterInfo &TRI = getRegisterInfo();
2338
2339 unsigned NewOpc = sForm(*MI);
2340 if (NewOpc == AArch64::INSTRUCTION_LIST_END)
2341 return false;
2342
2343 if (!canInstrSubstituteCmpInstr(*MI, CmpInstr, TRI))
2344 return false;
2345
2346 // Update the instruction to set NZCV.
2347 MI->setDesc(get(NewOpc));
2348 CmpInstr.eraseFromParent();
2350 (void)succeeded;
2351 assert(succeeded && "Some operands reg class are incompatible!");
2352 MI->addRegisterDefined(AArch64::NZCV, &TRI);
2353 return true;
2354}
2355
2356/// \returns True if \p CmpInstr can be removed.
2357///
2358/// \p IsInvertCC is true if, after removing \p CmpInstr, condition
2359/// codes used in \p CCUseInstrs must be inverted.
2361 int CmpValue, const TargetRegisterInfo &TRI,
2363 bool &IsInvertCC) {
2364 assert((CmpValue == 0 || CmpValue == 1) &&
2365 "Only comparisons to 0 or 1 considered for removal!");
2366
2367 // MI is 'CSINCWr %vreg, wzr, wzr, <cc>' or 'CSINCXr %vreg, xzr, xzr, <cc>'
2368 unsigned MIOpc = MI.getOpcode();
2369 if (MIOpc == AArch64::CSINCWr) {
2370 if (MI.getOperand(1).getReg() != AArch64::WZR ||
2371 MI.getOperand(2).getReg() != AArch64::WZR)
2372 return false;
2373 } else if (MIOpc == AArch64::CSINCXr) {
2374 if (MI.getOperand(1).getReg() != AArch64::XZR ||
2375 MI.getOperand(2).getReg() != AArch64::XZR)
2376 return false;
2377 } else {
2378 return false;
2379 }
2381 if (MICC == AArch64CC::Invalid)
2382 return false;
2383
2384 // NZCV needs to be defined
2385 if (MI.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) != -1)
2386 return false;
2387
2388 // CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' or 'SUBS %vreg, 1'
2389 const unsigned CmpOpcode = CmpInstr.getOpcode();
2390 bool IsSubsRegImm = isSUBSRegImm(CmpOpcode);
2391 if (CmpValue && !IsSubsRegImm)
2392 return false;
2393 if (!CmpValue && !IsSubsRegImm && !isADDSRegImm(CmpOpcode))
2394 return false;
2395
2396 // MI conditions allowed: eq, ne, mi, pl
2397 UsedNZCV MIUsedNZCV = getUsedNZCV(MICC);
2398 if (MIUsedNZCV.C || MIUsedNZCV.V)
2399 return false;
2400
2401 std::optional<UsedNZCV> NZCVUsedAfterCmp =
2402 examineCFlagsUse(MI, CmpInstr, TRI, &CCUseInstrs);
2403 // Condition flags are not used in CmpInstr basic block successors and only
2404 // Z or N flags allowed to be used after CmpInstr within its basic block
2405 if (!NZCVUsedAfterCmp || NZCVUsedAfterCmp->C || NZCVUsedAfterCmp->V)
2406 return false;
2407 // Z or N flag used after CmpInstr must correspond to the flag used in MI
2408 if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) ||
2409 (MIUsedNZCV.N && NZCVUsedAfterCmp->Z))
2410 return false;
2411 // If CmpInstr is comparison to zero MI conditions are limited to eq, ne
2412 if (MIUsedNZCV.N && !CmpValue)
2413 return false;
2414
2415 // There must be no defs of flags between MI and CmpInstr
2416 if (areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AK_Write))
2417 return false;
2418
2419 // Condition code is inverted in the following cases:
2420 // 1. MI condition is ne; CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
2421 // 2. MI condition is eq, pl; CmpInstr is 'SUBS %vreg, 1'
2422 IsInvertCC = (CmpValue && (MICC == AArch64CC::EQ || MICC == AArch64CC::PL)) ||
2423 (!CmpValue && MICC == AArch64CC::NE);
2424 return true;
2425}
2426
2427/// Remove comparison in csinc-cmp sequence
2428///
2429/// Examples:
2430/// 1. \code
2431/// csinc w9, wzr, wzr, ne
2432/// cmp w9, #0
2433/// b.eq
2434/// \endcode
2435/// to
2436/// \code
2437/// csinc w9, wzr, wzr, ne
2438/// b.ne
2439/// \endcode
2440///
2441/// 2. \code
2442/// csinc x2, xzr, xzr, mi
2443/// cmp x2, #1
2444/// b.pl
2445/// \endcode
2446/// to
2447/// \code
2448/// csinc x2, xzr, xzr, mi
2449/// b.pl
2450/// \endcode
2451///
2452/// \param CmpInstr comparison instruction
2453/// \return True when comparison removed
2454bool AArch64InstrInfo::removeCmpToZeroOrOne(
2455 MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue,
2456 const MachineRegisterInfo &MRI) const {
2457 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
2458 if (!MI)
2459 return false;
2460 const TargetRegisterInfo &TRI = getRegisterInfo();
2461 SmallVector<MachineInstr *, 4> CCUseInstrs;
2462 bool IsInvertCC = false;
2463 if (!canCmpInstrBeRemoved(*MI, CmpInstr, CmpValue, TRI, CCUseInstrs,
2464 IsInvertCC))
2465 return false;
2466 // Make transformation
2467 CmpInstr.eraseFromParent();
2468 if (IsInvertCC) {
2469 // Invert condition codes in CmpInstr CC users
2470 for (MachineInstr *CCUseInstr : CCUseInstrs) {
2471 int Idx = findCondCodeUseOperandIdxForBranchOrSelect(*CCUseInstr);
2472 assert(Idx >= 0 && "Unexpected instruction using CC.");
2473 MachineOperand &CCOperand = CCUseInstr->getOperand(Idx);
2475 static_cast<AArch64CC::CondCode>(CCOperand.getImm()));
2476 CCOperand.setImm(CCUse);
2477 }
2478 }
2479 return true;
2480}
2481
2482bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
2483 if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
2484 MI.getOpcode() != AArch64::CATCHRET)
2485 return false;
2486
2487 MachineBasicBlock &MBB = *MI.getParent();
2488 auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
2489 auto TRI = Subtarget.getRegisterInfo();
2490 DebugLoc DL = MI.getDebugLoc();
2491
2492 if (MI.getOpcode() == AArch64::CATCHRET) {
2493 // Skip to the first instruction before the epilog.
2494 const TargetInstrInfo *TII =
2496 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
2498 MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI);
2499 while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) &&
2500 FirstEpilogSEH != MBB.begin())
2501 FirstEpilogSEH = std::prev(FirstEpilogSEH);
2502 if (FirstEpilogSEH != MBB.begin())
2503 FirstEpilogSEH = std::next(FirstEpilogSEH);
2504 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP))
2505 .addReg(AArch64::X0, RegState::Define)
2506 .addMBB(TargetMBB);
2507 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri))
2508 .addReg(AArch64::X0, RegState::Define)
2509 .addReg(AArch64::X0)
2510 .addMBB(TargetMBB)
2511 .addImm(0);
2512 TargetMBB->setMachineBlockAddressTaken();
2513 return true;
2514 }
2515
2516 Register Reg = MI.getOperand(0).getReg();
2518 if (M.getStackProtectorGuard() == "sysreg") {
2519 const AArch64SysReg::SysReg *SrcReg =
2520 AArch64SysReg::lookupSysRegByName(M.getStackProtectorGuardReg());
2521 if (!SrcReg)
2522 report_fatal_error("Unknown SysReg for Stack Protector Guard Register");
2523
2524 // mrs xN, sysreg
2525 BuildMI(MBB, MI, DL, get(AArch64::MRS))
2527 .addImm(SrcReg->Encoding);
2528 int Offset = M.getStackProtectorGuardOffset();
2529 if (Offset >= 0 && Offset <= 32760 && Offset % 8 == 0) {
2530 // ldr xN, [xN, #offset]
2531 BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
2532 .addDef(Reg)
2534 .addImm(Offset / 8);
2535 } else if (Offset >= -256 && Offset <= 255) {
2536 // ldur xN, [xN, #offset]
2537 BuildMI(MBB, MI, DL, get(AArch64::LDURXi))
2538 .addDef(Reg)
2540 .addImm(Offset);
2541 } else if (Offset >= -4095 && Offset <= 4095) {
2542 if (Offset > 0) {
2543 // add xN, xN, #offset
2544 BuildMI(MBB, MI, DL, get(AArch64::ADDXri))
2545 .addDef(Reg)
2547 .addImm(Offset)
2548 .addImm(0);
2549 } else {
2550 // sub xN, xN, #offset
2551 BuildMI(MBB, MI, DL, get(AArch64::SUBXri))
2552 .addDef(Reg)
2554 .addImm(-Offset)
2555 .addImm(0);
2556 }
2557 // ldr xN, [xN]
2558 BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
2559 .addDef(Reg)
2561 .addImm(0);
2562 } else {
2563 // Cases that are larger than +/- 4095 and not a multiple of 8, or larger
2564 // than 23760.
2565 // It might be nice to use AArch64::MOVi32imm here, which would get
2566 // expanded in PreSched2 after PostRA, but our lone scratch Reg already
2567 // contains the MRS result. findScratchNonCalleeSaveRegister() in
2568 // AArch64FrameLowering might help us find such a scratch register
2569 // though. If we failed to find a scratch register, we could emit a
2570 // stream of add instructions to build up the immediate. Or, we could try
2571 // to insert a AArch64::MOVi32imm before register allocation so that we
2572 // didn't need to scavenge for a scratch register.
2573 report_fatal_error("Unable to encode Stack Protector Guard Offset");
2574 }
2575 MBB.erase(MI);
2576 return true;
2577 }
2578
2579 const GlobalValue *GV =
2580 cast<GlobalValue>((*MI.memoperands_begin())->getValue());
2581 const TargetMachine &TM = MBB.getParent()->getTarget();
2582 unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
2583 const unsigned char MO_NC = AArch64II::MO_NC;
2584
2585 unsigned GuardWidth = M.getStackProtectorGuardValueWidth().value_or(
2586 Subtarget.isTargetILP32() ? 4 : 8);
2587 if (GuardWidth != 4 && GuardWidth != 8)
2588 report_fatal_error("Unsupported stack protector value width");
2589 if ((OpFlags & AArch64II::MO_GOT) != 0) {
2590 BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
2591 .addGlobalAddress(GV, 0, OpFlags);
2592 if (GuardWidth == 4) {
2593 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2594 BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2595 .addDef(Reg32, RegState::Dead)
2597 .addImm(0)
2598 .addMemOperand(*MI.memoperands_begin())
2600 } else {
2601 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2603 .addImm(0)
2604 .addMemOperand(*MI.memoperands_begin());
2605 }
2606 } else if (TM.getCodeModel() == CodeModel::Large) {
2607 if (GuardWidth == 4)
2608 report_fatal_error("Large code model with 4-byte stack protector not yet "
2609 "supported");
2610 BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
2611 .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC)
2612 .addImm(0);
2613 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2615 .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC)
2616 .addImm(16);
2617 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2619 .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC)
2620 .addImm(32);
2621 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2624 .addImm(48);
2625 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2627 .addImm(0)
2628 .addMemOperand(*MI.memoperands_begin());
2629 } else if (TM.getCodeModel() == CodeModel::Tiny) {
2630 // FIXME: This is computing the stack protector value as a constant
2631 // pc-relative offset, not loading it from memory. Which is maybe
2632 // an interesting compromise in some environments, but it looks like it
2633 // was done accidentally. And it probably shouldn't be tied to the
2634 // code model.
2635 BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg)
2636 .addGlobalAddress(GV, 0, OpFlags);
2637 } else {
2638 BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
2639 .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
2640 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
2641 if (GuardWidth == 4) {
2642 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2643 BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2644 .addDef(Reg32, RegState::Dead)
2646 .addGlobalAddress(GV, 0, LoFlags)
2647 .addMemOperand(*MI.memoperands_begin())
2649 } else {
2650 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2652 .addGlobalAddress(GV, 0, LoFlags)
2653 .addMemOperand(*MI.memoperands_begin());
2654 }
2655 }
2656
2657 MBB.erase(MI);
2658
2659 return true;
2660}
2661
2662// Return true if this instruction simply sets its single destination register
2663// to zero. This is equivalent to a register rename of the zero-register.
2665 switch (MI.getOpcode()) {
2666 default:
2667 break;
2668 case AArch64::MOVZWi:
2669 case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
2670 if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
2671 assert(MI.getDesc().getNumOperands() == 3 &&
2672 MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
2673 return true;
2674 }
2675 break;
2676 case AArch64::ANDWri: // and Rd, Rzr, #imm
2677 return MI.getOperand(1).getReg() == AArch64::WZR;
2678 case AArch64::ANDXri:
2679 return MI.getOperand(1).getReg() == AArch64::XZR;
2680 case TargetOpcode::COPY:
2681 return MI.getOperand(1).getReg() == AArch64::WZR;
2682 }
2683 return false;
2684}
2685
2686// Return true if this instruction simply renames a general register without
2687// modifying bits.
2689 switch (MI.getOpcode()) {
2690 default:
2691 break;
2692 case TargetOpcode::COPY: {
2693 // GPR32 copies will by lowered to ORRXrs
2694 Register DstReg = MI.getOperand(0).getReg();
2695 return (AArch64::GPR32RegClass.contains(DstReg) ||
2696 AArch64::GPR64RegClass.contains(DstReg));
2697 }
2698 case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
2699 if (MI.getOperand(1).getReg() == AArch64::XZR) {
2700 assert(MI.getDesc().getNumOperands() == 4 &&
2701 MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
2702 return true;
2703 }
2704 break;
2705 case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
2706 if (MI.getOperand(2).getImm() == 0) {
2707 assert(MI.getDesc().getNumOperands() == 4 &&
2708 MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
2709 return true;
2710 }
2711 break;
2712 }
2713 return false;
2714}
2715
2716// Return true if this instruction simply renames a general register without
2717// modifying bits.
2719 switch (MI.getOpcode()) {
2720 default:
2721 break;
2722 case TargetOpcode::COPY: {
2723 Register DstReg = MI.getOperand(0).getReg();
2724 return AArch64::FPR128RegClass.contains(DstReg);
2725 }
2726 case AArch64::ORRv16i8:
2727 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
2728 assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
2729 "invalid ORRv16i8 operands");
2730 return true;
2731 }
2732 break;
2733 }
2734 return false;
2735}
2736
2737static bool isFrameLoadOpcode(int Opcode) {
2738 switch (Opcode) {
2739 default:
2740 return false;
2741 case AArch64::LDRWui:
2742 case AArch64::LDRXui:
2743 case AArch64::LDRBui:
2744 case AArch64::LDRHui:
2745 case AArch64::LDRSui:
2746 case AArch64::LDRDui:
2747 case AArch64::LDRQui:
2748 case AArch64::LDR_PXI:
2749 return true;
2750 }
2751}
2752
2754 int &FrameIndex) const {
2755 if (!isFrameLoadOpcode(MI.getOpcode()))
2756 return Register();
2757
2758 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2759 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2760 FrameIndex = MI.getOperand(1).getIndex();
2761 return MI.getOperand(0).getReg();
2762 }
2763 return Register();
2764}
2765
2766static bool isFrameStoreOpcode(int Opcode) {
2767 switch (Opcode) {
2768 default:
2769 return false;
2770 case AArch64::STRWui:
2771 case AArch64::STRXui:
2772 case AArch64::STRBui:
2773 case AArch64::STRHui:
2774 case AArch64::STRSui:
2775 case AArch64::STRDui:
2776 case AArch64::STRQui:
2777 case AArch64::STR_PXI:
2778 return true;
2779 }
2780}
2781
2783 int &FrameIndex) const {
2784 if (!isFrameStoreOpcode(MI.getOpcode()))
2785 return Register();
2786
2787 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2788 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2789 FrameIndex = MI.getOperand(1).getIndex();
2790 return MI.getOperand(0).getReg();
2791 }
2792 return Register();
2793}
2794
2796 int &FrameIndex) const {
2797 if (!isFrameStoreOpcode(MI.getOpcode()))
2798 return Register();
2799
2800 if (Register Reg = isStoreToStackSlot(MI, FrameIndex))
2801 return Reg;
2802
2804 if (hasStoreToStackSlot(MI, Accesses)) {
2805 if (Accesses.size() > 1)
2806 return Register();
2807
2808 FrameIndex =
2809 cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
2810 ->getFrameIndex();
2811 return MI.getOperand(0).getReg();
2812 }
2813 return Register();
2814}
2815
2817 int &FrameIndex) const {
2818 if (!isFrameLoadOpcode(MI.getOpcode()))
2819 return Register();
2820
2821 if (Register Reg = isLoadFromStackSlot(MI, FrameIndex))
2822 return Reg;
2823
2825 if (hasLoadFromStackSlot(MI, Accesses)) {
2826 if (Accesses.size() > 1)
2827 return Register();
2828
2829 FrameIndex =
2830 cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
2831 ->getFrameIndex();
2832 return MI.getOperand(0).getReg();
2833 }
2834 return Register();
2835}
2836
2837/// Check all MachineMemOperands for a hint to suppress pairing.
2839 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2840 return MMO->getFlags() & MOSuppressPair;
2841 });
2842}
2843
2844/// Set a flag on the first MachineMemOperand to suppress pairing.
2846 if (MI.memoperands_empty())
2847 return;
2848 (*MI.memoperands_begin())->setFlags(MOSuppressPair);
2849}
2850
2851/// Check all MachineMemOperands for a hint that the load/store is strided.
2853 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2854 return MMO->getFlags() & MOStridedAccess;
2855 });
2856}
2857
2859 switch (Opc) {
2860 default:
2861 return false;
2862 case AArch64::STURSi:
2863 case AArch64::STRSpre:
2864 case AArch64::STURDi:
2865 case AArch64::STRDpre:
2866 case AArch64::STURQi:
2867 case AArch64::STRQpre:
2868 case AArch64::STURBBi:
2869 case AArch64::STURHHi:
2870 case AArch64::STURWi:
2871 case AArch64::STRWpre:
2872 case AArch64::STURXi:
2873 case AArch64::STRXpre:
2874 case AArch64::LDURSi:
2875 case AArch64::LDRSpre:
2876 case AArch64::LDURDi:
2877 case AArch64::LDRDpre:
2878 case AArch64::LDURQi:
2879 case AArch64::LDRQpre:
2880 case AArch64::LDURWi:
2881 case AArch64::LDRWpre:
2882 case AArch64::LDURXi:
2883 case AArch64::LDRXpre:
2884 case AArch64::LDRSWpre:
2885 case AArch64::LDURSWi:
2886 case AArch64::LDURHHi:
2887 case AArch64::LDURBBi:
2888 case AArch64::LDURSBWi:
2889 case AArch64::LDURSHWi:
2890 return true;
2891 }
2892}
2893
2894std::optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) {
2895 switch (Opc) {
2896 default: return {};
2897 case AArch64::PRFMui: return AArch64::PRFUMi;
2898 case AArch64::LDRXui: return AArch64::LDURXi;
2899 case AArch64::LDRWui: return AArch64::LDURWi;
2900 case AArch64::LDRBui: return AArch64::LDURBi;
2901 case AArch64::LDRHui: return AArch64::LDURHi;
2902 case AArch64::LDRSui: return AArch64::LDURSi;
2903 case AArch64::LDRDui: return AArch64::LDURDi;
2904 case AArch64::LDRQui: return AArch64::LDURQi;
2905 case AArch64::LDRBBui: return AArch64::LDURBBi;
2906 case AArch64::LDRHHui: return AArch64::LDURHHi;
2907 case AArch64::LDRSBXui: return AArch64::LDURSBXi;
2908 case AArch64::LDRSBWui: return AArch64::LDURSBWi;
2909 case AArch64::LDRSHXui: return AArch64::LDURSHXi;
2910 case AArch64::LDRSHWui: return AArch64::LDURSHWi;
2911 case AArch64::LDRSWui: return AArch64::LDURSWi;
2912 case AArch64::STRXui: return AArch64::STURXi;
2913 case AArch64::STRWui: return AArch64::STURWi;
2914 case AArch64::STRBui: return AArch64::STURBi;
2915 case AArch64::STRHui: return AArch64::STURHi;
2916 case AArch64::STRSui: return AArch64::STURSi;
2917 case AArch64::STRDui: return AArch64::STURDi;
2918 case AArch64::STRQui: return AArch64::STURQi;
2919 case AArch64::STRBBui: return AArch64::STURBBi;
2920 case AArch64::STRHHui: return AArch64::STURHHi;
2921 }
2922}
2923
2925 switch (Opc) {
2926 default:
2927 llvm_unreachable("Unhandled Opcode in getLoadStoreImmIdx");
2928 case AArch64::ADDG:
2929 case AArch64::LDAPURBi:
2930 case AArch64::LDAPURHi:
2931 case AArch64::LDAPURi:
2932 case AArch64::LDAPURSBWi:
2933 case AArch64::LDAPURSBXi:
2934 case AArch64::LDAPURSHWi:
2935 case AArch64::LDAPURSHXi:
2936 case AArch64::LDAPURSWi:
2937 case AArch64::LDAPURXi:
2938 case AArch64::LDR_PPXI:
2939 case AArch64::LDR_PXI:
2940 case AArch64::LDR_ZXI:
2941 case AArch64::LDR_ZZXI:
2942 case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
2943 case AArch64::LDR_ZZZXI:
2944 case AArch64::LDR_ZZZZXI:
2945 case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
2946 case AArch64::LDRBBui:
2947 case AArch64::LDRBui:
2948 case AArch64::LDRDui:
2949 case AArch64::LDRHHui:
2950 case AArch64::LDRHui:
2951 case AArch64::LDRQui:
2952 case AArch64::LDRSBWui:
2953 case AArch64::LDRSBXui:
2954 case AArch64::LDRSHWui:
2955 case AArch64::LDRSHXui:
2956 case AArch64::LDRSui:
2957 case AArch64::LDRSWui:
2958 case AArch64::LDRWui:
2959 case AArch64::LDRXui:
2960 case AArch64::LDURBBi:
2961 case AArch64::LDURBi:
2962 case AArch64::LDURDi:
2963 case AArch64::LDURHHi:
2964 case AArch64::LDURHi:
2965 case AArch64::LDURQi:
2966 case AArch64::LDURSBWi:
2967 case AArch64::LDURSBXi:
2968 case AArch64::LDURSHWi:
2969 case AArch64::LDURSHXi:
2970 case AArch64::LDURSi:
2971 case AArch64::LDURSWi:
2972 case AArch64::LDURWi:
2973 case AArch64::LDURXi:
2974 case AArch64::PRFMui:
2975 case AArch64::PRFUMi:
2976 case AArch64::ST2Gi:
2977 case AArch64::STGi:
2978 case AArch64::STLURBi:
2979 case AArch64::STLURHi:
2980 case AArch64::STLURWi:
2981 case AArch64::STLURXi:
2982 case AArch64::StoreSwiftAsyncContext:
2983 case AArch64::STR_PPXI:
2984 case AArch64::STR_PXI:
2985 case AArch64::STR_ZXI:
2986 case AArch64::STR_ZZXI:
2987 case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
2988 case AArch64::STR_ZZZXI:
2989 case AArch64::STR_ZZZZXI:
2990 case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
2991 case AArch64::STRBBui:
2992 case AArch64::STRBui:
2993 case AArch64::STRDui:
2994 case AArch64::STRHHui:
2995 case AArch64::STRHui:
2996 case AArch64::STRQui:
2997 case AArch64::STRSui:
2998 case AArch64::STRWui:
2999 case AArch64::STRXui:
3000 case AArch64::STURBBi:
3001 case AArch64::STURBi:
3002 case AArch64::STURDi:
3003 case AArch64::STURHHi:
3004 case AArch64::STURHi:
3005 case AArch64::STURQi:
3006 case AArch64::STURSi:
3007 case AArch64::STURWi:
3008 case AArch64::STURXi:
3009 case AArch64::STZ2Gi:
3010 case AArch64::STZGi:
3011 case AArch64::TAGPstack:
3012 return 2;
3013 case AArch64::LD1B_D_IMM:
3014 case AArch64::LD1B_H_IMM:
3015 case AArch64::LD1B_IMM:
3016 case AArch64::LD1B_S_IMM:
3017 case AArch64::LD1D_IMM:
3018 case AArch64::LD1H_D_IMM:
3019 case AArch64::LD1H_IMM:
3020 case AArch64::LD1H_S_IMM:
3021 case AArch64::LD1RB_D_IMM:
3022 case AArch64::LD1RB_H_IMM:
3023 case AArch64::LD1RB_IMM:
3024 case AArch64::LD1RB_S_IMM:
3025 case AArch64::LD1RD_IMM:
3026 case AArch64::LD1RH_D_IMM:
3027 case AArch64::LD1RH_IMM:
3028 case AArch64::LD1RH_S_IMM:
3029 case AArch64::LD1RSB_D_IMM:
3030 case AArch64::LD1RSB_H_IMM:
3031 case AArch64::LD1RSB_S_IMM:
3032 case AArch64::LD1RSH_D_IMM:
3033 case AArch64::LD1RSH_S_IMM:
3034 case AArch64::LD1RSW_IMM:
3035 case AArch64::LD1RW_D_IMM:
3036 case AArch64::LD1RW_IMM:
3037 case AArch64::LD1SB_D_IMM:
3038 case AArch64::LD1SB_H_IMM:
3039 case AArch64::LD1SB_S_IMM:
3040 case AArch64::LD1SH_D_IMM:
3041 case AArch64::LD1SH_S_IMM:
3042 case AArch64::LD1SW_D_IMM:
3043 case AArch64::LD1W_D_IMM:
3044 case AArch64::LD1W_IMM:
3045 case AArch64::LD2B_IMM:
3046 case AArch64::LD2D_IMM:
3047 case AArch64::LD2H_IMM:
3048 case AArch64::LD2W_IMM:
3049 case AArch64::LD3B_IMM:
3050 case AArch64::LD3D_IMM:
3051 case AArch64::LD3H_IMM:
3052 case AArch64::LD3W_IMM:
3053 case AArch64::LD4B_IMM:
3054 case AArch64::LD4D_IMM:
3055 case AArch64::LD4H_IMM:
3056 case AArch64::LD4W_IMM:
3057 case AArch64::LDG:
3058 case AArch64::LDNF1B_D_IMM:
3059 case AArch64::LDNF1B_H_IMM:
3060 case AArch64::LDNF1B_IMM:
3061 case AArch64::LDNF1B_S_IMM:
3062 case AArch64::LDNF1D_IMM:
3063 case AArch64::LDNF1H_D_IMM:
3064 case AArch64::LDNF1H_IMM:
3065 case AArch64::LDNF1H_S_IMM:
3066 case AArch64::LDNF1SB_D_IMM:
3067 case AArch64::LDNF1SB_H_IMM:
3068 case AArch64::LDNF1SB_S_IMM:
3069 case AArch64::LDNF1SH_D_IMM:
3070 case AArch64::LDNF1SH_S_IMM:
3071 case AArch64::LDNF1SW_D_IMM:
3072 case AArch64::LDNF1W_D_IMM:
3073 case AArch64::LDNF1W_IMM:
3074 case AArch64::LDNPDi:
3075 case AArch64::LDNPQi:
3076 case AArch64::LDNPSi:
3077 case AArch64::LDNPWi:
3078 case AArch64::LDNPXi:
3079 case AArch64::LDNT1B_ZRI:
3080 case AArch64::LDNT1D_ZRI:
3081 case AArch64::LDNT1H_ZRI:
3082 case AArch64::LDNT1W_ZRI:
3083 case AArch64::LDPDi:
3084 case AArch64::LDPQi:
3085 case AArch64::LDPSi:
3086 case AArch64::LDPWi:
3087 case AArch64::LDPXi:
3088 case AArch64::LDRBBpost:
3089 case AArch64::LDRBBpre:
3090 case AArch64::LDRBpost:
3091 case AArch64::LDRBpre:
3092 case AArch64::LDRDpost:
3093 case AArch64::LDRDpre:
3094 case AArch64::LDRHHpost:
3095 case AArch64::LDRHHpre:
3096 case AArch64::LDRHpost:
3097 case AArch64::LDRHpre:
3098 case AArch64::LDRQpost:
3099 case AArch64::LDRQpre:
3100 case AArch64::LDRSpost:
3101 case AArch64::LDRSpre:
3102 case AArch64::LDRWpost:
3103 case AArch64::LDRWpre:
3104 case AArch64::LDRXpost:
3105 case AArch64::LDRXpre:
3106 case AArch64::ST1B_D_IMM:
3107 case AArch64::ST1B_H_IMM:
3108 case AArch64::ST1B_IMM:
3109 case AArch64::ST1B_S_IMM:
3110 case AArch64::ST1D_IMM:
3111 case AArch64::ST1H_D_IMM:
3112 case AArch64::ST1H_IMM:
3113 case AArch64::ST1H_S_IMM:
3114 case AArch64::ST1W_D_IMM:
3115 case AArch64::ST1W_IMM:
3116 case AArch64::ST2B_IMM:
3117 case AArch64::ST2D_IMM:
3118 case AArch64::ST2H_IMM:
3119 case AArch64::ST2W_IMM:
3120 case AArch64::ST3B_IMM:
3121 case AArch64::ST3D_IMM:
3122 case AArch64::ST3H_IMM:
3123 case AArch64::ST3W_IMM:
3124 case AArch64::ST4B_IMM:
3125 case AArch64::ST4D_IMM:
3126 case AArch64::ST4H_IMM:
3127 case AArch64::ST4W_IMM:
3128 case AArch64::STGPi:
3129 case AArch64::STGPreIndex:
3130 case AArch64::STZGPreIndex:
3131 case AArch64::ST2GPreIndex:
3132 case AArch64::STZ2GPreIndex:
3133 case AArch64::STGPostIndex:
3134 case AArch64::STZGPostIndex:
3135 case AArch64::ST2GPostIndex:
3136 case AArch64::STZ2GPostIndex:
3137 case AArch64::STNPDi:
3138 case AArch64::STNPQi:
3139 case AArch64::STNPSi:
3140 case AArch64::STNPWi:
3141 case AArch64::STNPXi:
3142 case AArch64::STNT1B_ZRI:
3143 case AArch64::STNT1D_ZRI:
3144 case AArch64::STNT1H_ZRI:
3145 case AArch64::STNT1W_ZRI:
3146 case AArch64::STPDi:
3147 case AArch64::STPQi:
3148 case AArch64::STPSi:
3149 case AArch64::STPWi:
3150 case AArch64::STPXi:
3151 case AArch64::STRBBpost:
3152 case AArch64::STRBBpre:
3153 case AArch64::STRBpost:
3154 case AArch64::STRBpre:
3155 case AArch64::STRDpost:
3156 case AArch64::STRDpre:
3157 case AArch64::STRHHpost:
3158 case AArch64::STRHHpre:
3159 case AArch64::STRHpost:
3160 case AArch64::STRHpre:
3161 case AArch64::STRQpost:
3162 case AArch64::STRQpre:
3163 case AArch64::STRSpost:
3164 case AArch64::STRSpre:
3165 case AArch64::STRWpost:
3166 case AArch64::STRWpre:
3167 case AArch64::STRXpost:
3168 case AArch64::STRXpre:
3169 case AArch64::LD1B_2Z_IMM:
3170 case AArch64::LD1B_2Z_STRIDED_IMM:
3171 case AArch64::LD1H_2Z_IMM:
3172 case AArch64::LD1H_2Z_STRIDED_IMM:
3173 case AArch64::LD1W_2Z_IMM:
3174 case AArch64::LD1W_2Z_STRIDED_IMM:
3175 case AArch64::LD1D_2Z_IMM:
3176 case AArch64::LD1D_2Z_STRIDED_IMM:
3177 case AArch64::LD1B_4Z_IMM:
3178 case AArch64::LD1B_4Z_STRIDED_IMM:
3179 case AArch64::LD1H_4Z_IMM:
3180 case AArch64::LD1H_4Z_STRIDED_IMM:
3181 case AArch64::LD1W_4Z_IMM:
3182 case AArch64::LD1W_4Z_STRIDED_IMM:
3183 case AArch64::LD1D_4Z_IMM:
3184 case AArch64::LD1D_4Z_STRIDED_IMM:
3185 case AArch64::LD1B_2Z_IMM_PSEUDO:
3186 case AArch64::LD1H_2Z_IMM_PSEUDO:
3187 case AArch64::LD1W_2Z_IMM_PSEUDO:
3188 case AArch64::LD1D_2Z_IMM_PSEUDO:
3189 case AArch64::LD1B_4Z_IMM_PSEUDO:
3190 case AArch64::LD1H_4Z_IMM_PSEUDO:
3191 case AArch64::LD1W_4Z_IMM_PSEUDO:
3192 case AArch64::LD1D_4Z_IMM_PSEUDO:
3193 case AArch64::ST1B_2Z_IMM:
3194 case AArch64::ST1B_2Z_STRIDED_IMM:
3195 case AArch64::ST1H_2Z_IMM:
3196 case AArch64::ST1H_2Z_STRIDED_IMM:
3197 case AArch64::ST1W_2Z_IMM:
3198 case AArch64::ST1W_2Z_STRIDED_IMM:
3199 case AArch64::ST1D_2Z_IMM:
3200 case AArch64::ST1D_2Z_STRIDED_IMM:
3201 case AArch64::LDNT1B_2Z_IMM_PSEUDO:
3202 case AArch64::LDNT1B_2Z_IMM:
3203 case AArch64::LDNT1B_2Z_STRIDED_IMM:
3204 case AArch64::LDNT1H_2Z_IMM_PSEUDO:
3205 case AArch64::LDNT1H_2Z_IMM:
3206 case AArch64::LDNT1H_2Z_STRIDED_IMM:
3207 case AArch64::LDNT1W_2Z_IMM_PSEUDO:
3208 case AArch64::LDNT1W_2Z_IMM:
3209 case AArch64::LDNT1W_2Z_STRIDED_IMM:
3210 case AArch64::LDNT1D_2Z_IMM_PSEUDO:
3211 case AArch64::LDNT1D_2Z_IMM:
3212 case AArch64::LDNT1D_2Z_STRIDED_IMM:
3213 case AArch64::STNT1B_2Z_IMM:
3214 case AArch64::STNT1B_2Z_STRIDED_IMM:
3215 case AArch64::STNT1H_2Z_IMM:
3216 case AArch64::STNT1H_2Z_STRIDED_IMM:
3217 case AArch64::STNT1W_2Z_IMM:
3218 case AArch64::STNT1W_2Z_STRIDED_IMM:
3219 case AArch64::STNT1D_2Z_IMM:
3220 case AArch64::STNT1D_2Z_STRIDED_IMM:
3221 case AArch64::ST1B_4Z_IMM:
3222 case AArch64::ST1B_4Z_STRIDED_IMM:
3223 case AArch64::ST1H_4Z_IMM:
3224 case AArch64::ST1H_4Z_STRIDED_IMM:
3225 case AArch64::ST1W_4Z_IMM:
3226 case AArch64::ST1W_4Z_STRIDED_IMM:
3227 case AArch64::ST1D_4Z_IMM:
3228 case AArch64::ST1D_4Z_STRIDED_IMM:
3229 case AArch64::LDNT1B_4Z_IMM_PSEUDO:
3230 case AArch64::LDNT1B_4Z_IMM:
3231 case AArch64::LDNT1B_4Z_STRIDED_IMM:
3232 case AArch64::LDNT1H_4Z_IMM_PSEUDO:
3233 case AArch64::LDNT1H_4Z_IMM:
3234 case AArch64::LDNT1H_4Z_STRIDED_IMM:
3235 case AArch64::LDNT1W_4Z_IMM_PSEUDO:
3236 case AArch64::LDNT1W_4Z_IMM:
3237 case AArch64::LDNT1W_4Z_STRIDED_IMM:
3238 case AArch64::LDNT1D_4Z_IMM_PSEUDO:
3239 case AArch64::LDNT1D_4Z_IMM:
3240 case AArch64::LDNT1D_4Z_STRIDED_IMM:
3241 case AArch64::STNT1B_4Z_IMM:
3242 case AArch64::STNT1B_4Z_STRIDED_IMM:
3243 case AArch64::STNT1H_4Z_IMM:
3244 case AArch64::STNT1H_4Z_STRIDED_IMM:
3245 case AArch64::STNT1W_4Z_IMM:
3246 case AArch64::STNT1W_4Z_STRIDED_IMM:
3247 case AArch64::STNT1D_4Z_IMM:
3248 case AArch64::STNT1D_4Z_STRIDED_IMM:
3249 return 3;
3250 case AArch64::LDPDpost:
3251 case AArch64::LDPDpre:
3252 case AArch64::LDPQpost:
3253 case AArch64::LDPQpre:
3254 case AArch64::LDPSpost:
3255 case AArch64::LDPSpre:
3256 case AArch64::LDPWpost:
3257 case AArch64::LDPWpre:
3258 case AArch64::LDPXpost:
3259 case AArch64::LDPXpre:
3260 case AArch64::STGPpre:
3261 case AArch64::STGPpost:
3262 case AArch64::STPDpost:
3263 case AArch64::STPDpre:
3264 case AArch64::STPQpost:
3265 case AArch64::STPQpre:
3266 case AArch64::STPSpost:
3267 case AArch64::STPSpre:
3268 case AArch64::STPWpost:
3269 case AArch64::STPWpre:
3270 case AArch64::STPXpost:
3271 case AArch64::STPXpre:
3272 return 4;
3273 }
3274}
3275
3277 switch (MI.getOpcode()) {
3278 default:
3279 return false;
3280 // Scaled instructions.
3281 case AArch64::STRSui:
3282 case AArch64::STRDui:
3283 case AArch64::STRQui:
3284 case AArch64::STRXui:
3285 case AArch64::STRWui:
3286 case AArch64::LDRSui:
3287 case AArch64::LDRDui:
3288 case AArch64::LDRQui:
3289 case AArch64::LDRXui:
3290 case AArch64::LDRWui:
3291 case AArch64::LDRSWui:
3292 // Unscaled instructions.
3293 case AArch64::STURSi:
3294 case AArch64::STRSpre:
3295 case AArch64::STURDi:
3296 case AArch64::STRDpre:
3297 case AArch64::STURQi:
3298 case AArch64::STRQpre:
3299 case AArch64::STURWi:
3300 case AArch64::STRWpre:
3301 case AArch64::STURXi:
3302 case AArch64::STRXpre:
3303 case AArch64::LDURSi:
3304 case AArch64::LDRSpre:
3305 case AArch64::LDURDi:
3306 case AArch64::LDRDpre:
3307 case AArch64::LDURQi:
3308 case AArch64::LDRQpre:
3309 case AArch64::LDURWi:
3310 case AArch64::LDRWpre:
3311 case AArch64::LDURXi:
3312 case AArch64::LDRXpre:
3313 case AArch64::LDURSWi:
3314 case AArch64::LDRSWpre:
3315 // SVE instructions.
3316 case AArch64::LDR_ZXI:
3317 case AArch64::STR_ZXI:
3318 return true;
3319 }
3320}
3321
3323 switch (MI.getOpcode()) {
3324 default:
3325 assert((!MI.isCall() || !MI.isReturn()) &&
3326 "Unexpected instruction - was a new tail call opcode introduced?");
3327 return false;
3328 case AArch64::TCRETURNdi:
3329 case AArch64::TCRETURNri:
3330 case AArch64::TCRETURNrix16x17:
3331 case AArch64::TCRETURNrix17:
3332 case AArch64::TCRETURNrinotx16:
3333 case AArch64::TCRETURNriALL:
3334 case AArch64::AUTH_TCRETURN:
3335 case AArch64::AUTH_TCRETURN_BTI:
3336 return true;
3337 }
3338}
3339
3341 switch (Opc) {
3342 default:
3343 llvm_unreachable("Opcode has no flag setting equivalent!");
3344 // 32-bit cases:
3345 case AArch64::ADDWri:
3346 return AArch64::ADDSWri;
3347 case AArch64::ADDWrr:
3348 return AArch64::ADDSWrr;
3349 case AArch64::ADDWrs:
3350 return AArch64::ADDSWrs;
3351 case AArch64::ADDWrx:
3352 return AArch64::ADDSWrx;
3353 case AArch64::ANDWri:
3354 return AArch64::ANDSWri;
3355 case AArch64::ANDWrr:
3356 return AArch64::ANDSWrr;
3357 case AArch64::ANDWrs:
3358 return AArch64::ANDSWrs;
3359 case AArch64::BICWrr:
3360 return AArch64::BICSWrr;
3361 case AArch64::BICWrs:
3362 return AArch64::BICSWrs;
3363 case AArch64::SUBWri:
3364 return AArch64::SUBSWri;
3365 case AArch64::SUBWrr:
3366 return AArch64::SUBSWrr;
3367 case AArch64::SUBWrs:
3368 return AArch64::SUBSWrs;
3369 case AArch64::SUBWrx:
3370 return AArch64::SUBSWrx;
3371 // 64-bit cases:
3372 case AArch64::ADDXri:
3373 return AArch64::ADDSXri;
3374 case AArch64::ADDXrr:
3375 return AArch64::ADDSXrr;
3376 case AArch64::ADDXrs:
3377 return AArch64::ADDSXrs;
3378 case AArch64::ADDXrx:
3379 return AArch64::ADDSXrx;
3380 case AArch64::ANDXri:
3381 return AArch64::ANDSXri;
3382 case AArch64::ANDXrr:
3383 return AArch64::ANDSXrr;
3384 case AArch64::ANDXrs:
3385 return AArch64::ANDSXrs;
3386 case AArch64::BICXrr:
3387 return AArch64::BICSXrr;
3388 case AArch64::BICXrs:
3389 return AArch64::BICSXrs;
3390 case AArch64::SUBXri:
3391 return AArch64::SUBSXri;
3392 case AArch64::SUBXrr:
3393 return AArch64::SUBSXrr;
3394 case AArch64::SUBXrs:
3395 return AArch64::SUBSXrs;
3396 case AArch64::SUBXrx:
3397 return AArch64::SUBSXrx;
3398 // SVE instructions:
3399 case AArch64::AND_PPzPP:
3400 return AArch64::ANDS_PPzPP;
3401 case AArch64::BIC_PPzPP:
3402 return AArch64::BICS_PPzPP;
3403 case AArch64::EOR_PPzPP:
3404 return AArch64::EORS_PPzPP;
3405 case AArch64::NAND_PPzPP:
3406 return AArch64::NANDS_PPzPP;
3407 case AArch64::NOR_PPzPP:
3408 return AArch64::NORS_PPzPP;
3409 case AArch64::ORN_PPzPP:
3410 return AArch64::ORNS_PPzPP;
3411 case AArch64::ORR_PPzPP:
3412 return AArch64::ORRS_PPzPP;
3413 case AArch64::BRKA_PPzP:
3414 return AArch64::BRKAS_PPzP;
3415 case AArch64::BRKPA_PPzPP:
3416 return AArch64::BRKPAS_PPzPP;
3417 case AArch64::BRKB_PPzP:
3418 return AArch64::BRKBS_PPzP;
3419 case AArch64::BRKPB_PPzPP:
3420 return AArch64::BRKPBS_PPzPP;
3421 case AArch64::BRKN_PPzP:
3422 return AArch64::BRKNS_PPzP;
3423 case AArch64::RDFFR_PPz:
3424 return AArch64::RDFFRS_PPz;
3425 case AArch64::PTRUE_B:
3426 return AArch64::PTRUES_B;
3427 }
3428}
3429
3430// Is this a candidate for ld/st merging or pairing? For example, we don't
3431// touch volatiles or load/stores that have a hint to avoid pair formation.
3433
3434 bool IsPreLdSt = isPreLdSt(MI);
3435
3436 // If this is a volatile load/store, don't mess with it.
3437 if (MI.hasOrderedMemoryRef())
3438 return false;
3439
3440 // Make sure this is a reg/fi+imm (as opposed to an address reloc).
3441 // For Pre-inc LD/ST, the operand is shifted by one.
3442 assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() ||
3443 MI.getOperand(IsPreLdSt ? 2 : 1).isFI()) &&
3444 "Expected a reg or frame index operand.");
3445
3446 // For Pre-indexed addressing quadword instructions, the third operand is the
3447 // immediate value.
3448 bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(3).isImm();
3449
3450 if (!MI.getOperand(2).isImm() && !IsImmPreLdSt)
3451 return false;
3452
3453 // Can't merge/pair if the instruction modifies the base register.
3454 // e.g., ldr x0, [x0]
3455 // This case will never occur with an FI base.
3456 // However, if the instruction is an LDR<S,D,Q,W,X,SW>pre or
3457 // STR<S,D,Q,W,X>pre, it can be merged.
3458 // For example:
3459 // ldr q0, [x11, #32]!
3460 // ldr q1, [x11, #16]
3461 // to
3462 // ldp q0, q1, [x11, #32]!
3463 if (MI.getOperand(1).isReg() && !IsPreLdSt) {
3464 Register BaseReg = MI.getOperand(1).getReg();
3466 if (MI.modifiesRegister(BaseReg, TRI))
3467 return false;
3468 }
3469
3470 // Pairing SVE fills/spills is only valid for little-endian targets that
3471 // implement VLS 128.
3472 switch (MI.getOpcode()) {
3473 default:
3474 break;
3475 case AArch64::LDR_ZXI:
3476 case AArch64::STR_ZXI:
3477 if (!Subtarget.isLittleEndian() ||
3478 Subtarget.getSVEVectorSizeInBits() != 128)
3479 return false;
3480 }
3481
3482 // Check if this load/store has a hint to avoid pair formation.
3483 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
3485 return false;
3486
3487 // Do not pair any callee-save store/reload instructions in the
3488 // prologue/epilogue if the CFI information encoded the operations as separate
3489 // instructions, as that will cause the size of the actual prologue to mismatch
3490 // with the prologue size recorded in the Windows CFI.
3491 const MCAsmInfo &MAI = MI.getMF()->getTarget().getMCAsmInfo();
3492 bool NeedsWinCFI =
3493 MAI.usesWindowsCFI() && MI.getMF()->getFunction().needsUnwindTableEntry();
3494 if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) ||
3496 return false;
3497
3498 // On some CPUs quad load/store pairs are slower than two single load/stores.
3499 if (Subtarget.isPaired128Slow()) {
3500 switch (MI.getOpcode()) {
3501 default:
3502 break;
3503 case AArch64::LDURQi:
3504 case AArch64::STURQi:
3505 case AArch64::LDRQui:
3506 case AArch64::STRQui:
3507 return false;
3508 }
3509 }
3510
3511 return true;
3512}
3513
3516 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
3517 const TargetRegisterInfo *TRI) const {
3518 if (!LdSt.mayLoadOrStore())
3519 return false;
3520
3521 const MachineOperand *BaseOp;
3522 TypeSize WidthN(0, false);
3523 if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable,
3524 WidthN, TRI))
3525 return false;
3526 // The maximum vscale is 16 under AArch64, return the maximal extent for the
3527 // vector.
3528 Width = LocationSize::precise(WidthN);
3529 BaseOps.push_back(BaseOp);
3530 return true;
3531}
3532
3533std::optional<ExtAddrMode>
3535 const TargetRegisterInfo *TRI) const {
3536 const MachineOperand *Base; // Filled with the base operand of MI.
3537 int64_t Offset; // Filled with the offset of MI.
3538 bool OffsetIsScalable;
3539 if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI))
3540 return std::nullopt;
3541
3542 if (!Base->isReg())
3543 return std::nullopt;
3544 ExtAddrMode AM;
3545 AM.BaseReg = Base->getReg();
3546 AM.Displacement = Offset;
3547 AM.ScaledReg = 0;
3548 AM.Scale = 0;
3549 return AM;
3550}
3551
3553 Register Reg,
3554 const MachineInstr &AddrI,
3555 ExtAddrMode &AM) const {
3556 // Filter out instructions into which we cannot fold.
3557 unsigned NumBytes;
3558 int64_t OffsetScale = 1;
3559 switch (MemI.getOpcode()) {
3560 default:
3561 return false;
3562
3563 case AArch64::LDURQi:
3564 case AArch64::STURQi:
3565 NumBytes = 16;
3566 break;
3567
3568 case AArch64::LDURDi:
3569 case AArch64::STURDi:
3570 case AArch64::LDURXi:
3571 case AArch64::STURXi:
3572 NumBytes = 8;
3573 break;
3574
3575 case AArch64::LDURWi:
3576 case AArch64::LDURSWi:
3577 case AArch64::STURWi:
3578 NumBytes = 4;
3579 break;
3580
3581 case AArch64::LDURHi:
3582 case AArch64::STURHi:
3583 case AArch64::LDURHHi:
3584 case AArch64::STURHHi:
3585 case AArch64::LDURSHXi:
3586 case AArch64::LDURSHWi:
3587 NumBytes = 2;
3588 break;
3589
3590 case AArch64::LDRBroX:
3591 case AArch64::LDRBBroX:
3592 case AArch64::LDRSBXroX:
3593 case AArch64::LDRSBWroX:
3594 case AArch64::STRBroX:
3595 case AArch64::STRBBroX:
3596 case AArch64::LDURBi:
3597 case AArch64::LDURBBi:
3598 case AArch64::LDURSBXi:
3599 case AArch64::LDURSBWi:
3600 case AArch64::STURBi:
3601 case AArch64::STURBBi:
3602 case AArch64::LDRBui:
3603 case AArch64::LDRBBui:
3604 case AArch64::LDRSBXui:
3605 case AArch64::LDRSBWui:
3606 case AArch64::STRBui:
3607 case AArch64::STRBBui:
3608 NumBytes = 1;
3609 break;
3610
3611 case AArch64::LDRQroX:
3612 case AArch64::STRQroX:
3613 case AArch64::LDRQui:
3614 case AArch64::STRQui:
3615 NumBytes = 16;
3616 OffsetScale = 16;
3617 break;
3618
3619 case AArch64::LDRDroX:
3620 case AArch64::STRDroX:
3621 case AArch64::LDRXroX:
3622 case AArch64::STRXroX:
3623 case AArch64::LDRDui:
3624 case AArch64::STRDui:
3625 case AArch64::LDRXui:
3626 case AArch64::STRXui:
3627 NumBytes = 8;
3628 OffsetScale = 8;
3629 break;
3630
3631 case AArch64::LDRWroX:
3632 case AArch64::LDRSWroX:
3633 case AArch64::STRWroX:
3634 case AArch64::LDRWui:
3635 case AArch64::LDRSWui:
3636 case AArch64::STRWui:
3637 NumBytes = 4;
3638 OffsetScale = 4;
3639 break;
3640
3641 case AArch64::LDRHroX:
3642 case AArch64::STRHroX:
3643 case AArch64::LDRHHroX:
3644 case AArch64::STRHHroX:
3645 case AArch64::LDRSHXroX:
3646 case AArch64::LDRSHWroX:
3647 case AArch64::LDRHui:
3648 case AArch64::STRHui:
3649 case AArch64::LDRHHui:
3650 case AArch64::STRHHui:
3651 case AArch64::LDRSHXui:
3652 case AArch64::LDRSHWui:
3653 NumBytes = 2;
3654 OffsetScale = 2;
3655 break;
3656 }
3657
3658 // Check the fold operand is not the loaded/stored value.
3659 const MachineOperand &BaseRegOp = MemI.getOperand(0);
3660 if (BaseRegOp.isReg() && BaseRegOp.getReg() == Reg)
3661 return false;
3662
3663 // Handle memory instructions with a [Reg, Reg] addressing mode.
3664 if (MemI.getOperand(2).isReg()) {
3665 // Bail if the addressing mode already includes extension of the offset
3666 // register.
3667 if (MemI.getOperand(3).getImm())
3668 return false;
3669
3670 // Check if we actually have a scaled offset.
3671 if (MemI.getOperand(4).getImm() == 0)
3672 OffsetScale = 1;
3673
3674 // If the address instructions is folded into the base register, then the
3675 // addressing mode must not have a scale. Then we can swap the base and the
3676 // scaled registers.
3677 if (MemI.getOperand(1).getReg() == Reg && OffsetScale != 1)
3678 return false;
3679
3680 switch (AddrI.getOpcode()) {
3681 default:
3682 return false;
3683
3684 case AArch64::SBFMXri:
3685 // sxtw Xa, Wm
3686 // ldr Xd, [Xn, Xa, lsl #N]
3687 // ->
3688 // ldr Xd, [Xn, Wm, sxtw #N]
3689 if (AddrI.getOperand(2).getImm() != 0 ||
3690 AddrI.getOperand(3).getImm() != 31)
3691 return false;
3692
3693 AM.BaseReg = MemI.getOperand(1).getReg();
3694 if (AM.BaseReg == Reg)
3695 AM.BaseReg = MemI.getOperand(2).getReg();
3696 AM.ScaledReg = AddrI.getOperand(1).getReg();
3697 AM.Scale = OffsetScale;
3698 AM.Displacement = 0;
3700 return true;
3701
3702 case TargetOpcode::SUBREG_TO_REG: {
3703 // mov Wa, Wm
3704 // ldr Xd, [Xn, Xa, lsl #N]
3705 // ->
3706 // ldr Xd, [Xn, Wm, uxtw #N]
3707
3708 // Zero-extension looks like an ORRWrs followed by a SUBREG_TO_REG.
3709 if (AddrI.getOperand(2).getImm() != AArch64::sub_32)
3710 return false;
3711
3712 const MachineRegisterInfo &MRI = AddrI.getMF()->getRegInfo();
3713 Register OffsetReg = AddrI.getOperand(1).getReg();
3714 if (!OffsetReg.isVirtual() || !MRI.hasOneNonDBGUse(OffsetReg))
3715 return false;
3716
3717 const MachineInstr &DefMI = *MRI.getVRegDef(OffsetReg);
3718 if (DefMI.getOpcode() != AArch64::ORRWrs ||
3719 DefMI.getOperand(1).getReg() != AArch64::WZR ||
3720 DefMI.getOperand(3).getImm() != 0)
3721 return false;
3722
3723 AM.BaseReg = MemI.getOperand(1).getReg();
3724 if (AM.BaseReg == Reg)
3725 AM.BaseReg = MemI.getOperand(2).getReg();
3726 AM.ScaledReg = DefMI.getOperand(2).getReg();
3727 AM.Scale = OffsetScale;
3728 AM.Displacement = 0;
3730 return true;
3731 }
3732 }
3733 }
3734
3735 // Handle memory instructions with a [Reg, #Imm] addressing mode.
3736
3737 // Check we are not breaking a potential conversion to an LDP.
3738 auto validateOffsetForLDP = [](unsigned NumBytes, int64_t OldOffset,
3739 int64_t NewOffset) -> bool {
3740 int64_t MinOffset, MaxOffset;
3741 switch (NumBytes) {
3742 default:
3743 return true;
3744 case 4:
3745 MinOffset = -256;
3746 MaxOffset = 252;
3747 break;
3748 case 8:
3749 MinOffset = -512;
3750 MaxOffset = 504;
3751 break;
3752 case 16:
3753 MinOffset = -1024;
3754 MaxOffset = 1008;
3755 break;
3756 }
3757 return OldOffset < MinOffset || OldOffset > MaxOffset ||
3758 (NewOffset >= MinOffset && NewOffset <= MaxOffset);
3759 };
3760 auto canFoldAddSubImmIntoAddrMode = [&](int64_t Disp) -> bool {
3761 int64_t OldOffset = MemI.getOperand(2).getImm() * OffsetScale;
3762 int64_t NewOffset = OldOffset + Disp;
3763 if (!isLegalAddressingMode(NumBytes, NewOffset, /* Scale */ 0))
3764 return false;
3765 // If the old offset would fit into an LDP, but the new offset wouldn't,
3766 // bail out.
3767 if (!validateOffsetForLDP(NumBytes, OldOffset, NewOffset))
3768 return false;
3769 AM.BaseReg = AddrI.getOperand(1).getReg();
3770 AM.ScaledReg = 0;
3771 AM.Scale = 0;
3772 AM.Displacement = NewOffset;
3774 return true;
3775 };
3776
3777 auto canFoldAddRegIntoAddrMode =
3778 [&](int64_t Scale,
3780 if (MemI.getOperand(2).getImm() != 0)
3781 return false;
3782 if ((unsigned)Scale != Scale)
3783 return false;
3784 if (!isLegalAddressingMode(NumBytes, /* Offset */ 0, Scale))
3785 return false;
3786 AM.BaseReg = AddrI.getOperand(1).getReg();
3787 AM.ScaledReg = AddrI.getOperand(2).getReg();
3788 AM.Scale = Scale;
3789 AM.Displacement = 0;
3790 AM.Form = Form;
3791 return true;
3792 };
3793
3794 auto avoidSlowSTRQ = [&](const MachineInstr &MemI) {
3795 unsigned Opcode = MemI.getOpcode();
3796 return (Opcode == AArch64::STURQi || Opcode == AArch64::STRQui) &&
3797 Subtarget.isSTRQroSlow();
3798 };
3799
3800 int64_t Disp = 0;
3801 const bool OptSize = MemI.getMF()->getFunction().hasOptSize();
3802 switch (AddrI.getOpcode()) {
3803 default:
3804 return false;
3805
3806 case AArch64::ADDXri:
3807 // add Xa, Xn, #N
3808 // ldr Xd, [Xa, #M]
3809 // ->
3810 // ldr Xd, [Xn, #N'+M]
3811 Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
3812 return canFoldAddSubImmIntoAddrMode(Disp);
3813
3814 case AArch64::SUBXri:
3815 // sub Xa, Xn, #N
3816 // ldr Xd, [Xa, #M]
3817 // ->
3818 // ldr Xd, [Xn, #N'+M]
3819 Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
3820 return canFoldAddSubImmIntoAddrMode(-Disp);
3821
3822 case AArch64::ADDXrs: {
3823 // add Xa, Xn, Xm, lsl #N
3824 // ldr Xd, [Xa]
3825 // ->
3826 // ldr Xd, [Xn, Xm, lsl #N]
3827
3828 // Don't fold the add if the result would be slower, unless optimising for
3829 // size.
3830 unsigned Shift = static_cast<unsigned>(AddrI.getOperand(3).getImm());
3832 return false;
3833 Shift = AArch64_AM::getShiftValue(Shift);
3834 if (!OptSize) {
3835 if (Shift != 2 && Shift != 3 && Subtarget.hasAddrLSLSlow14())
3836 return false;
3837 if (avoidSlowSTRQ(MemI))
3838 return false;
3839 }
3840 return canFoldAddRegIntoAddrMode(1ULL << Shift);
3841 }
3842
3843 case AArch64::ADDXrr:
3844 // add Xa, Xn, Xm
3845 // ldr Xd, [Xa]
3846 // ->
3847 // ldr Xd, [Xn, Xm, lsl #0]
3848
3849 // Don't fold the add if the result would be slower, unless optimising for
3850 // size.
3851 if (!OptSize && avoidSlowSTRQ(MemI))
3852 return false;
3853 return canFoldAddRegIntoAddrMode(1);
3854
3855 case AArch64::ADDXrx:
3856 // add Xa, Xn, Wm, {s,u}xtw #N
3857 // ldr Xd, [Xa]
3858 // ->
3859 // ldr Xd, [Xn, Wm, {s,u}xtw #N]
3860
3861 // Don't fold the add if the result would be slower, unless optimising for
3862 // size.
3863 if (!OptSize && avoidSlowSTRQ(MemI))
3864 return false;
3865
3866 // Can fold only sign-/zero-extend of a word.
3867 unsigned Imm = static_cast<unsigned>(AddrI.getOperand(3).getImm());
3869 if (Extend != AArch64_AM::UXTW && Extend != AArch64_AM::SXTW)
3870 return false;
3871
3872 return canFoldAddRegIntoAddrMode(
3873 1ULL << AArch64_AM::getArithShiftValue(Imm),
3876 }
3877}
3878
3879// Given an opcode for an instruction with a [Reg, #Imm] addressing mode,
3880// return the opcode of an instruction performing the same operation, but using
3881// the [Reg, Reg] addressing mode.
3882static unsigned regOffsetOpcode(unsigned Opcode) {
3883 switch (Opcode) {
3884 default:
3885 llvm_unreachable("Address folding not implemented for instruction");
3886
3887 case AArch64::LDURQi:
3888 case AArch64::LDRQui:
3889 return AArch64::LDRQroX;
3890 case AArch64::STURQi:
3891 case AArch64::STRQui:
3892 return AArch64::STRQroX;
3893 case AArch64::LDURDi:
3894 case AArch64::LDRDui:
3895 return AArch64::LDRDroX;
3896 case AArch64::STURDi:
3897 case AArch64::STRDui:
3898 return AArch64::STRDroX;
3899 case AArch64::LDURXi:
3900 case AArch64::LDRXui:
3901 return AArch64::LDRXroX;
3902 case AArch64::STURXi:
3903 case AArch64::STRXui:
3904 return AArch64::STRXroX;
3905 case AArch64::LDURWi:
3906 case AArch64::LDRWui:
3907 return AArch64::LDRWroX;
3908 case AArch64::LDURSWi:
3909 case AArch64::LDRSWui:
3910 return AArch64::LDRSWroX;
3911 case AArch64::STURWi:
3912 case AArch64::STRWui:
3913 return AArch64::STRWroX;
3914 case AArch64::LDURHi:
3915 case AArch64::LDRHui:
3916 return AArch64::LDRHroX;
3917 case AArch64::STURHi:
3918 case AArch64::STRHui:
3919 return AArch64::STRHroX;
3920 case AArch64::LDURHHi:
3921 case AArch64::LDRHHui:
3922 return AArch64::LDRHHroX;
3923 case AArch64::STURHHi:
3924 case AArch64::STRHHui:
3925 return AArch64::STRHHroX;
3926 case AArch64::LDURSHXi:
3927 case AArch64::LDRSHXui:
3928 return AArch64::LDRSHXroX;
3929 case AArch64::LDURSHWi:
3930 case AArch64::LDRSHWui:
3931 return AArch64::LDRSHWroX;
3932 case AArch64::LDURBi:
3933 case AArch64::LDRBui:
3934 return AArch64::LDRBroX;
3935 case AArch64::LDURBBi:
3936 case AArch64::LDRBBui:
3937 return AArch64::LDRBBroX;
3938 case AArch64::LDURSBXi:
3939 case AArch64::LDRSBXui:
3940 return AArch64::LDRSBXroX;
3941 case AArch64::LDURSBWi:
3942 case AArch64::LDRSBWui:
3943 return AArch64::LDRSBWroX;
3944 case AArch64::STURBi:
3945 case AArch64::STRBui:
3946 return AArch64::STRBroX;
3947 case AArch64::STURBBi:
3948 case AArch64::STRBBui:
3949 return AArch64::STRBBroX;
3950 }
3951}
3952
3953// Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3954// the opcode of an instruction performing the same operation, but using the
3955// [Reg, #Imm] addressing mode with scaled offset.
3956unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale) {
3957 switch (Opcode) {
3958 default:
3959 llvm_unreachable("Address folding not implemented for instruction");
3960
3961 case AArch64::LDURQi:
3962 Scale = 16;
3963 return AArch64::LDRQui;
3964 case AArch64::STURQi:
3965 Scale = 16;
3966 return AArch64::STRQui;
3967 case AArch64::LDURDi:
3968 Scale = 8;
3969 return AArch64::LDRDui;
3970 case AArch64::STURDi:
3971 Scale = 8;
3972 return AArch64::STRDui;
3973 case AArch64::LDURXi:
3974 Scale = 8;
3975 return AArch64::LDRXui;
3976 case AArch64::STURXi:
3977 Scale = 8;
3978 return AArch64::STRXui;
3979 case AArch64::LDURWi:
3980 Scale = 4;
3981 return AArch64::LDRWui;
3982 case AArch64::LDURSWi:
3983 Scale = 4;
3984 return AArch64::LDRSWui;
3985 case AArch64::STURWi:
3986 Scale = 4;
3987 return AArch64::STRWui;
3988 case AArch64::LDURHi:
3989 Scale = 2;
3990 return AArch64::LDRHui;
3991 case AArch64::STURHi:
3992 Scale = 2;
3993 return AArch64::STRHui;
3994 case AArch64::LDURHHi:
3995 Scale = 2;
3996 return AArch64::LDRHHui;
3997 case AArch64::STURHHi:
3998 Scale = 2;
3999 return AArch64::STRHHui;
4000 case AArch64::LDURSHXi:
4001 Scale = 2;
4002 return AArch64::LDRSHXui;
4003 case AArch64::LDURSHWi:
4004 Scale = 2;
4005 return AArch64::LDRSHWui;
4006 case AArch64::LDURBi:
4007 Scale = 1;
4008 return AArch64::LDRBui;
4009 case AArch64::LDURBBi:
4010 Scale = 1;
4011 return AArch64::LDRBBui;
4012 case AArch64::LDURSBXi:
4013 Scale = 1;
4014 return AArch64::LDRSBXui;
4015 case AArch64::LDURSBWi:
4016 Scale = 1;
4017 return AArch64::LDRSBWui;
4018 case AArch64::STURBi:
4019 Scale = 1;
4020 return AArch64::STRBui;
4021 case AArch64::STURBBi:
4022 Scale = 1;
4023 return AArch64::STRBBui;
4024 case AArch64::LDRQui:
4025 case AArch64::STRQui:
4026 Scale = 16;
4027 return Opcode;
4028 case AArch64::LDRDui:
4029 case AArch64::STRDui:
4030 case AArch64::LDRXui:
4031 case AArch64::STRXui:
4032 Scale = 8;
4033 return Opcode;
4034 case AArch64::LDRWui:
4035 case AArch64::LDRSWui:
4036 case AArch64::STRWui:
4037 Scale = 4;
4038 return Opcode;
4039 case AArch64::LDRHui:
4040 case AArch64::STRHui:
4041 case AArch64::LDRHHui:
4042 case AArch64::STRHHui:
4043 case AArch64::LDRSHXui:
4044 case AArch64::LDRSHWui:
4045 Scale = 2;
4046 return Opcode;
4047 case AArch64::LDRBui:
4048 case AArch64::LDRBBui:
4049 case AArch64::LDRSBXui:
4050 case AArch64::LDRSBWui:
4051 case AArch64::STRBui:
4052 case AArch64::STRBBui:
4053 Scale = 1;
4054 return Opcode;
4055 }
4056}
4057
4058// Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
4059// the opcode of an instruction performing the same operation, but using the
4060// [Reg, #Imm] addressing mode with unscaled offset.
4061unsigned unscaledOffsetOpcode(unsigned Opcode) {
4062 switch (Opcode) {
4063 default:
4064 llvm_unreachable("Address folding not implemented for instruction");
4065
4066 case AArch64::LDURQi:
4067 case AArch64::STURQi:
4068 case AArch64::LDURDi:
4069 case AArch64::STURDi:
4070 case AArch64::LDURXi:
4071 case AArch64::STURXi:
4072 case AArch64::LDURWi:
4073 case AArch64::LDURSWi:
4074 case AArch64::STURWi:
4075 case AArch64::LDURHi:
4076 case AArch64::STURHi:
4077 case AArch64::LDURHHi:
4078 case AArch64::STURHHi:
4079 case AArch64::LDURSHXi:
4080 case AArch64::LDURSHWi:
4081 case AArch64::LDURBi:
4082 case AArch64::STURBi:
4083 case AArch64::LDURBBi:
4084 case AArch64::STURBBi:
4085 case AArch64::LDURSBWi:
4086 case AArch64::LDURSBXi:
4087 return Opcode;
4088 case AArch64::LDRQui:
4089 return AArch64::LDURQi;
4090 case AArch64::STRQui:
4091 return AArch64::STURQi;
4092 case AArch64::LDRDui:
4093 return AArch64::LDURDi;
4094 case AArch64::STRDui:
4095 return AArch64::STURDi;
4096 case AArch64::LDRXui:
4097 return AArch64::LDURXi;
4098 case AArch64::STRXui:
4099 return AArch64::STURXi;
4100 case AArch64::LDRWui:
4101 return AArch64::LDURWi;
4102 case AArch64::LDRSWui:
4103 return AArch64::LDURSWi;
4104 case AArch64::STRWui:
4105 return AArch64::STURWi;
4106 case AArch64::LDRHui:
4107 return AArch64::LDURHi;
4108 case AArch64::STRHui:
4109 return AArch64::STURHi;
4110 case AArch64::LDRHHui:
4111 return AArch64::LDURHHi;
4112 case AArch64::STRHHui:
4113 return AArch64::STURHHi;
4114 case AArch64::LDRSHXui:
4115 return AArch64::LDURSHXi;
4116 case AArch64::LDRSHWui:
4117 return AArch64::LDURSHWi;
4118 case AArch64::LDRBBui:
4119 return AArch64::LDURBBi;
4120 case AArch64::LDRBui:
4121 return AArch64::LDURBi;
4122 case AArch64::STRBBui:
4123 return AArch64::STURBBi;
4124 case AArch64::STRBui:
4125 return AArch64::STURBi;
4126 case AArch64::LDRSBWui:
4127 return AArch64::LDURSBWi;
4128 case AArch64::LDRSBXui:
4129 return AArch64::LDURSBXi;
4130 }
4131}
4132
4133// Given the opcode of a memory load/store instruction, return the opcode of an
4134// instruction performing the same operation, but using
4135// the [Reg, Reg, {s,u}xtw #N] addressing mode with sign-/zero-extend of the
4136// offset register.
4137static unsigned offsetExtendOpcode(unsigned Opcode) {
4138 switch (Opcode) {
4139 default:
4140 llvm_unreachable("Address folding not implemented for instruction");
4141
4142 case AArch64::LDRQroX:
4143 case AArch64::LDURQi:
4144 case AArch64::LDRQui:
4145 return AArch64::LDRQroW;
4146 case AArch64::STRQroX:
4147 case AArch64::STURQi:
4148 case AArch64::STRQui:
4149 return AArch64::STRQroW;
4150 case AArch64::LDRDroX:
4151 case AArch64::LDURDi:
4152 case AArch64::LDRDui:
4153 return AArch64::LDRDroW;
4154 case AArch64::STRDroX:
4155 case AArch64::STURDi:
4156 case AArch64::STRDui:
4157 return AArch64::STRDroW;
4158 case AArch64::LDRXroX:
4159 case AArch64::LDURXi:
4160 case AArch64::LDRXui:
4161 return AArch64::LDRXroW;
4162 case AArch64::STRXroX:
4163 case AArch64::STURXi:
4164 case AArch64::STRXui:
4165 return AArch64::STRXroW;
4166 case AArch64::LDRWroX:
4167 case AArch64::LDURWi:
4168 case AArch64::LDRWui:
4169 return AArch64::LDRWroW;
4170 case AArch64::LDRSWroX:
4171 case AArch64::LDURSWi:
4172 case AArch64::LDRSWui:
4173 return AArch64::LDRSWroW;
4174 case AArch64::STRWroX:
4175 case AArch64::STURWi:
4176 case AArch64::STRWui:
4177 return AArch64::STRWroW;
4178 case AArch64::LDRHroX:
4179 case AArch64::LDURHi:
4180 case AArch64::LDRHui:
4181 return AArch64::LDRHroW;
4182 case AArch64::STRHroX:
4183 case AArch64::STURHi:
4184 case AArch64::STRHui:
4185 return AArch64::STRHroW;
4186 case AArch64::LDRHHroX:
4187 case AArch64::LDURHHi:
4188 case AArch64::LDRHHui:
4189 return AArch64::LDRHHroW;
4190 case AArch64::STRHHroX:
4191 case AArch64::STURHHi:
4192 case AArch64::STRHHui:
4193 return AArch64::STRHHroW;
4194 case AArch64::LDRSHXroX:
4195 case AArch64::LDURSHXi:
4196 case AArch64::LDRSHXui:
4197 return AArch64::LDRSHXroW;
4198 case AArch64::LDRSHWroX:
4199 case AArch64::LDURSHWi:
4200 case AArch64::LDRSHWui:
4201 return AArch64::LDRSHWroW;
4202 case AArch64::LDRBroX:
4203 case AArch64::LDURBi:
4204 case AArch64::LDRBui:
4205 return AArch64::LDRBroW;
4206 case AArch64::LDRBBroX:
4207 case AArch64::LDURBBi:
4208 case AArch64::LDRBBui:
4209 return AArch64::LDRBBroW;
4210 case AArch64::LDRSBXroX:
4211 case AArch64::LDURSBXi:
4212 case AArch64::LDRSBXui:
4213 return AArch64::LDRSBXroW;
4214 case AArch64::LDRSBWroX:
4215 case AArch64::LDURSBWi:
4216 case AArch64::LDRSBWui:
4217 return AArch64::LDRSBWroW;
4218 case AArch64::STRBroX:
4219 case AArch64::STURBi:
4220 case AArch64::STRBui:
4221 return AArch64::STRBroW;
4222 case AArch64::STRBBroX:
4223 case AArch64::STURBBi:
4224 case AArch64::STRBBui:
4225 return AArch64::STRBBroW;
4226 }
4227}
4228
4230 const ExtAddrMode &AM) const {
4231
4232 const DebugLoc &DL = MemI.getDebugLoc();
4233 MachineBasicBlock &MBB = *MemI.getParent();
4234 MachineRegisterInfo &MRI = MemI.getMF()->getRegInfo();
4235
4237 if (AM.ScaledReg) {
4238 // The new instruction will be in the form `ldr Rt, [Xn, Xm, lsl #imm]`.
4239 unsigned Opcode = regOffsetOpcode(MemI.getOpcode());
4240 MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
4241 auto B = BuildMI(MBB, MemI, DL, get(Opcode))
4242 .addReg(MemI.getOperand(0).getReg(),
4243 getDefRegState(MemI.mayLoad()))
4244 .addReg(AM.BaseReg)
4245 .addReg(AM.ScaledReg)
4246 .addImm(0)
4247 .addImm(AM.Scale > 1)
4248 .setMemRefs(MemI.memoperands())
4249 .setMIFlags(MemI.getFlags());
4250 return B.getInstr();
4251 }
4252
4253 assert(AM.ScaledReg == 0 && AM.Scale == 0 &&
4254 "Addressing mode not supported for folding");
4255
4256 // The new instruction will be in the form `ld[u]r Rt, [Xn, #imm]`.
4257 unsigned Scale = 1;
4258 unsigned Opcode = MemI.getOpcode();
4259 if (isInt<9>(AM.Displacement))
4260 Opcode = unscaledOffsetOpcode(Opcode);
4261 else
4262 Opcode = scaledOffsetOpcode(Opcode, Scale);
4263
4264 auto B =
4265 BuildMI(MBB, MemI, DL, get(Opcode))
4266 .addReg(MemI.getOperand(0).getReg(), getDefRegState(MemI.mayLoad()))
4267 .addReg(AM.BaseReg)
4268 .addImm(AM.Displacement / Scale)
4269 .setMemRefs(MemI.memoperands())
4270 .setMIFlags(MemI.getFlags());
4271 return B.getInstr();
4272 }
4273
4276 // The new instruction will be in the form `ldr Rt, [Xn, Wm, {s,u}xtw #N]`.
4277 assert(AM.ScaledReg && !AM.Displacement &&
4278 "Address offset can be a register or an immediate, but not both");
4279 unsigned Opcode = offsetExtendOpcode(MemI.getOpcode());
4280 MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
4281 // Make sure the offset register is in the correct register class.
4282 Register OffsetReg = AM.ScaledReg;
4283 const TargetRegisterClass *RC = MRI.getRegClass(OffsetReg);
4284 if (RC->hasSuperClassEq(&AArch64::GPR64RegClass)) {
4285 OffsetReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
4286 BuildMI(MBB, MemI, DL, get(TargetOpcode::COPY), OffsetReg)
4287 .addReg(AM.ScaledReg, {}, AArch64::sub_32);
4288 }
4289 auto B =
4290 BuildMI(MBB, MemI, DL, get(Opcode))
4291 .addReg(MemI.getOperand(0).getReg(), getDefRegState(MemI.mayLoad()))
4292 .addReg(AM.BaseReg)
4293 .addReg(OffsetReg)
4295 .addImm(AM.Scale != 1)
4296 .setMemRefs(MemI.memoperands())
4297 .setMIFlags(MemI.getFlags());
4298
4299 return B.getInstr();
4300 }
4301
4303 "Function must not be called with an addressing mode it can't handle");
4304}
4305
4306/// Return true if the opcode is a post-index ld/st instruction, which really
4307/// loads from base+0.
4308static bool isPostIndexLdStOpcode(unsigned Opcode) {
4309 switch (Opcode) {
4310 default:
4311 return false;
4312 case AArch64::LD1Fourv16b_POST:
4313 case AArch64::LD1Fourv1d_POST:
4314 case AArch64::LD1Fourv2d_POST:
4315 case AArch64::LD1Fourv2s_POST:
4316 case AArch64::LD1Fourv4h_POST:
4317 case AArch64::LD1Fourv4s_POST:
4318 case AArch64::LD1Fourv8b_POST:
4319 case AArch64::LD1Fourv8h_POST:
4320 case AArch64::LD1Onev16b_POST:
4321 case AArch64::LD1Onev1d_POST:
4322 case AArch64::LD1Onev2d_POST:
4323 case AArch64::LD1Onev2s_POST:
4324 case AArch64::LD1Onev4h_POST:
4325 case AArch64::LD1Onev4s_POST:
4326 case AArch64::LD1Onev8b_POST:
4327 case AArch64::LD1Onev8h_POST:
4328 case AArch64::LD1Rv16b_POST:
4329 case AArch64::LD1Rv1d_POST:
4330 case AArch64::LD1Rv2d_POST:
4331 case AArch64::LD1Rv2s_POST:
4332 case AArch64::LD1Rv4h_POST:
4333 case AArch64::LD1Rv4s_POST:
4334 case AArch64::LD1Rv8b_POST:
4335 case AArch64::LD1Rv8h_POST:
4336 case AArch64::LD1Threev16b_POST:
4337 case AArch64::LD1Threev1d_POST:
4338 case AArch64::LD1Threev2d_POST:
4339 case AArch64::LD1Threev2s_POST:
4340 case AArch64::LD1Threev4h_POST:
4341 case AArch64::LD1Threev4s_POST:
4342 case AArch64::LD1Threev8b_POST:
4343 case AArch64::LD1Threev8h_POST:
4344 case AArch64::LD1Twov16b_POST:
4345 case AArch64::LD1Twov1d_POST:
4346 case AArch64::LD1Twov2d_POST:
4347 case AArch64::LD1Twov2s_POST:
4348 case AArch64::LD1Twov4h_POST:
4349 case AArch64::LD1Twov4s_POST:
4350 case AArch64::LD1Twov8b_POST:
4351 case AArch64::LD1Twov8h_POST:
4352 case AArch64::LD1i16_POST:
4353 case AArch64::LD1i32_POST:
4354 case AArch64::LD1i64_POST:
4355 case AArch64::LD1i8_POST:
4356 case AArch64::LD2Rv16b_POST:
4357 case AArch64::LD2Rv1d_POST:
4358 case AArch64::LD2Rv2d_POST:
4359 case AArch64::LD2Rv2s_POST:
4360 case AArch64::LD2Rv4h_POST:
4361 case AArch64::LD2Rv4s_POST:
4362 case AArch64::LD2Rv8b_POST:
4363 case AArch64::LD2Rv8h_POST:
4364 case AArch64::LD2Twov16b_POST:
4365 case AArch64::LD2Twov2d_POST:
4366 case AArch64::LD2Twov2s_POST:
4367 case AArch64::LD2Twov4h_POST:
4368 case AArch64::LD2Twov4s_POST:
4369 case AArch64::LD2Twov8b_POST:
4370 case AArch64::LD2Twov8h_POST:
4371 case AArch64::LD2i16_POST:
4372 case AArch64::LD2i32_POST:
4373 case AArch64::LD2i64_POST:
4374 case AArch64::LD2i8_POST:
4375 case AArch64::LD3Rv16b_POST:
4376 case AArch64::LD3Rv1d_POST:
4377 case AArch64::LD3Rv2d_POST:
4378 case AArch64::LD3Rv2s_POST:
4379 case AArch64::LD3Rv4h_POST:
4380 case AArch64::LD3Rv4s_POST:
4381 case AArch64::LD3Rv8b_POST:
4382 case AArch64::LD3Rv8h_POST:
4383 case AArch64::LD3Threev16b_POST:
4384 case AArch64::LD3Threev2d_POST:
4385 case AArch64::LD3Threev2s_POST:
4386 case AArch64::LD3Threev4h_POST:
4387 case AArch64::LD3Threev4s_POST:
4388 case AArch64::LD3Threev8b_POST:
4389 case AArch64::LD3Threev8h_POST:
4390 case AArch64::LD3i16_POST:
4391 case AArch64::LD3i32_POST:
4392 case AArch64::LD3i64_POST:
4393 case AArch64::LD3i8_POST:
4394 case AArch64::LD4Fourv16b_POST:
4395 case AArch64::LD4Fourv2d_POST:
4396 case AArch64::LD4Fourv2s_POST:
4397 case AArch64::LD4Fourv4h_POST:
4398 case AArch64::LD4Fourv4s_POST:
4399 case AArch64::LD4Fourv8b_POST:
4400 case AArch64::LD4Fourv8h_POST:
4401 case AArch64::LD4Rv16b_POST:
4402 case AArch64::LD4Rv1d_POST:
4403 case AArch64::LD4Rv2d_POST:
4404 case AArch64::LD4Rv2s_POST:
4405 case AArch64::LD4Rv4h_POST:
4406 case AArch64::LD4Rv4s_POST:
4407 case AArch64::LD4Rv8b_POST:
4408 case AArch64::LD4Rv8h_POST:
4409 case AArch64::LD4i16_POST:
4410 case AArch64::LD4i32_POST:
4411 case AArch64::LD4i64_POST:
4412 case AArch64::LD4i8_POST:
4413 case AArch64::LDAPRWpost:
4414 case AArch64::LDAPRXpost:
4415 case AArch64::LDIAPPWpost:
4416 case AArch64::LDIAPPXpost:
4417 case AArch64::LDPDpost:
4418 case AArch64::LDPQpost:
4419 case AArch64::LDPSWpost:
4420 case AArch64::LDPSpost:
4421 case AArch64::LDPWpost:
4422 case AArch64::LDPXpost:
4423 case AArch64::LDRBBpost:
4424 case AArch64::LDRBpost:
4425 case AArch64::LDRDpost:
4426 case AArch64::LDRHHpost:
4427 case AArch64::LDRHpost:
4428 case AArch64::LDRQpost:
4429 case AArch64::LDRSBWpost:
4430 case AArch64::LDRSBXpost:
4431 case AArch64::LDRSHWpost:
4432 case AArch64::LDRSHXpost:
4433 case AArch64::LDRSWpost:
4434 case AArch64::LDRSpost:
4435 case AArch64::LDRWpost:
4436 case AArch64::LDRXpost:
4437 case AArch64::ST1Fourv16b_POST:
4438 case AArch64::ST1Fourv1d_POST:
4439 case AArch64::ST1Fourv2d_POST:
4440 case AArch64::ST1Fourv2s_POST:
4441 case AArch64::ST1Fourv4h_POST:
4442 case AArch64::ST1Fourv4s_POST:
4443 case AArch64::ST1Fourv8b_POST:
4444 case AArch64::ST1Fourv8h_POST:
4445 case AArch64::ST1Onev16b_POST:
4446 case AArch64::ST1Onev1d_POST:
4447 case AArch64::ST1Onev2d_POST:
4448 case AArch64::ST1Onev2s_POST:
4449 case AArch64::ST1Onev4h_POST:
4450 case AArch64::ST1Onev4s_POST:
4451 case AArch64::ST1Onev8b_POST:
4452 case AArch64::ST1Onev8h_POST:
4453 case AArch64::ST1Threev16b_POST:
4454 case AArch64::ST1Threev1d_POST:
4455 case AArch64::ST1Threev2d_POST:
4456 case AArch64::ST1Threev2s_POST:
4457 case AArch64::ST1Threev4h_POST:
4458 case AArch64::ST1Threev4s_POST:
4459 case AArch64::ST1Threev8b_POST:
4460 case AArch64::ST1Threev8h_POST:
4461 case AArch64::ST1Twov16b_POST:
4462 case AArch64::ST1Twov1d_POST:
4463 case AArch64::ST1Twov2d_POST:
4464 case AArch64::ST1Twov2s_POST:
4465 case AArch64::ST1Twov4h_POST:
4466 case AArch64::ST1Twov4s_POST:
4467 case AArch64::ST1Twov8b_POST:
4468 case AArch64::ST1Twov8h_POST:
4469 case AArch64::ST1i16_POST:
4470 case AArch64::ST1i32_POST:
4471 case AArch64::ST1i64_POST:
4472 case AArch64::ST1i8_POST:
4473 case AArch64::ST2GPostIndex:
4474 case AArch64::ST2Twov16b_POST:
4475 case AArch64::ST2Twov2d_POST:
4476 case AArch64::ST2Twov2s_POST:
4477 case AArch64::ST2Twov4h_POST:
4478 case AArch64::ST2Twov4s_POST:
4479 case AArch64::ST2Twov8b_POST:
4480 case AArch64::ST2Twov8h_POST:
4481 case AArch64::ST2i16_POST:
4482 case AArch64::ST2i32_POST:
4483 case AArch64::ST2i64_POST:
4484 case AArch64::ST2i8_POST:
4485 case AArch64::ST3Threev16b_POST:
4486 case AArch64::ST3Threev2d_POST:
4487 case AArch64::ST3Threev2s_POST:
4488 case AArch64::ST3Threev4h_POST:
4489 case AArch64::ST3Threev4s_POST:
4490 case AArch64::ST3Threev8b_POST:
4491 case AArch64::ST3Threev8h_POST:
4492 case AArch64::ST3i16_POST:
4493 case AArch64::ST3i32_POST:
4494 case AArch64::ST3i64_POST:
4495 case AArch64::ST3i8_POST:
4496 case AArch64::ST4Fourv16b_POST:
4497 case AArch64::ST4Fourv2d_POST:
4498 case AArch64::ST4Fourv2s_POST:
4499 case AArch64::ST4Fourv4h_POST:
4500 case AArch64::ST4Fourv4s_POST:
4501 case AArch64::ST4Fourv8b_POST:
4502 case AArch64::ST4Fourv8h_POST:
4503 case AArch64::ST4i16_POST:
4504 case AArch64::ST4i32_POST:
4505 case AArch64::ST4i64_POST:
4506 case AArch64::ST4i8_POST:
4507 case AArch64::STGPostIndex:
4508 case AArch64::STGPpost:
4509 case AArch64::STPDpost:
4510 case AArch64::STPQpost:
4511 case AArch64::STPSpost:
4512 case AArch64::STPWpost:
4513 case AArch64::STPXpost:
4514 case AArch64::STRBBpost:
4515 case AArch64::STRBpost:
4516 case AArch64::STRDpost:
4517 case AArch64::STRHHpost:
4518 case AArch64::STRHpost:
4519 case AArch64::STRQpost:
4520 case AArch64::STRSpost:
4521 case AArch64::STRWpost:
4522 case AArch64::STRXpost:
4523 case AArch64::STZ2GPostIndex:
4524 case AArch64::STZGPostIndex:
4525 return true;
4526 }
4527}
4528
4530 const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
4531 bool &OffsetIsScalable, TypeSize &Width,
4532 const TargetRegisterInfo *TRI) const {
4533 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
4534 // Handle only loads/stores with base register followed by immediate offset.
4535 if (LdSt.getNumExplicitOperands() == 3) {
4536 // Non-paired instruction (e.g., ldr x1, [x0, #8]).
4537 if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) ||
4538 !LdSt.getOperand(2).isImm())
4539 return false;
4540 } else if (LdSt.getNumExplicitOperands() == 4) {
4541 // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
4542 if (!LdSt.getOperand(1).isReg() ||
4543 (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) ||
4544 !LdSt.getOperand(3).isImm())
4545 return false;
4546 } else
4547 return false;
4548
4549 // Get the scaling factor for the instruction and set the width for the
4550 // instruction.
4551 TypeSize Scale(0U, false);
4552 int64_t Dummy1, Dummy2;
4553
4554 // If this returns false, then it's an instruction we don't want to handle.
4555 if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))
4556 return false;
4557
4558 // Compute the offset. Offset is calculated as the immediate operand
4559 // multiplied by the scaling factor. Unscaled instructions have scaling factor
4560 // set to 1. Postindex are a special case which have an offset of 0.
4561 if (isPostIndexLdStOpcode(LdSt.getOpcode())) {
4562 BaseOp = &LdSt.getOperand(2);
4563 Offset = 0;
4564 } else if (LdSt.getNumExplicitOperands() == 3) {
4565 BaseOp = &LdSt.getOperand(1);
4566 Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinValue();
4567 } else {
4568 assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
4569 BaseOp = &LdSt.getOperand(2);
4570 Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinValue();
4571 }
4572 OffsetIsScalable = Scale.isScalable();
4573
4574 return BaseOp->isReg() || BaseOp->isFI();
4575}
4576
4579 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
4580 MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1);
4581 assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
4582 return OfsOp;
4583}
4584
4585bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
4586 TypeSize &Width, int64_t &MinOffset,
4587 int64_t &MaxOffset) {
4588 switch (Opcode) {
4589 // Not a memory operation or something we want to handle.
4590 default:
4591 Scale = Width = TypeSize::getFixed(0);
4592 MinOffset = MaxOffset = 0;
4593 return false;
4594 // LDR / STR
4595 case AArch64::LDRQui:
4596 case AArch64::STRQui:
4597 Scale = Width = TypeSize::getFixed(16);
4598 MinOffset = 0;
4599 MaxOffset = 4095;
4600 break;
4601 case AArch64::LDRXui:
4602 case AArch64::LDRDui:
4603 case AArch64::STRXui:
4604 case AArch64::STRDui:
4605 case AArch64::PRFMui:
4606 Scale = Width = TypeSize::getFixed(8);
4607 MinOffset = 0;
4608 MaxOffset = 4095;
4609 break;
4610 case AArch64::LDRWui:
4611 case AArch64::LDRSui:
4612 case AArch64::LDRSWui:
4613 case AArch64::STRWui:
4614 case AArch64::STRSui:
4615 Scale = Width = TypeSize::getFixed(4);
4616 MinOffset = 0;
4617 MaxOffset = 4095;
4618 break;
4619 case AArch64::LDRHui:
4620 case AArch64::LDRHHui:
4621 case AArch64::LDRSHWui:
4622 case AArch64::LDRSHXui:
4623 case AArch64::STRHui:
4624 case AArch64::STRHHui:
4625 Scale = Width = TypeSize::getFixed(2);
4626 MinOffset = 0;
4627 MaxOffset = 4095;
4628 break;
4629 case AArch64::LDRBui:
4630 case AArch64::LDRBBui:
4631 case AArch64::LDRSBWui:
4632 case AArch64::LDRSBXui:
4633 case AArch64::STRBui:
4634 case AArch64::STRBBui:
4635 Scale = Width = TypeSize::getFixed(1);
4636 MinOffset = 0;
4637 MaxOffset = 4095;
4638 break;
4639 // post/pre inc
4640 case AArch64::STRQpre:
4641 case AArch64::LDRQpost:
4642 Scale = TypeSize::getFixed(1);
4643 Width = TypeSize::getFixed(16);
4644 MinOffset = -256;
4645 MaxOffset = 255;
4646 break;
4647 case AArch64::LDRDpost:
4648 case AArch64::LDRDpre:
4649 case AArch64::LDRXpost:
4650 case AArch64::LDRXpre:
4651 case AArch64::STRDpost:
4652 case AArch64::STRDpre:
4653 case AArch64::STRXpost:
4654 case AArch64::STRXpre:
4655 Scale = TypeSize::getFixed(1);
4656 Width = TypeSize::getFixed(8);
4657 MinOffset = -256;
4658 MaxOffset = 255;
4659 break;
4660 case AArch64::STRWpost:
4661 case AArch64::STRWpre:
4662 case AArch64::LDRWpost:
4663 case AArch64::LDRWpre:
4664 case AArch64::STRSpost:
4665 case AArch64::STRSpre:
4666 case AArch64::LDRSpost:
4667 case AArch64::LDRSpre:
4668 Scale = TypeSize::getFixed(1);
4669 Width = TypeSize::getFixed(4);
4670 MinOffset = -256;
4671 MaxOffset = 255;
4672 break;
4673 case AArch64::LDRHpost:
4674 case AArch64::LDRHpre:
4675 case AArch64::STRHpost:
4676 case AArch64::STRHpre:
4677 case AArch64::LDRHHpost:
4678 case AArch64::LDRHHpre:
4679 case AArch64::STRHHpost:
4680 case AArch64::STRHHpre:
4681 Scale = TypeSize::getFixed(1);
4682 Width = TypeSize::getFixed(2);
4683 MinOffset = -256;
4684 MaxOffset = 255;
4685 break;
4686 case AArch64::LDRBpost:
4687 case AArch64::LDRBpre:
4688 case AArch64::STRBpost:
4689 case AArch64::STRBpre:
4690 case AArch64::LDRBBpost:
4691 case AArch64::LDRBBpre:
4692 case AArch64::STRBBpost:
4693 case AArch64::STRBBpre:
4694 Scale = Width = TypeSize::getFixed(1);
4695 MinOffset = -256;
4696 MaxOffset = 255;
4697 break;
4698 // Unscaled
4699 case AArch64::LDURQi:
4700 case AArch64::STURQi:
4701 Scale = TypeSize::getFixed(1);
4702 Width = TypeSize::getFixed(16);
4703 MinOffset = -256;
4704 MaxOffset = 255;
4705 break;
4706 case AArch64::LDURXi:
4707 case AArch64::LDURDi:
4708 case AArch64::LDAPURXi:
4709 case AArch64::STURXi:
4710 case AArch64::STURDi:
4711 case AArch64::STLURXi:
4712 case AArch64::PRFUMi:
4713 Scale = TypeSize::getFixed(1);
4714 Width = TypeSize::getFixed(8);
4715 MinOffset = -256;
4716 MaxOffset = 255;
4717 break;
4718 case AArch64::LDURWi:
4719 case AArch64::LDURSi:
4720 case AArch64::LDURSWi:
4721 case AArch64::LDAPURi:
4722 case AArch64::LDAPURSWi:
4723 case AArch64::STURWi:
4724 case AArch64::STURSi:
4725 case AArch64::STLURWi:
4726 Scale = TypeSize::getFixed(1);
4727 Width = TypeSize::getFixed(4);
4728 MinOffset = -256;
4729 MaxOffset = 255;
4730 break;
4731 case AArch64::LDURHi:
4732 case AArch64::LDURHHi:
4733 case AArch64::LDURSHXi:
4734 case AArch64::LDURSHWi:
4735 case AArch64::LDAPURHi:
4736 case AArch64::LDAPURSHWi:
4737 case AArch64::LDAPURSHXi:
4738 case AArch64::STURHi:
4739 case AArch64::STURHHi:
4740 case AArch64::STLURHi:
4741 Scale = TypeSize::getFixed(1);
4742 Width = TypeSize::getFixed(2);
4743 MinOffset = -256;
4744 MaxOffset = 255;
4745 break;
4746 case AArch64::LDURBi:
4747 case AArch64::LDURBBi:
4748 case AArch64::LDURSBXi:
4749 case AArch64::LDURSBWi:
4750 case AArch64::LDAPURBi:
4751 case AArch64::LDAPURSBWi:
4752 case AArch64::LDAPURSBXi:
4753 case AArch64::STURBi:
4754 case AArch64::STURBBi:
4755 case AArch64::STLURBi:
4756 Scale = Width = TypeSize::getFixed(1);
4757 MinOffset = -256;
4758 MaxOffset = 255;
4759 break;
4760 // LDP / STP (including pre/post inc)
4761 case AArch64::LDPQi:
4762 case AArch64::LDNPQi:
4763 case AArch64::STPQi:
4764 case AArch64::STNPQi:
4765 case AArch64::LDPQpost:
4766 case AArch64::LDPQpre:
4767 case AArch64::STPQpost:
4768 case AArch64::STPQpre:
4769 Scale = TypeSize::getFixed(16);
4770 Width = TypeSize::getFixed(16 * 2);
4771 MinOffset = -64;
4772 MaxOffset = 63;
4773 break;
4774 case AArch64::LDPXi:
4775 case AArch64::LDPDi:
4776 case AArch64::LDNPXi:
4777 case AArch64::LDNPDi:
4778 case AArch64::STPXi:
4779 case AArch64::STPDi:
4780 case AArch64::STNPXi:
4781 case AArch64::STNPDi:
4782 case AArch64::LDPDpost:
4783 case AArch64::LDPDpre:
4784 case AArch64::LDPXpost:
4785 case AArch64::LDPXpre:
4786 case AArch64::STPDpost:
4787 case AArch64::STPDpre:
4788 case AArch64::STPXpost:
4789 case AArch64::STPXpre:
4790 Scale = TypeSize::getFixed(8);
4791 Width = TypeSize::getFixed(8 * 2);
4792 MinOffset = -64;
4793 MaxOffset = 63;
4794 break;
4795 case AArch64::LDPWi:
4796 case AArch64::LDPSi:
4797 case AArch64::LDNPWi:
4798 case AArch64::LDNPSi:
4799 case AArch64::STPWi:
4800 case AArch64::STPSi:
4801 case AArch64::STNPWi:
4802 case AArch64::STNPSi:
4803 case AArch64::LDPSpost:
4804 case AArch64::LDPSpre:
4805 case AArch64::LDPWpost:
4806 case AArch64::LDPWpre:
4807 case AArch64::STPSpost:
4808 case AArch64::STPSpre:
4809 case AArch64::STPWpost:
4810 case AArch64::STPWpre:
4811 Scale = TypeSize::getFixed(4);
4812 Width = TypeSize::getFixed(4 * 2);
4813 MinOffset = -64;
4814 MaxOffset = 63;
4815 break;
4816 case AArch64::StoreSwiftAsyncContext:
4817 // Store is an STRXui, but there might be an ADDXri in the expansion too.
4818 Scale = TypeSize::getFixed(1);
4819 Width = TypeSize::getFixed(8);
4820 MinOffset = 0;
4821 MaxOffset = 4095;
4822 break;
4823 case AArch64::ADDG:
4824 Scale = TypeSize::getFixed(16);
4825 Width = TypeSize::getFixed(0);
4826 MinOffset = 0;
4827 MaxOffset = 63;
4828 break;
4829 case AArch64::TAGPstack:
4830 Scale = TypeSize::getFixed(16);
4831 Width = TypeSize::getFixed(0);
4832 // TAGP with a negative offset turns into SUBP, which has a maximum offset
4833 // of 63 (not 64!).
4834 MinOffset = -63;
4835 MaxOffset = 63;
4836 break;
4837 case AArch64::LDG:
4838 case AArch64::STGi:
4839 case AArch64::STGPreIndex:
4840 case AArch64::STGPostIndex:
4841 case AArch64::STZGi:
4842 case AArch64::STZGPreIndex:
4843 case AArch64::STZGPostIndex:
4844 Scale = Width = TypeSize::getFixed(16);
4845 MinOffset = -256;
4846 MaxOffset = 255;
4847 break;
4848 // SVE
4849 case AArch64::STR_ZZZZXI:
4850 case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
4851 case AArch64::LDR_ZZZZXI:
4852 case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
4853 Scale = TypeSize::getScalable(16);
4854 Width = TypeSize::getScalable(16 * 4);
4855 MinOffset = -256;
4856 MaxOffset = 252;
4857 break;
4858 case AArch64::STR_ZZZXI:
4859 case AArch64::LDR_ZZZXI:
4860 Scale = TypeSize::getScalable(16);
4861 Width = TypeSize::getScalable(16 * 3);
4862 MinOffset = -256;
4863 MaxOffset = 253;
4864 break;
4865 case AArch64::STR_ZZXI:
4866 case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
4867 case AArch64::LDR_ZZXI:
4868 case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
4869 Scale = TypeSize::getScalable(16);
4870 Width = TypeSize::getScalable(16 * 2);
4871 MinOffset = -256;
4872 MaxOffset = 254;
4873 break;
4874 case AArch64::LDR_PXI:
4875 case AArch64::STR_PXI:
4876 Scale = Width = TypeSize::getScalable(2);
4877 MinOffset = -256;
4878 MaxOffset = 255;
4879 break;
4880 case AArch64::LDR_PPXI:
4881 case AArch64::STR_PPXI:
4882 Scale = TypeSize::getScalable(2);
4883 Width = TypeSize::getScalable(2 * 2);
4884 MinOffset = -256;
4885 MaxOffset = 254;
4886 break;
4887 case AArch64::LDR_ZXI:
4888 case AArch64::STR_ZXI:
4889 Scale = Width = TypeSize::getScalable(16);
4890 MinOffset = -256;
4891 MaxOffset = 255;
4892 break;
4893 case AArch64::LD1B_IMM:
4894 case AArch64::LD1H_IMM:
4895 case AArch64::LD1W_IMM:
4896 case AArch64::LD1D_IMM:
4897 case AArch64::LDNT1B_ZRI:
4898 case AArch64::LDNT1H_ZRI:
4899 case AArch64::LDNT1W_ZRI:
4900 case AArch64::LDNT1D_ZRI:
4901 case AArch64::ST1B_IMM:
4902 case AArch64::ST1H_IMM:
4903 case AArch64::ST1W_IMM:
4904 case AArch64::ST1D_IMM:
4905 case AArch64::STNT1B_ZRI:
4906 case AArch64::STNT1H_ZRI:
4907 case AArch64::STNT1W_ZRI:
4908 case AArch64::STNT1D_ZRI:
4909 case AArch64::LDNF1B_IMM:
4910 case AArch64::LDNF1H_IMM:
4911 case AArch64::LDNF1W_IMM:
4912 case AArch64::LDNF1D_IMM:
4913 // A full vectors worth of data
4914 // Width = mbytes * elements
4915 Scale = Width = TypeSize::getScalable(16);
4916 MinOffset = -8;
4917 MaxOffset = 7;
4918 break;
4919 case AArch64::LD2B_IMM:
4920 case AArch64::LD2H_IMM:
4921 case AArch64::LD2W_IMM:
4922 case AArch64::LD2D_IMM:
4923 case AArch64::ST2B_IMM:
4924 case AArch64::ST2H_IMM:
4925 case AArch64::ST2W_IMM:
4926 case AArch64::ST2D_IMM:
4927 case AArch64::LD1B_2Z_IMM:
4928 case AArch64::LD1B_2Z_STRIDED_IMM:
4929 case AArch64::LD1H_2Z_IMM:
4930 case AArch64::LD1H_2Z_STRIDED_IMM:
4931 case AArch64::LD1W_2Z_IMM:
4932 case AArch64::LD1W_2Z_STRIDED_IMM:
4933 case AArch64::LD1D_2Z_IMM:
4934 case AArch64::LD1D_2Z_STRIDED_IMM:
4935 case AArch64::LD1B_2Z_IMM_PSEUDO:
4936 case AArch64::LD1H_2Z_IMM_PSEUDO:
4937 case AArch64::LD1W_2Z_IMM_PSEUDO:
4938 case AArch64::LD1D_2Z_IMM_PSEUDO:
4939 case AArch64::ST1B_2Z_IMM:
4940 case AArch64::ST1B_2Z_STRIDED_IMM:
4941 case AArch64::ST1H_2Z_IMM:
4942 case AArch64::ST1H_2Z_STRIDED_IMM:
4943 case AArch64::ST1W_2Z_IMM:
4944 case AArch64::ST1W_2Z_STRIDED_IMM:
4945 case AArch64::ST1D_2Z_IMM:
4946 case AArch64::ST1D_2Z_STRIDED_IMM:
4947 case AArch64::LDNT1B_2Z_IMM_PSEUDO:
4948 case AArch64::LDNT1B_2Z_IMM:
4949 case AArch64::LDNT1B_2Z_STRIDED_IMM:
4950 case AArch64::LDNT1H_2Z_IMM_PSEUDO:
4951 case AArch64::LDNT1H_2Z_IMM:
4952 case AArch64::LDNT1H_2Z_STRIDED_IMM:
4953 case AArch64::LDNT1W_2Z_IMM_PSEUDO:
4954 case AArch64::LDNT1W_2Z_IMM:
4955 case AArch64::LDNT1W_2Z_STRIDED_IMM:
4956 case AArch64::LDNT1D_2Z_IMM_PSEUDO:
4957 case AArch64::LDNT1D_2Z_IMM:
4958 case AArch64::LDNT1D_2Z_STRIDED_IMM:
4959 case AArch64::STNT1B_2Z_IMM:
4960 case AArch64::STNT1B_2Z_STRIDED_IMM:
4961 case AArch64::STNT1H_2Z_IMM:
4962 case AArch64::STNT1H_2Z_STRIDED_IMM:
4963 case AArch64::STNT1W_2Z_IMM:
4964 case AArch64::STNT1W_2Z_STRIDED_IMM:
4965 case AArch64::STNT1D_2Z_IMM:
4966 case AArch64::STNT1D_2Z_STRIDED_IMM:
4967 Scale = Width = TypeSize::getScalable(16 * 2);
4968 MinOffset = -8;
4969 MaxOffset = 7;
4970 break;
4971 case AArch64::LD3B_IMM:
4972 case AArch64::LD3H_IMM:
4973 case AArch64::LD3W_IMM:
4974 case AArch64::LD3D_IMM:
4975 case AArch64::ST3B_IMM:
4976 case AArch64::ST3H_IMM:
4977 case AArch64::ST3W_IMM:
4978 case AArch64::ST3D_IMM:
4979 Scale = Width = TypeSize::getScalable(16 * 3);
4980 MinOffset = -8;
4981 MaxOffset = 7;
4982 break;
4983 case AArch64::LD4B_IMM:
4984 case AArch64::LD4H_IMM:
4985 case AArch64::LD4W_IMM:
4986 case AArch64::LD4D_IMM:
4987 case AArch64::ST4B_IMM:
4988 case AArch64::ST4H_IMM:
4989 case AArch64::ST4W_IMM:
4990 case AArch64::ST4D_IMM:
4991 case AArch64::LD1B_4Z_IMM:
4992 case AArch64::LD1B_4Z_STRIDED_IMM:
4993 case AArch64::LD1H_4Z_IMM:
4994 case AArch64::LD1H_4Z_STRIDED_IMM:
4995 case AArch64::LD1W_4Z_IMM:
4996 case AArch64::LD1W_4Z_STRIDED_IMM:
4997 case AArch64::LD1D_4Z_IMM:
4998 case AArch64::LD1D_4Z_STRIDED_IMM:
4999 case AArch64::LD1B_4Z_IMM_PSEUDO:
5000 case AArch64::LD1H_4Z_IMM_PSEUDO:
5001 case AArch64::LD1W_4Z_IMM_PSEUDO:
5002 case AArch64::LD1D_4Z_IMM_PSEUDO:
5003 case AArch64::ST1B_4Z_IMM:
5004 case AArch64::ST1B_4Z_STRIDED_IMM:
5005 case AArch64::ST1H_4Z_IMM:
5006 case AArch64::ST1H_4Z_STRIDED_IMM:
5007 case AArch64::ST1W_4Z_IMM:
5008 case AArch64::ST1W_4Z_STRIDED_IMM:
5009 case AArch64::ST1D_4Z_IMM:
5010 case AArch64::ST1D_4Z_STRIDED_IMM:
5011 case AArch64::LDNT1B_4Z_IMM_PSEUDO:
5012 case AArch64::LDNT1B_4Z_IMM:
5013 case AArch64::LDNT1B_4Z_STRIDED_IMM:
5014 case AArch64::LDNT1H_4Z_IMM_PSEUDO:
5015 case AArch64::LDNT1H_4Z_IMM:
5016 case AArch64::LDNT1H_4Z_STRIDED_IMM:
5017 case AArch64::LDNT1W_4Z_IMM_PSEUDO:
5018 case AArch64::LDNT1W_4Z_IMM:
5019 case AArch64::LDNT1W_4Z_STRIDED_IMM:
5020 case AArch64::LDNT1D_4Z_IMM_PSEUDO:
5021 case AArch64::LDNT1D_4Z_IMM:
5022 case AArch64::LDNT1D_4Z_STRIDED_IMM:
5023 case AArch64::STNT1B_4Z_IMM:
5024 case AArch64::STNT1B_4Z_STRIDED_IMM:
5025 case AArch64::STNT1H_4Z_IMM:
5026 case AArch64::STNT1H_4Z_STRIDED_IMM:
5027 case AArch64::STNT1W_4Z_IMM:
5028 case AArch64::STNT1W_4Z_STRIDED_IMM:
5029 case AArch64::STNT1D_4Z_IMM:
5030 case AArch64::STNT1D_4Z_STRIDED_IMM:
5031 Scale = Width = TypeSize::getScalable(16 * 4);
5032 MinOffset = -8;
5033 MaxOffset = 7;
5034 break;
5035 case AArch64::LD1B_H_IMM:
5036 case AArch64::LD1SB_H_IMM:
5037 case AArch64::LD1H_S_IMM:
5038 case AArch64::LD1SH_S_IMM:
5039 case AArch64::LD1W_D_IMM:
5040 case AArch64::LD1SW_D_IMM:
5041 case AArch64::ST1B_H_IMM:
5042 case AArch64::ST1H_S_IMM:
5043 case AArch64::ST1W_D_IMM:
5044 case AArch64::LDNF1B_H_IMM:
5045 case AArch64::LDNF1SB_H_IMM:
5046 case AArch64::LDNF1H_S_IMM:
5047 case AArch64::LDNF1SH_S_IMM:
5048 case AArch64::LDNF1W_D_IMM:
5049 case AArch64::LDNF1SW_D_IMM:
5050 // A half vector worth of data
5051 // Width = mbytes * elements
5052 Scale = Width = TypeSize::getScalable(8);
5053 MinOffset = -8;
5054 MaxOffset = 7;
5055 break;
5056 case AArch64::LD1B_S_IMM:
5057 case AArch64::LD1SB_S_IMM:
5058 case AArch64::LD1H_D_IMM:
5059 case AArch64::LD1SH_D_IMM:
5060 case AArch64::ST1B_S_IMM:
5061 case AArch64::ST1H_D_IMM:
5062 case AArch64::LDNF1B_S_IMM:
5063 case AArch64::LDNF1SB_S_IMM:
5064 case AArch64::LDNF1H_D_IMM:
5065 case AArch64::LDNF1SH_D_IMM:
5066 // A quarter vector worth of data
5067 // Width = mbytes * elements
5068 Scale = Width = TypeSize::getScalable(4);
5069 MinOffset = -8;
5070 MaxOffset = 7;
5071 break;
5072 case AArch64::LD1B_D_IMM:
5073 case AArch64::LD1SB_D_IMM:
5074 case AArch64::ST1B_D_IMM:
5075 case AArch64::LDNF1B_D_IMM:
5076 case AArch64::LDNF1SB_D_IMM:
5077 // A eighth vector worth of data
5078 // Width = mbytes * elements
5079 Scale = Width = TypeSize::getScalable(2);
5080 MinOffset = -8;
5081 MaxOffset = 7;
5082 break;
5083 case AArch64::ST2Gi:
5084 case AArch64::ST2GPreIndex:
5085 case AArch64::ST2GPostIndex:
5086 case AArch64::STZ2Gi:
5087 case AArch64::STZ2GPreIndex:
5088 case AArch64::STZ2GPostIndex:
5089 Scale = TypeSize::getFixed(16);
5090 Width = TypeSize::getFixed(32);
5091 MinOffset = -256;
5092 MaxOffset = 255;
5093 break;
5094 case AArch64::STGPi:
5095 case AArch64::STGPpost:
5096 case AArch64::STGPpre:
5097 Scale = Width = TypeSize::getFixed(16);
5098 MinOffset = -64;
5099 MaxOffset = 63;
5100 break;
5101 case AArch64::LD1RB_IMM:
5102 case AArch64::LD1RB_H_IMM:
5103 case AArch64::LD1RB_S_IMM:
5104 case AArch64::LD1RB_D_IMM:
5105 case AArch64::LD1RSB_H_IMM:
5106 case AArch64::LD1RSB_S_IMM:
5107 case AArch64::LD1RSB_D_IMM:
5108 Scale = Width = TypeSize::getFixed(1);
5109 MinOffset = 0;
5110 MaxOffset = 63;
5111 break;
5112 case AArch64::LD1RH_IMM:
5113 case AArch64::LD1RH_S_IMM:
5114 case AArch64::LD1RH_D_IMM:
5115 case AArch64::LD1RSH_S_IMM:
5116 case AArch64::LD1RSH_D_IMM:
5117 Scale = Width = TypeSize::getFixed(2);
5118 MinOffset = 0;
5119 MaxOffset = 63;
5120 break;
5121 case AArch64::LD1RW_IMM:
5122 case AArch64::LD1RW_D_IMM:
5123 case AArch64::LD1RSW_IMM:
5124 Scale = Width = TypeSize::getFixed(4);
5125 MinOffset = 0;
5126 MaxOffset = 63;
5127 break;
5128 case AArch64::LD1RD_IMM:
5129 Scale = Width = TypeSize::getFixed(8);
5130 MinOffset = 0;
5131 MaxOffset = 63;
5132 break;
5133 }
5134
5135 return true;
5136}
5137
5138// Scaling factor for unscaled load or store.
5140 switch (Opc) {
5141 default:
5142 llvm_unreachable("Opcode has unknown scale!");
5143 case AArch64::LDRBui:
5144 case AArch64::LDRBBui:
5145 case AArch64::LDURBBi:
5146 case AArch64::LDRSBWui:
5147 case AArch64::LDURSBWi:
5148 case AArch64::STRBui:
5149 case AArch64::STRBBui:
5150 case AArch64::STURBBi:
5151 return 1;
5152 case AArch64::LDRHui:
5153 case AArch64::LDRHHui:
5154 case AArch64::LDURHHi:
5155 case AArch64::LDRSHWui:
5156 case AArch64::LDURSHWi:
5157 case AArch64::STRHui:
5158 case AArch64::STRHHui:
5159 case AArch64::STURHHi:
5160 return 2;
5161 case AArch64::LDRSui:
5162 case AArch64::LDURSi:
5163 case AArch64::LDRSpre:
5164 case AArch64::LDRSWui:
5165 case AArch64::LDURSWi:
5166 case AArch64::LDRSWpre:
5167 case AArch64::LDRWpre:
5168 case AArch64::LDRWui:
5169 case AArch64::LDURWi:
5170 case AArch64::STRSui:
5171 case AArch64::STURSi:
5172 case AArch64::STRSpre:
5173 case AArch64::STRWui:
5174 case AArch64::STURWi:
5175 case AArch64::STRWpre:
5176 case AArch64::LDPSi:
5177 case AArch64::LDPSWi:
5178 case AArch64::LDPWi:
5179 case AArch64::STPSi:
5180 case AArch64::STPWi:
5181 return 4;
5182 case AArch64::LDRDui:
5183 case AArch64::LDURDi:
5184 case AArch64::LDRDpre:
5185 case AArch64::LDRXui:
5186 case AArch64::LDURXi:
5187 case AArch64::LDRXpre:
5188 case AArch64::STRDui:
5189 case AArch64::STURDi:
5190 case AArch64::STRDpre:
5191 case AArch64::STRXui:
5192 case AArch64::STURXi:
5193 case AArch64::STRXpre:
5194 case AArch64::LDPDi:
5195 case AArch64::LDPXi:
5196 case AArch64::STPDi:
5197 case AArch64::STPXi:
5198 return 8;
5199 case AArch64::LDRQui:
5200 case AArch64::LDURQi:
5201 case AArch64::STRQui:
5202 case AArch64::STURQi:
5203 case AArch64::STRQpre:
5204 case AArch64::LDPQi:
5205 case AArch64::LDRQpre:
5206 case AArch64::STPQi:
5207 case AArch64::STGi:
5208 case AArch64::STZGi:
5209 case AArch64::ST2Gi:
5210 case AArch64::STZ2Gi:
5211 case AArch64::STGPi:
5212 return 16;
5213 }
5214}
5215
5217 switch (MI.getOpcode()) {
5218 default:
5219 return false;
5220 case AArch64::LDRWpre:
5221 case AArch64::LDRXpre:
5222 case AArch64::LDRSWpre:
5223 case AArch64::LDRSpre:
5224 case AArch64::LDRDpre:
5225 case AArch64::LDRQpre:
5226 return true;
5227 }
5228}
5229
5231 switch (MI.getOpcode()) {
5232 default:
5233 return false;
5234 case AArch64::STRWpre:
5235 case AArch64::STRXpre:
5236 case AArch64::STRSpre:
5237 case AArch64::STRDpre:
5238 case AArch64::STRQpre:
5239 return true;
5240 }
5241}
5242
5244 return isPreLd(MI) || isPreSt(MI);
5245}
5246
5248 switch (MI.getOpcode()) {
5249 default:
5250 return false;
5251 case AArch64::LDURBBi:
5252 case AArch64::LDURHHi:
5253 case AArch64::LDURWi:
5254 case AArch64::LDRBBui:
5255 case AArch64::LDRHHui:
5256 case AArch64::LDRWui:
5257 case AArch64::LDRBBroX:
5258 case AArch64::LDRHHroX:
5259 case AArch64::LDRWroX:
5260 case AArch64::LDRBBroW:
5261 case AArch64::LDRHHroW:
5262 case AArch64::LDRWroW:
5263 return true;
5264 }
5265}
5266
5268 switch (MI.getOpcode()) {
5269 default:
5270 return false;
5271 case AArch64::LDURSBWi:
5272 case AArch64::LDURSHWi:
5273 case AArch64::LDURSBXi:
5274 case AArch64::LDURSHXi:
5275 case AArch64::LDURSWi:
5276 case AArch64::LDRSBWui:
5277 case AArch64::LDRSHWui:
5278 case AArch64::LDRSBXui:
5279 case AArch64::LDRSHXui:
5280 case AArch64::LDRSWui:
5281 case AArch64::LDRSBWroX:
5282 case AArch64::LDRSHWroX:
5283 case AArch64::LDRSBXroX:
5284 case AArch64::LDRSHXroX:
5285 case AArch64::LDRSWroX:
5286 case AArch64::LDRSBWroW:
5287 case AArch64::LDRSHWroW:
5288 case AArch64::LDRSBXroW:
5289 case AArch64::LDRSHXroW:
5290 case AArch64::LDRSWroW:
5291 return true;
5292 }
5293}
5294
5296 switch (MI.getOpcode()) {
5297 default:
5298 return false;
5299 case AArch64::LDPSi:
5300 case AArch64::LDPSWi:
5301 case AArch64::LDPDi:
5302 case AArch64::LDPQi:
5303 case AArch64::LDPWi:
5304 case AArch64::LDPXi:
5305 case AArch64::STPSi:
5306 case AArch64::STPDi:
5307 case AArch64::STPQi:
5308 case AArch64::STPWi:
5309 case AArch64::STPXi:
5310 case AArch64::STGPi:
5311 return true;
5312 }
5313}
5314
5316 assert(MI.mayLoadOrStore() && "Load or store instruction expected");
5317 unsigned Idx =
5319 : 1;
5320 return MI.getOperand(Idx);
5321}
5322
5323const MachineOperand &
5325 assert(MI.mayLoadOrStore() && "Load or store instruction expected");
5326 unsigned Idx =
5328 : 2;
5329 return MI.getOperand(Idx);
5330}
5331
5332const MachineOperand &
5334 switch (MI.getOpcode()) {
5335 default:
5336 llvm_unreachable("Unexpected opcode");
5337 case AArch64::LDRBroX:
5338 case AArch64::LDRBBroX:
5339 case AArch64::LDRSBXroX:
5340 case AArch64::LDRSBWroX:
5341 case AArch64::LDRHroX:
5342 case AArch64::LDRHHroX:
5343 case AArch64::LDRSHXroX:
5344 case AArch64::LDRSHWroX:
5345 case AArch64::LDRWroX:
5346 case AArch64::LDRSroX:
5347 case AArch64::LDRSWroX:
5348 case AArch64::LDRDroX:
5349 case AArch64::LDRXroX:
5350 case AArch64::LDRQroX:
5351 return MI.getOperand(4);
5352 }
5353}
5354
5356 Register Reg) {
5357 if (MI.getParent() == nullptr)
5358 return nullptr;
5359 const MachineFunction *MF = MI.getParent()->getParent();
5360 return MF ? MF->getRegInfo().getRegClassOrNull(Reg) : nullptr;
5361}
5362
5364 auto IsHFPR = [&](const MachineOperand &Op) {
5365 if (!Op.isReg())
5366 return false;
5367 auto Reg = Op.getReg();
5368 if (Reg.isPhysical())
5369 return AArch64::FPR16RegClass.contains(Reg);
5370 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5371 return TRC == &AArch64::FPR16RegClass ||
5372 TRC == &AArch64::FPR16_loRegClass;
5373 };
5374 return llvm::any_of(MI.operands(), IsHFPR);
5375}
5376
5378 auto IsQFPR = [&](const MachineOperand &Op) {
5379 if (!Op.isReg())
5380 return false;
5381 auto Reg = Op.getReg();
5382 if (Reg.isPhysical())
5383 return AArch64::FPR128RegClass.contains(Reg);
5384 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5385 return TRC == &AArch64::FPR128RegClass ||
5386 TRC == &AArch64::FPR128_loRegClass;
5387 };
5388 return llvm::any_of(MI.operands(), IsQFPR);
5389}
5390
5392 switch (MI.getOpcode()) {
5393 case AArch64::BRK:
5394 case AArch64::HLT:
5395 case AArch64::PACIASP:
5396 case AArch64::PACIBSP:
5397 // Implicit BTI behavior.
5398 return true;
5399 case AArch64::PAUTH_PROLOGUE:
5400 // PAUTH_PROLOGUE expands to PACI(A|B)SP.
5401 return true;
5402 case AArch64::HINT: {
5403 unsigned Imm = MI.getOperand(0).getImm();
5404 // Explicit BTI instruction.
5405 if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38)
5406 return true;
5407 // PACI(A|B)SP instructions.
5408 if (Imm == 25 || Imm == 27)
5409 return true;
5410 return false;
5411 }
5412 default:
5413 return false;
5414 }
5415}
5416
5418 if (Reg == 0)
5419 return false;
5420 assert(Reg.isPhysical() && "Expected physical register in isFpOrNEON");
5421 return AArch64::FPR128RegClass.contains(Reg) ||
5422 AArch64::FPR64RegClass.contains(Reg) ||
5423 AArch64::FPR32RegClass.contains(Reg) ||
5424 AArch64::FPR16RegClass.contains(Reg) ||
5425 AArch64::FPR8RegClass.contains(Reg);
5426}
5427
5429 auto IsFPR = [&](const MachineOperand &Op) {
5430 if (!Op.isReg())
5431 return false;
5432 auto Reg = Op.getReg();
5433 if (Reg.isPhysical())
5434 return isFpOrNEON(Reg);
5435
5436 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5437 return TRC == &AArch64::FPR128RegClass ||
5438 TRC == &AArch64::FPR128_loRegClass ||
5439 TRC == &AArch64::FPR64RegClass ||
5440 TRC == &AArch64::FPR64_loRegClass ||
5441 TRC == &AArch64::FPR32RegClass || TRC == &AArch64::FPR16RegClass ||
5442 TRC == &AArch64::FPR8RegClass;
5443 };
5444 return llvm::any_of(MI.operands(), IsFPR);
5445}
5446
5447// Scale the unscaled offsets. Returns false if the unscaled offset can't be
5448// scaled.
5449static bool scaleOffset(unsigned Opc, int64_t &Offset) {
5451
5452 // If the byte-offset isn't a multiple of the stride, we can't scale this
5453 // offset.
5454 if (Offset % Scale != 0)
5455 return false;
5456
5457 // Convert the byte-offset used by unscaled into an "element" offset used
5458 // by the scaled pair load/store instructions.
5459 Offset /= Scale;
5460 return true;
5461}
5462
5463static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
5464 if (FirstOpc == SecondOpc)
5465 return true;
5466 // We can also pair sign-ext and zero-ext instructions.
5467 switch (FirstOpc) {
5468 default:
5469 return false;
5470 case AArch64::STRSui:
5471 case AArch64::STURSi:
5472 return SecondOpc == AArch64::STRSui || SecondOpc == AArch64::STURSi;
5473 case AArch64::STRDui:
5474 case AArch64::STURDi:
5475 return SecondOpc == AArch64::STRDui || SecondOpc == AArch64::STURDi;
5476 case AArch64::STRQui:
5477 case AArch64::STURQi:
5478 return SecondOpc == AArch64::STRQui || SecondOpc == AArch64::STURQi;
5479 case AArch64::STRWui:
5480 case AArch64::STURWi:
5481 return SecondOpc == AArch64::STRWui || SecondOpc == AArch64::STURWi;
5482 case AArch64::STRXui:
5483 case AArch64::STURXi:
5484 return SecondOpc == AArch64::STRXui || SecondOpc == AArch64::STURXi;
5485 case AArch64::LDRSui:
5486 case AArch64::LDURSi:
5487 return SecondOpc == AArch64::LDRSui || SecondOpc == AArch64::LDURSi;
5488 case AArch64::LDRDui:
5489 case AArch64::LDURDi:
5490 return SecondOpc == AArch64::LDRDui || SecondOpc == AArch64::LDURDi;
5491 case AArch64::LDRQui:
5492 case AArch64::LDURQi:
5493 return SecondOpc == AArch64::LDRQui || SecondOpc == AArch64::LDURQi;
5494 case AArch64::LDRWui:
5495 case AArch64::LDURWi:
5496 return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
5497 case AArch64::LDRSWui:
5498 case AArch64::LDURSWi:
5499 return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
5500 case AArch64::LDRXui:
5501 case AArch64::LDURXi:
5502 return SecondOpc == AArch64::LDRXui || SecondOpc == AArch64::LDURXi;
5503 }
5504 // These instructions can't be paired based on their opcodes.
5505 return false;
5506}
5507
5508static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
5509 int64_t Offset1, unsigned Opcode1, int FI2,
5510 int64_t Offset2, unsigned Opcode2) {
5511 // Accesses through fixed stack object frame indices may access a different
5512 // fixed stack slot. Check that the object offsets + offsets match.
5513 if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) {
5514 int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
5515 int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
5516 assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
5517 // Convert to scaled object offsets.
5518 int Scale1 = AArch64InstrInfo::getMemScale(Opcode1);
5519 if (ObjectOffset1 % Scale1 != 0)
5520 return false;
5521 ObjectOffset1 /= Scale1;
5522 int Scale2 = AArch64InstrInfo::getMemScale(Opcode2);
5523 if (ObjectOffset2 % Scale2 != 0)
5524 return false;
5525 ObjectOffset2 /= Scale2;
5526 ObjectOffset1 += Offset1;
5527 ObjectOffset2 += Offset2;
5528 return ObjectOffset1 + 1 == ObjectOffset2;
5529 }
5530
5531 return FI1 == FI2;
5532}
5533
5534/// Detect opportunities for ldp/stp formation.
5535///
5536/// Only called for LdSt for which getMemOperandWithOffset returns true.
5538 ArrayRef<const MachineOperand *> BaseOps1, int64_t OpOffset1,
5539 bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2,
5540 int64_t OpOffset2, bool OffsetIsScalable2, unsigned ClusterSize,
5541 unsigned NumBytes) const {
5542 assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
5543 const MachineOperand &BaseOp1 = *BaseOps1.front();
5544 const MachineOperand &BaseOp2 = *BaseOps2.front();
5545 const MachineInstr &FirstLdSt = *BaseOp1.getParent();
5546 const MachineInstr &SecondLdSt = *BaseOp2.getParent();
5547 if (BaseOp1.getType() != BaseOp2.getType())
5548 return false;
5549
5550 assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
5551 "Only base registers and frame indices are supported.");
5552
5553 // Check for both base regs and base FI.
5554 if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
5555 return false;
5556
5557 // Only cluster up to a single pair.
5558 if (ClusterSize > 2)
5559 return false;
5560
5561 if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))
5562 return false;
5563
5564 // Can we pair these instructions based on their opcodes?
5565 unsigned FirstOpc = FirstLdSt.getOpcode();
5566 unsigned SecondOpc = SecondLdSt.getOpcode();
5567 if (!canPairLdStOpc(FirstOpc, SecondOpc))
5568 return false;
5569
5570 // Can't merge volatiles or load/stores that have a hint to avoid pair
5571 // formation, for example.
5572 if (!isCandidateToMergeOrPair(FirstLdSt) ||
5573 !isCandidateToMergeOrPair(SecondLdSt))
5574 return false;
5575
5576 // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
5577 int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
5578 if (hasUnscaledLdStOffset(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
5579 return false;
5580
5581 int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
5582 if (hasUnscaledLdStOffset(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
5583 return false;
5584
5585 // Pairwise instructions have a 7-bit signed offset field.
5586 if (Offset1 > 63 || Offset1 < -64)
5587 return false;
5588
5589 // The caller should already have ordered First/SecondLdSt by offset.
5590 // Note: except for non-equal frame index bases
5591 if (BaseOp1.isFI()) {
5592 assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
5593 "Caller should have ordered offsets.");
5594
5595 const MachineFrameInfo &MFI =
5596 FirstLdSt.getParent()->getParent()->getFrameInfo();
5597 return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc,
5598 BaseOp2.getIndex(), Offset2, SecondOpc);
5599 }
5600
5601 assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
5602
5603 return Offset1 + 1 == Offset2;
5604}
5605
5607 MCRegister Reg, unsigned SubIdx,
5608 RegState State,
5609 const TargetRegisterInfo *TRI) {
5610 if (!SubIdx)
5611 return MIB.addReg(Reg, State);
5612
5613 if (Reg.isPhysical())
5614 return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
5615 return MIB.addReg(Reg, State, SubIdx);
5616}
5617
5618static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
5619 unsigned NumRegs) {
5620 // We really want the positive remainder mod 32 here, that happens to be
5621 // easily obtainable with a mask.
5622 return ((DestReg - SrcReg) & 0x1f) < NumRegs;
5623}
5624
5627 const DebugLoc &DL, MCRegister DestReg,
5628 MCRegister SrcReg, bool KillSrc,
5629 unsigned Opcode,
5630 ArrayRef<unsigned> Indices) const {
5631 assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
5633 uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
5634 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
5635 unsigned NumRegs = Indices.size();
5636
5637 int SubReg = 0, End = NumRegs, Incr = 1;
5638 if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
5639 SubReg = NumRegs - 1;
5640 End = -1;
5641 Incr = -1;
5642 }
5643
5644 for (; SubReg != End; SubReg += Incr) {
5645 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
5646 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
5647 AddSubReg(MIB, SrcReg, Indices[SubReg], {}, TRI);
5648 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
5649 }
5650}
5651
5654 const DebugLoc &DL, MCRegister DestReg,
5655 MCRegister SrcReg, bool KillSrc,
5656 unsigned Opcode, unsigned ZeroReg,
5657 llvm::ArrayRef<unsigned> Indices) const {
5659 unsigned NumRegs = Indices.size();
5660
5661#ifndef NDEBUG
5662 uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
5663 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
5664 assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
5665 "GPR reg sequences should not be able to overlap");
5666#endif
5667
5668 for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
5669 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
5670 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
5671 MIB.addReg(ZeroReg);
5672 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
5673 MIB.addImm(0);
5674 }
5675}
5676
5677/// Returns true if the instruction at I is in a streaming call site region,
5678/// within a single basic block.
5679/// A "call site streaming region" starts after smstart and ends at smstop
5680/// around a call to a streaming function. This walks backward from I.
5683 MachineFunction &MF = *MBB.getParent();
5685 if (!AFI->hasStreamingModeChanges())
5686 return false;
5687 // Walk backwards to find smstart/smstop
5688 for (MachineInstr &MI : reverse(make_range(MBB.begin(), I))) {
5689 unsigned Opc = MI.getOpcode();
5690 if (Opc == AArch64::MSRpstatesvcrImm1 || Opc == AArch64::MSRpstatePseudo) {
5691 // Check if this is SM change (not ZA)
5692 int64_t PState = MI.getOperand(0).getImm();
5693 if (PState == AArch64SVCR::SVCRSM || PState == AArch64SVCR::SVCRSMZA) {
5694 // Operand 1 is 1 for start, 0 for stop
5695 return MI.getOperand(1).getImm() == 1;
5696 }
5697 }
5698 }
5699 return false;
5700}
5701
5702/// Returns true if in a streaming call site region without SME-FA64.
5703static bool mustAvoidNeonAtMBBI(const AArch64Subtarget &Subtarget,
5706 return !Subtarget.hasSMEFA64() && isInStreamingCallSiteRegion(MBB, I);
5707}
5708
5711 const DebugLoc &DL, Register DestReg,
5712 Register SrcReg, bool KillSrc,
5713 bool RenamableDest,
5714 bool RenamableSrc) const {
5715 ++NumCopyInstrs;
5716 if (AArch64::GPR32spRegClass.contains(DestReg) &&
5717 AArch64::GPR32spRegClass.contains(SrcReg)) {
5718 if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
5719 // If either operand is WSP, expand to ADD #0.
5720 if (Subtarget.hasZeroCycleRegMoveGPR64() &&
5721 !Subtarget.hasZeroCycleRegMoveGPR32()) {
5722 // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
5723 MCRegister DestRegX = RI.getMatchingSuperReg(DestReg, AArch64::sub_32,
5724 &AArch64::GPR64spRegClass);
5725 MCRegister SrcRegX = RI.getMatchingSuperReg(SrcReg, AArch64::sub_32,
5726 &AArch64::GPR64spRegClass);
5727 // This instruction is reading and writing X registers. This may upset
5728 // the register scavenger and machine verifier, so we need to indicate
5729 // that we are reading an undefined value from SrcRegX, but a proper
5730 // value from SrcReg.
5731 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
5732 .addReg(SrcRegX, RegState::Undef)
5733 .addImm(0)
5735 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5736 ++NumZCRegMoveInstrsGPR;
5737 } else {
5738 BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
5739 .addReg(SrcReg, getKillRegState(KillSrc))
5740 .addImm(0)
5742 if (Subtarget.hasZeroCycleRegMoveGPR32())
5743 ++NumZCRegMoveInstrsGPR;
5744 }
5745 } else if (Subtarget.hasZeroCycleRegMoveGPR64() &&
5746 !Subtarget.hasZeroCycleRegMoveGPR32()) {
5747 // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
5748 MCRegister DestRegX = RI.getMatchingSuperReg(DestReg, AArch64::sub_32,
5749 &AArch64::GPR64spRegClass);
5750 assert(DestRegX.isValid() && "Destination super-reg not valid");
5751 MCRegister SrcRegX = RI.getMatchingSuperReg(SrcReg, AArch64::sub_32,
5752 &AArch64::GPR64spRegClass);
5753 assert(SrcRegX.isValid() && "Source super-reg not valid");
5754 // This instruction is reading and writing X registers. This may upset
5755 // the register scavenger and machine verifier, so we need to indicate
5756 // that we are reading an undefined value from SrcRegX, but a proper
5757 // value from SrcReg.
5758 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
5759 .addReg(AArch64::XZR)
5760 .addReg(SrcRegX, RegState::Undef)
5761 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5762 ++NumZCRegMoveInstrsGPR;
5763 } else {
5764 // Otherwise, expand to ORR WZR.
5765 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
5766 .addReg(AArch64::WZR)
5767 .addReg(SrcReg, getKillRegState(KillSrc));
5768 if (Subtarget.hasZeroCycleRegMoveGPR32())
5769 ++NumZCRegMoveInstrsGPR;
5770 }
5771 return;
5772 }
5773
5774 // GPR32 zeroing
5775 if (AArch64::GPR32spRegClass.contains(DestReg) && SrcReg == AArch64::WZR) {
5776 if (Subtarget.hasZeroCycleZeroingGPR64() &&
5777 !Subtarget.hasZeroCycleZeroingGPR32()) {
5778 MCRegister DestRegX = RI.getMatchingSuperReg(DestReg, AArch64::sub_32,
5779 &AArch64::GPR64spRegClass);
5780 assert(DestRegX.isValid() && "Destination super-reg not valid");
5781 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestRegX)
5782 .addImm(0)
5784 ++NumZCZeroingInstrsGPR;
5785 } else if (Subtarget.hasZeroCycleZeroingGPR32()) {
5786 BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
5787 .addImm(0)
5789 ++NumZCZeroingInstrsGPR;
5790 } else {
5791 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
5792 .addReg(AArch64::WZR)
5793 .addReg(AArch64::WZR);
5794 }
5795 return;
5796 }
5797
5798 if (AArch64::GPR64spRegClass.contains(DestReg) &&
5799 AArch64::GPR64spRegClass.contains(SrcReg)) {
5800 if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
5801 // If either operand is SP, expand to ADD #0.
5802 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
5803 .addReg(SrcReg, getKillRegState(KillSrc))
5804 .addImm(0)
5806 if (Subtarget.hasZeroCycleRegMoveGPR64())
5807 ++NumZCRegMoveInstrsGPR;
5808 } else {
5809 // Otherwise, expand to ORR XZR.
5810 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
5811 .addReg(AArch64::XZR)
5812 .addReg(SrcReg, getKillRegState(KillSrc));
5813 if (Subtarget.hasZeroCycleRegMoveGPR64())
5814 ++NumZCRegMoveInstrsGPR;
5815 }
5816 return;
5817 }
5818
5819 // GPR64 zeroing
5820 if (AArch64::GPR64spRegClass.contains(DestReg) && SrcReg == AArch64::XZR) {
5821 if (Subtarget.hasZeroCycleZeroingGPR64()) {
5822 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
5823 .addImm(0)
5825 ++NumZCZeroingInstrsGPR;
5826 } else {
5827 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
5828 .addReg(AArch64::XZR)
5829 .addReg(AArch64::XZR);
5830 }
5831 return;
5832 }
5833
5834 // Copy a Predicate register by ORRing with itself.
5835 if (AArch64::PPRRegClass.contains(DestReg) &&
5836 AArch64::PPRRegClass.contains(SrcReg)) {
5837 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5838 "Unexpected SVE register.");
5839 BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)
5840 .addReg(SrcReg) // Pg
5841 .addReg(SrcReg)
5842 .addReg(SrcReg, getKillRegState(KillSrc));
5843 return;
5844 }
5845
5846 // Copy a predicate-as-counter register by ORRing with itself as if it
5847 // were a regular predicate (mask) register.
5848 bool DestIsPNR = AArch64::PNRRegClass.contains(DestReg);
5849 bool SrcIsPNR = AArch64::PNRRegClass.contains(SrcReg);
5850 if (DestIsPNR || SrcIsPNR) {
5851 auto ToPPR = [](MCRegister R) -> MCRegister {
5852 return (R - AArch64::PN0) + AArch64::P0;
5853 };
5854 MCRegister PPRSrcReg = SrcIsPNR ? ToPPR(SrcReg) : SrcReg.asMCReg();
5855 MCRegister PPRDestReg = DestIsPNR ? ToPPR(DestReg) : DestReg.asMCReg();
5856
5857 if (PPRSrcReg != PPRDestReg) {
5858 auto NewMI = BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), PPRDestReg)
5859 .addReg(PPRSrcReg) // Pg
5860 .addReg(PPRSrcReg)
5861 .addReg(PPRSrcReg, getKillRegState(KillSrc));
5862 if (DestIsPNR)
5863 NewMI.addDef(DestReg, RegState::Implicit);
5864 }
5865 return;
5866 }
5867
5868 // Copy a Z register by ORRing with itself.
5869 if (AArch64::ZPRRegClass.contains(DestReg) &&
5870 AArch64::ZPRRegClass.contains(SrcReg)) {
5871 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5872 "Unexpected SVE register.");
5873 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg)
5874 .addReg(SrcReg)
5875 .addReg(SrcReg, getKillRegState(KillSrc));
5876 return;
5877 }
5878
5879 // Copy a Z register pair by copying the individual sub-registers.
5880 if ((AArch64::ZPR2RegClass.contains(DestReg) ||
5881 AArch64::ZPR2StridedOrContiguousRegClass.contains(DestReg)) &&
5882 (AArch64::ZPR2RegClass.contains(SrcReg) ||
5883 AArch64::ZPR2StridedOrContiguousRegClass.contains(SrcReg))) {
5884 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5885 "Unexpected SVE register.");
5886 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1};
5887 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5888 Indices);
5889 return;
5890 }
5891
5892 // Copy a Z register triple by copying the individual sub-registers.
5893 if (AArch64::ZPR3RegClass.contains(DestReg) &&
5894 AArch64::ZPR3RegClass.contains(SrcReg)) {
5895 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5896 "Unexpected SVE register.");
5897 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
5898 AArch64::zsub2};
5899 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5900 Indices);
5901 return;
5902 }
5903
5904 // Copy a Z register quad by copying the individual sub-registers.
5905 if ((AArch64::ZPR4RegClass.contains(DestReg) ||
5906 AArch64::ZPR4StridedOrContiguousRegClass.contains(DestReg)) &&
5907 (AArch64::ZPR4RegClass.contains(SrcReg) ||
5908 AArch64::ZPR4StridedOrContiguousRegClass.contains(SrcReg))) {
5909 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5910 "Unexpected SVE register.");
5911 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
5912 AArch64::zsub2, AArch64::zsub3};
5913 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5914 Indices);
5915 return;
5916 }
5917
5918 // Copy a DDDD register quad by copying the individual sub-registers.
5919 if (AArch64::DDDDRegClass.contains(DestReg) &&
5920 AArch64::DDDDRegClass.contains(SrcReg)) {
5921 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
5922 AArch64::dsub2, AArch64::dsub3};
5923 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5924 Indices);
5925 return;
5926 }
5927
5928 // Copy a DDD register triple by copying the individual sub-registers.
5929 if (AArch64::DDDRegClass.contains(DestReg) &&
5930 AArch64::DDDRegClass.contains(SrcReg)) {
5931 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
5932 AArch64::dsub2};
5933 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5934 Indices);
5935 return;
5936 }
5937
5938 // Copy a DD register pair by copying the individual sub-registers.
5939 if (AArch64::DDRegClass.contains(DestReg) &&
5940 AArch64::DDRegClass.contains(SrcReg)) {
5941 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
5942 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5943 Indices);
5944 return;
5945 }
5946
5947 // Copy a QQQQ register quad by copying the individual sub-registers.
5948 if (AArch64::QQQQRegClass.contains(DestReg) &&
5949 AArch64::QQQQRegClass.contains(SrcReg)) {
5950 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
5951 AArch64::qsub2, AArch64::qsub3};
5952 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
5953 Indices);
5954 return;
5955 }
5956
5957 // Copy a QQQ register triple by copying the individual sub-registers.
5958 if (AArch64::QQQRegClass.contains(DestReg) &&
5959 AArch64::QQQRegClass.contains(SrcReg)) {
5960 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
5961 AArch64::qsub2};
5962 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
5963 Indices);
5964 return;
5965 }
5966
5967 // Copy a QQ register pair by copying the individual sub-registers.
5968 if (AArch64::QQRegClass.contains(DestReg) &&
5969 AArch64::QQRegClass.contains(SrcReg)) {
5970 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
5971 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
5972 Indices);
5973 return;
5974 }
5975
5976 if (AArch64::XSeqPairsClassRegClass.contains(DestReg) &&
5977 AArch64::XSeqPairsClassRegClass.contains(SrcReg)) {
5978 static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
5979 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs,
5980 AArch64::XZR, Indices);
5981 return;
5982 }
5983
5984 if (AArch64::WSeqPairsClassRegClass.contains(DestReg) &&
5985 AArch64::WSeqPairsClassRegClass.contains(SrcReg)) {
5986 static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
5987 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs,
5988 AArch64::WZR, Indices);
5989 return;
5990 }
5991
5992 if (AArch64::FPR128RegClass.contains(DestReg) &&
5993 AArch64::FPR128RegClass.contains(SrcReg)) {
5994 // In streaming regions, NEON is illegal but streaming-SVE is available.
5995 // Use SVE for copies if we're in a streaming region and SME is available.
5996 // With +sme-fa64, NEON is legal in streaming mode so we can use it.
5997 if ((Subtarget.isSVEorStreamingSVEAvailable() &&
5998 !Subtarget.isNeonAvailable()) ||
5999 mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
6000 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ))
6001 .addReg(AArch64::Z0 + (DestReg - AArch64::Q0), RegState::Define)
6002 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0))
6003 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0));
6004 } else if (Subtarget.isNeonAvailable()) {
6005 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
6006 .addReg(SrcReg)
6007 .addReg(SrcReg, getKillRegState(KillSrc));
6008 if (Subtarget.hasZeroCycleRegMoveFPR128())
6009 ++NumZCRegMoveInstrsFPR;
6010 } else {
6011 BuildMI(MBB, I, DL, get(AArch64::STRQpre))
6012 .addReg(AArch64::SP, RegState::Define)
6013 .addReg(SrcReg, getKillRegState(KillSrc))
6014 .addReg(AArch64::SP)
6015 .addImm(-16);
6016 BuildMI(MBB, I, DL, get(AArch64::LDRQpost))
6017 .addReg(AArch64::SP, RegState::Define)
6018 .addReg(DestReg, RegState::Define)
6019 .addReg(AArch64::SP)
6020 .addImm(16);
6021 }
6022 return;
6023 }
6024
6025 if (AArch64::FPR64RegClass.contains(DestReg) &&
6026 AArch64::FPR64RegClass.contains(SrcReg)) {
6027 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
6028 !Subtarget.hasZeroCycleRegMoveFPR64() &&
6029 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
6030 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
6031 MCRegister DestRegQ = RI.getMatchingSuperReg(DestReg, AArch64::dsub,
6032 &AArch64::FPR128RegClass);
6033 MCRegister SrcRegQ = RI.getMatchingSuperReg(SrcReg, AArch64::dsub,
6034 &AArch64::FPR128RegClass);
6035 // This instruction is reading and writing Q registers. This may upset
6036 // the register scavenger and machine verifier, so we need to indicate
6037 // that we are reading an undefined value from SrcRegQ, but a proper
6038 // value from SrcReg.
6039 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
6040 .addReg(SrcRegQ, RegState::Undef)
6041 .addReg(SrcRegQ, RegState::Undef)
6042 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
6043 ++NumZCRegMoveInstrsFPR;
6044 } else {
6045 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
6046 .addReg(SrcReg, getKillRegState(KillSrc));
6047 if (Subtarget.hasZeroCycleRegMoveFPR64())
6048 ++NumZCRegMoveInstrsFPR;
6049 }
6050 return;
6051 }
6052
6053 if (AArch64::FPR32RegClass.contains(DestReg) &&
6054 AArch64::FPR32RegClass.contains(SrcReg)) {
6055 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
6056 !Subtarget.hasZeroCycleRegMoveFPR64() &&
6057 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
6058 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
6059 MCRegister DestRegQ = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
6060 &AArch64::FPR128RegClass);
6061 MCRegister SrcRegQ = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
6062 &AArch64::FPR128RegClass);
6063 // This instruction is reading and writing Q registers. This may upset
6064 // the register scavenger and machine verifier, so we need to indicate
6065 // that we are reading an undefined value from SrcRegQ, but a proper
6066 // value from SrcReg.
6067 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
6068 .addReg(SrcRegQ, RegState::Undef)
6069 .addReg(SrcRegQ, RegState::Undef)
6070 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
6071 ++NumZCRegMoveInstrsFPR;
6072 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
6073 !Subtarget.hasZeroCycleRegMoveFPR32()) {
6074 MCRegister DestRegD = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
6075 &AArch64::FPR64RegClass);
6076 MCRegister SrcRegD = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
6077 &AArch64::FPR64RegClass);
6078 // This instruction is reading and writing D registers. This may upset
6079 // the register scavenger and machine verifier, so we need to indicate
6080 // that we are reading an undefined value from SrcRegD, but a proper
6081 // value from SrcReg.
6082 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
6083 .addReg(SrcRegD, RegState::Undef)
6084 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
6085 ++NumZCRegMoveInstrsFPR;
6086 } else {
6087 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
6088 .addReg(SrcReg, getKillRegState(KillSrc));
6089 if (Subtarget.hasZeroCycleRegMoveFPR32())
6090 ++NumZCRegMoveInstrsFPR;
6091 }
6092 return;
6093 }
6094
6095 if (AArch64::FPR16RegClass.contains(DestReg) &&
6096 AArch64::FPR16RegClass.contains(SrcReg)) {
6097 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
6098 !Subtarget.hasZeroCycleRegMoveFPR64() &&
6099 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
6100 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
6101 MCRegister DestRegQ = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
6102 &AArch64::FPR128RegClass);
6103 MCRegister SrcRegQ = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
6104 &AArch64::FPR128RegClass);
6105 // This instruction is reading and writing Q registers. This may upset
6106 // the register scavenger and machine verifier, so we need to indicate
6107 // that we are reading an undefined value from SrcRegQ, but a proper
6108 // value from SrcReg.
6109 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
6110 .addReg(SrcRegQ, RegState::Undef)
6111 .addReg(SrcRegQ, RegState::Undef)
6112 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
6113 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
6114 !Subtarget.hasZeroCycleRegMoveFPR32()) {
6115 MCRegister DestRegD = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
6116 &AArch64::FPR64RegClass);
6117 MCRegister SrcRegD = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
6118 &AArch64::FPR64RegClass);
6119 // This instruction is reading and writing D registers. This may upset
6120 // the register scavenger and machine verifier, so we need to indicate
6121 // that we are reading an undefined value from SrcRegD, but a proper
6122 // value from SrcReg.
6123 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
6124 .addReg(SrcRegD, RegState::Undef)
6125 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
6126 } else {
6127 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
6128 &AArch64::FPR32RegClass);
6129 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
6130 &AArch64::FPR32RegClass);
6131 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
6132 .addReg(SrcReg, getKillRegState(KillSrc));
6133 }
6134 return;
6135 }
6136
6137 if (AArch64::FPR8RegClass.contains(DestReg) &&
6138 AArch64::FPR8RegClass.contains(SrcReg)) {
6139 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
6140 !Subtarget.hasZeroCycleRegMoveFPR64() &&
6141 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
6142 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
6143 MCRegister DestRegQ = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
6144 &AArch64::FPR128RegClass);
6145 MCRegister SrcRegQ = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
6146 &AArch64::FPR128RegClass);
6147 // This instruction is reading and writing Q registers. This may upset
6148 // the register scavenger and machine verifier, so we need to indicate
6149 // that we are reading an undefined value from SrcRegQ, but a proper
6150 // value from SrcReg.
6151 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
6152 .addReg(SrcRegQ, RegState::Undef)
6153 .addReg(SrcRegQ, RegState::Undef)
6154 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
6155 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
6156 !Subtarget.hasZeroCycleRegMoveFPR32()) {
6157 MCRegister DestRegD = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
6158 &AArch64::FPR64RegClass);
6159 MCRegister SrcRegD = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
6160 &AArch64::FPR64RegClass);
6161 // This instruction is reading and writing D registers. This may upset
6162 // the register scavenger and machine verifier, so we need to indicate
6163 // that we are reading an undefined value from SrcRegD, but a proper
6164 // value from SrcReg.
6165 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
6166 .addReg(SrcRegD, RegState::Undef)
6167 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
6168 } else {
6169 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
6170 &AArch64::FPR32RegClass);
6171 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
6172 &AArch64::FPR32RegClass);
6173 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
6174 .addReg(SrcReg, getKillRegState(KillSrc));
6175 }
6176 return;
6177 }
6178
6179 // Copies between GPR64 and FPR64.
6180 if (AArch64::FPR64RegClass.contains(DestReg) &&
6181 AArch64::GPR64RegClass.contains(SrcReg)) {
6182 if (AArch64::XZR == SrcReg) {
6183 BuildMI(MBB, I, DL, get(AArch64::FMOVD0), DestReg);
6184 } else {
6185 BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
6186 .addReg(SrcReg, getKillRegState(KillSrc));
6187 }
6188 return;
6189 }
6190 if (AArch64::GPR64RegClass.contains(DestReg) &&
6191 AArch64::FPR64RegClass.contains(SrcReg)) {
6192 BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
6193 .addReg(SrcReg, getKillRegState(KillSrc));
6194 return;
6195 }
6196 // Copies between GPR32 and FPR32.
6197 if (AArch64::FPR32RegClass.contains(DestReg) &&
6198 AArch64::GPR32RegClass.contains(SrcReg)) {
6199 if (AArch64::WZR == SrcReg) {
6200 BuildMI(MBB, I, DL, get(AArch64::FMOVS0), DestReg);
6201 } else {
6202 BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
6203 .addReg(SrcReg, getKillRegState(KillSrc));
6204 }
6205 return;
6206 }
6207 if (AArch64::GPR32RegClass.contains(DestReg) &&
6208 AArch64::FPR32RegClass.contains(SrcReg)) {
6209 BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
6210 .addReg(SrcReg, getKillRegState(KillSrc));
6211 return;
6212 }
6213
6214 if (DestReg == AArch64::NZCV) {
6215 assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
6216 BuildMI(MBB, I, DL, get(AArch64::MSR))
6217 .addImm(AArch64SysReg::NZCV)
6218 .addReg(SrcReg, getKillRegState(KillSrc))
6219 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
6220 return;
6221 }
6222
6223 if (SrcReg == AArch64::NZCV) {
6224 assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
6225 BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
6226 .addImm(AArch64SysReg::NZCV)
6227 .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
6228 return;
6229 }
6230
6231#ifndef NDEBUG
6232 errs() << RI.getRegAsmName(DestReg) << " = COPY " << RI.getRegAsmName(SrcReg)
6233 << "\n";
6234#endif
6235 llvm_unreachable("unimplemented reg-to-reg copy");
6236}
6237
6240 MachineBasicBlock::iterator InsertBefore,
6241 const MCInstrDesc &MCID,
6242 Register SrcReg, bool IsKill,
6243 unsigned SubIdx0, unsigned SubIdx1, int FI,
6244 MachineMemOperand *MMO) {
6245 Register SrcReg0 = SrcReg;
6246 Register SrcReg1 = SrcReg;
6247 if (SrcReg.isPhysical()) {
6248 SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
6249 SubIdx0 = 0;
6250 SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1);
6251 SubIdx1 = 0;
6252 }
6253 BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
6254 .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0)
6255 .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1)
6256 .addFrameIndex(FI)
6257 .addImm(0)
6258 .addMemOperand(MMO);
6259}
6260
6263 Register SrcReg, bool isKill, int FI,
6264 const TargetRegisterClass *RC,
6265 Register VReg,
6266 MachineInstr::MIFlag Flags) const {
6267 MachineFunction &MF = *MBB.getParent();
6268 MachineFrameInfo &MFI = MF.getFrameInfo();
6269
6271 MachineMemOperand *MMO =
6273 MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
6274 unsigned Opc = 0;
6275 bool Offset = true;
6277 unsigned StackID = TargetStackID::Default;
6278 switch (RI.getSpillSize(*RC)) {
6279 case 1:
6280 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
6281 Opc = AArch64::STRBui;
6282 break;
6283 case 2: {
6284 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
6285 Opc = AArch64::STRHui;
6286 else if (AArch64::PNRRegClass.hasSubClassEq(RC) ||
6287 AArch64::PPRRegClass.hasSubClassEq(RC)) {
6288 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6289 "Unexpected register store without SVE store instructions");
6290 Opc = AArch64::STR_PXI;
6292 }
6293 break;
6294 }
6295 case 4:
6296 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
6297 Opc = AArch64::STRWui;
6298 if (SrcReg.isVirtual())
6299 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
6300 else
6301 assert(SrcReg != AArch64::WSP);
6302 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
6303 Opc = AArch64::STRSui;
6304 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
6305 Opc = AArch64::STR_PPXI;
6307 }
6308 break;
6309 case 8:
6310 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
6311 Opc = AArch64::STRXui;
6312 if (SrcReg.isVirtual())
6313 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
6314 else
6315 assert(SrcReg != AArch64::SP);
6316 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
6317 Opc = AArch64::STRDui;
6318 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
6320 get(AArch64::STPWi), SrcReg, isKill,
6321 AArch64::sube32, AArch64::subo32, FI, MMO);
6322 return;
6323 }
6324 break;
6325 case 16:
6326 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
6327 Opc = AArch64::STRQui;
6328 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
6329 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6330 Opc = AArch64::ST1Twov1d;
6331 Offset = false;
6332 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
6334 get(AArch64::STPXi), SrcReg, isKill,
6335 AArch64::sube64, AArch64::subo64, FI, MMO);
6336 return;
6337 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
6338 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6339 "Unexpected register store without SVE store instructions");
6340 Opc = AArch64::STR_ZXI;
6342 }
6343 break;
6344 case 24:
6345 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
6346 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6347 Opc = AArch64::ST1Threev1d;
6348 Offset = false;
6349 }
6350 break;
6351 case 32:
6352 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
6353 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6354 Opc = AArch64::ST1Fourv1d;
6355 Offset = false;
6356 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
6357 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6358 Opc = AArch64::ST1Twov2d;
6359 Offset = false;
6360 } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6361 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6362 "Unexpected register store without SVE store instructions");
6363 Opc = AArch64::STR_ZZXI_STRIDED_CONTIGUOUS;
6365 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
6366 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6367 "Unexpected register store without SVE store instructions");
6368 Opc = AArch64::STR_ZZXI;
6370 }
6371 break;
6372 case 48:
6373 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
6374 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6375 Opc = AArch64::ST1Threev2d;
6376 Offset = false;
6377 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
6378 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6379 "Unexpected register store without SVE store instructions");
6380 Opc = AArch64::STR_ZZZXI;
6382 }
6383 break;
6384 case 64:
6385 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
6386 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6387 Opc = AArch64::ST1Fourv2d;
6388 Offset = false;
6389 } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6390 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6391 "Unexpected register store without SVE store instructions");
6392 Opc = AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS;
6394 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
6395 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6396 "Unexpected register store without SVE store instructions");
6397 Opc = AArch64::STR_ZZZZXI;
6399 }
6400 break;
6401 }
6402 assert(Opc && "Unknown register class");
6403 MFI.setStackID(FI, StackID);
6404
6406 .addReg(SrcReg, getKillRegState(isKill))
6407 .addFrameIndex(FI);
6408
6409 if (Offset)
6410 MI.addImm(0);
6411 if (PNRReg.isValid())
6412 MI.addDef(PNRReg, RegState::Implicit);
6413 MI.addMemOperand(MMO);
6414}
6415
6418 MachineBasicBlock::iterator InsertBefore,
6419 const MCInstrDesc &MCID,
6420 Register DestReg, unsigned SubIdx0,
6421 unsigned SubIdx1, int FI,
6422 MachineMemOperand *MMO) {
6423 Register DestReg0 = DestReg;
6424 Register DestReg1 = DestReg;
6425 bool IsUndef = true;
6426 if (DestReg.isPhysical()) {
6427 DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
6428 SubIdx0 = 0;
6429 DestReg1 = TRI.getSubReg(DestReg, SubIdx1);
6430 SubIdx1 = 0;
6431 IsUndef = false;
6432 }
6433 BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
6434 .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0)
6435 .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1)
6436 .addFrameIndex(FI)
6437 .addImm(0)
6438 .addMemOperand(MMO);
6439}
6440
6443 Register DestReg, int FI,
6444 const TargetRegisterClass *RC,
6445 Register VReg, unsigned SubReg,
6446 MachineInstr::MIFlag Flags) const {
6447 MachineFunction &MF = *MBB.getParent();
6448 MachineFrameInfo &MFI = MF.getFrameInfo();
6450 MachineMemOperand *MMO =
6452 MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
6453
6454 unsigned Opc = 0;
6455 bool Offset = true;
6456 unsigned StackID = TargetStackID::Default;
6458 switch (TRI.getSpillSize(*RC)) {
6459 case 1:
6460 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
6461 Opc = AArch64::LDRBui;
6462 break;
6463 case 2: {
6464 bool IsPNR = AArch64::PNRRegClass.hasSubClassEq(RC);
6465 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
6466 Opc = AArch64::LDRHui;
6467 else if (IsPNR || AArch64::PPRRegClass.hasSubClassEq(RC)) {
6468 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6469 "Unexpected register load without SVE load instructions");
6470 if (IsPNR)
6471 PNRReg = DestReg;
6472 Opc = AArch64::LDR_PXI;
6474 }
6475 break;
6476 }
6477 case 4:
6478 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
6479 Opc = AArch64::LDRWui;
6480 if (DestReg.isVirtual())
6481 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
6482 else
6483 assert(DestReg != AArch64::WSP);
6484 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
6485 Opc = AArch64::LDRSui;
6486 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
6487 Opc = AArch64::LDR_PPXI;
6489 }
6490 break;
6491 case 8:
6492 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
6493 Opc = AArch64::LDRXui;
6494 if (DestReg.isVirtual())
6495 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
6496 else
6497 assert(DestReg != AArch64::SP);
6498 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
6499 Opc = AArch64::LDRDui;
6500 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
6502 get(AArch64::LDPWi), DestReg, AArch64::sube32,
6503 AArch64::subo32, FI, MMO);
6504 return;
6505 }
6506 break;
6507 case 16:
6508 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
6509 Opc = AArch64::LDRQui;
6510 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
6511 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6512 Opc = AArch64::LD1Twov1d;
6513 Offset = false;
6514 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
6516 get(AArch64::LDPXi), DestReg, AArch64::sube64,
6517 AArch64::subo64, FI, MMO);
6518 return;
6519 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
6520 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6521 "Unexpected register load without SVE load instructions");
6522 Opc = AArch64::LDR_ZXI;
6524 }
6525 break;
6526 case 24:
6527 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
6528 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6529 Opc = AArch64::LD1Threev1d;
6530 Offset = false;
6531 }
6532 break;
6533 case 32:
6534 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
6535 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6536 Opc = AArch64::LD1Fourv1d;
6537 Offset = false;
6538 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
6539 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6540 Opc = AArch64::LD1Twov2d;
6541 Offset = false;
6542 } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6543 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6544 "Unexpected register load without SVE load instructions");
6545 Opc = AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS;
6547 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
6548 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6549 "Unexpected register load without SVE load instructions");
6550 Opc = AArch64::LDR_ZZXI;
6552 }
6553 break;
6554 case 48:
6555 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
6556 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6557 Opc = AArch64::LD1Threev2d;
6558 Offset = false;
6559 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
6560 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6561 "Unexpected register load without SVE load instructions");
6562 Opc = AArch64::LDR_ZZZXI;
6564 }
6565 break;
6566 case 64:
6567 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
6568 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6569 Opc = AArch64::LD1Fourv2d;
6570 Offset = false;
6571 } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6572 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6573 "Unexpected register load without SVE load instructions");
6574 Opc = AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS;
6576 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
6577 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6578 "Unexpected register load without SVE load instructions");
6579 Opc = AArch64::LDR_ZZZZXI;
6581 }
6582 break;
6583 }
6584
6585 assert(Opc && "Unknown register class");
6586 MFI.setStackID(FI, StackID);
6587
6589 .addReg(DestReg, getDefRegState(true))
6590 .addFrameIndex(FI);
6591 if (Offset)
6592 MI.addImm(0);
6593 if (PNRReg.isValid() && !PNRReg.isVirtual())
6594 MI.addDef(PNRReg, RegState::Implicit);
6595 MI.addMemOperand(MMO);
6596}
6597
6599 const MachineInstr &UseMI,
6600 const TargetRegisterInfo *TRI) {
6601 return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()),
6602 UseMI.getIterator()),
6603 [TRI](const MachineInstr &I) {
6604 return I.modifiesRegister(AArch64::NZCV, TRI) ||
6605 I.readsRegister(AArch64::NZCV, TRI);
6606 });
6607}
6608
6609void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
6610 const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) {
6611 // The smallest scalable element supported by scaled SVE addressing
6612 // modes are predicates, which are 2 scalable bytes in size. So the scalable
6613 // byte offset must always be a multiple of 2.
6614 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
6615
6616 // VGSized offsets are divided by '2', because the VG register is the
6617 // the number of 64bit granules as opposed to 128bit vector chunks,
6618 // which is how the 'n' in e.g. MVT::nxv1i8 is modelled.
6619 // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes.
6620 // VG = n * 2 and the dwarf offset must be VG * 8 bytes.
6621 ByteSized = Offset.getFixed();
6622 VGSized = Offset.getScalable() / 2;
6623}
6624
6625/// Returns the offset in parts to which this frame offset can be
6626/// decomposed for the purpose of describing a frame offset.
6627/// For non-scalable offsets this is simply its byte size.
6628void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
6629 const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors,
6630 int64_t &NumDataVectors) {
6631 // The smallest scalable element supported by scaled SVE addressing
6632 // modes are predicates, which are 2 scalable bytes in size. So the scalable
6633 // byte offset must always be a multiple of 2.
6634 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
6635
6636 NumBytes = Offset.getFixed();
6637 NumDataVectors = 0;
6638 NumPredicateVectors = Offset.getScalable() / 2;
6639 // This method is used to get the offsets to adjust the frame offset.
6640 // If the function requires ADDPL to be used and needs more than two ADDPL
6641 // instructions, part of the offset is folded into NumDataVectors so that it
6642 // uses ADDVL for part of it, reducing the number of ADDPL instructions.
6643 if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 ||
6644 NumPredicateVectors > 62) {
6645 NumDataVectors = NumPredicateVectors / 8;
6646 NumPredicateVectors -= NumDataVectors * 8;
6647 }
6648}
6649
6650// Convenience function to create a DWARF expression for: Constant `Operation`.
6651// This helper emits compact sequences for common cases. For example, for`-15
6652// DW_OP_plus`, this helper would create DW_OP_lit15 DW_OP_minus.
6655 if (Operation == dwarf::DW_OP_plus && Constant < 0 && -Constant <= 31) {
6656 // -Constant (1 to 31)
6657 Expr.push_back(dwarf::DW_OP_lit0 - Constant);
6658 Operation = dwarf::DW_OP_minus;
6659 } else if (Constant >= 0 && Constant <= 31) {
6660 // Literal value 0 to 31
6661 Expr.push_back(dwarf::DW_OP_lit0 + Constant);
6662 } else {
6663 // Signed constant
6664 Expr.push_back(dwarf::DW_OP_consts);
6666 }
6667 return Expr.push_back(Operation);
6668}
6669
6670// Convenience function to create a DWARF expression for a register.
6671static void appendReadRegExpr(SmallVectorImpl<char> &Expr, unsigned RegNum) {
6672 Expr.push_back((char)dwarf::DW_OP_bregx);
6674 Expr.push_back(0);
6675}
6676
6677// Convenience function to create a DWARF expression for loading a register from
6678// a CFA offset.
6680 int64_t OffsetFromDefCFA) {
6681 // This assumes the top of the DWARF stack contains the CFA.
6682 Expr.push_back(dwarf::DW_OP_dup);
6683 // Add the offset to the register.
6684 appendConstantExpr(Expr, OffsetFromDefCFA, dwarf::DW_OP_plus);
6685 // Dereference the address (loads a 64 bit value)..
6686 Expr.push_back(dwarf::DW_OP_deref);
6687}
6688
6689// Convenience function to create a comment for
6690// (+/-) NumBytes (* RegScale)?
6691static void appendOffsetComment(int NumBytes, llvm::raw_string_ostream &Comment,
6692 StringRef RegScale = {}) {
6693 if (NumBytes) {
6694 Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes);
6695 if (!RegScale.empty())
6696 Comment << ' ' << RegScale;
6697 }
6698}
6699
6700// Creates an MCCFIInstruction:
6701// { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr }
6703 unsigned Reg,
6704 const StackOffset &Offset) {
6705 int64_t NumBytes, NumVGScaledBytes;
6706 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(Offset, NumBytes,
6707 NumVGScaledBytes);
6708 std::string CommentBuffer;
6709 llvm::raw_string_ostream Comment(CommentBuffer);
6710
6711 if (Reg == AArch64::SP)
6712 Comment << "sp";
6713 else if (Reg == AArch64::FP)
6714 Comment << "fp";
6715 else
6716 Comment << printReg(Reg, &TRI);
6717
6718 // Build up the expression (Reg + NumBytes + VG * NumVGScaledBytes)
6719 SmallString<64> Expr;
6720 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
6721 assert(DwarfReg <= 31 && "DwarfReg out of bounds (0..31)");
6722 // Reg + NumBytes
6723 Expr.push_back(dwarf::DW_OP_breg0 + DwarfReg);
6724 appendLEB128<LEB128Sign::Signed>(Expr, NumBytes);
6725 appendOffsetComment(NumBytes, Comment);
6726 if (NumVGScaledBytes) {
6727 // + VG * NumVGScaledBytes
6728 appendOffsetComment(NumVGScaledBytes, Comment, "* VG");
6729 appendReadRegExpr(Expr, TRI.getDwarfRegNum(AArch64::VG, true));
6730 appendConstantExpr(Expr, NumVGScaledBytes, dwarf::DW_OP_mul);
6731 Expr.push_back(dwarf::DW_OP_plus);
6732 }
6733
6734 // Wrap this into DW_CFA_def_cfa.
6735 SmallString<64> DefCfaExpr;
6736 DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);
6737 appendLEB128<LEB128Sign::Unsigned>(DefCfaExpr, Expr.size());
6738 DefCfaExpr.append(Expr.str());
6739 return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(), SMLoc(),
6740 Comment.str());
6741}
6742
6744 unsigned FrameReg, unsigned Reg,
6745 const StackOffset &Offset,
6746 bool LastAdjustmentWasScalable) {
6747 if (Offset.getScalable())
6748 return createDefCFAExpression(TRI, Reg, Offset);
6749
6750 if (FrameReg == Reg && !LastAdjustmentWasScalable)
6751 return MCCFIInstruction::cfiDefCfaOffset(nullptr, int(Offset.getFixed()));
6752
6753 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
6754 return MCCFIInstruction::cfiDefCfa(nullptr, DwarfReg, (int)Offset.getFixed());
6755}
6756
6759 const StackOffset &OffsetFromDefCFA,
6760 std::optional<int64_t> IncomingVGOffsetFromDefCFA) {
6761 int64_t NumBytes, NumVGScaledBytes;
6762 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
6763 OffsetFromDefCFA, NumBytes, NumVGScaledBytes);
6764
6765 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
6766
6767 // Non-scalable offsets can use DW_CFA_offset directly.
6768 if (!NumVGScaledBytes)
6769 return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes);
6770
6771 std::string CommentBuffer;
6772 llvm::raw_string_ostream Comment(CommentBuffer);
6773 Comment << printReg(Reg, &TRI) << " @ cfa";
6774
6775 // Build up expression (CFA + VG * NumVGScaledBytes + NumBytes)
6776 assert(NumVGScaledBytes && "Expected scalable offset");
6777 SmallString<64> OffsetExpr;
6778 // + VG * NumVGScaledBytes
6779 StringRef VGRegScale;
6780 if (IncomingVGOffsetFromDefCFA) {
6781 appendLoadRegExpr(OffsetExpr, *IncomingVGOffsetFromDefCFA);
6782 VGRegScale = "* IncomingVG";
6783 } else {
6784 appendReadRegExpr(OffsetExpr, TRI.getDwarfRegNum(AArch64::VG, true));
6785 VGRegScale = "* VG";
6786 }
6787 appendConstantExpr(OffsetExpr, NumVGScaledBytes, dwarf::DW_OP_mul);
6788 appendOffsetComment(NumVGScaledBytes, Comment, VGRegScale);
6789 OffsetExpr.push_back(dwarf::DW_OP_plus);
6790 if (NumBytes) {
6791 // + NumBytes
6792 appendOffsetComment(NumBytes, Comment);
6793 appendConstantExpr(OffsetExpr, NumBytes, dwarf::DW_OP_plus);
6794 }
6795
6796 // Wrap this into DW_CFA_expression
6797 SmallString<64> CfaExpr;
6798 CfaExpr.push_back(dwarf::DW_CFA_expression);
6799 appendLEB128<LEB128Sign::Unsigned>(CfaExpr, DwarfReg);
6800 appendLEB128<LEB128Sign::Unsigned>(CfaExpr, OffsetExpr.size());
6801 CfaExpr.append(OffsetExpr.str());
6802
6803 return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), SMLoc(),
6804 Comment.str());
6805}
6806
6807// Helper function to emit a frame offset adjustment from a given
6808// pointer (SrcReg), stored into DestReg. This function is explicit
6809// in that it requires the opcode.
6812 const DebugLoc &DL, unsigned DestReg,
6813 unsigned SrcReg, int64_t Offset, unsigned Opc,
6814 const TargetInstrInfo *TII,
6815 MachineInstr::MIFlag Flag, bool NeedsWinCFI,
6816 bool *HasWinCFI, bool EmitCFAOffset,
6817 StackOffset CFAOffset, unsigned FrameReg) {
6818 int Sign = 1;
6819 unsigned MaxEncoding, ShiftSize;
6820 switch (Opc) {
6821 case AArch64::ADDXri:
6822 case AArch64::ADDSXri:
6823 case AArch64::SUBXri:
6824 case AArch64::SUBSXri:
6825 MaxEncoding = 0xfff;
6826 ShiftSize = 12;
6827 break;
6828 case AArch64::ADDVL_XXI:
6829 case AArch64::ADDPL_XXI:
6830 case AArch64::ADDSVL_XXI:
6831 case AArch64::ADDSPL_XXI:
6832 MaxEncoding = 31;
6833 ShiftSize = 0;
6834 if (Offset < 0) {
6835 MaxEncoding = 32;
6836 Sign = -1;
6837 Offset = -Offset;
6838 }
6839 break;
6840 default:
6841 llvm_unreachable("Unsupported opcode");
6842 }
6843
6844 // `Offset` can be in bytes or in "scalable bytes".
6845 int VScale = 1;
6846 if (Opc == AArch64::ADDVL_XXI || Opc == AArch64::ADDSVL_XXI)
6847 VScale = 16;
6848 else if (Opc == AArch64::ADDPL_XXI || Opc == AArch64::ADDSPL_XXI)
6849 VScale = 2;
6850
6851 // FIXME: If the offset won't fit in 24-bits, compute the offset into a
6852 // scratch register. If DestReg is a virtual register, use it as the
6853 // scratch register; otherwise, create a new virtual register (to be
6854 // replaced by the scavenger at the end of PEI). That case can be optimized
6855 // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
6856 // register can be loaded with offset%8 and the add/sub can use an extending
6857 // instruction with LSL#3.
6858 // Currently the function handles any offsets but generates a poor sequence
6859 // of code.
6860 // assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
6861
6862 const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
6863 Register TmpReg = DestReg;
6864 if (TmpReg == AArch64::XZR)
6865 TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister(
6866 &AArch64::GPR64RegClass);
6867 do {
6868 uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue);
6869 unsigned LocalShiftSize = 0;
6870 if (ThisVal > MaxEncoding) {
6871 ThisVal = ThisVal >> ShiftSize;
6872 LocalShiftSize = ShiftSize;
6873 }
6874 assert((ThisVal >> ShiftSize) <= MaxEncoding &&
6875 "Encoding cannot handle value that big");
6876
6877 Offset -= ThisVal << LocalShiftSize;
6878 if (Offset == 0)
6879 TmpReg = DestReg;
6880 auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg)
6881 .addReg(SrcReg)
6882 .addImm(Sign * (int)ThisVal);
6883 if (ShiftSize)
6884 MBI = MBI.addImm(
6886 MBI = MBI.setMIFlag(Flag);
6887
6888 auto Change =
6889 VScale == 1
6890 ? StackOffset::getFixed(ThisVal << LocalShiftSize)
6891 : StackOffset::getScalable(VScale * (ThisVal << LocalShiftSize));
6892 if (Sign == -1 || Opc == AArch64::SUBXri || Opc == AArch64::SUBSXri)
6893 CFAOffset += Change;
6894 else
6895 CFAOffset -= Change;
6896 if (EmitCFAOffset && DestReg == TmpReg) {
6897 MachineFunction &MF = *MBB.getParent();
6898 const TargetSubtargetInfo &STI = MF.getSubtarget();
6899 const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
6900
6901 unsigned CFIIndex = MF.addFrameInst(
6902 createDefCFA(TRI, FrameReg, DestReg, CFAOffset, VScale != 1));
6903 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
6904 .addCFIIndex(CFIIndex)
6905 .setMIFlags(Flag);
6906 }
6907
6908 if (NeedsWinCFI) {
6909 int Imm = (int)(ThisVal << LocalShiftSize);
6910 if (VScale != 1 && DestReg == AArch64::SP) {
6911 if (HasWinCFI)
6912 *HasWinCFI = true;
6913 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AllocZ))
6914 .addImm(ThisVal)
6915 .setMIFlag(Flag);
6916 } else if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
6917 (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
6918 assert(VScale == 1 && "Expected non-scalable operation");
6919 if (HasWinCFI)
6920 *HasWinCFI = true;
6921 if (Imm == 0)
6922 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag);
6923 else
6924 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP))
6925 .addImm(Imm)
6926 .setMIFlag(Flag);
6927 assert(Offset == 0 && "Expected remaining offset to be zero to "
6928 "emit a single SEH directive");
6929 } else if (DestReg == AArch64::SP) {
6930 assert(VScale == 1 && "Expected non-scalable operation");
6931 if (HasWinCFI)
6932 *HasWinCFI = true;
6933 assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc");
6934 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
6935 .addImm(Imm)
6936 .setMIFlag(Flag);
6937 }
6938 }
6939
6940 SrcReg = TmpReg;
6941 } while (Offset);
6942}
6943
6946 unsigned DestReg, unsigned SrcReg,
6948 MachineInstr::MIFlag Flag, bool SetNZCV,
6949 bool NeedsWinCFI, bool *HasWinCFI,
6950 bool EmitCFAOffset, StackOffset CFAOffset,
6951 unsigned FrameReg) {
6952 // If a function is marked as arm_locally_streaming, then the runtime value of
6953 // vscale in the prologue/epilogue is different the runtime value of vscale
6954 // in the function's body. To avoid having to consider multiple vscales,
6955 // we can use `addsvl` to allocate any scalable stack-slots, which under
6956 // most circumstances will be only locals, not callee-save slots.
6957 const Function &F = MBB.getParent()->getFunction();
6958 bool UseSVL = F.hasFnAttribute("aarch64_pstate_sm_body");
6959
6960 int64_t Bytes, NumPredicateVectors, NumDataVectors;
6961 AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
6962 Offset, Bytes, NumPredicateVectors, NumDataVectors);
6963
6964 // Insert ADDSXri for scalable offset at the end.
6965 bool NeedsFinalDefNZCV = SetNZCV && (NumPredicateVectors || NumDataVectors);
6966 if (NeedsFinalDefNZCV)
6967 SetNZCV = false;
6968
6969 // First emit non-scalable frame offsets, or a simple 'mov'.
6970 if (Bytes || (!Offset && SrcReg != DestReg)) {
6971 assert((DestReg != AArch64::SP || Bytes % 8 == 0) &&
6972 "SP increment/decrement not 8-byte aligned");
6973 unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
6974 if (Bytes < 0) {
6975 Bytes = -Bytes;
6976 Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
6977 }
6978 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag,
6979 NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
6980 FrameReg);
6981 CFAOffset += (Opc == AArch64::ADDXri || Opc == AArch64::ADDSXri)
6982 ? StackOffset::getFixed(-Bytes)
6983 : StackOffset::getFixed(Bytes);
6984 SrcReg = DestReg;
6985 FrameReg = DestReg;
6986 }
6987
6988 assert(!(NeedsWinCFI && NumPredicateVectors) &&
6989 "WinCFI can't allocate fractions of an SVE data vector");
6990
6991 if (NumDataVectors) {
6992 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors,
6993 UseSVL ? AArch64::ADDSVL_XXI : AArch64::ADDVL_XXI, TII,
6994 Flag, NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
6995 FrameReg);
6996 CFAOffset += StackOffset::getScalable(-NumDataVectors * 16);
6997 SrcReg = DestReg;
6998 }
6999
7000 if (NumPredicateVectors) {
7001 assert(DestReg != AArch64::SP && "Unaligned access to SP");
7002 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors,
7003 UseSVL ? AArch64::ADDSPL_XXI : AArch64::ADDPL_XXI, TII,
7004 Flag, NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
7005 FrameReg);
7006 }
7007
7008 if (NeedsFinalDefNZCV)
7009 BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDSXri), DestReg)
7010 .addReg(DestReg)
7011 .addImm(0)
7012 .addImm(0);
7013}
7014
7017 int FrameIndex, MachineInstr *&CopyMI, LiveIntervals *LIS,
7018 VirtRegMap *VRM) const {
7020 // This is a bit of a hack. Consider this instruction:
7021 //
7022 // %0 = COPY %sp; GPR64all:%0
7023 //
7024 // We explicitly chose GPR64all for the virtual register so such a copy might
7025 // be eliminated by RegisterCoalescer. However, that may not be possible, and
7026 // %0 may even spill. We can't spill %sp, and since it is in the GPR64all
7027 // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
7028 //
7029 // To prevent that, we are going to constrain the %0 register class here.
7030 if (MI.isFullCopy()) {
7031 Register DstReg = MI.getOperand(0).getReg();
7032 Register SrcReg = MI.getOperand(1).getReg();
7033 if (SrcReg == AArch64::SP && DstReg.isVirtual()) {
7034 MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
7035 return nullptr;
7036 }
7037 if (DstReg == AArch64::SP && SrcReg.isVirtual()) {
7038 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
7039 return nullptr;
7040 }
7041 // Nothing can folded with copy from/to NZCV.
7042 if (SrcReg == AArch64::NZCV || DstReg == AArch64::NZCV)
7043 return nullptr;
7044 }
7045
7046 // Handle the case where a copy is being spilled or filled but the source
7047 // and destination register class don't match. For example:
7048 //
7049 // %0 = COPY %xzr; GPR64common:%0
7050 //
7051 // In this case we can still safely fold away the COPY and generate the
7052 // following spill code:
7053 //
7054 // STRXui %xzr, %stack.0
7055 //
7056 // This also eliminates spilled cross register class COPYs (e.g. between x and
7057 // d regs) of the same size. For example:
7058 //
7059 // %0 = COPY %1; GPR64:%0, FPR64:%1
7060 //
7061 // will be filled as
7062 //
7063 // LDRDui %0, fi<#0>
7064 //
7065 // instead of
7066 //
7067 // LDRXui %Temp, fi<#0>
7068 // %0 = FMOV %Temp
7069 //
7070 if (MI.isCopy() && Ops.size() == 1 &&
7071 // Make sure we're only folding the explicit COPY defs/uses.
7072 (Ops[0] == 0 || Ops[0] == 1)) {
7073 bool IsSpill = Ops[0] == 0;
7074 bool IsFill = !IsSpill;
7076 const MachineRegisterInfo &MRI = MF.getRegInfo();
7077 MachineBasicBlock &MBB = *MI.getParent();
7078 const MachineOperand &DstMO = MI.getOperand(0);
7079 const MachineOperand &SrcMO = MI.getOperand(1);
7080 Register DstReg = DstMO.getReg();
7081 Register SrcReg = SrcMO.getReg();
7082 // This is slightly expensive to compute for physical regs since
7083 // getMinimalPhysRegClass is slow.
7084 auto getRegClass = [&](unsigned Reg) {
7085 return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg)
7086 : TRI.getMinimalPhysRegClass(Reg);
7087 };
7088
7089 if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
7090 assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
7091 TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
7092 "Mismatched register size in non subreg COPY");
7093 if (IsSpill)
7094 storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
7095 getRegClass(SrcReg), Register());
7096 else
7097 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
7098 getRegClass(DstReg), Register());
7099 return &*--InsertPt;
7100 }
7101
7102 // Handle cases like spilling def of:
7103 //
7104 // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
7105 //
7106 // where the physical register source can be widened and stored to the full
7107 // virtual reg destination stack slot, in this case producing:
7108 //
7109 // STRXui %xzr, %stack.0
7110 //
7111 if (IsSpill && DstMO.isUndef() && SrcReg == AArch64::WZR &&
7112 TRI.getRegSizeInBits(*getRegClass(DstReg)) == 64) {
7113 assert(SrcMO.getSubReg() == 0 &&
7114 "Unexpected subreg on physical register");
7115 storeRegToStackSlot(MBB, InsertPt, AArch64::XZR, SrcMO.isKill(),
7116 FrameIndex, &AArch64::GPR64RegClass, Register());
7117 return &*--InsertPt;
7118 }
7119
7120 // Handle cases like filling use of:
7121 //
7122 // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
7123 //
7124 // where we can load the full virtual reg source stack slot, into the subreg
7125 // destination, in this case producing:
7126 //
7127 // LDRWui %0:sub_32<def,read-undef>, %stack.0
7128 //
7129 if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
7130 const TargetRegisterClass *FillRC = nullptr;
7131 switch (DstMO.getSubReg()) {
7132 default:
7133 break;
7134 case AArch64::sub_32:
7135 if (AArch64::GPR64RegClass.hasSubClassEq(getRegClass(DstReg)))
7136 FillRC = &AArch64::GPR32RegClass;
7137 break;
7138 case AArch64::ssub:
7139 FillRC = &AArch64::FPR32RegClass;
7140 break;
7141 case AArch64::dsub:
7142 FillRC = &AArch64::FPR64RegClass;
7143 break;
7144 }
7145
7146 if (FillRC) {
7147 assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
7148 TRI.getRegSizeInBits(*FillRC) &&
7149 "Mismatched regclass size on folded subreg COPY");
7150 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC,
7151 Register());
7152 MachineInstr &LoadMI = *--InsertPt;
7153 MachineOperand &LoadDst = LoadMI.getOperand(0);
7154 assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
7155 LoadDst.setSubReg(DstMO.getSubReg());
7156 LoadDst.setIsUndef();
7157 return &LoadMI;
7158 }
7159 }
7160 }
7161
7162 // Cannot fold.
7163 return nullptr;
7164}
7165
7167 StackOffset &SOffset,
7168 bool *OutUseUnscaledOp,
7169 unsigned *OutUnscaledOp,
7170 int64_t *EmittableOffset) {
7171 // Set output values in case of early exit.
7172 if (EmittableOffset)
7173 *EmittableOffset = 0;
7174 if (OutUseUnscaledOp)
7175 *OutUseUnscaledOp = false;
7176 if (OutUnscaledOp)
7177 *OutUnscaledOp = 0;
7178
7179 // Exit early for structured vector spills/fills as they can't take an
7180 // immediate offset.
7181 switch (MI.getOpcode()) {
7182 default:
7183 break;
7184 case AArch64::LD1Rv1d:
7185 case AArch64::LD1Rv2s:
7186 case AArch64::LD1Rv2d:
7187 case AArch64::LD1Rv4h:
7188 case AArch64::LD1Rv4s:
7189 case AArch64::LD1Rv8b:
7190 case AArch64::LD1Rv8h:
7191 case AArch64::LD1Rv16b:
7192 case AArch64::LD1Twov2d:
7193 case AArch64::LD1Threev2d:
7194 case AArch64::LD1Fourv2d:
7195 case AArch64::LD1Twov1d:
7196 case AArch64::LD1Threev1d:
7197 case AArch64::LD1Fourv1d:
7198 case AArch64::ST1Twov2d:
7199 case AArch64::ST1Threev2d:
7200 case AArch64::ST1Fourv2d:
7201 case AArch64::ST1Twov1d:
7202 case AArch64::ST1Threev1d:
7203 case AArch64::ST1Fourv1d:
7204 case AArch64::ST1i8:
7205 case AArch64::ST1i16:
7206 case AArch64::ST1i32:
7207 case AArch64::ST1i64:
7208 case AArch64::IRG:
7209 case AArch64::IRGstack:
7210 case AArch64::STGloop:
7211 case AArch64::STZGloop:
7213 }
7214
7215 // Get the min/max offset and the scale.
7216 TypeSize ScaleValue(0U, false), Width(0U, false);
7217 int64_t MinOff, MaxOff;
7218 if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff,
7219 MaxOff))
7220 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
7221
7222 // Construct the complete offset.
7223 bool IsMulVL = ScaleValue.isScalable();
7224 unsigned Scale = ScaleValue.getKnownMinValue();
7225 int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed();
7226
7227 const MachineOperand &ImmOpnd =
7228 MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode()));
7229 Offset += ImmOpnd.getImm() * Scale;
7230
7231 // If the offset doesn't match the scale, we rewrite the instruction to
7232 // use the unscaled instruction instead. Likewise, if we have a negative
7233 // offset and there is an unscaled op to use.
7234 std::optional<unsigned> UnscaledOp =
7236 bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);
7237 if (useUnscaledOp &&
7238 !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff,
7239 MaxOff))
7240 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
7241
7242 Scale = ScaleValue.getKnownMinValue();
7243 assert(IsMulVL == ScaleValue.isScalable() &&
7244 "Unscaled opcode has different value for scalable");
7245
7246 int64_t Remainder = Offset % Scale;
7247 assert(!(Remainder && useUnscaledOp) &&
7248 "Cannot have remainder when using unscaled op");
7249
7250 assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
7251 int64_t NewOffset = Offset / Scale;
7252 if (MinOff <= NewOffset && NewOffset <= MaxOff)
7253 Offset = Remainder;
7254 else {
7255 NewOffset = NewOffset < 0 ? MinOff : MaxOff;
7256 Offset = Offset - (NewOffset * Scale);
7257 }
7258
7259 if (EmittableOffset)
7260 *EmittableOffset = NewOffset;
7261 if (OutUseUnscaledOp)
7262 *OutUseUnscaledOp = useUnscaledOp;
7263 if (OutUnscaledOp && UnscaledOp)
7264 *OutUnscaledOp = *UnscaledOp;
7265
7266 if (IsMulVL)
7267 SOffset = StackOffset::get(SOffset.getFixed(), Offset);
7268 else
7269 SOffset = StackOffset::get(Offset, SOffset.getScalable());
7271 (SOffset ? 0 : AArch64FrameOffsetIsLegal);
7272}
7273
7275 unsigned FrameReg, StackOffset &Offset,
7276 const AArch64InstrInfo *TII) {
7277 unsigned Opcode = MI.getOpcode();
7278 unsigned ImmIdx = FrameRegIdx + 1;
7279
7280 if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
7281 Offset += StackOffset::getFixed(MI.getOperand(ImmIdx).getImm());
7282 emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
7283 MI.getOperand(0).getReg(), FrameReg, Offset, TII,
7284 MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
7285 MI.eraseFromParent();
7286 Offset = StackOffset();
7287 return true;
7288 }
7289
7290 int64_t NewOffset;
7291 unsigned UnscaledOp;
7292 bool UseUnscaledOp;
7293 int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
7294 &UnscaledOp, &NewOffset);
7297 // Replace the FrameIndex with FrameReg.
7298 MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
7299 if (UseUnscaledOp)
7300 MI.setDesc(TII->get(UnscaledOp));
7301
7302 MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
7303 return !Offset;
7304 }
7305
7306 return false;
7307}
7308
7314
7315MCInst AArch64InstrInfo::getNop() const { return MCInstBuilder(AArch64::NOP); }
7316
7317// AArch64 supports MachineCombiner.
7318bool AArch64InstrInfo::useMachineCombiner() const { return true; }
7319
7320// True when Opc sets flag
7321static bool isCombineInstrSettingFlag(unsigned Opc) {
7322 switch (Opc) {
7323 case AArch64::ADDSWrr:
7324 case AArch64::ADDSWri:
7325 case AArch64::ADDSXrr:
7326 case AArch64::ADDSXri:
7327 case AArch64::SUBSWrr:
7328 case AArch64::SUBSXrr:
7329 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
7330 case AArch64::SUBSWri:
7331 case AArch64::SUBSXri:
7332 return true;
7333 default:
7334 break;
7335 }
7336 return false;
7337}
7338
7339// 32b Opcodes that can be combined with a MUL
7340static bool isCombineInstrCandidate32(unsigned Opc) {
7341 switch (Opc) {
7342 case AArch64::ADDWrr:
7343 case AArch64::ADDWri:
7344 case AArch64::SUBWrr:
7345 case AArch64::ADDSWrr:
7346 case AArch64::ADDSWri:
7347 case AArch64::SUBSWrr:
7348 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
7349 case AArch64::SUBWri:
7350 case AArch64::SUBSWri:
7351 return true;
7352 default:
7353 break;
7354 }
7355 return false;
7356}
7357
7358// 64b Opcodes that can be combined with a MUL
7359static bool isCombineInstrCandidate64(unsigned Opc) {
7360 switch (Opc) {
7361 case AArch64::ADDXrr:
7362 case AArch64::ADDXri:
7363 case AArch64::SUBXrr:
7364 case AArch64::ADDSXrr:
7365 case AArch64::ADDSXri:
7366 case AArch64::SUBSXrr:
7367 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
7368 case AArch64::SUBXri:
7369 case AArch64::SUBSXri:
7370 case AArch64::ADDv8i8:
7371 case AArch64::ADDv16i8:
7372 case AArch64::ADDv4i16:
7373 case AArch64::ADDv8i16:
7374 case AArch64::ADDv2i32:
7375 case AArch64::ADDv4i32:
7376 case AArch64::SUBv8i8:
7377 case AArch64::SUBv16i8:
7378 case AArch64::SUBv4i16:
7379 case AArch64::SUBv8i16:
7380 case AArch64::SUBv2i32:
7381 case AArch64::SUBv4i32:
7382 return true;
7383 default:
7384 break;
7385 }
7386 return false;
7387}
7388
7389// FP Opcodes that can be combined with a FMUL.
7390static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
7391 switch (Inst.getOpcode()) {
7392 default:
7393 break;
7394 case AArch64::FADDHrr:
7395 case AArch64::FADDSrr:
7396 case AArch64::FADDDrr:
7397 case AArch64::FADDv4f16:
7398 case AArch64::FADDv8f16:
7399 case AArch64::FADDv2f32:
7400 case AArch64::FADDv2f64:
7401 case AArch64::FADDv4f32:
7402 case AArch64::FSUBHrr:
7403 case AArch64::FSUBSrr:
7404 case AArch64::FSUBDrr:
7405 case AArch64::FSUBv4f16:
7406 case AArch64::FSUBv8f16:
7407 case AArch64::FSUBv2f32:
7408 case AArch64::FSUBv2f64:
7409 case AArch64::FSUBv4f32:
7411 // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by
7412 // the target options or if FADD/FSUB has the contract fast-math flag.
7413 return Options.AllowFPOpFusion == FPOpFusion::Fast ||
7415 }
7416 return false;
7417}
7418
7419// Opcodes that can be combined with a MUL
7423
7424//
7425// Utility routine that checks if \param MO is defined by an
7426// \param CombineOpc instruction in the basic block \param MBB
7428 unsigned CombineOpc, unsigned ZeroReg = 0,
7429 bool CheckZeroReg = false) {
7430 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7431 MachineInstr *MI = nullptr;
7432
7433 if (MO.isReg() && MO.getReg().isVirtual())
7434 MI = MRI.getUniqueVRegDef(MO.getReg());
7435 // And it needs to be in the trace (otherwise, it won't have a depth).
7436 if (!MI || MI->getParent() != &MBB || MI->getOpcode() != CombineOpc)
7437 return false;
7438 // Must only used by the user we combine with.
7439 if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
7440 return false;
7441
7442 if (CheckZeroReg) {
7443 assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
7444 MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
7445 MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
7446 // The third input reg must be zero.
7447 if (MI->getOperand(3).getReg() != ZeroReg)
7448 return false;
7449 }
7450
7451 if (isCombineInstrSettingFlag(CombineOpc) &&
7452 MI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) == -1)
7453 return false;
7454
7455 return true;
7456}
7457
7458//
7459// Is \param MO defined by an integer multiply and can be combined?
7461 unsigned MulOpc, unsigned ZeroReg) {
7462 return canCombine(MBB, MO, MulOpc, ZeroReg, true);
7463}
7464
7465//
7466// Is \param MO defined by a floating-point multiply and can be combined?
7468 unsigned MulOpc) {
7469 return canCombine(MBB, MO, MulOpc);
7470}
7471
7472// TODO: There are many more machine instruction opcodes to match:
7473// 1. Other data types (integer, vectors)
7474// 2. Other math / logic operations (xor, or)
7475// 3. Other forms of the same operation (intrinsics and other variants)
7476bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
7477 bool Invert) const {
7478 if (Invert)
7479 return false;
7480 switch (Inst.getOpcode()) {
7481 // == Floating-point types ==
7482 // -- Floating-point instructions --
7483 case AArch64::FADDHrr:
7484 case AArch64::FADDSrr:
7485 case AArch64::FADDDrr:
7486 case AArch64::FMULHrr:
7487 case AArch64::FMULSrr:
7488 case AArch64::FMULDrr:
7489 case AArch64::FMULX16:
7490 case AArch64::FMULX32:
7491 case AArch64::FMULX64:
7492 // -- Advanced SIMD instructions --
7493 case AArch64::FADDv4f16:
7494 case AArch64::FADDv8f16:
7495 case AArch64::FADDv2f32:
7496 case AArch64::FADDv4f32:
7497 case AArch64::FADDv2f64:
7498 case AArch64::FMULv4f16:
7499 case AArch64::FMULv8f16:
7500 case AArch64::FMULv2f32:
7501 case AArch64::FMULv4f32:
7502 case AArch64::FMULv2f64:
7503 case AArch64::FMULXv4f16:
7504 case AArch64::FMULXv8f16:
7505 case AArch64::FMULXv2f32:
7506 case AArch64::FMULXv4f32:
7507 case AArch64::FMULXv2f64:
7508 // -- SVE instructions --
7509 // Opcodes FMULX_ZZZ_? don't exist because there is no unpredicated FMULX
7510 // in the SVE instruction set (though there are predicated ones).
7511 case AArch64::FADD_ZZZ_H:
7512 case AArch64::FADD_ZZZ_S:
7513 case AArch64::FADD_ZZZ_D:
7514 case AArch64::FMUL_ZZZ_H:
7515 case AArch64::FMUL_ZZZ_S:
7516 case AArch64::FMUL_ZZZ_D:
7519
7520 // == Integer types ==
7521 // -- Base instructions --
7522 // Opcodes MULWrr and MULXrr don't exist because
7523 // `MUL <Wd>, <Wn>, <Wm>` and `MUL <Xd>, <Xn>, <Xm>` are aliases of
7524 // `MADD <Wd>, <Wn>, <Wm>, WZR` and `MADD <Xd>, <Xn>, <Xm>, XZR` respectively.
7525 // The machine-combiner does not support three-source-operands machine
7526 // instruction. So we cannot reassociate MULs.
7527 case AArch64::ADDWrr:
7528 case AArch64::ADDXrr:
7529 case AArch64::ANDWrr:
7530 case AArch64::ANDXrr:
7531 case AArch64::ORRWrr:
7532 case AArch64::ORRXrr:
7533 case AArch64::EORWrr:
7534 case AArch64::EORXrr:
7535 case AArch64::EONWrr:
7536 case AArch64::EONXrr:
7537 // -- Advanced SIMD instructions --
7538 // Opcodes MULv1i64 and MULv2i64 don't exist because there is no 64-bit MUL
7539 // in the Advanced SIMD instruction set.
7540 case AArch64::ADDv8i8:
7541 case AArch64::ADDv16i8:
7542 case AArch64::ADDv4i16:
7543 case AArch64::ADDv8i16:
7544 case AArch64::ADDv2i32:
7545 case AArch64::ADDv4i32:
7546 case AArch64::ADDv1i64:
7547 case AArch64::ADDv2i64:
7548 case AArch64::MULv8i8:
7549 case AArch64::MULv16i8:
7550 case AArch64::MULv4i16:
7551 case AArch64::MULv8i16:
7552 case AArch64::MULv2i32:
7553 case AArch64::MULv4i32:
7554 case AArch64::ANDv8i8:
7555 case AArch64::ANDv16i8:
7556 case AArch64::ORRv8i8:
7557 case AArch64::ORRv16i8:
7558 case AArch64::EORv8i8:
7559 case AArch64::EORv16i8:
7560 // -- SVE instructions --
7561 case AArch64::ADD_ZZZ_B:
7562 case AArch64::ADD_ZZZ_H:
7563 case AArch64::ADD_ZZZ_S:
7564 case AArch64::ADD_ZZZ_D:
7565 case AArch64::MUL_ZZZ_B:
7566 case AArch64::MUL_ZZZ_H:
7567 case AArch64::MUL_ZZZ_S:
7568 case AArch64::MUL_ZZZ_D:
7569 case AArch64::AND_ZZZ:
7570 case AArch64::ORR_ZZZ:
7571 case AArch64::EOR_ZZZ:
7572 return true;
7573
7574 default:
7575 return false;
7576 }
7577}
7578
7579/// Find instructions that can be turned into madd.
7581 SmallVectorImpl<unsigned> &Patterns) {
7582 unsigned Opc = Root.getOpcode();
7583 MachineBasicBlock &MBB = *Root.getParent();
7584 bool Found = false;
7585
7587 return false;
7589 int Cmp_NZCV =
7590 Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true);
7591 // When NZCV is live bail out.
7592 if (Cmp_NZCV == -1)
7593 return false;
7594 unsigned NewOpc = convertToNonFlagSettingOpc(Root);
7595 // When opcode can't change bail out.
7596 // CHECKME: do we miss any cases for opcode conversion?
7597 if (NewOpc == Opc)
7598 return false;
7599 Opc = NewOpc;
7600 }
7601
7602 auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg,
7603 unsigned Pattern) {
7604 if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) {
7605 Patterns.push_back(Pattern);
7606 Found = true;
7607 }
7608 };
7609
7610 auto setVFound = [&](int Opcode, int Operand, unsigned Pattern) {
7611 if (canCombine(MBB, Root.getOperand(Operand), Opcode)) {
7612 Patterns.push_back(Pattern);
7613 Found = true;
7614 }
7615 };
7616
7618
7619 switch (Opc) {
7620 default:
7621 break;
7622 case AArch64::ADDWrr:
7623 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7624 "ADDWrr does not have register operands");
7625 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1);
7626 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2);
7627 break;
7628 case AArch64::ADDXrr:
7629 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1);
7630 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2);
7631 break;
7632 case AArch64::SUBWrr:
7633 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2);
7634 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1);
7635 break;
7636 case AArch64::SUBXrr:
7637 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2);
7638 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1);
7639 break;
7640 case AArch64::ADDWri:
7641 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1);
7642 break;
7643 case AArch64::ADDXri:
7644 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1);
7645 break;
7646 case AArch64::SUBWri:
7647 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1);
7648 break;
7649 case AArch64::SUBXri:
7650 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
7651 break;
7652 case AArch64::ADDv8i8:
7653 setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1);
7654 setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2);
7655 break;
7656 case AArch64::ADDv16i8:
7657 setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1);
7658 setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2);
7659 break;
7660 case AArch64::ADDv4i16:
7661 setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1);
7662 setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2);
7663 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1);
7664 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2);
7665 break;
7666 case AArch64::ADDv8i16:
7667 setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1);
7668 setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2);
7669 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1);
7670 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2);
7671 break;
7672 case AArch64::ADDv2i32:
7673 setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1);
7674 setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2);
7675 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1);
7676 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2);
7677 break;
7678 case AArch64::ADDv4i32:
7679 setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1);
7680 setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2);
7681 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1);
7682 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2);
7683 break;
7684 case AArch64::SUBv8i8:
7685 setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1);
7686 setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2);
7687 break;
7688 case AArch64::SUBv16i8:
7689 setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1);
7690 setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2);
7691 break;
7692 case AArch64::SUBv4i16:
7693 setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1);
7694 setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2);
7695 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1);
7696 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2);
7697 break;
7698 case AArch64::SUBv8i16:
7699 setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1);
7700 setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2);
7701 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1);
7702 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2);
7703 break;
7704 case AArch64::SUBv2i32:
7705 setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1);
7706 setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2);
7707 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1);
7708 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2);
7709 break;
7710 case AArch64::SUBv4i32:
7711 setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1);
7712 setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2);
7713 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1);
7714 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2);
7715 break;
7716 }
7717 return Found;
7718}
7719
7720bool AArch64InstrInfo::isAccumulationOpcode(unsigned Opcode) const {
7721 switch (Opcode) {
7722 default:
7723 break;
7724 case AArch64::UABALB_ZZZ_D:
7725 case AArch64::UABALB_ZZZ_H:
7726 case AArch64::UABALB_ZZZ_S:
7727 case AArch64::UABALT_ZZZ_D:
7728 case AArch64::UABALT_ZZZ_H:
7729 case AArch64::UABALT_ZZZ_S:
7730 case AArch64::SABALB_ZZZ_D:
7731 case AArch64::SABALB_ZZZ_S:
7732 case AArch64::SABALB_ZZZ_H:
7733 case AArch64::SABALT_ZZZ_D:
7734 case AArch64::SABALT_ZZZ_S:
7735 case AArch64::SABALT_ZZZ_H:
7736 case AArch64::UABALv16i8_v8i16:
7737 case AArch64::UABALv2i32_v2i64:
7738 case AArch64::UABALv4i16_v4i32:
7739 case AArch64::UABALv4i32_v2i64:
7740 case AArch64::UABALv8i16_v4i32:
7741 case AArch64::UABALv8i8_v8i16:
7742 case AArch64::UABAv16i8:
7743 case AArch64::UABAv2i32:
7744 case AArch64::UABAv4i16:
7745 case AArch64::UABAv4i32:
7746 case AArch64::UABAv8i16:
7747 case AArch64::UABAv8i8:
7748 case AArch64::SABALv16i8_v8i16:
7749 case AArch64::SABALv2i32_v2i64:
7750 case AArch64::SABALv4i16_v4i32:
7751 case AArch64::SABALv4i32_v2i64:
7752 case AArch64::SABALv8i16_v4i32:
7753 case AArch64::SABALv8i8_v8i16:
7754 case AArch64::SABAv16i8:
7755 case AArch64::SABAv2i32:
7756 case AArch64::SABAv4i16:
7757 case AArch64::SABAv4i32:
7758 case AArch64::SABAv8i16:
7759 case AArch64::SABAv8i8:
7760 return true;
7761 }
7762
7763 return false;
7764}
7765
7766unsigned AArch64InstrInfo::getAccumulationStartOpcode(
7767 unsigned AccumulationOpcode) const {
7768 switch (AccumulationOpcode) {
7769 default:
7770 llvm_unreachable("Unsupported accumulation Opcode!");
7771 case AArch64::UABALB_ZZZ_D:
7772 return AArch64::UABDLB_ZZZ_D;
7773 case AArch64::UABALB_ZZZ_H:
7774 return AArch64::UABDLB_ZZZ_H;
7775 case AArch64::UABALB_ZZZ_S:
7776 return AArch64::UABDLB_ZZZ_S;
7777 case AArch64::UABALT_ZZZ_D:
7778 return AArch64::UABDLT_ZZZ_D;
7779 case AArch64::UABALT_ZZZ_H:
7780 return AArch64::UABDLT_ZZZ_H;
7781 case AArch64::UABALT_ZZZ_S:
7782 return AArch64::UABDLT_ZZZ_S;
7783 case AArch64::UABALv16i8_v8i16:
7784 return AArch64::UABDLv16i8_v8i16;
7785 case AArch64::UABALv2i32_v2i64:
7786 return AArch64::UABDLv2i32_v2i64;
7787 case AArch64::UABALv4i16_v4i32:
7788 return AArch64::UABDLv4i16_v4i32;
7789 case AArch64::UABALv4i32_v2i64:
7790 return AArch64::UABDLv4i32_v2i64;
7791 case AArch64::UABALv8i16_v4i32:
7792 return AArch64::UABDLv8i16_v4i32;
7793 case AArch64::UABALv8i8_v8i16:
7794 return AArch64::UABDLv8i8_v8i16;
7795 case AArch64::UABAv16i8:
7796 return AArch64::UABDv16i8;
7797 case AArch64::UABAv2i32:
7798 return AArch64::UABDv2i32;
7799 case AArch64::UABAv4i16:
7800 return AArch64::UABDv4i16;
7801 case AArch64::UABAv4i32:
7802 return AArch64::UABDv4i32;
7803 case AArch64::UABAv8i16:
7804 return AArch64::UABDv8i16;
7805 case AArch64::UABAv8i8:
7806 return AArch64::UABDv8i8;
7807 case AArch64::SABALB_ZZZ_D:
7808 return AArch64::SABDLB_ZZZ_D;
7809 case AArch64::SABALB_ZZZ_S:
7810 return AArch64::SABDLB_ZZZ_S;
7811 case AArch64::SABALB_ZZZ_H:
7812 return AArch64::SABDLB_ZZZ_H;
7813 case AArch64::SABALT_ZZZ_D:
7814 return AArch64::SABDLT_ZZZ_D;
7815 case AArch64::SABALT_ZZZ_S:
7816 return AArch64::SABDLT_ZZZ_S;
7817 case AArch64::SABALT_ZZZ_H:
7818 return AArch64::SABDLT_ZZZ_H;
7819 case AArch64::SABALv16i8_v8i16:
7820 return AArch64::SABDLv16i8_v8i16;
7821 case AArch64::SABALv2i32_v2i64:
7822 return AArch64::SABDLv2i32_v2i64;
7823 case AArch64::SABALv4i16_v4i32:
7824 return AArch64::SABDLv4i16_v4i32;
7825 case AArch64::SABALv4i32_v2i64:
7826 return AArch64::SABDLv4i32_v2i64;
7827 case AArch64::SABALv8i16_v4i32:
7828 return AArch64::SABDLv8i16_v4i32;
7829 case AArch64::SABALv8i8_v8i16:
7830 return AArch64::SABDLv8i8_v8i16;
7831 case AArch64::SABAv16i8:
7832 return AArch64::SABDv16i8;
7833 case AArch64::SABAv2i32:
7834 return AArch64::SABAv2i32;
7835 case AArch64::SABAv4i16:
7836 return AArch64::SABDv4i16;
7837 case AArch64::SABAv4i32:
7838 return AArch64::SABDv4i32;
7839 case AArch64::SABAv8i16:
7840 return AArch64::SABDv8i16;
7841 case AArch64::SABAv8i8:
7842 return AArch64::SABDv8i8;
7843 }
7844}
7845
7846/// Floating-Point Support
7847
7848/// Find instructions that can be turned into madd.
7850 SmallVectorImpl<unsigned> &Patterns) {
7851
7852 if (!isCombineInstrCandidateFP(Root))
7853 return false;
7854
7855 MachineBasicBlock &MBB = *Root.getParent();
7856 bool Found = false;
7857
7858 auto Match = [&](int Opcode, int Operand, unsigned Pattern) -> bool {
7859 if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) {
7860 Patterns.push_back(Pattern);
7861 return true;
7862 }
7863 return false;
7864 };
7865
7867
7868 switch (Root.getOpcode()) {
7869 default:
7870 assert(false && "Unsupported FP instruction in combiner\n");
7871 break;
7872 case AArch64::FADDHrr:
7873 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7874 "FADDHrr does not have register operands");
7875
7876 Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1);
7877 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2);
7878 break;
7879 case AArch64::FADDSrr:
7880 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7881 "FADDSrr does not have register operands");
7882
7883 Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) ||
7884 Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1);
7885
7886 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) ||
7887 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2);
7888 break;
7889 case AArch64::FADDDrr:
7890 Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) ||
7891 Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1);
7892
7893 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) ||
7894 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2);
7895 break;
7896 case AArch64::FADDv4f16:
7897 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) ||
7898 Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1);
7899
7900 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) ||
7901 Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2);
7902 break;
7903 case AArch64::FADDv8f16:
7904 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) ||
7905 Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1);
7906
7907 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) ||
7908 Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2);
7909 break;
7910 case AArch64::FADDv2f32:
7911 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) ||
7912 Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1);
7913
7914 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) ||
7915 Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2);
7916 break;
7917 case AArch64::FADDv2f64:
7918 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) ||
7919 Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1);
7920
7921 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) ||
7922 Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2);
7923 break;
7924 case AArch64::FADDv4f32:
7925 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) ||
7926 Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1);
7927
7928 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) ||
7929 Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2);
7930 break;
7931 case AArch64::FSUBHrr:
7932 Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1);
7933 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2);
7934 Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1);
7935 break;
7936 case AArch64::FSUBSrr:
7937 Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1);
7938
7939 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) ||
7940 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2);
7941
7942 Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1);
7943 break;
7944 case AArch64::FSUBDrr:
7945 Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1);
7946
7947 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) ||
7948 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);
7949
7950 Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);
7951 break;
7952 case AArch64::FSUBv4f16:
7953 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) ||
7954 Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);
7955
7956 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) ||
7957 Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);
7958 break;
7959 case AArch64::FSUBv8f16:
7960 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) ||
7961 Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);
7962
7963 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) ||
7964 Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);
7965 break;
7966 case AArch64::FSUBv2f32:
7967 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) ||
7968 Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2);
7969
7970 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) ||
7971 Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1);
7972 break;
7973 case AArch64::FSUBv2f64:
7974 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) ||
7975 Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2);
7976
7977 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) ||
7978 Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1);
7979 break;
7980 case AArch64::FSUBv4f32:
7981 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) ||
7982 Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2);
7983
7984 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) ||
7985 Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1);
7986 break;
7987 }
7988 return Found;
7989}
7990
7992 SmallVectorImpl<unsigned> &Patterns) {
7993 MachineBasicBlock &MBB = *Root.getParent();
7994 bool Found = false;
7995
7996 auto Match = [&](unsigned Opcode, int Operand, unsigned Pattern) -> bool {
7997 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7998 MachineOperand &MO = Root.getOperand(Operand);
7999 MachineInstr *MI = nullptr;
8000 if (MO.isReg() && MO.getReg().isVirtual())
8001 MI = MRI.getUniqueVRegDef(MO.getReg());
8002 // Ignore No-op COPYs in FMUL(COPY(DUP(..)))
8003 if (MI && MI->getOpcode() == TargetOpcode::COPY &&
8004 MI->getOperand(1).getReg().isVirtual())
8005 MI = MRI.getUniqueVRegDef(MI->getOperand(1).getReg());
8006 if (MI && MI->getOpcode() == Opcode) {
8007 Patterns.push_back(Pattern);
8008 return true;
8009 }
8010 return false;
8011 };
8012
8014
8015 switch (Root.getOpcode()) {
8016 default:
8017 return false;
8018 case AArch64::FMULv2f32:
8019 Found = Match(AArch64::DUPv2i32lane, 1, MCP::FMULv2i32_indexed_OP1);
8020 Found |= Match(AArch64::DUPv2i32lane, 2, MCP::FMULv2i32_indexed_OP2);
8021 break;
8022 case AArch64::FMULv2f64:
8023 Found = Match(AArch64::DUPv2i64lane, 1, MCP::FMULv2i64_indexed_OP1);
8024 Found |= Match(AArch64::DUPv2i64lane, 2, MCP::FMULv2i64_indexed_OP2);
8025 break;
8026 case AArch64::FMULv4f16:
8027 Found = Match(AArch64::DUPv4i16lane, 1, MCP::FMULv4i16_indexed_OP1);
8028 Found |= Match(AArch64::DUPv4i16lane, 2, MCP::FMULv4i16_indexed_OP2);
8029 break;
8030 case AArch64::FMULv4f32:
8031 Found = Match(AArch64::DUPv4i32lane, 1, MCP::FMULv4i32_indexed_OP1);
8032 Found |= Match(AArch64::DUPv4i32lane, 2, MCP::FMULv4i32_indexed_OP2);
8033 break;
8034 case AArch64::FMULv8f16:
8035 Found = Match(AArch64::DUPv8i16lane, 1, MCP::FMULv8i16_indexed_OP1);
8036 Found |= Match(AArch64::DUPv8i16lane, 2, MCP::FMULv8i16_indexed_OP2);
8037 break;
8038 }
8039
8040 return Found;
8041}
8042
8044 SmallVectorImpl<unsigned> &Patterns) {
8045 unsigned Opc = Root.getOpcode();
8046 MachineBasicBlock &MBB = *Root.getParent();
8047 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8048
8049 auto Match = [&](unsigned Opcode, unsigned Pattern) -> bool {
8050 MachineOperand &MO = Root.getOperand(1);
8052 if (MI != nullptr && (MI->getOpcode() == Opcode) &&
8053 MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()) &&
8057 MI->getFlag(MachineInstr::MIFlag::FmNsz)) {
8058 Patterns.push_back(Pattern);
8059 return true;
8060 }
8061 return false;
8062 };
8063
8064 switch (Opc) {
8065 default:
8066 break;
8067 case AArch64::FNEGDr:
8068 return Match(AArch64::FMADDDrrr, AArch64MachineCombinerPattern::FNMADD);
8069 case AArch64::FNEGSr:
8070 return Match(AArch64::FMADDSrrr, AArch64MachineCombinerPattern::FNMADD);
8071 }
8072
8073 return false;
8074}
8075
8076/// Return true when a code sequence can improve throughput. It
8077/// should be called only for instructions in loops.
8078/// \param Pattern - combiner pattern
8080 switch (Pattern) {
8081 default:
8082 break;
8188 return true;
8189 } // end switch (Pattern)
8190 return false;
8191}
8192
8193/// Find other MI combine patterns.
8195 SmallVectorImpl<unsigned> &Patterns) {
8196 // A - (B + C) ==> (A - B) - C or (A - C) - B
8197 unsigned Opc = Root.getOpcode();
8198 MachineBasicBlock &MBB = *Root.getParent();
8199
8200 switch (Opc) {
8201 case AArch64::SUBWrr:
8202 case AArch64::SUBSWrr:
8203 case AArch64::SUBXrr:
8204 case AArch64::SUBSXrr:
8205 // Found candidate root.
8206 break;
8207 default:
8208 return false;
8209 }
8210
8212 Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) ==
8213 -1)
8214 return false;
8215
8216 if (canCombine(MBB, Root.getOperand(2), AArch64::ADDWrr) ||
8217 canCombine(MBB, Root.getOperand(2), AArch64::ADDSWrr) ||
8218 canCombine(MBB, Root.getOperand(2), AArch64::ADDXrr) ||
8219 canCombine(MBB, Root.getOperand(2), AArch64::ADDSXrr)) {
8222 return true;
8223 }
8224
8225 return false;
8226}
8227
8228/// Check if the given instruction forms a gather load pattern that can be
8229/// optimized for better Memory-Level Parallelism (MLP). This function
8230/// identifies chains of NEON lane load instructions that load data from
8231/// different memory addresses into individual lanes of a 128-bit vector
8232/// register, then attempts to split the pattern into parallel loads to break
8233/// the serial dependency between instructions.
8234///
8235/// Pattern Matched:
8236/// Initial scalar load -> SUBREG_TO_REG (lane 0) -> LD1i* (lane 1) ->
8237/// LD1i* (lane 2) -> ... -> LD1i* (lane N-1, Root)
8238///
8239/// Transformed Into:
8240/// Two parallel vector loads using fewer lanes each, followed by ZIP1v2i64
8241/// to combine the results, enabling better memory-level parallelism.
8242///
8243/// Supported Element Types:
8244/// - 32-bit elements (LD1i32, 4 lanes total)
8245/// - 16-bit elements (LD1i16, 8 lanes total)
8246/// - 8-bit elements (LD1i8, 16 lanes total)
8248 SmallVectorImpl<unsigned> &Patterns,
8249 unsigned LoadLaneOpCode, unsigned NumLanes) {
8250 const MachineFunction *MF = Root.getMF();
8251
8252 // Early exit if optimizing for size.
8253 if (MF->getFunction().hasMinSize())
8254 return false;
8255
8256 const MachineRegisterInfo &MRI = MF->getRegInfo();
8258
8259 // The root of the pattern must load into the last lane of the vector.
8260 if (Root.getOperand(2).getImm() != NumLanes - 1)
8261 return false;
8262
8263 // Check that we have load into all lanes except lane 0.
8264 // For each load we also want to check that:
8265 // 1. It has a single non-debug use (since we will be replacing the virtual
8266 // register)
8267 // 2. That the addressing mode only uses a single pointer operand
8268 auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
8269 auto Range = llvm::seq<unsigned>(1, NumLanes - 1);
8270 SmallSet<unsigned, 16> RemainingLanes(Range.begin(), Range.end());
8272 while (!RemainingLanes.empty() && CurrInstr &&
8273 CurrInstr->getOpcode() == LoadLaneOpCode &&
8274 MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) &&
8275 CurrInstr->getNumOperands() == 4) {
8276 RemainingLanes.erase(CurrInstr->getOperand(2).getImm());
8277 LoadInstrs.push_back(CurrInstr);
8278 CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
8279 }
8280
8281 // Check that we have found a match for lanes N-1.. 1.
8282 if (!RemainingLanes.empty())
8283 return false;
8284
8285 // Match the SUBREG_TO_REG sequence.
8286 if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG)
8287 return false;
8288
8289 // Verify that the subreg to reg loads an integer into the first lane.
8290 auto Lane0LoadReg = CurrInstr->getOperand(1).getReg();
8291 unsigned SingleLaneSizeInBits = 128 / NumLanes;
8292 if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != SingleLaneSizeInBits)
8293 return false;
8294
8295 // Verify that it also has a single non debug use.
8296 if (!MRI.hasOneNonDBGUse(Lane0LoadReg))
8297 return false;
8298
8299 LoadInstrs.push_back(MRI.getUniqueVRegDef(Lane0LoadReg));
8300
8301 // If there is any chance of aliasing, do not apply the pattern.
8302 // Walk backward through the MBB starting from Root.
8303 // Exit early if we've encountered all load instructions or hit the search
8304 // limit.
8305 auto MBBItr = Root.getIterator();
8306 unsigned RemainingSteps = GatherOptSearchLimit;
8307 SmallPtrSet<const MachineInstr *, 16> RemainingLoadInstrs;
8308 RemainingLoadInstrs.insert(LoadInstrs.begin(), LoadInstrs.end());
8309 const MachineBasicBlock *MBB = Root.getParent();
8310
8311 for (; MBBItr != MBB->begin() && RemainingSteps > 0 &&
8312 !RemainingLoadInstrs.empty();
8313 --MBBItr, --RemainingSteps) {
8314 const MachineInstr &CurrInstr = *MBBItr;
8315
8316 // Remove this instruction from remaining loads if it's one we're tracking.
8317 RemainingLoadInstrs.erase(&CurrInstr);
8318
8319 // Check for potential aliasing with any of the load instructions to
8320 // optimize.
8321 if (CurrInstr.isLoadFoldBarrier())
8322 return false;
8323 }
8324
8325 // If we hit the search limit without finding all load instructions,
8326 // don't match the pattern.
8327 if (RemainingSteps == 0 && !RemainingLoadInstrs.empty())
8328 return false;
8329
8330 switch (NumLanes) {
8331 case 4:
8333 break;
8334 case 8:
8336 break;
8337 case 16:
8339 break;
8340 default:
8341 llvm_unreachable("Got bad number of lanes for gather pattern.");
8342 }
8343
8344 return true;
8345}
8346
8347/// Search for patterns of LD instructions we can optimize.
8349 SmallVectorImpl<unsigned> &Patterns) {
8350
8351 // The pattern searches for loads into single lanes.
8352 switch (Root.getOpcode()) {
8353 case AArch64::LD1i32:
8354 return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 4);
8355 case AArch64::LD1i16:
8356 return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 8);
8357 case AArch64::LD1i8:
8358 return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 16);
8359 default:
8360 return false;
8361 }
8362}
8363
8364/// Generate optimized instruction sequence for gather load patterns to improve
8365/// Memory-Level Parallelism (MLP). This function transforms a chain of
8366/// sequential NEON lane loads into parallel vector loads that can execute
8367/// concurrently.
8368static void
8372 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
8373 unsigned Pattern, unsigned NumLanes) {
8374 MachineFunction &MF = *Root.getParent()->getParent();
8375 MachineRegisterInfo &MRI = MF.getRegInfo();
8377
8378 // Gather the initial load instructions to build the pattern.
8379 SmallVector<MachineInstr *, 16> LoadToLaneInstrs;
8380 MachineInstr *CurrInstr = &Root;
8381 for (unsigned i = 0; i < NumLanes - 1; ++i) {
8382 LoadToLaneInstrs.push_back(CurrInstr);
8383 CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
8384 }
8385
8386 // Sort the load instructions according to the lane.
8387 llvm::sort(LoadToLaneInstrs,
8388 [](const MachineInstr *A, const MachineInstr *B) {
8389 return A->getOperand(2).getImm() > B->getOperand(2).getImm();
8390 });
8391
8392 MachineInstr *SubregToReg = CurrInstr;
8393 LoadToLaneInstrs.push_back(
8394 MRI.getUniqueVRegDef(SubregToReg->getOperand(1).getReg()));
8395 auto LoadToLaneInstrsAscending = llvm::reverse(LoadToLaneInstrs);
8396
8397 const TargetRegisterClass *FPR128RegClass =
8398 MRI.getRegClass(Root.getOperand(0).getReg());
8399
8400 // Helper lambda to create a LD1 instruction.
8401 auto CreateLD1Instruction = [&](MachineInstr *OriginalInstr,
8402 Register SrcRegister, unsigned Lane,
8403 Register OffsetRegister,
8404 bool OffsetRegisterKillState) {
8405 auto NewRegister = MRI.createVirtualRegister(FPR128RegClass);
8406 MachineInstrBuilder LoadIndexIntoRegister =
8407 BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()),
8408 NewRegister)
8409 .addReg(SrcRegister)
8410 .addImm(Lane)
8411 .addReg(OffsetRegister, getKillRegState(OffsetRegisterKillState))
8412 .setMemRefs(OriginalInstr->memoperands());
8413 InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size()));
8414 InsInstrs.push_back(LoadIndexIntoRegister);
8415 return NewRegister;
8416 };
8417
8418 // Helper to create load instruction based on the NumLanes in the NEON
8419 // register we are rewriting.
8420 auto CreateLDRInstruction =
8421 [&](unsigned NumLanes, Register DestReg, Register OffsetReg,
8423 unsigned Opcode;
8424 switch (NumLanes) {
8425 case 4:
8426 Opcode = AArch64::LDRSui;
8427 break;
8428 case 8:
8429 Opcode = AArch64::LDRHui;
8430 break;
8431 case 16:
8432 Opcode = AArch64::LDRBui;
8433 break;
8434 default:
8436 "Got unsupported number of lanes in machine-combiner gather pattern");
8437 }
8438 // Immediate offset load
8439 return BuildMI(MF, MIMetadata(Root), TII->get(Opcode), DestReg)
8440 .addReg(OffsetReg)
8441 .addImm(0)
8442 .setMemRefs(MMOs);
8443 };
8444
8445 // Load the remaining lanes into register 0.
8446 auto LanesToLoadToReg0 =
8447 llvm::make_range(LoadToLaneInstrsAscending.begin() + 1,
8448 LoadToLaneInstrsAscending.begin() + NumLanes / 2);
8449 Register PrevReg = SubregToReg->getOperand(0).getReg();
8450 for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg0)) {
8451 const MachineOperand &OffsetRegOperand = LoadInstr->getOperand(3);
8452 PrevReg = CreateLD1Instruction(LoadInstr, PrevReg, Index + 1,
8453 OffsetRegOperand.getReg(),
8454 OffsetRegOperand.isKill());
8455 DelInstrs.push_back(LoadInstr);
8456 }
8457 Register LastLoadReg0 = PrevReg;
8458
8459 // First load into register 1. Perform an integer load to zero out the upper
8460 // lanes in a single instruction.
8461 MachineInstr *Lane0Load = *LoadToLaneInstrsAscending.begin();
8462 MachineInstr *OriginalSplitLoad =
8463 *std::next(LoadToLaneInstrsAscending.begin(), NumLanes / 2);
8464 Register DestRegForMiddleIndex = MRI.createVirtualRegister(
8465 MRI.getRegClass(Lane0Load->getOperand(0).getReg()));
8466
8467 const MachineOperand &OriginalSplitToLoadOffsetOperand =
8468 OriginalSplitLoad->getOperand(3);
8469 MachineInstrBuilder MiddleIndexLoadInstr =
8470 CreateLDRInstruction(NumLanes, DestRegForMiddleIndex,
8471 OriginalSplitToLoadOffsetOperand.getReg(),
8472 OriginalSplitLoad->memoperands());
8473
8474 InstrIdxForVirtReg.insert(
8475 std::make_pair(DestRegForMiddleIndex, InsInstrs.size()));
8476 InsInstrs.push_back(MiddleIndexLoadInstr);
8477 DelInstrs.push_back(OriginalSplitLoad);
8478
8479 // Subreg To Reg instruction for register 1.
8480 Register DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass);
8481 unsigned SubregType;
8482 switch (NumLanes) {
8483 case 4:
8484 SubregType = AArch64::ssub;
8485 break;
8486 case 8:
8487 SubregType = AArch64::hsub;
8488 break;
8489 case 16:
8490 SubregType = AArch64::bsub;
8491 break;
8492 default:
8494 "Got invalid NumLanes for machine-combiner gather pattern");
8495 }
8496
8497 auto SubRegToRegInstr =
8498 BuildMI(MF, MIMetadata(Root), TII->get(SubregToReg->getOpcode()),
8499 DestRegForSubregToReg)
8500 .addReg(DestRegForMiddleIndex, getKillRegState(true))
8501 .addImm(SubregType);
8502 InstrIdxForVirtReg.insert(
8503 std::make_pair(DestRegForSubregToReg, InsInstrs.size()));
8504 InsInstrs.push_back(SubRegToRegInstr);
8505
8506 // Load remaining lanes into register 1.
8507 auto LanesToLoadToReg1 =
8508 llvm::make_range(LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1,
8509 LoadToLaneInstrsAscending.end());
8510 PrevReg = SubRegToRegInstr->getOperand(0).getReg();
8511 for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg1)) {
8512 const MachineOperand &OffsetRegOperand = LoadInstr->getOperand(3);
8513 PrevReg = CreateLD1Instruction(LoadInstr, PrevReg, Index + 1,
8514 OffsetRegOperand.getReg(),
8515 OffsetRegOperand.isKill());
8516
8517 // Do not add the last reg to DelInstrs - it will be removed later.
8518 if (Index == NumLanes / 2 - 2) {
8519 break;
8520 }
8521 DelInstrs.push_back(LoadInstr);
8522 }
8523 Register LastLoadReg1 = PrevReg;
8524
8525 // Create the final zip instruction to combine the results.
8526 MachineInstrBuilder ZipInstr =
8527 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64),
8528 Root.getOperand(0).getReg())
8529 .addReg(LastLoadReg0)
8530 .addReg(LastLoadReg1);
8531 InsInstrs.push_back(ZipInstr);
8532}
8533
8547
8548/// Return true when there is potentially a faster code sequence for an
8549/// instruction chain ending in \p Root. All potential patterns are listed in
8550/// the \p Pattern vector. Pattern should be sorted in priority order since the
8551/// pattern evaluator stops checking as soon as it finds a faster sequence.
8552
8553bool AArch64InstrInfo::getMachineCombinerPatterns(
8554 MachineInstr &Root, SmallVectorImpl<unsigned> &Patterns,
8555 bool DoRegPressureReduce) const {
8556 // Integer patterns
8557 if (getMaddPatterns(Root, Patterns))
8558 return true;
8559 // Floating point patterns
8560 if (getFMULPatterns(Root, Patterns))
8561 return true;
8562 if (getFMAPatterns(Root, Patterns))
8563 return true;
8564 if (getFNEGPatterns(Root, Patterns))
8565 return true;
8566
8567 // Other patterns
8568 if (getMiscPatterns(Root, Patterns))
8569 return true;
8570
8571 // Load patterns
8572 if (getLoadPatterns(Root, Patterns))
8573 return true;
8574
8575 return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
8576 DoRegPressureReduce);
8577}
8578
8580/// genFusedMultiply - Generate fused multiply instructions.
8581/// This function supports both integer and floating point instructions.
8582/// A typical example:
8583/// F|MUL I=A,B,0
8584/// F|ADD R,I,C
8585/// ==> F|MADD R,A,B,C
8586/// \param MF Containing MachineFunction
8587/// \param MRI Register information
8588/// \param TII Target information
8589/// \param Root is the F|ADD instruction
8590/// \param [out] InsInstrs is a vector of machine instructions and will
8591/// contain the generated madd instruction
8592/// \param IdxMulOpd is index of operand in Root that is the result of
8593/// the F|MUL. In the example above IdxMulOpd is 1.
8594/// \param MaddOpc the opcode fo the f|madd instruction
8595/// \param RC Register class of operands
8596/// \param kind of fma instruction (addressing mode) to be generated
8597/// \param ReplacedAddend is the result register from the instruction
8598/// replacing the non-combined operand, if any.
8599static MachineInstr *
8601 const TargetInstrInfo *TII, MachineInstr &Root,
8602 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
8603 unsigned MaddOpc, const TargetRegisterClass *RC,
8605 const Register *ReplacedAddend = nullptr) {
8606 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
8607
8608 unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
8609 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
8610 Register ResultReg = Root.getOperand(0).getReg();
8611 Register SrcReg0 = MUL->getOperand(1).getReg();
8612 bool Src0IsKill = MUL->getOperand(1).isKill();
8613 Register SrcReg1 = MUL->getOperand(2).getReg();
8614 bool Src1IsKill = MUL->getOperand(2).isKill();
8615
8616 Register SrcReg2;
8617 bool Src2IsKill;
8618 if (ReplacedAddend) {
8619 // If we just generated a new addend, we must be it's only use.
8620 SrcReg2 = *ReplacedAddend;
8621 Src2IsKill = true;
8622 } else {
8623 SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
8624 Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
8625 }
8626
8627 if (ResultReg.isVirtual())
8628 MRI.constrainRegClass(ResultReg, RC);
8629 if (SrcReg0.isVirtual())
8630 MRI.constrainRegClass(SrcReg0, RC);
8631 if (SrcReg1.isVirtual())
8632 MRI.constrainRegClass(SrcReg1, RC);
8633 if (SrcReg2.isVirtual())
8634 MRI.constrainRegClass(SrcReg2, RC);
8635
8637 if (kind == FMAInstKind::Default)
8638 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
8639 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8640 .addReg(SrcReg1, getKillRegState(Src1IsKill))
8641 .addReg(SrcReg2, getKillRegState(Src2IsKill));
8642 else if (kind == FMAInstKind::Indexed)
8643 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
8644 .addReg(SrcReg2, getKillRegState(Src2IsKill))
8645 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8646 .addReg(SrcReg1, getKillRegState(Src1IsKill))
8647 .addImm(MUL->getOperand(3).getImm());
8648 else if (kind == FMAInstKind::Accumulator)
8649 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
8650 .addReg(SrcReg2, getKillRegState(Src2IsKill))
8651 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8652 .addReg(SrcReg1, getKillRegState(Src1IsKill));
8653 else
8654 assert(false && "Invalid FMA instruction kind \n");
8655 // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
8656 InsInstrs.push_back(MIB);
8657 return MUL;
8658}
8659
8660static MachineInstr *
8662 const TargetInstrInfo *TII, MachineInstr &Root,
8664 MachineInstr *MAD = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
8665
8666 unsigned Opc = 0;
8667 const TargetRegisterClass *RC = MRI.getRegClass(MAD->getOperand(0).getReg());
8668 if (AArch64::FPR32RegClass.hasSubClassEq(RC))
8669 Opc = AArch64::FNMADDSrrr;
8670 else if (AArch64::FPR64RegClass.hasSubClassEq(RC))
8671 Opc = AArch64::FNMADDDrrr;
8672 else
8673 return nullptr;
8674
8675 Register ResultReg = Root.getOperand(0).getReg();
8676 Register SrcReg0 = MAD->getOperand(1).getReg();
8677 Register SrcReg1 = MAD->getOperand(2).getReg();
8678 Register SrcReg2 = MAD->getOperand(3).getReg();
8679 bool Src0IsKill = MAD->getOperand(1).isKill();
8680 bool Src1IsKill = MAD->getOperand(2).isKill();
8681 bool Src2IsKill = MAD->getOperand(3).isKill();
8682 if (ResultReg.isVirtual())
8683 MRI.constrainRegClass(ResultReg, RC);
8684 if (SrcReg0.isVirtual())
8685 MRI.constrainRegClass(SrcReg0, RC);
8686 if (SrcReg1.isVirtual())
8687 MRI.constrainRegClass(SrcReg1, RC);
8688 if (SrcReg2.isVirtual())
8689 MRI.constrainRegClass(SrcReg2, RC);
8690
8692 BuildMI(MF, MIMetadata(Root), TII->get(Opc), ResultReg)
8693 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8694 .addReg(SrcReg1, getKillRegState(Src1IsKill))
8695 .addReg(SrcReg2, getKillRegState(Src2IsKill));
8696 InsInstrs.push_back(MIB);
8697
8698 return MAD;
8699}
8700
8701/// Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane)
8702static MachineInstr *
8705 unsigned IdxDupOp, unsigned MulOpc,
8706 const TargetRegisterClass *RC, MachineRegisterInfo &MRI) {
8707 assert(((IdxDupOp == 1) || (IdxDupOp == 2)) &&
8708 "Invalid index of FMUL operand");
8709
8710 MachineFunction &MF = *Root.getMF();
8712
8713 MachineInstr *Dup =
8714 MF.getRegInfo().getUniqueVRegDef(Root.getOperand(IdxDupOp).getReg());
8715
8716 if (Dup->getOpcode() == TargetOpcode::COPY)
8717 Dup = MRI.getUniqueVRegDef(Dup->getOperand(1).getReg());
8718
8719 Register DupSrcReg = Dup->getOperand(1).getReg();
8720 MRI.clearKillFlags(DupSrcReg);
8721 MRI.constrainRegClass(DupSrcReg, RC);
8722
8723 unsigned DupSrcLane = Dup->getOperand(2).getImm();
8724
8725 unsigned IdxMulOp = IdxDupOp == 1 ? 2 : 1;
8726 MachineOperand &MulOp = Root.getOperand(IdxMulOp);
8727
8728 Register ResultReg = Root.getOperand(0).getReg();
8729
8731 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MulOpc), ResultReg)
8732 .add(MulOp)
8733 .addReg(DupSrcReg)
8734 .addImm(DupSrcLane);
8735
8736 InsInstrs.push_back(MIB);
8737 return &Root;
8738}
8739
8740/// genFusedMultiplyAcc - Helper to generate fused multiply accumulate
8741/// instructions.
8742///
8743/// \see genFusedMultiply
8747 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
8748 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8750}
8751
8752/// genNeg - Helper to generate an intermediate negation of the second operand
8753/// of Root
8755 const TargetInstrInfo *TII, MachineInstr &Root,
8757 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
8758 unsigned MnegOpc, const TargetRegisterClass *RC) {
8759 Register NewVR = MRI.createVirtualRegister(RC);
8761 BuildMI(MF, MIMetadata(Root), TII->get(MnegOpc), NewVR)
8762 .add(Root.getOperand(2));
8763 InsInstrs.push_back(MIB);
8764
8765 assert(InstrIdxForVirtReg.empty());
8766 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8767
8768 return NewVR;
8769}
8770
8771/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
8772/// instructions with an additional negation of the accumulator
8776 DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
8777 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
8778 assert(IdxMulOpd == 1);
8779
8780 Register NewVR =
8781 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
8782 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8783 FMAInstKind::Accumulator, &NewVR);
8784}
8785
8786/// genFusedMultiplyIdx - Helper to generate fused multiply accumulate
8787/// instructions.
8788///
8789/// \see genFusedMultiply
8793 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
8794 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8796}
8797
8798/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
8799/// instructions with an additional negation of the accumulator
8803 DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
8804 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
8805 assert(IdxMulOpd == 1);
8806
8807 Register NewVR =
8808 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
8809
8810 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8811 FMAInstKind::Indexed, &NewVR);
8812}
8813
8814/// genMaddR - Generate madd instruction and combine mul and add using
8815/// an extra virtual register
8816/// Example - an ADD intermediate needs to be stored in a register:
8817/// MUL I=A,B,0
8818/// ADD R,I,Imm
8819/// ==> ORR V, ZR, Imm
8820/// ==> MADD R,A,B,V
8821/// \param MF Containing MachineFunction
8822/// \param MRI Register information
8823/// \param TII Target information
8824/// \param Root is the ADD instruction
8825/// \param [out] InsInstrs is a vector of machine instructions and will
8826/// contain the generated madd instruction
8827/// \param IdxMulOpd is index of operand in Root that is the result of
8828/// the MUL. In the example above IdxMulOpd is 1.
8829/// \param MaddOpc the opcode fo the madd instruction
8830/// \param VR is a virtual register that holds the value of an ADD operand
8831/// (V in the example above).
8832/// \param RC Register class of operands
8834 const TargetInstrInfo *TII, MachineInstr &Root,
8836 unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,
8837 const TargetRegisterClass *RC) {
8838 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
8839
8840 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
8841 Register ResultReg = Root.getOperand(0).getReg();
8842 Register SrcReg0 = MUL->getOperand(1).getReg();
8843 bool Src0IsKill = MUL->getOperand(1).isKill();
8844 Register SrcReg1 = MUL->getOperand(2).getReg();
8845 bool Src1IsKill = MUL->getOperand(2).isKill();
8846
8847 if (ResultReg.isVirtual())
8848 MRI.constrainRegClass(ResultReg, RC);
8849 if (SrcReg0.isVirtual())
8850 MRI.constrainRegClass(SrcReg0, RC);
8851 if (SrcReg1.isVirtual())
8852 MRI.constrainRegClass(SrcReg1, RC);
8854 MRI.constrainRegClass(VR, RC);
8855
8857 BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
8858 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8859 .addReg(SrcReg1, getKillRegState(Src1IsKill))
8860 .addReg(VR);
8861 // Insert the MADD
8862 InsInstrs.push_back(MIB);
8863 return MUL;
8864}
8865
8866/// Do the following transformation
8867/// A - (B + C) ==> (A - B) - C
8868/// A - (B + C) ==> (A - C) - B
8870 const TargetInstrInfo *TII, MachineInstr &Root,
8873 unsigned IdxOpd1,
8874 DenseMap<Register, unsigned> &InstrIdxForVirtReg) {
8875 assert(IdxOpd1 == 1 || IdxOpd1 == 2);
8876 unsigned IdxOtherOpd = IdxOpd1 == 1 ? 2 : 1;
8877 MachineInstr *AddMI = MRI.getUniqueVRegDef(Root.getOperand(2).getReg());
8878
8879 Register ResultReg = Root.getOperand(0).getReg();
8880 Register RegA = Root.getOperand(1).getReg();
8881 bool RegAIsKill = Root.getOperand(1).isKill();
8882 Register RegB = AddMI->getOperand(IdxOpd1).getReg();
8883 bool RegBIsKill = AddMI->getOperand(IdxOpd1).isKill();
8884 Register RegC = AddMI->getOperand(IdxOtherOpd).getReg();
8885 bool RegCIsKill = AddMI->getOperand(IdxOtherOpd).isKill();
8886 Register NewVR =
8888
8889 unsigned Opcode = Root.getOpcode();
8890 if (Opcode == AArch64::SUBSWrr)
8891 Opcode = AArch64::SUBWrr;
8892 else if (Opcode == AArch64::SUBSXrr)
8893 Opcode = AArch64::SUBXrr;
8894 else
8895 assert((Opcode == AArch64::SUBWrr || Opcode == AArch64::SUBXrr) &&
8896 "Unexpected instruction opcode.");
8897
8898 uint32_t Flags = Root.mergeFlagsWith(*AddMI);
8899 Flags &= ~MachineInstr::NoSWrap;
8900 Flags &= ~MachineInstr::NoUWrap;
8901
8902 MachineInstrBuilder MIB1 =
8903 BuildMI(MF, MIMetadata(Root), TII->get(Opcode), NewVR)
8904 .addReg(RegA, getKillRegState(RegAIsKill))
8905 .addReg(RegB, getKillRegState(RegBIsKill))
8906 .setMIFlags(Flags);
8907 MachineInstrBuilder MIB2 =
8908 BuildMI(MF, MIMetadata(Root), TII->get(Opcode), ResultReg)
8909 .addReg(NewVR, getKillRegState(true))
8910 .addReg(RegC, getKillRegState(RegCIsKill))
8911 .setMIFlags(Flags);
8912
8913 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8914 InsInstrs.push_back(MIB1);
8915 InsInstrs.push_back(MIB2);
8916 DelInstrs.push_back(AddMI);
8917 DelInstrs.push_back(&Root);
8918}
8919
8920unsigned AArch64InstrInfo::getReduceOpcodeForAccumulator(
8921 unsigned int AccumulatorOpCode) const {
8922 switch (AccumulatorOpCode) {
8923 case AArch64::UABALB_ZZZ_D:
8924 case AArch64::SABALB_ZZZ_D:
8925 case AArch64::UABALT_ZZZ_D:
8926 case AArch64::SABALT_ZZZ_D:
8927 return AArch64::ADD_ZZZ_D;
8928 case AArch64::UABALB_ZZZ_H:
8929 case AArch64::SABALB_ZZZ_H:
8930 case AArch64::UABALT_ZZZ_H:
8931 case AArch64::SABALT_ZZZ_H:
8932 return AArch64::ADD_ZZZ_H;
8933 case AArch64::UABALB_ZZZ_S:
8934 case AArch64::SABALB_ZZZ_S:
8935 case AArch64::UABALT_ZZZ_S:
8936 case AArch64::SABALT_ZZZ_S:
8937 return AArch64::ADD_ZZZ_S;
8938 case AArch64::UABALv16i8_v8i16:
8939 case AArch64::SABALv8i8_v8i16:
8940 case AArch64::SABAv8i16:
8941 case AArch64::UABAv8i16:
8942 return AArch64::ADDv8i16;
8943 case AArch64::SABALv2i32_v2i64:
8944 case AArch64::UABALv2i32_v2i64:
8945 case AArch64::SABALv4i32_v2i64:
8946 return AArch64::ADDv2i64;
8947 case AArch64::UABALv4i16_v4i32:
8948 case AArch64::SABALv4i16_v4i32:
8949 case AArch64::SABALv8i16_v4i32:
8950 case AArch64::SABAv4i32:
8951 case AArch64::UABAv4i32:
8952 return AArch64::ADDv4i32;
8953 case AArch64::UABALv4i32_v2i64:
8954 return AArch64::ADDv2i64;
8955 case AArch64::UABALv8i16_v4i32:
8956 return AArch64::ADDv4i32;
8957 case AArch64::UABALv8i8_v8i16:
8958 case AArch64::SABALv16i8_v8i16:
8959 return AArch64::ADDv8i16;
8960 case AArch64::UABAv16i8:
8961 case AArch64::SABAv16i8:
8962 return AArch64::ADDv16i8;
8963 case AArch64::UABAv4i16:
8964 case AArch64::SABAv4i16:
8965 return AArch64::ADDv4i16;
8966 case AArch64::UABAv2i32:
8967 case AArch64::SABAv2i32:
8968 return AArch64::ADDv2i32;
8969 case AArch64::UABAv8i8:
8970 case AArch64::SABAv8i8:
8971 return AArch64::ADDv8i8;
8972 default:
8973 llvm_unreachable("Unknown accumulator opcode");
8974 }
8975}
8976
8977/// When getMachineCombinerPatterns() finds potential patterns,
8978/// this function generates the instructions that could replace the
8979/// original code sequence
8980void AArch64InstrInfo::genAlternativeCodeSequence(
8981 MachineInstr &Root, unsigned Pattern,
8984 DenseMap<Register, unsigned> &InstrIdxForVirtReg) const {
8985 MachineBasicBlock &MBB = *Root.getParent();
8986 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8987 MachineFunction &MF = *MBB.getParent();
8988 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
8989
8990 MachineInstr *MUL = nullptr;
8991 const TargetRegisterClass *RC;
8992 unsigned Opc;
8993 switch (Pattern) {
8994 default:
8995 // Reassociate instructions.
8996 TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
8997 DelInstrs, InstrIdxForVirtReg);
8998 return;
9000 // A - (B + C)
9001 // ==> (A - B) - C
9002 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 1,
9003 InstrIdxForVirtReg);
9004 return;
9006 // A - (B + C)
9007 // ==> (A - C) - B
9008 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 2,
9009 InstrIdxForVirtReg);
9010 return;
9013 // MUL I=A,B,0
9014 // ADD R,I,C
9015 // ==> MADD R,A,B,C
9016 // --- Create(MADD);
9018 Opc = AArch64::MADDWrrr;
9019 RC = &AArch64::GPR32RegClass;
9020 } else {
9021 Opc = AArch64::MADDXrrr;
9022 RC = &AArch64::GPR64RegClass;
9023 }
9024 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9025 break;
9028 // MUL I=A,B,0
9029 // ADD R,C,I
9030 // ==> MADD R,A,B,C
9031 // --- Create(MADD);
9033 Opc = AArch64::MADDWrrr;
9034 RC = &AArch64::GPR32RegClass;
9035 } else {
9036 Opc = AArch64::MADDXrrr;
9037 RC = &AArch64::GPR64RegClass;
9038 }
9039 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9040 break;
9045 // MUL I=A,B,0
9046 // ADD/SUB R,I,Imm
9047 // ==> MOV V, Imm/-Imm
9048 // ==> MADD R,A,B,V
9049 // --- Create(MADD);
9050 const TargetRegisterClass *RC;
9051 unsigned BitSize, MovImm;
9054 MovImm = AArch64::MOVi32imm;
9055 RC = &AArch64::GPR32spRegClass;
9056 BitSize = 32;
9057 Opc = AArch64::MADDWrrr;
9058 RC = &AArch64::GPR32RegClass;
9059 } else {
9060 MovImm = AArch64::MOVi64imm;
9061 RC = &AArch64::GPR64spRegClass;
9062 BitSize = 64;
9063 Opc = AArch64::MADDXrrr;
9064 RC = &AArch64::GPR64RegClass;
9065 }
9066 Register NewVR = MRI.createVirtualRegister(RC);
9067 uint64_t Imm = Root.getOperand(2).getImm();
9068
9069 if (Root.getOperand(3).isImm()) {
9070 unsigned Val = Root.getOperand(3).getImm();
9071 Imm = Imm << Val;
9072 }
9073 bool IsSub = Pattern == AArch64MachineCombinerPattern::MULSUBWI_OP1 ||
9075 uint64_t UImm = SignExtend64(IsSub ? -Imm : Imm, BitSize);
9076 // Check that the immediate can be composed via a single instruction.
9078 AArch64_IMM::expandMOVImm(UImm, BitSize, Insn);
9079 if (Insn.size() != 1)
9080 return;
9081 MachineInstrBuilder MIB1 =
9082 BuildMI(MF, MIMetadata(Root), TII->get(MovImm), NewVR)
9083 .addImm(IsSub ? -Imm : Imm);
9084 InsInstrs.push_back(MIB1);
9085 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9086 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
9087 break;
9088 }
9091 // MUL I=A,B,0
9092 // SUB R,I, C
9093 // ==> SUB V, 0, C
9094 // ==> MADD R,A,B,V // = -C + A*B
9095 // --- Create(MADD);
9096 const TargetRegisterClass *SubRC;
9097 unsigned SubOpc, ZeroReg;
9099 SubOpc = AArch64::SUBWrr;
9100 SubRC = &AArch64::GPR32spRegClass;
9101 ZeroReg = AArch64::WZR;
9102 Opc = AArch64::MADDWrrr;
9103 RC = &AArch64::GPR32RegClass;
9104 } else {
9105 SubOpc = AArch64::SUBXrr;
9106 SubRC = &AArch64::GPR64spRegClass;
9107 ZeroReg = AArch64::XZR;
9108 Opc = AArch64::MADDXrrr;
9109 RC = &AArch64::GPR64RegClass;
9110 }
9111 Register NewVR = MRI.createVirtualRegister(SubRC);
9112 // SUB NewVR, 0, C
9113 MachineInstrBuilder MIB1 =
9114 BuildMI(MF, MIMetadata(Root), TII->get(SubOpc), NewVR)
9115 .addReg(ZeroReg)
9116 .add(Root.getOperand(2));
9117 InsInstrs.push_back(MIB1);
9118 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9119 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
9120 break;
9121 }
9124 // MUL I=A,B,0
9125 // SUB R,C,I
9126 // ==> MSUB R,A,B,C (computes C - A*B)
9127 // --- Create(MSUB);
9129 Opc = AArch64::MSUBWrrr;
9130 RC = &AArch64::GPR32RegClass;
9131 } else {
9132 Opc = AArch64::MSUBXrrr;
9133 RC = &AArch64::GPR64RegClass;
9134 }
9135 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9136 break;
9138 Opc = AArch64::MLAv8i8;
9139 RC = &AArch64::FPR64RegClass;
9140 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9141 break;
9143 Opc = AArch64::MLAv8i8;
9144 RC = &AArch64::FPR64RegClass;
9145 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9146 break;
9148 Opc = AArch64::MLAv16i8;
9149 RC = &AArch64::FPR128RegClass;
9150 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9151 break;
9153 Opc = AArch64::MLAv16i8;
9154 RC = &AArch64::FPR128RegClass;
9155 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9156 break;
9158 Opc = AArch64::MLAv4i16;
9159 RC = &AArch64::FPR64RegClass;
9160 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9161 break;
9163 Opc = AArch64::MLAv4i16;
9164 RC = &AArch64::FPR64RegClass;
9165 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9166 break;
9168 Opc = AArch64::MLAv8i16;
9169 RC = &AArch64::FPR128RegClass;
9170 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9171 break;
9173 Opc = AArch64::MLAv8i16;
9174 RC = &AArch64::FPR128RegClass;
9175 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9176 break;
9178 Opc = AArch64::MLAv2i32;
9179 RC = &AArch64::FPR64RegClass;
9180 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9181 break;
9183 Opc = AArch64::MLAv2i32;
9184 RC = &AArch64::FPR64RegClass;
9185 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9186 break;
9188 Opc = AArch64::MLAv4i32;
9189 RC = &AArch64::FPR128RegClass;
9190 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9191 break;
9193 Opc = AArch64::MLAv4i32;
9194 RC = &AArch64::FPR128RegClass;
9195 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9196 break;
9197
9199 Opc = AArch64::MLAv8i8;
9200 RC = &AArch64::FPR64RegClass;
9201 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9202 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8,
9203 RC);
9204 break;
9206 Opc = AArch64::MLSv8i8;
9207 RC = &AArch64::FPR64RegClass;
9208 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9209 break;
9211 Opc = AArch64::MLAv16i8;
9212 RC = &AArch64::FPR128RegClass;
9213 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9214 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8,
9215 RC);
9216 break;
9218 Opc = AArch64::MLSv16i8;
9219 RC = &AArch64::FPR128RegClass;
9220 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9221 break;
9223 Opc = AArch64::MLAv4i16;
9224 RC = &AArch64::FPR64RegClass;
9225 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9226 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
9227 RC);
9228 break;
9230 Opc = AArch64::MLSv4i16;
9231 RC = &AArch64::FPR64RegClass;
9232 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9233 break;
9235 Opc = AArch64::MLAv8i16;
9236 RC = &AArch64::FPR128RegClass;
9237 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9238 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
9239 RC);
9240 break;
9242 Opc = AArch64::MLSv8i16;
9243 RC = &AArch64::FPR128RegClass;
9244 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9245 break;
9247 Opc = AArch64::MLAv2i32;
9248 RC = &AArch64::FPR64RegClass;
9249 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9250 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
9251 RC);
9252 break;
9254 Opc = AArch64::MLSv2i32;
9255 RC = &AArch64::FPR64RegClass;
9256 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9257 break;
9259 Opc = AArch64::MLAv4i32;
9260 RC = &AArch64::FPR128RegClass;
9261 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9262 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
9263 RC);
9264 break;
9266 Opc = AArch64::MLSv4i32;
9267 RC = &AArch64::FPR128RegClass;
9268 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9269 break;
9270
9272 Opc = AArch64::MLAv4i16_indexed;
9273 RC = &AArch64::FPR64RegClass;
9274 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9275 break;
9277 Opc = AArch64::MLAv4i16_indexed;
9278 RC = &AArch64::FPR64RegClass;
9279 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9280 break;
9282 Opc = AArch64::MLAv8i16_indexed;
9283 RC = &AArch64::FPR128RegClass;
9284 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9285 break;
9287 Opc = AArch64::MLAv8i16_indexed;
9288 RC = &AArch64::FPR128RegClass;
9289 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9290 break;
9292 Opc = AArch64::MLAv2i32_indexed;
9293 RC = &AArch64::FPR64RegClass;
9294 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9295 break;
9297 Opc = AArch64::MLAv2i32_indexed;
9298 RC = &AArch64::FPR64RegClass;
9299 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9300 break;
9302 Opc = AArch64::MLAv4i32_indexed;
9303 RC = &AArch64::FPR128RegClass;
9304 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9305 break;
9307 Opc = AArch64::MLAv4i32_indexed;
9308 RC = &AArch64::FPR128RegClass;
9309 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9310 break;
9311
9313 Opc = AArch64::MLAv4i16_indexed;
9314 RC = &AArch64::FPR64RegClass;
9315 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9316 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
9317 RC);
9318 break;
9320 Opc = AArch64::MLSv4i16_indexed;
9321 RC = &AArch64::FPR64RegClass;
9322 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9323 break;
9325 Opc = AArch64::MLAv8i16_indexed;
9326 RC = &AArch64::FPR128RegClass;
9327 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9328 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
9329 RC);
9330 break;
9332 Opc = AArch64::MLSv8i16_indexed;
9333 RC = &AArch64::FPR128RegClass;
9334 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9335 break;
9337 Opc = AArch64::MLAv2i32_indexed;
9338 RC = &AArch64::FPR64RegClass;
9339 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9340 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
9341 RC);
9342 break;
9344 Opc = AArch64::MLSv2i32_indexed;
9345 RC = &AArch64::FPR64RegClass;
9346 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9347 break;
9349 Opc = AArch64::MLAv4i32_indexed;
9350 RC = &AArch64::FPR128RegClass;
9351 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9352 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
9353 RC);
9354 break;
9356 Opc = AArch64::MLSv4i32_indexed;
9357 RC = &AArch64::FPR128RegClass;
9358 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9359 break;
9360
9361 // Floating Point Support
9363 Opc = AArch64::FMADDHrrr;
9364 RC = &AArch64::FPR16RegClass;
9365 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9366 break;
9368 Opc = AArch64::FMADDSrrr;
9369 RC = &AArch64::FPR32RegClass;
9370 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9371 break;
9373 Opc = AArch64::FMADDDrrr;
9374 RC = &AArch64::FPR64RegClass;
9375 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9376 break;
9377
9379 Opc = AArch64::FMADDHrrr;
9380 RC = &AArch64::FPR16RegClass;
9381 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9382 break;
9384 Opc = AArch64::FMADDSrrr;
9385 RC = &AArch64::FPR32RegClass;
9386 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9387 break;
9389 Opc = AArch64::FMADDDrrr;
9390 RC = &AArch64::FPR64RegClass;
9391 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9392 break;
9393
9395 Opc = AArch64::FMLAv1i32_indexed;
9396 RC = &AArch64::FPR32RegClass;
9397 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9399 break;
9401 Opc = AArch64::FMLAv1i32_indexed;
9402 RC = &AArch64::FPR32RegClass;
9403 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9405 break;
9406
9408 Opc = AArch64::FMLAv1i64_indexed;
9409 RC = &AArch64::FPR64RegClass;
9410 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9412 break;
9414 Opc = AArch64::FMLAv1i64_indexed;
9415 RC = &AArch64::FPR64RegClass;
9416 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9418 break;
9419
9421 RC = &AArch64::FPR64RegClass;
9422 Opc = AArch64::FMLAv4i16_indexed;
9423 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9425 break;
9427 RC = &AArch64::FPR64RegClass;
9428 Opc = AArch64::FMLAv4f16;
9429 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9431 break;
9433 RC = &AArch64::FPR64RegClass;
9434 Opc = AArch64::FMLAv4i16_indexed;
9435 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9437 break;
9439 RC = &AArch64::FPR64RegClass;
9440 Opc = AArch64::FMLAv4f16;
9441 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9443 break;
9444
9447 RC = &AArch64::FPR64RegClass;
9449 Opc = AArch64::FMLAv2i32_indexed;
9450 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9452 } else {
9453 Opc = AArch64::FMLAv2f32;
9454 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9456 }
9457 break;
9460 RC = &AArch64::FPR64RegClass;
9462 Opc = AArch64::FMLAv2i32_indexed;
9463 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9465 } else {
9466 Opc = AArch64::FMLAv2f32;
9467 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9469 }
9470 break;
9471
9473 RC = &AArch64::FPR128RegClass;
9474 Opc = AArch64::FMLAv8i16_indexed;
9475 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9477 break;
9479 RC = &AArch64::FPR128RegClass;
9480 Opc = AArch64::FMLAv8f16;
9481 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9483 break;
9485 RC = &AArch64::FPR128RegClass;
9486 Opc = AArch64::FMLAv8i16_indexed;
9487 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9489 break;
9491 RC = &AArch64::FPR128RegClass;
9492 Opc = AArch64::FMLAv8f16;
9493 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9495 break;
9496
9499 RC = &AArch64::FPR128RegClass;
9501 Opc = AArch64::FMLAv2i64_indexed;
9502 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9504 } else {
9505 Opc = AArch64::FMLAv2f64;
9506 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9508 }
9509 break;
9512 RC = &AArch64::FPR128RegClass;
9514 Opc = AArch64::FMLAv2i64_indexed;
9515 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9517 } else {
9518 Opc = AArch64::FMLAv2f64;
9519 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9521 }
9522 break;
9523
9526 RC = &AArch64::FPR128RegClass;
9528 Opc = AArch64::FMLAv4i32_indexed;
9529 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9531 } else {
9532 Opc = AArch64::FMLAv4f32;
9533 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9535 }
9536 break;
9537
9540 RC = &AArch64::FPR128RegClass;
9542 Opc = AArch64::FMLAv4i32_indexed;
9543 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9545 } else {
9546 Opc = AArch64::FMLAv4f32;
9547 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9549 }
9550 break;
9551
9553 Opc = AArch64::FNMSUBHrrr;
9554 RC = &AArch64::FPR16RegClass;
9555 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9556 break;
9558 Opc = AArch64::FNMSUBSrrr;
9559 RC = &AArch64::FPR32RegClass;
9560 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9561 break;
9563 Opc = AArch64::FNMSUBDrrr;
9564 RC = &AArch64::FPR64RegClass;
9565 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9566 break;
9567
9569 Opc = AArch64::FNMADDHrrr;
9570 RC = &AArch64::FPR16RegClass;
9571 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9572 break;
9574 Opc = AArch64::FNMADDSrrr;
9575 RC = &AArch64::FPR32RegClass;
9576 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9577 break;
9579 Opc = AArch64::FNMADDDrrr;
9580 RC = &AArch64::FPR64RegClass;
9581 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9582 break;
9583
9585 Opc = AArch64::FMSUBHrrr;
9586 RC = &AArch64::FPR16RegClass;
9587 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9588 break;
9590 Opc = AArch64::FMSUBSrrr;
9591 RC = &AArch64::FPR32RegClass;
9592 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9593 break;
9595 Opc = AArch64::FMSUBDrrr;
9596 RC = &AArch64::FPR64RegClass;
9597 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9598 break;
9599
9601 Opc = AArch64::FMLSv1i32_indexed;
9602 RC = &AArch64::FPR32RegClass;
9603 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9605 break;
9606
9608 Opc = AArch64::FMLSv1i64_indexed;
9609 RC = &AArch64::FPR64RegClass;
9610 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9612 break;
9613
9616 RC = &AArch64::FPR64RegClass;
9617 Register NewVR = MRI.createVirtualRegister(RC);
9618 MachineInstrBuilder MIB1 =
9619 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f16), NewVR)
9620 .add(Root.getOperand(2));
9621 InsInstrs.push_back(MIB1);
9622 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9624 Opc = AArch64::FMLAv4f16;
9625 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9626 FMAInstKind::Accumulator, &NewVR);
9627 } else {
9628 Opc = AArch64::FMLAv4i16_indexed;
9629 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9630 FMAInstKind::Indexed, &NewVR);
9631 }
9632 break;
9633 }
9635 RC = &AArch64::FPR64RegClass;
9636 Opc = AArch64::FMLSv4f16;
9637 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9639 break;
9641 RC = &AArch64::FPR64RegClass;
9642 Opc = AArch64::FMLSv4i16_indexed;
9643 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9645 break;
9646
9649 RC = &AArch64::FPR64RegClass;
9651 Opc = AArch64::FMLSv2i32_indexed;
9652 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9654 } else {
9655 Opc = AArch64::FMLSv2f32;
9656 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9658 }
9659 break;
9660
9663 RC = &AArch64::FPR128RegClass;
9664 Register NewVR = MRI.createVirtualRegister(RC);
9665 MachineInstrBuilder MIB1 =
9666 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv8f16), NewVR)
9667 .add(Root.getOperand(2));
9668 InsInstrs.push_back(MIB1);
9669 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9671 Opc = AArch64::FMLAv8f16;
9672 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9673 FMAInstKind::Accumulator, &NewVR);
9674 } else {
9675 Opc = AArch64::FMLAv8i16_indexed;
9676 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9677 FMAInstKind::Indexed, &NewVR);
9678 }
9679 break;
9680 }
9682 RC = &AArch64::FPR128RegClass;
9683 Opc = AArch64::FMLSv8f16;
9684 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9686 break;
9688 RC = &AArch64::FPR128RegClass;
9689 Opc = AArch64::FMLSv8i16_indexed;
9690 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9692 break;
9693
9696 RC = &AArch64::FPR128RegClass;
9698 Opc = AArch64::FMLSv2i64_indexed;
9699 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9701 } else {
9702 Opc = AArch64::FMLSv2f64;
9703 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9705 }
9706 break;
9707
9710 RC = &AArch64::FPR128RegClass;
9712 Opc = AArch64::FMLSv4i32_indexed;
9713 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9715 } else {
9716 Opc = AArch64::FMLSv4f32;
9717 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9719 }
9720 break;
9723 RC = &AArch64::FPR64RegClass;
9724 Register NewVR = MRI.createVirtualRegister(RC);
9725 MachineInstrBuilder MIB1 =
9726 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f32), NewVR)
9727 .add(Root.getOperand(2));
9728 InsInstrs.push_back(MIB1);
9729 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9731 Opc = AArch64::FMLAv2i32_indexed;
9732 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9733 FMAInstKind::Indexed, &NewVR);
9734 } else {
9735 Opc = AArch64::FMLAv2f32;
9736 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9737 FMAInstKind::Accumulator, &NewVR);
9738 }
9739 break;
9740 }
9743 RC = &AArch64::FPR128RegClass;
9744 Register NewVR = MRI.createVirtualRegister(RC);
9745 MachineInstrBuilder MIB1 =
9746 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f32), NewVR)
9747 .add(Root.getOperand(2));
9748 InsInstrs.push_back(MIB1);
9749 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9751 Opc = AArch64::FMLAv4i32_indexed;
9752 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9753 FMAInstKind::Indexed, &NewVR);
9754 } else {
9755 Opc = AArch64::FMLAv4f32;
9756 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9757 FMAInstKind::Accumulator, &NewVR);
9758 }
9759 break;
9760 }
9763 RC = &AArch64::FPR128RegClass;
9764 Register NewVR = MRI.createVirtualRegister(RC);
9765 MachineInstrBuilder MIB1 =
9766 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f64), NewVR)
9767 .add(Root.getOperand(2));
9768 InsInstrs.push_back(MIB1);
9769 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9771 Opc = AArch64::FMLAv2i64_indexed;
9772 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9773 FMAInstKind::Indexed, &NewVR);
9774 } else {
9775 Opc = AArch64::FMLAv2f64;
9776 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9777 FMAInstKind::Accumulator, &NewVR);
9778 }
9779 break;
9780 }
9783 unsigned IdxDupOp =
9785 : 2;
9786 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i32_indexed,
9787 &AArch64::FPR128RegClass, MRI);
9788 break;
9789 }
9792 unsigned IdxDupOp =
9794 : 2;
9795 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i64_indexed,
9796 &AArch64::FPR128RegClass, MRI);
9797 break;
9798 }
9801 unsigned IdxDupOp =
9803 : 2;
9804 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i16_indexed,
9805 &AArch64::FPR128_loRegClass, MRI);
9806 break;
9807 }
9810 unsigned IdxDupOp =
9812 : 2;
9813 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i32_indexed,
9814 &AArch64::FPR128RegClass, MRI);
9815 break;
9816 }
9819 unsigned IdxDupOp =
9821 : 2;
9822 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv8i16_indexed,
9823 &AArch64::FPR128_loRegClass, MRI);
9824 break;
9825 }
9827 MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
9828 break;
9829 }
9831 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9832 Pattern, 4);
9833 break;
9834 }
9836 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9837 Pattern, 8);
9838 break;
9839 }
9841 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9842 Pattern, 16);
9843 break;
9844 }
9845
9846 } // end switch (Pattern)
9847 // Record MUL and ADD/SUB for deletion
9848 if (MUL)
9849 DelInstrs.push_back(MUL);
9850 DelInstrs.push_back(&Root);
9851
9852 // Set the flags on the inserted instructions to be the merged flags of the
9853 // instructions that we have combined.
9854 uint32_t Flags = Root.getFlags();
9855 if (MUL)
9856 Flags = Root.mergeFlagsWith(*MUL);
9857 for (auto *MI : InsInstrs)
9858 MI->setFlags(Flags);
9859}
9860
9861/// Replace csincr-branch sequence by simple conditional branch
9862///
9863/// Examples:
9864/// 1. \code
9865/// csinc w9, wzr, wzr, <condition code>
9866/// tbnz w9, #0, 0x44
9867/// \endcode
9868/// to
9869/// \code
9870/// b.<inverted condition code>
9871/// \endcode
9872///
9873/// 2. \code
9874/// csinc w9, wzr, wzr, <condition code>
9875/// tbz w9, #0, 0x44
9876/// \endcode
9877/// to
9878/// \code
9879/// b.<condition code>
9880/// \endcode
9881///
9882/// Replace compare and branch sequence by TBZ/TBNZ instruction when the
9883/// compare's constant operand is power of 2.
9884///
9885/// Examples:
9886/// \code
9887/// and w8, w8, #0x400
9888/// cbnz w8, L1
9889/// \endcode
9890/// to
9891/// \code
9892/// tbnz w8, #10, L1
9893/// \endcode
9894///
9895/// \param MI Conditional Branch
9896/// \return True when the simple conditional branch is generated
9897///
9899 bool IsNegativeBranch = false;
9900 bool IsTestAndBranch = false;
9901 unsigned TargetBBInMI = 0;
9902 switch (MI.getOpcode()) {
9903 default:
9904 llvm_unreachable("Unknown branch instruction?");
9905 case AArch64::Bcc:
9906 case AArch64::CBWPri:
9907 case AArch64::CBXPri:
9908 case AArch64::CBBAssertExt:
9909 case AArch64::CBHAssertExt:
9910 case AArch64::CBWPrr:
9911 case AArch64::CBXPrr:
9912 return false;
9913 case AArch64::CBZW:
9914 case AArch64::CBZX:
9915 TargetBBInMI = 1;
9916 break;
9917 case AArch64::CBNZW:
9918 case AArch64::CBNZX:
9919 TargetBBInMI = 1;
9920 IsNegativeBranch = true;
9921 break;
9922 case AArch64::TBZW:
9923 case AArch64::TBZX:
9924 TargetBBInMI = 2;
9925 IsTestAndBranch = true;
9926 break;
9927 case AArch64::TBNZW:
9928 case AArch64::TBNZX:
9929 TargetBBInMI = 2;
9930 IsNegativeBranch = true;
9931 IsTestAndBranch = true;
9932 break;
9933 }
9934 // So we increment a zero register and test for bits other
9935 // than bit 0? Conservatively bail out in case the verifier
9936 // missed this case.
9937 if (IsTestAndBranch && MI.getOperand(1).getImm())
9938 return false;
9939
9940 // Find Definition.
9941 assert(MI.getParent() && "Incomplete machine instruction\n");
9942 MachineBasicBlock *MBB = MI.getParent();
9943 MachineFunction *MF = MBB->getParent();
9944 MachineRegisterInfo *MRI = &MF->getRegInfo();
9945 Register VReg = MI.getOperand(0).getReg();
9946 if (!VReg.isVirtual())
9947 return false;
9948
9949 MachineInstr *DefMI = MRI->getVRegDef(VReg);
9950
9951 // Look through COPY instructions to find definition.
9952 while (DefMI->isCopy()) {
9953 Register CopyVReg = DefMI->getOperand(1).getReg();
9954 if (!MRI->hasOneNonDBGUse(CopyVReg))
9955 return false;
9956 if (!MRI->hasOneDef(CopyVReg))
9957 return false;
9958 DefMI = MRI->getVRegDef(CopyVReg);
9959 }
9960
9961 switch (DefMI->getOpcode()) {
9962 default:
9963 return false;
9964 // Fold AND into a TBZ/TBNZ if constant operand is power of 2.
9965 case AArch64::ANDWri:
9966 case AArch64::ANDXri: {
9967 if (IsTestAndBranch)
9968 return false;
9969 if (DefMI->getParent() != MBB)
9970 return false;
9971 if (!MRI->hasOneNonDBGUse(VReg))
9972 return false;
9973
9974 bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
9976 DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64);
9977 if (!isPowerOf2_64(Mask))
9978 return false;
9979
9980 MachineOperand &MO = DefMI->getOperand(1);
9981 Register NewReg = MO.getReg();
9982 if (!NewReg.isVirtual())
9983 return false;
9984
9985 assert(!MRI->def_empty(NewReg) && "Register must be defined.");
9986
9987 MachineBasicBlock &RefToMBB = *MBB;
9988 MachineBasicBlock *TBB = MI.getOperand(1).getMBB();
9989 DebugLoc DL = MI.getDebugLoc();
9990 unsigned Imm = Log2_64(Mask);
9991 unsigned Opc = (Imm < 32)
9992 ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
9993 : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
9994 MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc))
9995 .addReg(NewReg)
9996 .addImm(Imm)
9997 .addMBB(TBB);
9998 // Register lives on to the CBZ now.
9999 MO.setIsKill(false);
10000
10001 // For immediate smaller than 32, we need to use the 32-bit
10002 // variant (W) in all cases. Indeed the 64-bit variant does not
10003 // allow to encode them.
10004 // Therefore, if the input register is 64-bit, we need to take the
10005 // 32-bit sub-part.
10006 if (!Is32Bit && Imm < 32)
10007 NewMI->getOperand(0).setSubReg(AArch64::sub_32);
10008 MI.eraseFromParent();
10009 return true;
10010 }
10011 // Look for CSINC
10012 case AArch64::CSINCWr:
10013 case AArch64::CSINCXr: {
10014 if (!(DefMI->getOperand(1).getReg() == AArch64::WZR &&
10015 DefMI->getOperand(2).getReg() == AArch64::WZR) &&
10016 !(DefMI->getOperand(1).getReg() == AArch64::XZR &&
10017 DefMI->getOperand(2).getReg() == AArch64::XZR))
10018 return false;
10019
10020 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
10021 true) != -1)
10022 return false;
10023
10024 AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
10025 // Convert only when the condition code is not modified between
10026 // the CSINC and the branch. The CC may be used by other
10027 // instructions in between.
10029 return false;
10030 MachineBasicBlock &RefToMBB = *MBB;
10031 MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB();
10032 DebugLoc DL = MI.getDebugLoc();
10033 if (IsNegativeBranch)
10035 BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
10036 MI.eraseFromParent();
10037 return true;
10038 }
10039 }
10040}
10041
10042std::pair<unsigned, unsigned>
10043AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
10044 const unsigned Mask = AArch64II::MO_FRAGMENT;
10045 return std::make_pair(TF & Mask, TF & ~Mask);
10046}
10047
10049AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
10050 using namespace AArch64II;
10051
10052 static const std::pair<unsigned, const char *> TargetFlags[] = {
10053 {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"},
10054 {MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"},
10055 {MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"},
10056 {MO_HI12, "aarch64-hi12"}};
10057 return ArrayRef(TargetFlags);
10058}
10059
10061AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
10062 using namespace AArch64II;
10063
10064 static const std::pair<unsigned, const char *> TargetFlags[] = {
10065 {MO_COFFSTUB, "aarch64-coffstub"},
10066 {MO_GOT, "aarch64-got"},
10067 {MO_NC, "aarch64-nc"},
10068 {MO_S, "aarch64-s"},
10069 {MO_TLS, "aarch64-tls"},
10070 {MO_DLLIMPORT, "aarch64-dllimport"},
10071 {MO_PREL, "aarch64-prel"},
10072 {MO_TAGGED, "aarch64-tagged"},
10073 {MO_ARM64EC_CALLMANGLE, "aarch64-arm64ec-callmangle"},
10074 };
10075 return ArrayRef(TargetFlags);
10076}
10077
10079AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
10080 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
10081 {{MOSuppressPair, "aarch64-suppress-pair"},
10082 {MOStridedAccess, "aarch64-strided-access"}};
10083 return ArrayRef(TargetFlags);
10084}
10085
10086/// Constants defining how certain sequences should be outlined.
10087/// This encompasses how an outlined function should be called, and what kind of
10088/// frame should be emitted for that outlined function.
10089///
10090/// \p MachineOutlinerDefault implies that the function should be called with
10091/// a save and restore of LR to the stack.
10092///
10093/// That is,
10094///
10095/// I1 Save LR OUTLINED_FUNCTION:
10096/// I2 --> BL OUTLINED_FUNCTION I1
10097/// I3 Restore LR I2
10098/// I3
10099/// RET
10100///
10101/// * Call construction overhead: 3 (save + BL + restore)
10102/// * Frame construction overhead: 1 (ret)
10103/// * Requires stack fixups? Yes
10104///
10105/// \p MachineOutlinerTailCall implies that the function is being created from
10106/// a sequence of instructions ending in a return.
10107///
10108/// That is,
10109///
10110/// I1 OUTLINED_FUNCTION:
10111/// I2 --> B OUTLINED_FUNCTION I1
10112/// RET I2
10113/// RET
10114///
10115/// * Call construction overhead: 1 (B)
10116/// * Frame construction overhead: 0 (Return included in sequence)
10117/// * Requires stack fixups? No
10118///
10119/// \p MachineOutlinerNoLRSave implies that the function should be called using
10120/// a BL instruction, but doesn't require LR to be saved and restored. This
10121/// happens when LR is known to be dead.
10122///
10123/// That is,
10124///
10125/// I1 OUTLINED_FUNCTION:
10126/// I2 --> BL OUTLINED_FUNCTION I1
10127/// I3 I2
10128/// I3
10129/// RET
10130///
10131/// * Call construction overhead: 1 (BL)
10132/// * Frame construction overhead: 1 (RET)
10133/// * Requires stack fixups? No
10134///
10135/// \p MachineOutlinerThunk implies that the function is being created from
10136/// a sequence of instructions ending in a call. The outlined function is
10137/// called with a BL instruction, and the outlined function tail-calls the
10138/// original call destination.
10139///
10140/// That is,
10141///
10142/// I1 OUTLINED_FUNCTION:
10143/// I2 --> BL OUTLINED_FUNCTION I1
10144/// BL f I2
10145/// B f
10146/// * Call construction overhead: 1 (BL)
10147/// * Frame construction overhead: 0
10148/// * Requires stack fixups? No
10149///
10150/// \p MachineOutlinerRegSave implies that the function should be called with a
10151/// save and restore of LR to an available register. This allows us to avoid
10152/// stack fixups. Note that this outlining variant is compatible with the
10153/// NoLRSave case.
10154///
10155/// That is,
10156///
10157/// I1 Save LR OUTLINED_FUNCTION:
10158/// I2 --> BL OUTLINED_FUNCTION I1
10159/// I3 Restore LR I2
10160/// I3
10161/// RET
10162///
10163/// * Call construction overhead: 3 (save + BL + restore)
10164/// * Frame construction overhead: 1 (ret)
10165/// * Requires stack fixups? No
10167 MachineOutlinerDefault, /// Emit a save, restore, call, and return.
10168 MachineOutlinerTailCall, /// Only emit a branch.
10169 MachineOutlinerNoLRSave, /// Emit a call and return.
10170 MachineOutlinerThunk, /// Emit a call and tail-call.
10171 MachineOutlinerRegSave /// Same as default, but save to a register.
10172};
10173
10179
10181AArch64InstrInfo::findRegisterToSaveLRTo(outliner::Candidate &C) const {
10182 MachineFunction *MF = C.getMF();
10183 const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
10184 const AArch64RegisterInfo *ARI =
10185 static_cast<const AArch64RegisterInfo *>(&TRI);
10186 // Check if there is an available register across the sequence that we can
10187 // use.
10188 for (unsigned Reg : AArch64::GPR64RegClass) {
10189 if (!ARI->isReservedReg(*MF, Reg) &&
10190 Reg != AArch64::LR && // LR is not reserved, but don't use it.
10191 Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.
10192 Reg != AArch64::X17 && // Ditto for X17.
10193 C.isAvailableAcrossAndOutOfSeq(Reg, TRI) &&
10194 C.isAvailableInsideSeq(Reg, TRI))
10195 return Reg;
10196 }
10197 return Register();
10198}
10199
10200static bool
10202 const outliner::Candidate &b) {
10203 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
10204 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
10205
10206 return MFIa->getSignReturnAddressCondition() ==
10208}
10209
10210static bool
10212 const outliner::Candidate &b) {
10213 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
10214 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
10215
10216 return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey();
10217}
10218
10220 const outliner::Candidate &b) {
10221 const AArch64Subtarget &SubtargetA =
10223 const AArch64Subtarget &SubtargetB =
10224 b.getMF()->getSubtarget<AArch64Subtarget>();
10225 return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps();
10226}
10227
10228std::optional<std::unique_ptr<outliner::OutlinedFunction>>
10229AArch64InstrInfo::getOutliningCandidateInfo(
10230 const MachineModuleInfo &MMI,
10231 std::vector<outliner::Candidate> &RepeatedSequenceLocs,
10232 unsigned MinRepeats) const {
10233 unsigned SequenceSize = 0;
10234 for (auto &MI : RepeatedSequenceLocs[0])
10235 SequenceSize += getInstSizeInBytes(MI);
10236
10237 unsigned NumBytesToCreateFrame = 0;
10238
10239 // Avoid splitting ADRP ADD/LDR pair into outlined functions.
10240 // These instructions are fused together by the scheduler.
10241 // Any candidate where ADRP is the last instruction should be rejected
10242 // as that will lead to splitting ADRP pair.
10243 MachineInstr &LastMI = RepeatedSequenceLocs[0].back();
10244 MachineInstr &FirstMI = RepeatedSequenceLocs[0].front();
10245 if (LastMI.getOpcode() == AArch64::ADRP &&
10246 (LastMI.getOperand(1).getTargetFlags() & AArch64II::MO_PAGE) != 0 &&
10247 (LastMI.getOperand(1).getTargetFlags() & AArch64II::MO_GOT) != 0) {
10248 return std::nullopt;
10249 }
10250
10251 // Similarly any candidate where the first instruction is ADD/LDR with a
10252 // page offset should be rejected to avoid ADRP splitting.
10253 if ((FirstMI.getOpcode() == AArch64::ADDXri ||
10254 FirstMI.getOpcode() == AArch64::LDRXui) &&
10255 (FirstMI.getOperand(2).getTargetFlags() & AArch64II::MO_PAGEOFF) != 0 &&
10256 (FirstMI.getOperand(2).getTargetFlags() & AArch64II::MO_GOT) != 0) {
10257 return std::nullopt;
10258 }
10259
10260 // We only allow outlining for functions having exactly matching return
10261 // address signing attributes, i.e., all share the same value for the
10262 // attribute "sign-return-address" and all share the same type of key they
10263 // are signed with.
10264 // Additionally we require all functions to simultaneously either support
10265 // v8.3a features or not. Otherwise an outlined function could get signed
10266 // using dedicated v8.3 instructions and a call from a function that doesn't
10267 // support v8.3 instructions would therefore be invalid.
10268 if (std::adjacent_find(
10269 RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
10270 [](const outliner::Candidate &a, const outliner::Candidate &b) {
10271 // Return true if a and b are non-equal w.r.t. return address
10272 // signing or support of v8.3a features
10273 if (outliningCandidatesSigningScopeConsensus(a, b) &&
10274 outliningCandidatesSigningKeyConsensus(a, b) &&
10275 outliningCandidatesV8_3OpsConsensus(a, b)) {
10276 return false;
10277 }
10278 return true;
10279 }) != RepeatedSequenceLocs.end()) {
10280 return std::nullopt;
10281 }
10282
10283 // Since at this point all candidates agree on their return address signing
10284 // picking just one is fine. If the candidate functions potentially sign their
10285 // return addresses, the outlined function should do the same. Note that in
10286 // the case of "sign-return-address"="non-leaf" this is an assumption: It is
10287 // not certainly true that the outlined function will have to sign its return
10288 // address but this decision is made later, when the decision to outline
10289 // has already been made.
10290 // The same holds for the number of additional instructions we need: On
10291 // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is
10292 // necessary. However, at this point we don't know if the outlined function
10293 // will have a RET instruction so we assume the worst.
10294 const TargetRegisterInfo &TRI = getRegisterInfo();
10295 // Performing a tail call may require extra checks when PAuth is enabled.
10296 // If PAuth is disabled, set it to zero for uniformity.
10297 unsigned NumBytesToCheckLRInTCEpilogue = 0;
10298 const auto RASignCondition = RepeatedSequenceLocs[0]
10299 .getMF()
10300 ->getInfo<AArch64FunctionInfo>()
10301 ->getSignReturnAddressCondition();
10302 if (RASignCondition != SignReturnAddress::None) {
10303 // One PAC and one AUT instructions
10304 NumBytesToCreateFrame += 8;
10305
10306 // PAuth is enabled - set extra tail call cost, if any.
10307 auto LRCheckMethod = Subtarget.getAuthenticatedLRCheckMethod(
10308 *RepeatedSequenceLocs[0].getMF());
10309 NumBytesToCheckLRInTCEpilogue =
10311 // Checking the authenticated LR value may significantly impact
10312 // SequenceSize, so account for it for more precise results.
10313 if (isTailCallReturnInst(RepeatedSequenceLocs[0].back()))
10314 SequenceSize += NumBytesToCheckLRInTCEpilogue;
10315
10316 // We have to check if sp modifying instructions would get outlined.
10317 // If so we only allow outlining if sp is unchanged overall, so matching
10318 // sub and add instructions are okay to outline, all other sp modifications
10319 // are not
10320 auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) {
10321 int SPValue = 0;
10322 for (auto &MI : C) {
10323 if (MI.modifiesRegister(AArch64::SP, &TRI)) {
10324 switch (MI.getOpcode()) {
10325 case AArch64::ADDXri:
10326 case AArch64::ADDWri:
10327 assert(MI.getNumOperands() == 4 && "Wrong number of operands");
10328 assert(MI.getOperand(2).isImm() &&
10329 "Expected operand to be immediate");
10330 assert(MI.getOperand(1).isReg() &&
10331 "Expected operand to be a register");
10332 // Check if the add just increments sp. If so, we search for
10333 // matching sub instructions that decrement sp. If not, the
10334 // modification is illegal
10335 if (MI.getOperand(1).getReg() == AArch64::SP)
10336 SPValue += MI.getOperand(2).getImm();
10337 else
10338 return true;
10339 break;
10340 case AArch64::SUBXri:
10341 case AArch64::SUBWri:
10342 assert(MI.getNumOperands() == 4 && "Wrong number of operands");
10343 assert(MI.getOperand(2).isImm() &&
10344 "Expected operand to be immediate");
10345 assert(MI.getOperand(1).isReg() &&
10346 "Expected operand to be a register");
10347 // Check if the sub just decrements sp. If so, we search for
10348 // matching add instructions that increment sp. If not, the
10349 // modification is illegal
10350 if (MI.getOperand(1).getReg() == AArch64::SP)
10351 SPValue -= MI.getOperand(2).getImm();
10352 else
10353 return true;
10354 break;
10355 default:
10356 return true;
10357 }
10358 }
10359 }
10360 if (SPValue)
10361 return true;
10362 return false;
10363 };
10364 // Remove candidates with illegal stack modifying instructions
10365 llvm::erase_if(RepeatedSequenceLocs, hasIllegalSPModification);
10366
10367 // If the sequence doesn't have enough candidates left, then we're done.
10368 if (RepeatedSequenceLocs.size() < MinRepeats)
10369 return std::nullopt;
10370 }
10371
10372 // Properties about candidate MBBs that hold for all of them.
10373 unsigned FlagsSetInAll = 0xF;
10374
10375 // Compute liveness information for each candidate, and set FlagsSetInAll.
10376 for (outliner::Candidate &C : RepeatedSequenceLocs)
10377 FlagsSetInAll &= C.Flags;
10378
10379 unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back().getOpcode();
10380
10381 // Helper lambda which sets call information for every candidate.
10382 auto SetCandidateCallInfo =
10383 [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
10384 for (outliner::Candidate &C : RepeatedSequenceLocs)
10385 C.setCallInfo(CallID, NumBytesForCall);
10386 };
10387
10388 unsigned FrameID = MachineOutlinerDefault;
10389 NumBytesToCreateFrame += 4;
10390
10391 bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) {
10392 return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement();
10393 });
10394
10395 // We check to see if CFI Instructions are present, and if they are
10396 // we find the number of CFI Instructions in the candidates.
10397 unsigned CFICount = 0;
10398 for (auto &I : RepeatedSequenceLocs[0]) {
10399 if (I.isCFIInstruction())
10400 CFICount++;
10401 }
10402
10403 // We compare the number of found CFI Instructions to the number of CFI
10404 // instructions in the parent function for each candidate. We must check this
10405 // since if we outline one of the CFI instructions in a function, we have to
10406 // outline them all for correctness. If we do not, the address offsets will be
10407 // incorrect between the two sections of the program.
10408 for (outliner::Candidate &C : RepeatedSequenceLocs) {
10409 std::vector<MCCFIInstruction> CFIInstructions =
10410 C.getMF()->getFrameInstructions();
10411
10412 if (CFICount > 0 && CFICount != CFIInstructions.size())
10413 return std::nullopt;
10414 }
10415
10416 // Returns true if an instructions is safe to fix up, false otherwise.
10417 auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
10418 if (MI.isCall())
10419 return true;
10420
10421 if (!MI.modifiesRegister(AArch64::SP, &TRI) &&
10422 !MI.readsRegister(AArch64::SP, &TRI))
10423 return true;
10424
10425 // Any modification of SP will break our code to save/restore LR.
10426 // FIXME: We could handle some instructions which add a constant
10427 // offset to SP, with a bit more work.
10428 if (MI.modifiesRegister(AArch64::SP, &TRI))
10429 return false;
10430
10431 // At this point, we have a stack instruction that we might need to
10432 // fix up. We'll handle it if it's a load or store.
10433 if (MI.mayLoadOrStore()) {
10434 const MachineOperand *Base; // Filled with the base operand of MI.
10435 int64_t Offset; // Filled with the offset of MI.
10436 bool OffsetIsScalable;
10437
10438 // Does it allow us to offset the base operand and is the base the
10439 // register SP?
10440 if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) ||
10441 !Base->isReg() || Base->getReg() != AArch64::SP)
10442 return false;
10443
10444 // Fixe-up code below assumes bytes.
10445 if (OffsetIsScalable)
10446 return false;
10447
10448 // Find the minimum/maximum offset for this instruction and check
10449 // if fixing it up would be in range.
10450 int64_t MinOffset,
10451 MaxOffset; // Unscaled offsets for the instruction.
10452 // The scale to multiply the offsets by.
10453 TypeSize Scale(0U, false), DummyWidth(0U, false);
10454 getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset);
10455
10456 Offset += 16; // Update the offset to what it would be if we outlined.
10457 if (Offset < MinOffset * (int64_t)Scale.getFixedValue() ||
10458 Offset > MaxOffset * (int64_t)Scale.getFixedValue())
10459 return false;
10460
10461 // It's in range, so we can outline it.
10462 return true;
10463 }
10464
10465 // FIXME: Add handling for instructions like "add x0, sp, #8".
10466
10467 // We can't fix it up, so don't outline it.
10468 return false;
10469 };
10470
10471 // True if it's possible to fix up each stack instruction in this sequence.
10472 // Important for frames/call variants that modify the stack.
10473 bool AllStackInstrsSafe =
10474 llvm::all_of(RepeatedSequenceLocs[0], IsSafeToFixup);
10475
10476 // If the last instruction in any candidate is a terminator, then we should
10477 // tail call all of the candidates.
10478 if (RepeatedSequenceLocs[0].back().isTerminator()) {
10479 FrameID = MachineOutlinerTailCall;
10480 NumBytesToCreateFrame = 0;
10481 unsigned NumBytesForCall = 4 + NumBytesToCheckLRInTCEpilogue;
10482 SetCandidateCallInfo(MachineOutlinerTailCall, NumBytesForCall);
10483 }
10484
10485 else if (LastInstrOpcode == AArch64::BL ||
10486 ((LastInstrOpcode == AArch64::BLR ||
10487 LastInstrOpcode == AArch64::BLRNoIP) &&
10488 !HasBTI)) {
10489 // FIXME: Do we need to check if the code after this uses the value of LR?
10490 FrameID = MachineOutlinerThunk;
10491 NumBytesToCreateFrame = NumBytesToCheckLRInTCEpilogue;
10492 SetCandidateCallInfo(MachineOutlinerThunk, 4);
10493 }
10494
10495 else {
10496 // We need to decide how to emit calls + frames. We can always emit the same
10497 // frame if we don't need to save to the stack. If we have to save to the
10498 // stack, then we need a different frame.
10499 unsigned NumBytesNoStackCalls = 0;
10500 std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
10501
10502 // Check if we have to save LR.
10503 for (outliner::Candidate &C : RepeatedSequenceLocs) {
10504 bool LRAvailable =
10506 ? C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI)
10507 : true;
10508 // If we have a noreturn caller, then we're going to be conservative and
10509 // say that we have to save LR. If we don't have a ret at the end of the
10510 // block, then we can't reason about liveness accurately.
10511 //
10512 // FIXME: We can probably do better than always disabling this in
10513 // noreturn functions by fixing up the liveness info.
10514 bool IsNoReturn =
10515 C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn);
10516
10517 // Is LR available? If so, we don't need a save.
10518 if (LRAvailable && !IsNoReturn) {
10519 NumBytesNoStackCalls += 4;
10520 C.setCallInfo(MachineOutlinerNoLRSave, 4);
10521 CandidatesWithoutStackFixups.push_back(C);
10522 }
10523
10524 // Is an unused register available? If so, we won't modify the stack, so
10525 // we can outline with the same frame type as those that don't save LR.
10526 else if (findRegisterToSaveLRTo(C)) {
10527 NumBytesNoStackCalls += 12;
10528 C.setCallInfo(MachineOutlinerRegSave, 12);
10529 CandidatesWithoutStackFixups.push_back(C);
10530 }
10531
10532 // Is SP used in the sequence at all? If not, we don't have to modify
10533 // the stack, so we are guaranteed to get the same frame.
10534 else if (C.isAvailableInsideSeq(AArch64::SP, TRI)) {
10535 NumBytesNoStackCalls += 12;
10536 C.setCallInfo(MachineOutlinerDefault, 12);
10537 CandidatesWithoutStackFixups.push_back(C);
10538 }
10539
10540 // If we outline this, we need to modify the stack. Pretend we don't
10541 // outline this by saving all of its bytes.
10542 else {
10543 NumBytesNoStackCalls += SequenceSize;
10544 }
10545 }
10546
10547 // If there are no places where we have to save LR, then note that we
10548 // don't have to update the stack. Otherwise, give every candidate the
10549 // default call type, as long as it's safe to do so.
10550 if (!AllStackInstrsSafe ||
10551 NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {
10552 RepeatedSequenceLocs = CandidatesWithoutStackFixups;
10553 FrameID = MachineOutlinerNoLRSave;
10554 if (RepeatedSequenceLocs.size() < MinRepeats)
10555 return std::nullopt;
10556 } else {
10557 SetCandidateCallInfo(MachineOutlinerDefault, 12);
10558
10559 // Bugzilla ID: 46767
10560 // TODO: Check if fixing up the stack more than once is safe so we can
10561 // outline these.
10562 //
10563 // An outline resulting in a caller that requires stack fixups at the
10564 // callsite to a callee that also requires stack fixups can happen when
10565 // there are no available registers at the candidate callsite for a
10566 // candidate that itself also has calls.
10567 //
10568 // In other words if function_containing_sequence in the following pseudo
10569 // assembly requires that we save LR at the point of the call, but there
10570 // are no available registers: in this case we save using SP and as a
10571 // result the SP offsets requires stack fixups by multiples of 16.
10572 //
10573 // function_containing_sequence:
10574 // ...
10575 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
10576 // call OUTLINED_FUNCTION_N
10577 // restore LR from SP
10578 // ...
10579 //
10580 // OUTLINED_FUNCTION_N:
10581 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
10582 // ...
10583 // bl foo
10584 // restore LR from SP
10585 // ret
10586 //
10587 // Because the code to handle more than one stack fixup does not
10588 // currently have the proper checks for legality, these cases will assert
10589 // in the AArch64 MachineOutliner. This is because the code to do this
10590 // needs more hardening, testing, better checks that generated code is
10591 // legal, etc and because it is only verified to handle a single pass of
10592 // stack fixup.
10593 //
10594 // The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch
10595 // these cases until they are known to be handled. Bugzilla 46767 is
10596 // referenced in comments at the assert site.
10597 //
10598 // To avoid asserting (or generating non-legal code on noassert builds)
10599 // we remove all candidates which would need more than one stack fixup by
10600 // pruning the cases where the candidate has calls while also having no
10601 // available LR and having no available general purpose registers to copy
10602 // LR to (ie one extra stack save/restore).
10603 //
10604 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
10605 erase_if(RepeatedSequenceLocs, [this, &TRI](outliner::Candidate &C) {
10606 auto IsCall = [](const MachineInstr &MI) { return MI.isCall(); };
10607 return (llvm::any_of(C, IsCall)) &&
10608 (!C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) ||
10609 !findRegisterToSaveLRTo(C));
10610 });
10611 }
10612 }
10613
10614 // If we dropped all of the candidates, bail out here.
10615 if (RepeatedSequenceLocs.size() < MinRepeats)
10616 return std::nullopt;
10617 }
10618
10619 // Does every candidate's MBB contain a call? If so, then we might have a call
10620 // in the range.
10621 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
10622 // Check if the range contains a call. These require a save + restore of the
10623 // link register.
10624 outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
10625 bool ModStackToSaveLR = false;
10626 if (any_of(drop_end(FirstCand),
10627 [](const MachineInstr &MI) { return MI.isCall(); }))
10628 ModStackToSaveLR = true;
10629
10630 // Handle the last instruction separately. If this is a tail call, then the
10631 // last instruction is a call. We don't want to save + restore in this case.
10632 // However, it could be possible that the last instruction is a call without
10633 // it being valid to tail call this sequence. We should consider this as
10634 // well.
10635 else if (FrameID != MachineOutlinerThunk &&
10636 FrameID != MachineOutlinerTailCall && FirstCand.back().isCall())
10637 ModStackToSaveLR = true;
10638
10639 if (ModStackToSaveLR) {
10640 // We can't fix up the stack. Bail out.
10641 if (!AllStackInstrsSafe)
10642 return std::nullopt;
10643
10644 // Save + restore LR.
10645 NumBytesToCreateFrame += 8;
10646 }
10647 }
10648
10649 // If we have CFI instructions, we can only outline if the outlined section
10650 // can be a tail call
10651 if (FrameID != MachineOutlinerTailCall && CFICount > 0)
10652 return std::nullopt;
10653
10654 return std::make_unique<outliner::OutlinedFunction>(
10655 RepeatedSequenceLocs, SequenceSize, NumBytesToCreateFrame, FrameID);
10656}
10657
10658void AArch64InstrInfo::mergeOutliningCandidateAttributes(
10659 Function &F, std::vector<outliner::Candidate> &Candidates) const {
10660 // If a bunch of candidates reach this point they must agree on their return
10661 // address signing. It is therefore enough to just consider the signing
10662 // behaviour of one of them
10663 const auto &CFn = Candidates.front().getMF()->getFunction();
10664
10665 if (CFn.hasFnAttribute("ptrauth-returns"))
10666 F.addFnAttr(CFn.getFnAttribute("ptrauth-returns"));
10667 if (CFn.hasFnAttribute("ptrauth-auth-traps"))
10668 F.addFnAttr(CFn.getFnAttribute("ptrauth-auth-traps"));
10669 // Since all candidates belong to the same module, just copy the
10670 // function-level attributes of an arbitrary function.
10671 if (CFn.hasFnAttribute("sign-return-address"))
10672 F.addFnAttr(CFn.getFnAttribute("sign-return-address"));
10673 if (CFn.hasFnAttribute("sign-return-address-key"))
10674 F.addFnAttr(CFn.getFnAttribute("sign-return-address-key"));
10675
10676 AArch64GenInstrInfo::mergeOutliningCandidateAttributes(F, Candidates);
10677}
10678
10679bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
10680 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
10681 const Function &F = MF.getFunction();
10682
10683 // Can F be deduplicated by the linker? If it can, don't outline from it.
10684 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
10685 return false;
10686
10687 // Don't outline from functions with section markings; the program could
10688 // expect that all the code is in the named section.
10689 // FIXME: Allow outlining from multiple functions with the same section
10690 // marking.
10691 if (F.hasSection())
10692 return false;
10693
10694 // Outlining from functions with redzones is unsafe since the outliner may
10695 // modify the stack. Check if hasRedZone is true or unknown; if yes, don't
10696 // outline from it.
10697 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
10698 if (!AFI || AFI->hasRedZone().value_or(true))
10699 return false;
10700
10701 // FIXME: Determine whether it is safe to outline from functions which contain
10702 // streaming-mode changes. We may need to ensure any smstart/smstop pairs are
10703 // outlined together and ensure it is safe to outline with async unwind info,
10704 // required for saving & restoring VG around calls.
10705 if (AFI->hasStreamingModeChanges())
10706 return false;
10707
10708 // FIXME: Teach the outliner to generate/handle Windows unwind info.
10710 return false;
10711
10712 // It's safe to outline from MF.
10713 return true;
10714}
10715
10717AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB,
10718 unsigned &Flags) const {
10720 "Must track liveness!");
10722 std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
10723 Ranges;
10724 // According to the AArch64 Procedure Call Standard, the following are
10725 // undefined on entry/exit from a function call:
10726 //
10727 // * Registers x16, x17, (and thus w16, w17)
10728 // * Condition codes (and thus the NZCV register)
10729 //
10730 // If any of these registers are used inside or live across an outlined
10731 // function, then they may be modified later, either by the compiler or
10732 // some other tool (like the linker).
10733 //
10734 // To avoid outlining in these situations, partition each block into ranges
10735 // where these registers are dead. We will only outline from those ranges.
10736 LiveRegUnits LRU(getRegisterInfo());
10737 auto AreAllUnsafeRegsDead = [&LRU]() {
10738 return LRU.available(AArch64::W16) && LRU.available(AArch64::W17) &&
10739 LRU.available(AArch64::NZCV);
10740 };
10741
10742 // We need to know if LR is live across an outlining boundary later on in
10743 // order to decide how we'll create the outlined call, frame, etc.
10744 //
10745 // It's pretty expensive to check this for *every candidate* within a block.
10746 // That's some potentially n^2 behaviour, since in the worst case, we'd need
10747 // to compute liveness from the end of the block for O(n) candidates within
10748 // the block.
10749 //
10750 // So, to improve the average case, let's keep track of liveness from the end
10751 // of the block to the beginning of *every outlinable range*. If we know that
10752 // LR is available in every range we could outline from, then we know that
10753 // we don't need to check liveness for any candidate within that range.
10754 bool LRAvailableEverywhere = true;
10755 // Compute liveness bottom-up.
10756 LRU.addLiveOuts(MBB);
10757 // Update flags that require info about the entire MBB.
10758 auto UpdateWholeMBBFlags = [&Flags](const MachineInstr &MI) {
10759 if (MI.isCall() && !MI.isTerminator())
10761 };
10762 // Range: [RangeBegin, RangeEnd)
10763 MachineBasicBlock::instr_iterator RangeBegin, RangeEnd;
10764 unsigned RangeLen;
10765 auto CreateNewRangeStartingAt =
10766 [&RangeBegin, &RangeEnd,
10767 &RangeLen](MachineBasicBlock::instr_iterator NewBegin) {
10768 RangeBegin = NewBegin;
10769 RangeEnd = std::next(RangeBegin);
10770 RangeLen = 0;
10771 };
10772 auto SaveRangeIfNonEmpty = [&RangeLen, &Ranges, &RangeBegin, &RangeEnd]() {
10773 // At least one unsafe register is not dead. We do not want to outline at
10774 // this point. If it is long enough to outline from and does not cross a
10775 // bundle boundary, save the range [RangeBegin, RangeEnd).
10776 if (RangeLen <= 1)
10777 return;
10778 if (!RangeBegin.isEnd() && RangeBegin->isBundledWithPred())
10779 return;
10780 if (!RangeEnd.isEnd() && RangeEnd->isBundledWithPred())
10781 return;
10782 Ranges.emplace_back(RangeBegin, RangeEnd);
10783 };
10784 // Find the first point where all unsafe registers are dead.
10785 // FIND: <safe instr> <-- end of first potential range
10786 // SKIP: <unsafe def>
10787 // SKIP: ... everything between ...
10788 // SKIP: <unsafe use>
10789 auto FirstPossibleEndPt = MBB.instr_rbegin();
10790 for (; FirstPossibleEndPt != MBB.instr_rend(); ++FirstPossibleEndPt) {
10791 if (!FirstPossibleEndPt->isDebugInstr())
10792 LRU.stepBackward(*FirstPossibleEndPt);
10793 // Update flags that impact how we outline across the entire block,
10794 // regardless of safety.
10795 UpdateWholeMBBFlags(*FirstPossibleEndPt);
10796 if (AreAllUnsafeRegsDead())
10797 break;
10798 }
10799 // If we exhausted the entire block, we have no safe ranges to outline.
10800 if (FirstPossibleEndPt == MBB.instr_rend())
10801 return Ranges;
10802 // Current range.
10803 CreateNewRangeStartingAt(FirstPossibleEndPt->getIterator());
10804 // StartPt points to the first place where all unsafe registers
10805 // are dead (if there is any such point). Begin partitioning the MBB into
10806 // ranges.
10807 for (auto &MI : make_range(FirstPossibleEndPt, MBB.instr_rend())) {
10808 if (!MI.isDebugInstr())
10809 LRU.stepBackward(MI);
10810 UpdateWholeMBBFlags(MI);
10811 if (!AreAllUnsafeRegsDead()) {
10812 SaveRangeIfNonEmpty();
10813 CreateNewRangeStartingAt(MI.getIterator());
10814 continue;
10815 }
10816 LRAvailableEverywhere &= LRU.available(AArch64::LR);
10817 RangeBegin = MI.getIterator();
10818 ++RangeLen;
10819 }
10820 // Above loop misses the last (or only) range. If we are still safe, then
10821 // let's save the range.
10822 if (AreAllUnsafeRegsDead())
10823 SaveRangeIfNonEmpty();
10824 if (Ranges.empty())
10825 return Ranges;
10826 // We found the ranges bottom-up. Mapping expects the top-down. Reverse
10827 // the order.
10828 std::reverse(Ranges.begin(), Ranges.end());
10829 // If there is at least one outlinable range where LR is unavailable
10830 // somewhere, remember that.
10831 if (!LRAvailableEverywhere)
10833 return Ranges;
10834}
10835
10837AArch64InstrInfo::getOutliningTypeImpl(const MachineModuleInfo &MMI,
10839 unsigned Flags) const {
10840 MachineInstr &MI = *MIT;
10841
10842 // Don't outline anything used for return address signing. The outlined
10843 // function will get signed later if needed
10844 switch (MI.getOpcode()) {
10845 case AArch64::PACM:
10846 case AArch64::PACIASP:
10847 case AArch64::PACIBSP:
10848 case AArch64::PACIASPPC:
10849 case AArch64::PACIBSPPC:
10850 case AArch64::AUTIASP:
10851 case AArch64::AUTIBSP:
10852 case AArch64::AUTIASPPCi:
10853 case AArch64::AUTIASPPCr:
10854 case AArch64::AUTIBSPPCi:
10855 case AArch64::AUTIBSPPCr:
10856 case AArch64::RETAA:
10857 case AArch64::RETAB:
10858 case AArch64::RETAASPPCi:
10859 case AArch64::RETAASPPCr:
10860 case AArch64::RETABSPPCi:
10861 case AArch64::RETABSPPCr:
10862 case AArch64::EMITBKEY:
10863 case AArch64::PAUTH_PROLOGUE:
10864 case AArch64::PAUTH_EPILOGUE:
10866 }
10867
10868 // We can only outline these if we will tail call the outlined function, or
10869 // fix up the CFI offsets. Currently, CFI instructions are outlined only if
10870 // in a tail call.
10871 //
10872 // FIXME: If the proper fixups for the offset are implemented, this should be
10873 // possible.
10874 if (MI.isCFIInstruction())
10876
10877 // Is this a terminator for a basic block?
10878 if (MI.isTerminator())
10879 // TargetInstrInfo::getOutliningType has already filtered out anything
10880 // that would break this, so we can allow it here.
10882
10883 // Make sure none of the operands are un-outlinable.
10884 for (const MachineOperand &MOP : MI.operands()) {
10885 // A check preventing CFI indices was here before, but only CFI
10886 // instructions should have those.
10887 assert(!MOP.isCFIIndex());
10888
10889 // If it uses LR or W30 explicitly, then don't touch it.
10890 if (MOP.isReg() && !MOP.isImplicit() &&
10891 (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30))
10893 }
10894
10895 // Special cases for instructions that can always be outlined, but will fail
10896 // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always
10897 // be outlined because they don't require a *specific* value to be in LR.
10898 if (MI.getOpcode() == AArch64::ADRP)
10900
10901 // If MI is a call we might be able to outline it. We don't want to outline
10902 // any calls that rely on the position of items on the stack. When we outline
10903 // something containing a call, we have to emit a save and restore of LR in
10904 // the outlined function. Currently, this always happens by saving LR to the
10905 // stack. Thus, if we outline, say, half the parameters for a function call
10906 // plus the call, then we'll break the callee's expectations for the layout
10907 // of the stack.
10908 //
10909 // FIXME: Allow calls to functions which construct a stack frame, as long
10910 // as they don't access arguments on the stack.
10911 // FIXME: Figure out some way to analyze functions defined in other modules.
10912 // We should be able to compute the memory usage based on the IR calling
10913 // convention, even if we can't see the definition.
10914 if (MI.isCall()) {
10915 // Get the function associated with the call. Look at each operand and find
10916 // the one that represents the callee and get its name.
10917 const Function *Callee = nullptr;
10918 for (const MachineOperand &MOP : MI.operands()) {
10919 if (MOP.isGlobal()) {
10920 Callee = dyn_cast<Function>(MOP.getGlobal());
10921 break;
10922 }
10923 }
10924
10925 // Never outline calls to mcount. There isn't any rule that would require
10926 // this, but the Linux kernel's "ftrace" feature depends on it.
10927 if (Callee && Callee->getName() == "\01_mcount")
10929
10930 // If we don't know anything about the callee, assume it depends on the
10931 // stack layout of the caller. In that case, it's only legal to outline
10932 // as a tail-call. Explicitly list the call instructions we know about so we
10933 // don't get unexpected results with call pseudo-instructions.
10934 auto UnknownCallOutlineType = outliner::InstrType::Illegal;
10935 if (MI.getOpcode() == AArch64::BLR ||
10936 MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL)
10937 UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
10938
10939 if (!Callee)
10940 return UnknownCallOutlineType;
10941
10942 // We have a function we have information about. Check it if it's something
10943 // can safely outline.
10944 MachineFunction *CalleeMF = MMI.getMachineFunction(*Callee);
10945
10946 // We don't know what's going on with the callee at all. Don't touch it.
10947 if (!CalleeMF)
10948 return UnknownCallOutlineType;
10949
10950 // Check if we know anything about the callee saves on the function. If we
10951 // don't, then don't touch it, since that implies that we haven't
10952 // computed anything about its stack frame yet.
10953 MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
10954 if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||
10955 MFI.getNumObjects() > 0)
10956 return UnknownCallOutlineType;
10957
10958 // At this point, we can say that CalleeMF ought to not pass anything on the
10959 // stack. Therefore, we can outline it.
10961 }
10962
10963 // Don't touch the link register or W30.
10964 if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) ||
10965 MI.modifiesRegister(AArch64::W30, &getRegisterInfo()))
10967
10968 // Don't outline BTI instructions, because that will prevent the outlining
10969 // site from being indirectly callable.
10970 if (hasBTISemantics(MI))
10972
10974}
10975
10976void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
10977 for (MachineInstr &MI : MBB) {
10978 const MachineOperand *Base;
10979 TypeSize Width(0, false);
10980 int64_t Offset;
10981 bool OffsetIsScalable;
10982
10983 // Is this a load or store with an immediate offset with SP as the base?
10984 if (!MI.mayLoadOrStore() ||
10985 !getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width,
10986 &RI) ||
10987 (Base->isReg() && Base->getReg() != AArch64::SP))
10988 continue;
10989
10990 // It is, so we have to fix it up.
10991 TypeSize Scale(0U, false);
10992 int64_t Dummy1, Dummy2;
10993
10994 MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI);
10995 assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
10996 getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2);
10997 assert(Scale != 0 && "Unexpected opcode!");
10998 assert(!OffsetIsScalable && "Expected offset to be a byte offset");
10999
11000 // We've pushed the return address to the stack, so add 16 to the offset.
11001 // This is safe, since we already checked if it would overflow when we
11002 // checked if this instruction was legal to outline.
11003 int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedValue();
11004 StackOffsetOperand.setImm(NewImm);
11005 }
11006}
11007
11009 const AArch64InstrInfo *TII,
11010 bool ShouldSignReturnAddr) {
11011 if (!ShouldSignReturnAddr)
11012 return;
11013
11014 BuildMI(MBB, MBB.begin(), DebugLoc(), TII->get(AArch64::PAUTH_PROLOGUE))
11016 TII->createPauthEpilogueInstr(MBB, DebugLoc());
11017}
11018
11019void AArch64InstrInfo::buildOutlinedFrame(
11021 const outliner::OutlinedFunction &OF) const {
11022
11023 AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>();
11024
11025 if (OF.FrameConstructionID == MachineOutlinerTailCall)
11026 FI->setOutliningStyle("Tail Call");
11027 else if (OF.FrameConstructionID == MachineOutlinerThunk) {
11028 // For thunk outlining, rewrite the last instruction from a call to a
11029 // tail-call.
11030 MachineInstr *Call = &*--MBB.instr_end();
11031 unsigned TailOpcode;
11032 if (Call->getOpcode() == AArch64::BL) {
11033 TailOpcode = AArch64::TCRETURNdi;
11034 } else {
11035 assert(Call->getOpcode() == AArch64::BLR ||
11036 Call->getOpcode() == AArch64::BLRNoIP);
11037 TailOpcode = AArch64::TCRETURNriALL;
11038 }
11039 MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))
11040 .add(Call->getOperand(0))
11041 .addImm(0);
11042 MBB.insert(MBB.end(), TC);
11044
11045 FI->setOutliningStyle("Thunk");
11046 }
11047
11048 bool IsLeafFunction = true;
11049
11050 // Is there a call in the outlined range?
11051 auto IsNonTailCall = [](const MachineInstr &MI) {
11052 return MI.isCall() && !MI.isReturn();
11053 };
11054
11055 if (llvm::any_of(MBB.instrs(), IsNonTailCall)) {
11056 // Fix up the instructions in the range, since we're going to modify the
11057 // stack.
11058
11059 // Bugzilla ID: 46767
11060 // TODO: Check if fixing up twice is safe so we can outline these.
11061 assert(OF.FrameConstructionID != MachineOutlinerDefault &&
11062 "Can only fix up stack references once");
11063 fixupPostOutline(MBB);
11064
11065 IsLeafFunction = false;
11066
11067 // LR has to be a live in so that we can save it.
11068 if (!MBB.isLiveIn(AArch64::LR))
11069 MBB.addLiveIn(AArch64::LR);
11070
11073
11074 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
11075 OF.FrameConstructionID == MachineOutlinerThunk)
11076 Et = std::prev(MBB.end());
11077
11078 // Insert a save before the outlined region
11079 MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
11080 .addReg(AArch64::SP, RegState::Define)
11081 .addReg(AArch64::LR)
11082 .addReg(AArch64::SP)
11083 .addImm(-16);
11084 It = MBB.insert(It, STRXpre);
11085
11086 if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo(MF)) {
11087 CFIInstBuilder CFIBuilder(MBB, It, MachineInstr::FrameSetup);
11088
11089 // Add a CFI saying the stack was moved 16 B down.
11090 CFIBuilder.buildDefCFAOffset(16);
11091
11092 // Add a CFI saying that the LR that we want to find is now 16 B higher
11093 // than before.
11094 CFIBuilder.buildOffset(AArch64::LR, -16);
11095 }
11096
11097 // Insert a restore before the terminator for the function.
11098 MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
11099 .addReg(AArch64::SP, RegState::Define)
11100 .addReg(AArch64::LR, RegState::Define)
11101 .addReg(AArch64::SP)
11102 .addImm(16);
11103 Et = MBB.insert(Et, LDRXpost);
11104 }
11105
11106 auto RASignCondition = FI->getSignReturnAddressCondition();
11107 bool ShouldSignReturnAddr = AArch64FunctionInfo::shouldSignReturnAddress(
11108 RASignCondition, !IsLeafFunction);
11109
11110 // If this is a tail call outlined function, then there's already a return.
11111 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
11112 OF.FrameConstructionID == MachineOutlinerThunk) {
11113 signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);
11114 return;
11115 }
11116
11117 // It's not a tail call, so we have to insert the return ourselves.
11118
11119 // LR has to be a live in so that we can return to it.
11120 if (!MBB.isLiveIn(AArch64::LR))
11121 MBB.addLiveIn(AArch64::LR);
11122
11123 MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
11124 .addReg(AArch64::LR);
11125 MBB.insert(MBB.end(), ret);
11126
11127 signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);
11128
11129 FI->setOutliningStyle("Function");
11130
11131 // Did we have to modify the stack by saving the link register?
11132 if (OF.FrameConstructionID != MachineOutlinerDefault)
11133 return;
11134
11135 // We modified the stack.
11136 // Walk over the basic block and fix up all the stack accesses.
11137 fixupPostOutline(MBB);
11138}
11139
11140MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
11143
11144 // Are we tail calling?
11145 if (C.CallConstructionID == MachineOutlinerTailCall) {
11146 // If yes, then we can just branch to the label.
11147 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi))
11148 .addGlobalAddress(M.getNamedValue(MF.getName()))
11149 .addImm(0));
11150 return It;
11151 }
11152
11153 // Are we saving the link register?
11154 if (C.CallConstructionID == MachineOutlinerNoLRSave ||
11155 C.CallConstructionID == MachineOutlinerThunk) {
11156 // No, so just insert the call.
11157 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
11158 .addGlobalAddress(M.getNamedValue(MF.getName())));
11159 return It;
11160 }
11161
11162 // We want to return the spot where we inserted the call.
11164
11165 // Instructions for saving and restoring LR around the call instruction we're
11166 // going to insert.
11167 MachineInstr *Save;
11168 MachineInstr *Restore;
11169 // Can we save to a register?
11170 if (C.CallConstructionID == MachineOutlinerRegSave) {
11171 // FIXME: This logic should be sunk into a target-specific interface so that
11172 // we don't have to recompute the register.
11173 Register Reg = findRegisterToSaveLRTo(C);
11174 assert(Reg && "No callee-saved register available?");
11175
11176 // LR has to be a live in so that we can save it.
11177 if (!MBB.isLiveIn(AArch64::LR))
11178 MBB.addLiveIn(AArch64::LR);
11179
11180 // Save and restore LR from Reg.
11181 Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg)
11182 .addReg(AArch64::XZR)
11183 .addReg(AArch64::LR)
11184 .addImm(0);
11185 Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR)
11186 .addReg(AArch64::XZR)
11187 .addReg(Reg)
11188 .addImm(0);
11189 } else {
11190 // We have the default case. Save and restore from SP.
11191 Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
11192 .addReg(AArch64::SP, RegState::Define)
11193 .addReg(AArch64::LR)
11194 .addReg(AArch64::SP)
11195 .addImm(-16);
11196 Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
11197 .addReg(AArch64::SP, RegState::Define)
11198 .addReg(AArch64::LR, RegState::Define)
11199 .addReg(AArch64::SP)
11200 .addImm(16);
11201 }
11202
11203 It = MBB.insert(It, Save);
11204 It++;
11205
11206 // Insert the call.
11207 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
11208 .addGlobalAddress(M.getNamedValue(MF.getName())));
11209 CallPt = It;
11210 It++;
11211
11212 It = MBB.insert(It, Restore);
11213 return CallPt;
11214}
11215
11216bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault(
11217 MachineFunction &MF) const {
11218 return MF.getFunction().hasMinSize();
11219}
11220
11221void AArch64InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB,
11223 DebugLoc &DL,
11224 bool AllowSideEffects) const {
11225 const MachineFunction &MF = *MBB.getParent();
11226 const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
11227 const AArch64RegisterInfo &TRI = *STI.getRegisterInfo();
11228
11229 if (TRI.isGeneralPurposeRegister(MF, Reg)) {
11230 BuildMI(MBB, Iter, DL, get(AArch64::MOVZXi), Reg).addImm(0).addImm(0);
11231 } else if (STI.isSVEorStreamingSVEAvailable()) {
11232 BuildMI(MBB, Iter, DL, get(AArch64::DUP_ZI_D), Reg)
11233 .addImm(0)
11234 .addImm(0);
11235 } else if (STI.isNeonAvailable()) {
11236 BuildMI(MBB, Iter, DL, get(AArch64::MOVIv2d_ns), Reg)
11237 .addImm(0);
11238 } else {
11239 // This is a streaming-compatible function without SVE. We don't have full
11240 // Neon (just FPRs), so we can at most use the first 64-bit sub-register.
11241 // So given `movi v..` would be illegal use `fmov d..` instead.
11242 assert(STI.hasNEON() && "Expected to have NEON.");
11243 Register Reg64 = TRI.getSubReg(Reg, AArch64::dsub);
11244 BuildMI(MBB, Iter, DL, get(AArch64::FMOVD0), Reg64);
11245 }
11246}
11247
11248std::optional<DestSourcePair>
11250
11251 // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg
11252 // and zero immediate operands used as an alias for mov instruction.
11253 if (((MI.getOpcode() == AArch64::ORRWrs &&
11254 MI.getOperand(1).getReg() == AArch64::WZR &&
11255 MI.getOperand(3).getImm() == 0x0) ||
11256 (MI.getOpcode() == AArch64::ORRWrr &&
11257 MI.getOperand(1).getReg() == AArch64::WZR)) &&
11258 // Check that the w->w move is not a zero-extending w->x mov.
11259 (!MI.getOperand(0).getReg().isVirtual() ||
11260 MI.getOperand(0).getSubReg() == 0) &&
11261 (!MI.getOperand(0).getReg().isPhysical() ||
11262 MI.findRegisterDefOperandIdx(getXRegFromWReg(MI.getOperand(0).getReg()),
11263 /*TRI=*/nullptr) == -1))
11264 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
11265
11266 if (MI.getOpcode() == AArch64::ORRXrs &&
11267 MI.getOperand(1).getReg() == AArch64::XZR &&
11268 MI.getOperand(3).getImm() == 0x0)
11269 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
11270
11271 return std::nullopt;
11272}
11273
11274std::optional<DestSourcePair>
11276 if ((MI.getOpcode() == AArch64::ORRWrs &&
11277 MI.getOperand(1).getReg() == AArch64::WZR &&
11278 MI.getOperand(3).getImm() == 0x0) ||
11279 (MI.getOpcode() == AArch64::ORRWrr &&
11280 MI.getOperand(1).getReg() == AArch64::WZR))
11281 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
11282 return std::nullopt;
11283}
11284
11285std::optional<RegImmPair>
11286AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, Register Reg) const {
11287 int Sign = 1;
11288 int64_t Offset = 0;
11289
11290 // TODO: Handle cases where Reg is a super- or sub-register of the
11291 // destination register.
11292 const MachineOperand &Op0 = MI.getOperand(0);
11293 if (!Op0.isReg() || Reg != Op0.getReg())
11294 return std::nullopt;
11295
11296 switch (MI.getOpcode()) {
11297 default:
11298 return std::nullopt;
11299 case AArch64::SUBWri:
11300 case AArch64::SUBXri:
11301 case AArch64::SUBSWri:
11302 case AArch64::SUBSXri:
11303 Sign *= -1;
11304 [[fallthrough]];
11305 case AArch64::ADDSWri:
11306 case AArch64::ADDSXri:
11307 case AArch64::ADDWri:
11308 case AArch64::ADDXri: {
11309 // TODO: Third operand can be global address (usually some string).
11310 if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() ||
11311 !MI.getOperand(2).isImm())
11312 return std::nullopt;
11313 int Shift = MI.getOperand(3).getImm();
11314 assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12");
11315 Offset = Sign * (MI.getOperand(2).getImm() << Shift);
11316 }
11317 }
11318 return RegImmPair{MI.getOperand(1).getReg(), Offset};
11319}
11320
11321/// If the given ORR instruction is a copy, and \p DescribedReg overlaps with
11322/// the destination register then, if possible, describe the value in terms of
11323/// the source register.
11324static std::optional<ParamLoadedValue>
11326 const TargetInstrInfo *TII,
11327 const TargetRegisterInfo *TRI) {
11328 auto DestSrc = TII->isCopyLikeInstr(MI);
11329 if (!DestSrc)
11330 return std::nullopt;
11331
11332 Register DestReg = DestSrc->Destination->getReg();
11333 Register SrcReg = DestSrc->Source->getReg();
11334
11335 if (!DestReg.isValid() || !SrcReg.isValid())
11336 return std::nullopt;
11337
11338 auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
11339
11340 // If the described register is the destination, just return the source.
11341 if (DestReg == DescribedReg)
11342 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
11343
11344 // ORRWrs zero-extends to 64-bits, so we need to consider such cases.
11345 if (MI.getOpcode() == AArch64::ORRWrs &&
11346 TRI->isSuperRegister(DestReg, DescribedReg))
11347 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
11348
11349 // We may need to describe the lower part of a ORRXrs move.
11350 if (MI.getOpcode() == AArch64::ORRXrs &&
11351 TRI->isSubRegister(DestReg, DescribedReg)) {
11352 Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32);
11353 return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr);
11354 }
11355
11356 assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) &&
11357 "Unhandled ORR[XW]rs copy case");
11358
11359 return std::nullopt;
11360}
11361
11362bool AArch64InstrInfo::isFunctionSafeToSplit(const MachineFunction &MF) const {
11363 // Functions cannot be split to different sections on AArch64 if they have
11364 // a red zone. This is because relaxing a cross-section branch may require
11365 // incrementing the stack pointer to spill a register, which would overwrite
11366 // the red zone.
11367 if (MF.getInfo<AArch64FunctionInfo>()->hasRedZone().value_or(true))
11368 return false;
11369
11371}
11372
11373bool AArch64InstrInfo::isMBBSafeToSplitToCold(
11374 const MachineBasicBlock &MBB) const {
11375 // Asm Goto blocks can contain conditional branches to goto labels, which can
11376 // get moved out of range of the branch instruction.
11377 auto isAsmGoto = [](const MachineInstr &MI) {
11378 return MI.getOpcode() == AArch64::INLINEASM_BR;
11379 };
11380 if (llvm::any_of(MBB, isAsmGoto) || MBB.isInlineAsmBrIndirectTarget())
11381 return false;
11382
11383 // Because jump tables are label-relative instead of table-relative, they all
11384 // must be in the same section or relocation fixup handling will fail.
11385
11386 // Check if MBB is a jump table target
11387 const MachineJumpTableInfo *MJTI = MBB.getParent()->getJumpTableInfo();
11388 auto containsMBB = [&MBB](const MachineJumpTableEntry &JTE) {
11389 return llvm::is_contained(JTE.MBBs, &MBB);
11390 };
11391 if (MJTI != nullptr && llvm::any_of(MJTI->getJumpTables(), containsMBB))
11392 return false;
11393
11394 // Check if MBB contains a jump table lookup
11395 for (const MachineInstr &MI : MBB) {
11396 switch (MI.getOpcode()) {
11397 case TargetOpcode::G_BRJT:
11398 case AArch64::JumpTableDest32:
11399 case AArch64::JumpTableDest16:
11400 case AArch64::JumpTableDest8:
11401 return false;
11402 default:
11403 continue;
11404 }
11405 }
11406
11407 // MBB isn't a special case, so it's safe to be split to the cold section.
11408 return true;
11409}
11410
11411std::optional<ParamLoadedValue>
11412AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI,
11413 Register Reg) const {
11414 const MachineFunction *MF = MI.getMF();
11415 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
11416 switch (MI.getOpcode()) {
11417 case AArch64::MOVZWi:
11418 case AArch64::MOVZXi: {
11419 // MOVZWi may be used for producing zero-extended 32-bit immediates in
11420 // 64-bit parameters, so we need to consider super-registers.
11421 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
11422 return std::nullopt;
11423
11424 if (!MI.getOperand(1).isImm())
11425 return std::nullopt;
11426 int64_t Immediate = MI.getOperand(1).getImm();
11427 int Shift = MI.getOperand(2).getImm();
11428 return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift),
11429 nullptr);
11430 }
11431 case AArch64::ORRWrs:
11432 case AArch64::ORRXrs:
11433 return describeORRLoadedValue(MI, Reg, this, TRI);
11434 }
11435
11437}
11438
11439bool AArch64InstrInfo::isExtendLikelyToBeFolded(
11440 MachineInstr &ExtMI, MachineRegisterInfo &MRI) const {
11441 assert(ExtMI.getOpcode() == TargetOpcode::G_SEXT ||
11442 ExtMI.getOpcode() == TargetOpcode::G_ZEXT ||
11443 ExtMI.getOpcode() == TargetOpcode::G_ANYEXT);
11444
11445 // Anyexts are nops.
11446 if (ExtMI.getOpcode() == TargetOpcode::G_ANYEXT)
11447 return true;
11448
11449 Register DefReg = ExtMI.getOperand(0).getReg();
11450 if (!MRI.hasOneNonDBGUse(DefReg))
11451 return false;
11452
11453 // It's likely that a sext/zext as a G_PTR_ADD offset will be folded into an
11454 // addressing mode.
11455 auto *UserMI = &*MRI.use_instr_nodbg_begin(DefReg);
11456 return UserMI->getOpcode() == TargetOpcode::G_PTR_ADD;
11457}
11458
11459uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const {
11460 return get(Opc).TSFlags & AArch64::ElementSizeMask;
11461}
11462
11463bool AArch64InstrInfo::isPTestLikeOpcode(unsigned Opc) const {
11464 return get(Opc).TSFlags & AArch64::InstrFlagIsPTestLike;
11465}
11466
11467bool AArch64InstrInfo::isWhileOpcode(unsigned Opc) const {
11468 return get(Opc).TSFlags & AArch64::InstrFlagIsWhile;
11469}
11470
11471unsigned int
11472AArch64InstrInfo::getTailDuplicateSize(CodeGenOptLevel OptLevel) const {
11473 return OptLevel >= CodeGenOptLevel::Aggressive ? 6 : 2;
11474}
11475
11476bool AArch64InstrInfo::isLegalAddressingMode(unsigned NumBytes, int64_t Offset,
11477 unsigned Scale) const {
11478 if (Offset && Scale)
11479 return false;
11480
11481 // Check Reg + Imm
11482 if (!Scale) {
11483 // 9-bit signed offset
11484 if (isInt<9>(Offset))
11485 return true;
11486
11487 // 12-bit unsigned offset
11488 unsigned Shift = Log2_64(NumBytes);
11489 if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
11490 // Must be a multiple of NumBytes (NumBytes is a power of 2)
11491 (Offset >> Shift) << Shift == Offset)
11492 return true;
11493 return false;
11494 }
11495
11496 // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
11497 return Scale == 1 || (Scale > 0 && Scale == NumBytes);
11498}
11499
11501 if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr())
11502 return AArch64::BLRNoIP;
11503 else
11504 return AArch64::BLR;
11505}
11506
11508 DebugLoc DL) const {
11509 MachineBasicBlock::iterator InsertPt = MBB.getFirstTerminator();
11510 auto Builder = BuildMI(MBB, InsertPt, DL, get(AArch64::PAUTH_EPILOGUE))
11512
11513 MachineFunction &MF = *MBB.getParent();
11514 const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
11515 auto &AFL = *static_cast<const AArch64FrameLowering *>(
11516 MF.getSubtarget().getFrameLowering());
11517 if (AFL.getArgumentStackToRestore(MF, MBB)) {
11518 Builder.addReg(AArch64::X17, RegState::ImplicitDefine);
11519 Builder.addReg(AArch64::X16, RegState::ImplicitDefine);
11520 if (Subtarget.hasPAuthLR())
11521 Builder.addReg(AArch64::X15, RegState::ImplicitDefine);
11522 return;
11523 }
11524
11525 if (AFI->branchProtectionPAuthLR() && !Subtarget.hasPAuthLR())
11526 Builder.addReg(AArch64::X16, RegState::ImplicitDefine);
11527}
11528
11530AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI,
11531 Register TargetReg, bool FrameSetup) const {
11532 assert(TargetReg != AArch64::SP && "New top of stack cannot already be in SP");
11533
11534 MachineBasicBlock &MBB = *MBBI->getParent();
11535 MachineFunction &MF = *MBB.getParent();
11536 const AArch64InstrInfo *TII =
11537 MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
11538 int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize();
11539 DebugLoc DL = MBB.findDebugLoc(MBBI);
11540
11541 MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
11542 MachineBasicBlock *LoopTestMBB =
11543 MF.CreateMachineBasicBlock(MBB.getBasicBlock());
11544 MF.insert(MBBInsertPoint, LoopTestMBB);
11545 MachineBasicBlock *LoopBodyMBB =
11546 MF.CreateMachineBasicBlock(MBB.getBasicBlock());
11547 MF.insert(MBBInsertPoint, LoopBodyMBB);
11548 MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
11549 MF.insert(MBBInsertPoint, ExitMBB);
11550 MachineInstr::MIFlag Flags =
11552
11553 // LoopTest:
11554 // SUB SP, SP, #ProbeSize
11555 emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, AArch64::SP,
11556 AArch64::SP, StackOffset::getFixed(-ProbeSize), TII, Flags);
11557
11558 // CMP SP, TargetReg
11559 BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64),
11560 AArch64::XZR)
11561 .addReg(AArch64::SP)
11562 .addReg(TargetReg)
11564 .setMIFlags(Flags);
11565
11566 // B.<Cond> LoopExit
11567 BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc))
11569 .addMBB(ExitMBB)
11570 .setMIFlags(Flags);
11571
11572 // LDR XZR, [SP]
11573 BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::LDRXui))
11574 .addDef(AArch64::XZR)
11575 .addReg(AArch64::SP)
11576 .addImm(0)
11580 Align(8)))
11581 .setMIFlags(Flags);
11582
11583 // B loop
11584 BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B))
11585 .addMBB(LoopTestMBB)
11586 .setMIFlags(Flags);
11587
11588 // LoopExit:
11589 // MOV SP, TargetReg
11590 BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::ADDXri), AArch64::SP)
11591 .addReg(TargetReg)
11592 .addImm(0)
11594 .setMIFlags(Flags);
11595
11596 // LDR XZR, [SP]
11597 BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::LDRXui))
11598 .addReg(AArch64::XZR, RegState::Define)
11599 .addReg(AArch64::SP)
11600 .addImm(0)
11601 .setMIFlags(Flags);
11602
11603 ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end());
11605
11606 LoopTestMBB->addSuccessor(ExitMBB);
11607 LoopTestMBB->addSuccessor(LoopBodyMBB);
11608 LoopBodyMBB->addSuccessor(LoopTestMBB);
11609 MBB.addSuccessor(LoopTestMBB);
11610
11611 // Update liveins.
11612 if (MF.getRegInfo().reservedRegsFrozen())
11613 fullyRecomputeLiveIns({ExitMBB, LoopBodyMBB, LoopTestMBB});
11614
11615 return ExitMBB->begin();
11616}
11617
11618namespace {
11619class AArch64PipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
11620 MachineFunction *MF;
11621 const TargetInstrInfo *TII;
11622 const TargetRegisterInfo *TRI;
11623 MachineRegisterInfo &MRI;
11624
11625 /// The block of the loop
11626 MachineBasicBlock *LoopBB;
11627 /// The conditional branch of the loop
11628 MachineInstr *CondBranch;
11629 /// The compare instruction for loop control
11630 MachineInstr *Comp;
11631 /// The number of the operand of the loop counter value in Comp
11632 unsigned CompCounterOprNum;
11633 /// The instruction that updates the loop counter value
11634 MachineInstr *Update;
11635 /// The number of the operand of the loop counter value in Update
11636 unsigned UpdateCounterOprNum;
11637 /// The initial value of the loop counter
11638 Register Init;
11639 /// True iff Update is a predecessor of Comp
11640 bool IsUpdatePriorComp;
11641
11642 /// The normalized condition used by createTripCountGreaterCondition()
11644
11645public:
11646 AArch64PipelinerLoopInfo(MachineBasicBlock *LoopBB, MachineInstr *CondBranch,
11647 MachineInstr *Comp, unsigned CompCounterOprNum,
11648 MachineInstr *Update, unsigned UpdateCounterOprNum,
11649 Register Init, bool IsUpdatePriorComp,
11650 const SmallVectorImpl<MachineOperand> &Cond)
11651 : MF(Comp->getParent()->getParent()),
11652 TII(MF->getSubtarget().getInstrInfo()),
11653 TRI(MF->getSubtarget().getRegisterInfo()), MRI(MF->getRegInfo()),
11654 LoopBB(LoopBB), CondBranch(CondBranch), Comp(Comp),
11655 CompCounterOprNum(CompCounterOprNum), Update(Update),
11656 UpdateCounterOprNum(UpdateCounterOprNum), Init(Init),
11657 IsUpdatePriorComp(IsUpdatePriorComp), Cond(Cond.begin(), Cond.end()) {}
11658
11659 bool shouldIgnoreForPipelining(const MachineInstr *MI) const override {
11660 // Make the instructions for loop control be placed in stage 0.
11661 // The predecessors of Comp are considered by the caller.
11662 return MI == Comp;
11663 }
11664
11665 std::optional<bool> createTripCountGreaterCondition(
11666 int TC, MachineBasicBlock &MBB,
11667 SmallVectorImpl<MachineOperand> &CondParam) override {
11668 // A branch instruction will be inserted as "if (Cond) goto epilogue".
11669 // Cond is normalized for such use.
11670 // The predecessors of the branch are assumed to have already been inserted.
11671 CondParam = Cond;
11672 return {};
11673 }
11674
11675 void createRemainingIterationsGreaterCondition(
11676 int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
11677 DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) override;
11678
11679 void setPreheader(MachineBasicBlock *NewPreheader) override {}
11680
11681 void adjustTripCount(int TripCountAdjust) override {}
11682
11683 bool isMVEExpanderSupported() override { return true; }
11684};
11685} // namespace
11686
11687/// Clone an instruction from MI. The register of ReplaceOprNum-th operand
11688/// is replaced by ReplaceReg. The output register is newly created.
11689/// The other operands are unchanged from MI.
11690static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum,
11691 Register ReplaceReg, MachineBasicBlock &MBB,
11692 MachineBasicBlock::iterator InsertTo) {
11693 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
11694 const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
11695 MachineInstr *NewMI = MBB.getParent()->CloneMachineInstr(MI);
11696 Register Result = 0;
11697 for (unsigned I = 0; I < NewMI->getNumOperands(); ++I) {
11698 if (I == 0 && NewMI->getOperand(0).getReg().isVirtual()) {
11699 Result = MRI.createVirtualRegister(
11700 MRI.getRegClass(NewMI->getOperand(0).getReg()));
11701 NewMI->getOperand(I).setReg(Result);
11702 } else if (I == ReplaceOprNum) {
11703 MRI.constrainRegClass(ReplaceReg, TII->getRegClass(NewMI->getDesc(), I));
11704 NewMI->getOperand(I).setReg(ReplaceReg);
11705 }
11706 }
11707 MBB.insert(InsertTo, NewMI);
11708 return Result;
11709}
11710
11711void AArch64PipelinerLoopInfo::createRemainingIterationsGreaterCondition(
11714 // Create and accumulate conditions for next TC iterations.
11715 // Example:
11716 // SUBSXrr N, counter, implicit-def $nzcv # compare instruction for the last
11717 // # iteration of the kernel
11718 //
11719 // # insert the following instructions
11720 // cond = CSINCXr 0, 0, C, implicit $nzcv
11721 // counter = ADDXri counter, 1 # clone from this->Update
11722 // SUBSXrr n, counter, implicit-def $nzcv # clone from this->Comp
11723 // cond = CSINCXr cond, cond, C, implicit $nzcv
11724 // ... (repeat TC times)
11725 // SUBSXri cond, 0, implicit-def $nzcv
11726
11727 assert(CondBranch->getOpcode() == AArch64::Bcc);
11728 // CondCode to exit the loop
11730 (AArch64CC::CondCode)CondBranch->getOperand(0).getImm();
11731 if (CondBranch->getOperand(1).getMBB() == LoopBB)
11733
11734 // Accumulate conditions to exit the loop
11735 Register AccCond = AArch64::XZR;
11736
11737 // If CC holds, CurCond+1 is returned; otherwise CurCond is returned.
11738 auto AccumulateCond = [&](Register CurCond,
11740 Register NewCond = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
11741 BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::CSINCXr))
11742 .addReg(NewCond, RegState::Define)
11743 .addReg(CurCond)
11744 .addReg(CurCond)
11746 return NewCond;
11747 };
11748
11749 if (!LastStage0Insts.empty() && LastStage0Insts[Comp]->getParent() == &MBB) {
11750 // Update and Comp for I==0 are already exists in MBB
11751 // (MBB is an unrolled kernel)
11752 Register Counter;
11753 for (int I = 0; I <= TC; ++I) {
11754 Register NextCounter;
11755 if (I != 0)
11756 NextCounter =
11757 cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end());
11758
11759 AccCond = AccumulateCond(AccCond, CC);
11760
11761 if (I != TC) {
11762 if (I == 0) {
11763 if (Update != Comp && IsUpdatePriorComp) {
11764 Counter =
11765 LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg();
11766 NextCounter = cloneInstr(Update, UpdateCounterOprNum, Counter, MBB,
11767 MBB.end());
11768 } else {
11769 // can use already calculated value
11770 NextCounter = LastStage0Insts[Update]->getOperand(0).getReg();
11771 }
11772 } else if (Update != Comp) {
11773 NextCounter =
11774 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
11775 }
11776 }
11777 Counter = NextCounter;
11778 }
11779 } else {
11780 Register Counter;
11781 if (LastStage0Insts.empty()) {
11782 // use initial counter value (testing if the trip count is sufficient to
11783 // be executed by pipelined code)
11784 Counter = Init;
11785 if (IsUpdatePriorComp)
11786 Counter =
11787 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
11788 } else {
11789 // MBB is an epilogue block. LastStage0Insts[Comp] is in the kernel block.
11790 Counter = LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg();
11791 }
11792
11793 for (int I = 0; I <= TC; ++I) {
11794 Register NextCounter;
11795 NextCounter =
11796 cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end());
11797 AccCond = AccumulateCond(AccCond, CC);
11798 if (I != TC && Update != Comp)
11799 NextCounter =
11800 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
11801 Counter = NextCounter;
11802 }
11803 }
11804
11805 // If AccCond == 0, the remainder is greater than TC.
11806 BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::SUBSXri))
11807 .addReg(AArch64::XZR, RegState::Define | RegState::Dead)
11808 .addReg(AccCond)
11809 .addImm(0)
11810 .addImm(0);
11811 Cond.clear();
11813}
11814
11815static void extractPhiReg(const MachineInstr &Phi, const MachineBasicBlock *MBB,
11816 Register &RegMBB, Register &RegOther) {
11817 assert(Phi.getNumOperands() == 5);
11818 if (Phi.getOperand(2).getMBB() == MBB) {
11819 RegMBB = Phi.getOperand(1).getReg();
11820 RegOther = Phi.getOperand(3).getReg();
11821 } else {
11822 assert(Phi.getOperand(4).getMBB() == MBB);
11823 RegMBB = Phi.getOperand(3).getReg();
11824 RegOther = Phi.getOperand(1).getReg();
11825 }
11826}
11827
11829 if (!Reg.isVirtual())
11830 return false;
11831 const MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
11832 return MRI.getVRegDef(Reg)->getParent() != BB;
11833}
11834
11835/// If Reg is an induction variable, return true and set some parameters
11836static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB,
11837 MachineInstr *&UpdateInst,
11838 unsigned &UpdateCounterOprNum, Register &InitReg,
11839 bool &IsUpdatePriorComp) {
11840 // Example:
11841 //
11842 // Preheader:
11843 // InitReg = ...
11844 // LoopBB:
11845 // Reg0 = PHI (InitReg, Preheader), (Reg1, LoopBB)
11846 // Reg = COPY Reg0 ; COPY is ignored.
11847 // Reg1 = ADD Reg, #1; UpdateInst. Incremented by a loop invariant value.
11848 // ; Reg is the value calculated in the previous
11849 // ; iteration, so IsUpdatePriorComp == false.
11850
11851 if (LoopBB->pred_size() != 2)
11852 return false;
11853 if (!Reg.isVirtual())
11854 return false;
11855 const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
11856 UpdateInst = nullptr;
11857 UpdateCounterOprNum = 0;
11858 InitReg = 0;
11859 IsUpdatePriorComp = true;
11860 Register CurReg = Reg;
11861 while (true) {
11862 MachineInstr *Def = MRI.getVRegDef(CurReg);
11863 if (Def->getParent() != LoopBB)
11864 return false;
11865 if (Def->isCopy()) {
11866 // Ignore copy instructions unless they contain subregisters
11867 if (Def->getOperand(0).getSubReg() || Def->getOperand(1).getSubReg())
11868 return false;
11869 CurReg = Def->getOperand(1).getReg();
11870 } else if (Def->isPHI()) {
11871 if (InitReg != 0)
11872 return false;
11873 if (!UpdateInst)
11874 IsUpdatePriorComp = false;
11875 extractPhiReg(*Def, LoopBB, CurReg, InitReg);
11876 } else {
11877 if (UpdateInst)
11878 return false;
11879 switch (Def->getOpcode()) {
11880 case AArch64::ADDSXri:
11881 case AArch64::ADDSWri:
11882 case AArch64::SUBSXri:
11883 case AArch64::SUBSWri:
11884 case AArch64::ADDXri:
11885 case AArch64::ADDWri:
11886 case AArch64::SUBXri:
11887 case AArch64::SUBWri:
11888 UpdateInst = Def;
11889 UpdateCounterOprNum = 1;
11890 break;
11891 case AArch64::ADDSXrr:
11892 case AArch64::ADDSWrr:
11893 case AArch64::SUBSXrr:
11894 case AArch64::SUBSWrr:
11895 case AArch64::ADDXrr:
11896 case AArch64::ADDWrr:
11897 case AArch64::SUBXrr:
11898 case AArch64::SUBWrr:
11899 UpdateInst = Def;
11900 if (isDefinedOutside(Def->getOperand(2).getReg(), LoopBB))
11901 UpdateCounterOprNum = 1;
11902 else if (isDefinedOutside(Def->getOperand(1).getReg(), LoopBB))
11903 UpdateCounterOprNum = 2;
11904 else
11905 return false;
11906 break;
11907 default:
11908 return false;
11909 }
11910 CurReg = Def->getOperand(UpdateCounterOprNum).getReg();
11911 }
11912
11913 if (!CurReg.isVirtual())
11914 return false;
11915 if (Reg == CurReg)
11916 break;
11917 }
11918
11919 if (!UpdateInst)
11920 return false;
11921
11922 return true;
11923}
11924
11925std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
11927 // Accept loops that meet the following conditions
11928 // * The conditional branch is BCC
11929 // * The compare instruction is ADDS/SUBS/WHILEXX
11930 // * One operand of the compare is an induction variable and the other is a
11931 // loop invariant value
11932 // * The induction variable is incremented/decremented by a single instruction
11933 // * Does not contain CALL or instructions which have unmodeled side effects
11934
11935 for (MachineInstr &MI : *LoopBB)
11936 if (MI.isCall() || MI.hasUnmodeledSideEffects())
11937 // This instruction may use NZCV, which interferes with the instruction to
11938 // be inserted for loop control.
11939 return nullptr;
11940
11941 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
11943 if (analyzeBranch(*LoopBB, TBB, FBB, Cond))
11944 return nullptr;
11945
11946 // Infinite loops are not supported
11947 if (TBB == LoopBB && FBB == LoopBB)
11948 return nullptr;
11949
11950 // Must be conditional branch
11951 if (TBB != LoopBB && FBB == nullptr)
11952 return nullptr;
11953
11954 assert((TBB == LoopBB || FBB == LoopBB) &&
11955 "The Loop must be a single-basic-block loop");
11956
11957 MachineInstr *CondBranch = &*LoopBB->getFirstTerminator();
11959
11960 if (CondBranch->getOpcode() != AArch64::Bcc)
11961 return nullptr;
11962
11963 // Normalization for createTripCountGreaterCondition()
11964 if (TBB == LoopBB)
11966
11967 MachineInstr *Comp = nullptr;
11968 unsigned CompCounterOprNum = 0;
11969 for (MachineInstr &MI : reverse(*LoopBB)) {
11970 if (MI.modifiesRegister(AArch64::NZCV, &TRI)) {
11971 // Guarantee that the compare is SUBS/ADDS/WHILEXX and that one of the
11972 // operands is a loop invariant value
11973
11974 switch (MI.getOpcode()) {
11975 case AArch64::SUBSXri:
11976 case AArch64::SUBSWri:
11977 case AArch64::ADDSXri:
11978 case AArch64::ADDSWri:
11979 Comp = &MI;
11980 CompCounterOprNum = 1;
11981 break;
11982 case AArch64::ADDSWrr:
11983 case AArch64::ADDSXrr:
11984 case AArch64::SUBSWrr:
11985 case AArch64::SUBSXrr:
11986 Comp = &MI;
11987 break;
11988 default:
11989 if (isWhileOpcode(MI.getOpcode())) {
11990 Comp = &MI;
11991 break;
11992 }
11993 return nullptr;
11994 }
11995
11996 if (CompCounterOprNum == 0) {
11997 if (isDefinedOutside(Comp->getOperand(1).getReg(), LoopBB))
11998 CompCounterOprNum = 2;
11999 else if (isDefinedOutside(Comp->getOperand(2).getReg(), LoopBB))
12000 CompCounterOprNum = 1;
12001 else
12002 return nullptr;
12003 }
12004 break;
12005 }
12006 }
12007 if (!Comp)
12008 return nullptr;
12009
12010 MachineInstr *Update = nullptr;
12011 Register Init;
12012 bool IsUpdatePriorComp;
12013 unsigned UpdateCounterOprNum;
12014 if (!getIndVarInfo(Comp->getOperand(CompCounterOprNum).getReg(), LoopBB,
12015 Update, UpdateCounterOprNum, Init, IsUpdatePriorComp))
12016 return nullptr;
12017
12018 return std::make_unique<AArch64PipelinerLoopInfo>(
12019 LoopBB, CondBranch, Comp, CompCounterOprNum, Update, UpdateCounterOprNum,
12020 Init, IsUpdatePriorComp, Cond);
12021}
12022
12023/// verifyInstruction - Perform target specific instruction verification.
12024bool AArch64InstrInfo::verifyInstruction(const MachineInstr &MI,
12025 StringRef &ErrInfo) const {
12026 // Verify that immediate offsets on load/store instructions are within range.
12027 // Stack objects with an FI operand are excluded as they can be fixed up
12028 // during PEI.
12029 TypeSize Scale(0U, false), Width(0U, false);
12030 int64_t MinOffset, MaxOffset;
12031 if (getMemOpInfo(MI.getOpcode(), Scale, Width, MinOffset, MaxOffset)) {
12032 unsigned ImmIdx = getLoadStoreImmIdx(MI.getOpcode());
12033 if (MI.getOperand(ImmIdx).isImm() && !MI.getOperand(ImmIdx - 1).isFI()) {
12034 int64_t Imm = MI.getOperand(ImmIdx).getImm();
12035 if (Imm < MinOffset || Imm > MaxOffset) {
12036 ErrInfo = "Unexpected immediate on load/store instruction";
12037 return false;
12038 }
12039 }
12040 }
12041
12042 const MCInstrDesc &MCID = MI.getDesc();
12043 for (unsigned Op = 0; Op < MCID.getNumOperands(); Op++) {
12044 const MachineOperand &MO = MI.getOperand(Op);
12045 switch (MCID.operands()[Op].OperandType) {
12047 if (!MO.isImm() || MO.getImm() != 0) {
12048 ErrInfo = "OPERAND_IMPLICIT_IMM_0 should be 0";
12049 return false;
12050 }
12051 break;
12053 if (!MO.isImm() ||
12055 (AArch64_AM::getShiftValue(MO.getImm()) != 8 &&
12056 AArch64_AM::getShiftValue(MO.getImm()) != 16)) {
12057 ErrInfo = "OPERAND_SHIFT_MSL should be msl shift of 8 or 16";
12058 return false;
12059 }
12060 break;
12061 default:
12062 break;
12063 }
12064 }
12065 return true;
12066}
12067
12068#define GET_INSTRINFO_HELPERS
12069#define GET_INSTRMAP_INFO
12070#include "AArch64GenInstrInfo.inc"
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg, unsigned NumRegs)
static cl::opt< unsigned > BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19), cl::desc("Restrict range of Bcc instructions (DEBUG)"))
static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg, unsigned MnegOpc, const TargetRegisterClass *RC)
genNeg - Helper to generate an intermediate negation of the second operand of Root
static bool isFrameStoreOpcode(int Opcode)
static cl::opt< unsigned > GatherOptSearchLimit("aarch64-search-limit", cl::Hidden, cl::init(2048), cl::desc("Restrict range of instructions to search for the " "machine-combiner gather pattern optimization"))
static bool getMaddPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
Find instructions that can be turned into madd.
static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr)
Find a condition code used by the instruction.
static MachineInstr * genFusedMultiplyAcc(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC)
genFusedMultiplyAcc - Helper to generate fused multiply accumulate instructions.
static MachineInstr * genFusedMultiplyAccNeg(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg, unsigned IdxMulOpd, unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC)
genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate instructions with an additional...
static bool isCombineInstrCandidate64(unsigned Opc)
static bool isFrameLoadOpcode(int Opcode)
static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg)
static bool areCFlagsAccessedBetweenInstrs(MachineBasicBlock::iterator From, MachineBasicBlock::iterator To, const TargetRegisterInfo *TRI, const AccessKind AccessToCheck=AK_All)
True when condition flags are accessed (either by writing or reading) on the instruction trace starti...
static bool getFMAPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
Floating-Point Support.
static bool isADDSRegImm(unsigned Opcode)
static bool isCheapCopy(const MachineInstr &MI, const AArch64RegisterInfo &RI)
static bool isANDOpcode(MachineInstr &MI)
static void appendOffsetComment(int NumBytes, llvm::raw_string_ostream &Comment, StringRef RegScale={})
static unsigned sForm(MachineInstr &Instr)
Get opcode of S version of Instr.
static bool isCombineInstrSettingFlag(unsigned Opc)
static bool getFNEGPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB, MachineInstr *&UpdateInst, unsigned &UpdateCounterOprNum, Register &InitReg, bool &IsUpdatePriorComp)
If Reg is an induction variable, return true and set some parameters.
static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc)
static bool mustAvoidNeonAtMBBI(const AArch64Subtarget &Subtarget, MachineBasicBlock &MBB, MachineBasicBlock::iterator I)
Returns true if in a streaming call site region without SME-FA64.
static bool isPostIndexLdStOpcode(unsigned Opcode)
Return true if the opcode is a post-index ld/st instruction, which really loads from base+0.
static std::optional< unsigned > getLFIInstSizeInBytes(const MachineInstr &MI)
Return the maximum number of bytes of code the specified instruction may be after LFI rewriting.
static unsigned getBranchDisplacementBits(unsigned Opc)
static cl::opt< unsigned > CBDisplacementBits("aarch64-cb-offset-bits", cl::Hidden, cl::init(9), cl::desc("Restrict range of CB instructions (DEBUG)"))
static std::optional< ParamLoadedValue > describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg, const TargetInstrInfo *TII, const TargetRegisterInfo *TRI)
If the given ORR instruction is a copy, and DescribedReg overlaps with the destination register then,...
static bool getFMULPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
static void appendReadRegExpr(SmallVectorImpl< char > &Expr, unsigned RegNum)
static MachineInstr * genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR, const TargetRegisterClass *RC)
genMaddR - Generate madd instruction and combine mul and add using an extra virtual register Example ...
static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum, Register ReplaceReg, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertTo)
Clone an instruction from MI.
static bool scaleOffset(unsigned Opc, int64_t &Offset)
static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO, unsigned MulOpc)
unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale)
static MachineInstr * genFusedMultiplyIdx(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC)
genFusedMultiplyIdx - Helper to generate fused multiply accumulate instructions.
static MachineInstr * genIndexedMultiply(MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxDupOp, unsigned MulOpc, const TargetRegisterClass *RC, MachineRegisterInfo &MRI)
Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane)
static bool isSUBSRegImm(unsigned Opcode)
static bool UpdateOperandRegClass(MachineInstr &Instr)
static const TargetRegisterClass * getRegClass(const MachineInstr &MI, Register Reg)
static bool isInStreamingCallSiteRegion(MachineBasicBlock &MBB, MachineBasicBlock::iterator I)
Returns true if the instruction at I is in a streaming call site region, within a single basic block.
static bool canCmpInstrBeRemoved(MachineInstr &MI, MachineInstr &CmpInstr, int CmpValue, const TargetRegisterInfo &TRI, SmallVectorImpl< MachineInstr * > &CCUseInstrs, bool &IsInvertCC)
unsigned unscaledOffsetOpcode(unsigned Opcode)
static bool getLoadPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
Search for patterns of LD instructions we can optimize.
static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr, const TargetRegisterInfo &TRI)
Check if CmpInstr can be substituted by MI.
static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC)
static bool isCombineInstrCandidateFP(const MachineInstr &Inst)
static void appendLoadRegExpr(SmallVectorImpl< char > &Expr, int64_t OffsetFromDefCFA)
static void appendConstantExpr(SmallVectorImpl< char > &Expr, int64_t Constant, dwarf::LocationAtom Operation)
static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI)
Return the opcode that does not set flags when possible - otherwise return the original opcode.
static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a, const outliner::Candidate &b)
static bool isCombineInstrCandidate32(unsigned Opc)
static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target, SmallVectorImpl< MachineOperand > &Cond)
static unsigned offsetExtendOpcode(unsigned Opcode)
MachineOutlinerMBBFlags
@ LRUnavailableSomewhere
@ UnsafeRegsDead
static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, const MCInstrDesc &MCID, Register DestReg, unsigned SubIdx0, unsigned SubIdx1, int FI, MachineMemOperand *MMO)
static void generateGatherLanePattern(MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg, unsigned Pattern, unsigned NumLanes)
Generate optimized instruction sequence for gather load patterns to improve Memory-Level Parallelism ...
static bool getMiscPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
Find other MI combine patterns.
static bool outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a, const outliner::Candidate &b)
static const MachineInstrBuilder & AddSubReg(const MachineInstrBuilder &MIB, MCRegister Reg, unsigned SubIdx, RegState State, const TargetRegisterInfo *TRI)
static bool outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a, const outliner::Candidate &b)
static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1, int64_t Offset1, unsigned Opcode1, int FI2, int64_t Offset2, unsigned Opcode2)
static cl::opt< unsigned > TBZDisplacementBits("aarch64-tbz-offset-bits", cl::Hidden, cl::init(14), cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"))
static void extractPhiReg(const MachineInstr &Phi, const MachineBasicBlock *MBB, Register &RegMBB, Register &RegOther)
static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI, unsigned Reg, const StackOffset &Offset)
static bool isDefinedOutside(Register Reg, const MachineBasicBlock *BB)
static MachineInstr * genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC, FMAInstKind kind=FMAInstKind::Default, const Register *ReplacedAddend=nullptr)
genFusedMultiply - Generate fused multiply instructions.
static bool getGatherLanePattern(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns, unsigned LoadLaneOpCode, unsigned NumLanes)
Check if the given instruction forms a gather load pattern that can be optimized for better Memory-Le...
static MachineInstr * genFusedMultiplyIdxNeg(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg, unsigned IdxMulOpd, unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC)
genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate instructions with an additional...
static bool isCombineInstrCandidate(unsigned Opc)
static unsigned regOffsetOpcode(unsigned Opcode)
MachineOutlinerClass
Constants defining how certain sequences should be outlined.
@ MachineOutlinerTailCall
Emit a save, restore, call, and return.
@ MachineOutlinerRegSave
Emit a call and tail-call.
@ MachineOutlinerNoLRSave
Only emit a branch.
@ MachineOutlinerThunk
Emit a call and return.
@ MachineOutlinerDefault
static cl::opt< unsigned > BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26), cl::desc("Restrict range of B instructions (DEBUG)"))
static bool areCFlagsAliveInSuccessors(const MachineBasicBlock *MBB)
Check if AArch64::NZCV should be alive in successors of MBB.
static void emitFrameOffsetAdj(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, int64_t Offset, unsigned Opc, const TargetInstrInfo *TII, MachineInstr::MIFlag Flag, bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFAOffset, StackOffset CFAOffset, unsigned FrameReg)
static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize)
static cl::opt< unsigned > CBZDisplacementBits("aarch64-cbz-offset-bits", cl::Hidden, cl::init(19), cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"))
static void genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, unsigned IdxOpd1, DenseMap< Register, unsigned > &InstrIdxForVirtReg)
Do the following transformation A - (B + C) ==> (A - B) - C A - (B + C) ==> (A - C) - B.
static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, unsigned *NewReg=nullptr)
static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB, const AArch64InstrInfo *TII, bool ShouldSignReturnAddr)
static MachineInstr * genFNegatedMAD(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs)
static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO, unsigned MulOpc, unsigned ZeroReg)
static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, const MCInstrDesc &MCID, Register SrcReg, bool IsKill, unsigned SubIdx0, unsigned SubIdx1, int FI, MachineMemOperand *MMO)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
DXIL Forward Handle Accesses
@ Default
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
This file implements the LivePhysRegs utility for tracking liveness of physical registers.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Machine Check Debug Module
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
PowerPC Reduce CR logical Operation
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:483
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define DEBUG_WITH_TYPE(TYPE,...)
DEBUG_WITH_TYPE macro - This macro should be used by passes to emit debug information.
Definition Debug.h:72
static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO, unsigned CombineOpc=0)
AArch64FunctionInfo - This class is derived from MachineFunctionInfo and contains private AArch64-spe...
SignReturnAddress getSignReturnAddressCondition() const
void setOutliningStyle(const std::string &Style)
std::optional< bool > hasRedZone() const
static bool shouldSignReturnAddress(SignReturnAddress Condition, bool IsLRSpilled)
static bool isHForm(const MachineInstr &MI)
Returns whether the instruction is in H form (16 bit operands)
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
static bool hasBTISemantics(const MachineInstr &MI)
Returns whether the instruction can be compatible with non-zero BTYPE.
static bool isQForm(const MachineInstr &MI)
Returns whether the instruction is in Q form (128 bit operands)
static bool getMemOpInfo(unsigned Opcode, TypeSize &Scale, TypeSize &Width, int64_t &MinOffset, int64_t &MaxOffset)
Returns true if opcode Opc is a memory operation.
static bool isTailCallReturnInst(const MachineInstr &MI)
Returns true if MI is one of the TCRETURN* instructions.
static bool isFPRCopy(const MachineInstr &MI)
Does this instruction rename an FPR without modifying bits?
MachineInstr * emitLdStWithAddr(MachineInstr &MemI, const ExtAddrMode &AM) const override
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is an instruction that moves/copies value from one register to an...
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
GetInstSize - Return the number of bytes of code the specified instruction may be.
static bool isZExtLoad(const MachineInstr &MI)
Returns whether the instruction is a zero-extending load.
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
static bool isGPRCopy(const MachineInstr &MI)
Does this instruction rename a GPR without modifying bits?
static unsigned convertToFlagSettingOpc(unsigned Opc)
Return the opcode that set flags when possible.
void createPauthEpilogueInstr(MachineBasicBlock &MBB, DebugLoc DL) const
Return true when there is potentially a faster code sequence for an instruction chain ending in Root.
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
bool canInsertSelect(const MachineBasicBlock &, ArrayRef< MachineOperand > Cond, Register, Register, Register, int &, int &, int &) const override
Register isLoadFromStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const override
Check for post-frame ptr elimination stack locations as well.
static const MachineOperand & getLdStOffsetOp(const MachineInstr &MI)
Returns the immediate offset operator of a load/store.
bool isCoalescableExtInstr(const MachineInstr &MI, Register &SrcReg, Register &DstReg, unsigned &SubIdx) const override
static std::optional< unsigned > getUnscaledLdSt(unsigned Opc)
Returns the unscaled load/store for the scaled load/store opcode, if there is a corresponding unscale...
static bool hasUnscaledLdStOffset(unsigned Opc)
Return true if it has an unscaled load/store offset.
static const MachineOperand & getLdStAmountOp(const MachineInstr &MI)
Returns the shift amount operator of a load/store.
static bool isPreLdSt(const MachineInstr &MI)
Returns whether the instruction is a pre-indexed load/store.
std::optional< ExtAddrMode > getAddrModeFromMemoryOp(const MachineInstr &MemI, const TargetRegisterInfo *TRI) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &MI, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const override
bool analyzeBranchPredicate(MachineBasicBlock &MBB, MachineBranchPredicate &MBP, bool AllowModify) const override
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, Register VReg, unsigned SubReg=0, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isPairableLdStInst(const MachineInstr &MI)
Return true if pairing the given load or store may be paired with another.
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isSExtLoad(const MachineInstr &MI)
Returns whether the instruction is a sign-extending load.
const AArch64RegisterInfo & getRegisterInfo() const
getRegisterInfo - TargetInstrInfo is a superset of MRegister info.
static bool isPreSt(const MachineInstr &MI)
Returns whether the instruction is a pre-indexed store.
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
AArch64InstrInfo(const AArch64Subtarget &STI)
static bool isPairedLdSt(const MachineInstr &MI)
Returns whether the instruction is a paired load/store.
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, int FrameIndex, MachineInstr *&CopyMI, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
bool getMemOperandWithOffsetWidth(const MachineInstr &MI, const MachineOperand *&BaseOp, int64_t &Offset, bool &OffsetIsScalable, TypeSize &Width, const TargetRegisterInfo *TRI) const
If OffsetIsScalable is set to 'true', the offset is scaled by vscale.
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isStridedAccess(const MachineInstr &MI)
Return true if the given load or store is a strided memory access.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
Detect opportunities for ldp/stp formation.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
bool isThroughputPattern(unsigned Pattern) const override
Return true when a code sequence can improve throughput.
MachineOperand & getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const
Return the immediate offset of the base register in a load/store LdSt.
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
bool canFoldIntoAddrMode(const MachineInstr &MemI, Register Reg, const MachineInstr &AddrI, ExtAddrMode &AM) const override
static bool isLdStPairSuppressed(const MachineInstr &MI)
Return true if pairing the given load or store is hinted to be unprofitable.
Register isStoreToStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const override
Check for post-frame ptr elimination stack locations as well.
std::unique_ptr< TargetInstrInfo::PipelinerLoopInfo > analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const override
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
optimizeCompareInstr - Convert the instruction supplying the argument to the comparison into one that...
static unsigned getLoadStoreImmIdx(unsigned Opc)
Returns the index for the immediate for a given instruction.
static bool isGPRZero(const MachineInstr &MI)
Does this instruction set its full destination register to zero?
void copyGPRRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, unsigned Opcode, unsigned ZeroReg, llvm::ArrayRef< unsigned > Indices) const
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
analyzeCompare - For a comparison instruction, return the source registers in SrcReg and SrcReg2,...
CombinerObjective getCombinerObjective(unsigned Pattern) const override
static bool isFpOrNEON(Register Reg)
Returns whether the physical register is FP or NEON.
bool isAsCheapAsAMove(const MachineInstr &MI) const override
std::optional< DestSourcePair > isCopyLikeInstrImpl(const MachineInstr &MI) const override
static void suppressLdStPair(MachineInstr &MI)
Hint that pairing the given load or store is unprofitable.
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
static bool isPreLd(const MachineInstr &MI)
Returns whether the instruction is a pre-indexed load.
void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, unsigned Opcode, llvm::ArrayRef< unsigned > Indices) const
bool optimizeCondBranch(MachineInstr &MI) const override
Replace csincr-branch sequence by simple conditional branch.
static int getMemScale(unsigned Opc)
Scaling factor for (scaled or unscaled) load or store.
bool isCandidateToMergeOrPair(const MachineInstr &MI) const
Return true if this is a load/store that can be potentially paired/merged.
MCInst getNop() const override
static const MachineOperand & getLdStBaseOp(const MachineInstr &MI)
Returns the base register operator of a load/store.
bool isReservedReg(const MachineFunction &MF, MCRegister Reg) const
const AArch64RegisterInfo * getRegisterInfo() const override
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
bool isSVEorStreamingSVEAvailable() const
Returns true if the target has access to either the full range of SVE instructions,...
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
Get the first element.
Definition ArrayRef.h:144
size_t size() const
Get the array size.
Definition ArrayRef.h:141
This is an important base class in LLVM.
Definition Constant.h:43
A debug info location.
Definition DebugLoc.h:124
bool empty() const
Definition DenseMap.h:173
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:286
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition Function.h:714
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:711
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
static LocationSize precise(uint64_t Value)
This class is intended to be used as a base class for asm properties and features specific to the tar...
Definition MCAsmInfo.h:66
bool usesWindowsCFI() const
Definition MCAsmInfo.h:674
static MCCFIInstruction cfiDefCfa(MCSymbol *L, unsigned Register, int64_t Offset, SMLoc Loc={})
.cfi_def_cfa defines a rule for computing CFA as: take address from Register and add Offset to it.
Definition MCDwarf.h:615
static MCCFIInstruction createOffset(MCSymbol *L, unsigned Register, int64_t Offset, SMLoc Loc={})
.cfi_offset Previous value of Register is saved at offset Offset from CFA.
Definition MCDwarf.h:657
static MCCFIInstruction cfiDefCfaOffset(MCSymbol *L, int64_t Offset, SMLoc Loc={})
.cfi_def_cfa_offset modifies a rule for computing CFA.
Definition MCDwarf.h:630
static MCCFIInstruction createEscape(MCSymbol *L, StringRef Vals, SMLoc Loc={}, StringRef Comment="")
.cfi_escape Allows the user to add arbitrary bytes to the unwind info.
Definition MCDwarf.h:727
Instances of this class represent a single low-level machine instruction.
Definition MCInst.h:188
Describe properties that are true of each instruction in the target description file.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
constexpr bool isValid() const
Definition MCRegister.h:84
static constexpr unsigned NoRegister
Definition MCRegister.h:60
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1567
Set of metadata that should be preserved when using BuildMI().
bool isInlineAsmBrIndirectTarget() const
Returns true if this is the indirect dest of an INLINEASM_BR.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
reverse_instr_iterator instr_rbegin()
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
reverse_instr_iterator instr_rend()
Instructions::iterator instr_iterator
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
void setMachineBlockAddressTaken()
Set this block to indicate that its address is used as something other than the target of a terminato...
LLVM_ABI bool isLiveIn(MCRegister Reg, LaneBitmask LaneMask=LaneBitmask::getAll()) const
Return true if the specified register is in the live in set.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
uint64_t getStackSize() const
Return the number of bytes that must be allocated to hold all of the fixed size frame objects.
void setStackID(int ObjectIdx, uint8_t ID)
bool isCalleeSavedInfoValid() const
Has the callee saved info been calculated yet?
Align getObjectAlign(int ObjectIdx) const
Return the alignment of the specified stack object.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
unsigned getNumObjects() const
Return the number of objects.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
unsigned addFrameInst(const MCCFIInstruction &Inst)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineJumpTableInfo * getJumpTableInfo() const
getJumpTableInfo - Return the jump table info object for the current function.
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & addCFIIndex(unsigned CFIIndex) const
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
reverse_iterator getReverse() const
Get a reverse iterator to the same node.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
bool isCall(QueryType Type=AnyInBundle) const
bool getFlag(MIFlag Flag) const
Return whether an MI flag is set.
LLVM_ABI uint32_t mergeFlagsWith(const MachineInstr &Other) const
Return the MIFlags which represent both MachineInstrs.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
bool registerDefIsDead(Register Reg, const TargetRegisterInfo *TRI) const
Returns true if the register is dead in this machine instruction.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
LLVM_ABI bool isLoadFoldBarrier() const
Returns true if it is illegal to fold a load across this instruction.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
LLVM_ABI void addRegisterDefined(Register Reg, const TargetRegisterInfo *RegInfo=nullptr)
We have determined MI defines a register.
const MachineOperand & getOperand(unsigned i) const
uint32_t getFlags() const
Return the MI flags bitvector.
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
LLVM_ABI MachineInstrBundleIterator< MachineInstr > eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const std::vector< MachineJumpTableEntry > & getJumpTables() const
A description of a memory reference used in the backend.
@ MOVolatile
The memory access is volatile.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
This class contains meta information specific to a module.
LLVM_ABI MachineFunction * getMachineFunction(const Function &F) const
Returns the MachineFunction associated to IR function F if there is one, otherwise nullptr.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
MachineBasicBlock * getMBB() const
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void setIsKill(bool Val=true)
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
bool tracksLiveness() const
tracksLiveness - Returns true when tracking register liveness accurately.
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
bool use_nodbg_empty(Register RegNo) const
use_nodbg_empty - Return true if there are no non-Debug instructions using the specified register.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
bool reservedRegsFrozen() const
reservedRegsFrozen - Returns true after freezeReservedRegs() was called to ensure the set of reserved...
bool def_empty(Register RegNo) const
def_empty - Return true if there are no instructions defining the specified register (it may be live-...
use_instr_nodbg_iterator use_instr_nodbg_begin(Register RegNo) const
bool hasOneDef(Register RegNo) const
Return true if there is exactly one operand defining the specified register.
const TargetRegisterClass * getRegClassOrNull(Register Reg) const
Return the register class of Reg, or null if Reg has not been assigned a register class yet.
LLVM_ABI const TargetRegisterClass * constrainRegClass(Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
MI-level patchpoint operands.
Definition StackMaps.h:77
uint32_t getNumPatchBytes() const
Return the number of patchable bytes the given patchpoint should emit.
Definition StackMaps.h:105
Wrapper class representing virtual and physical registers.
Definition Register.h:20
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
static constexpr bool isVirtualRegister(unsigned Reg)
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:66
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Represents a location in source code.
Definition SMLoc.h:22
bool erase(PtrType Ptr)
Remove pointer from the set.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:134
bool empty() const
Definition SmallSet.h:169
bool erase(const T &V)
Definition SmallSet.h:200
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
MI-level stackmap operands.
Definition StackMaps.h:36
uint32_t getNumPatchBytes() const
Return the number of patchable bytes the given stackmap should emit.
Definition StackMaps.h:51
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
int64_t getFixed() const
Returns the fixed component of the stack.
Definition TypeSize.h:46
int64_t getScalable() const
Returns the scalable component of the stack.
Definition TypeSize.h:49
static StackOffset get(int64_t Fixed, int64_t Scalable)
Definition TypeSize.h:41
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:40
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:39
MI-level Statepoint operands.
Definition StackMaps.h:159
uint32_t getNumPatchBytes() const
Return the number of patchable bytes the given statepoint should emit.
Definition StackMaps.h:208
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
Object returned by analyzeLoopForPipelining.
TargetInstrInfo - Interface to description of machine instruction set.
virtual void genAlternativeCodeSequence(MachineInstr &Root, unsigned Pattern, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, DenseMap< Register, unsigned > &InstIdxForVirtReg) const
When getMachineCombinerPatterns() finds patterns, this function generates the instructions that could...
virtual std::optional< ParamLoadedValue > describeLoadedValue(const MachineInstr &MI, Register Reg) const
Produce the expression describing the MI loading a value into the physical register Reg.
virtual bool getMachineCombinerPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns, bool DoRegPressureReduce) const
Return true when there is potentially a faster code sequence for an instruction chain ending in Root.
virtual bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const
Test if the given instruction should be considered a scheduling boundary.
virtual CombinerObjective getCombinerObjective(unsigned Pattern) const
Return the objective of a combiner pattern.
virtual bool isFunctionSafeToSplit(const MachineFunction &MF) const
Return true if the function is a viable candidate for machine function splitting.
const MCAsmInfo & getMCAsmInfo() const
Return target specific asm information.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual const TargetInstrInfo * getInstrInfo() const
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
Target - Wrapper for Target specific information.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
Value * getOperand(unsigned i) const
Definition User.h:207
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
self_iterator getIterator()
Definition ilist_node.h:123
A raw_ostream that writes to an std::string.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static CondCode getInvertedCondCode(CondCode Code)
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_NC
MO_NC - Indicates whether the linker is expected to check the symbol reference for overflow.
@ MO_G1
MO_G1 - A symbol operand with this flag (granule 1) represents the bits 16-31 of a 64-bit address,...
@ MO_S
MO_S - Indicates that the bits of the symbol operand represented by MO_G0 etc are signed.
@ MO_PAGEOFF
MO_PAGEOFF - A symbol operand with this flag represents the offset of that symbol within a 4K page.
@ MO_GOT
MO_GOT - This flag indicates that a symbol operand represents the address of the GOT entry for the sy...
@ MO_PREL
MO_PREL - Indicates that the bits of the symbol operand represented by MO_G0 etc are PC relative.
@ MO_G0
MO_G0 - A symbol operand with this flag (granule 0) represents the bits 0-15 of a 64-bit address,...
@ MO_ARM64EC_CALLMANGLE
MO_ARM64EC_CALLMANGLE - Operand refers to the Arm64EC-mangled version of a symbol,...
@ MO_PAGE
MO_PAGE - A symbol operand with this flag represents the pc-relative offset of the 4K page containing...
@ MO_HI12
MO_HI12 - This flag indicates that a symbol operand represents the bits 13-24 of a 64-bit address,...
@ MO_TLS
MO_TLS - Indicates that the operand being accessed is some kind of thread-local symbol.
@ MO_G2
MO_G2 - A symbol operand with this flag (granule 2) represents the bits 32-47 of a 64-bit address,...
@ MO_TAGGED
MO_TAGGED - With MO_PAGE, indicates that the page includes a memory tag in bits 56-63.
@ MO_G3
MO_G3 - A symbol operand with this flag (granule 3) represents the high 16-bits of a 64-bit address,...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
unsigned getCheckerSizeInBytes(AuthCheckMethod Method)
Returns the number of bytes added by checkAuthenticatedRegister.
static uint64_t decodeLogicalImmediate(uint64_t val, unsigned regSize)
decodeLogicalImmediate - Decode a logical immediate value in the form "N:immr:imms" (where the immr a...
static unsigned getShiftValue(unsigned Imm)
getShiftValue - Extract the shift value.
static unsigned getArithExtendImm(AArch64_AM::ShiftExtendType ET, unsigned Imm)
getArithExtendImm - Encode the extend type and shift amount for an arithmetic instruction: imm: 3-bit...
static unsigned getArithShiftValue(unsigned Imm)
getArithShiftValue - get the arithmetic shift value.
static uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize)
encodeLogicalImmediate - Return the encoded immediate value for a logical immediate instruction of th...
static AArch64_AM::ShiftExtendType getExtendType(unsigned Imm)
getExtendType - Extract the extend type for operands of arithmetic ops.
static AArch64_AM::ShiftExtendType getArithExtendType(unsigned Imm)
static AArch64_AM::ShiftExtendType getShiftType(unsigned Imm)
getShiftType - Extract the shift type.
static unsigned getShifterImm(AArch64_AM::ShiftExtendType ST, unsigned Imm)
getShifterImm - Encode the shift type and amount: imm: 6-bit shift amount shifter: 000 ==> lsl 001 ==...
void expandMOVAddr(unsigned Opcode, unsigned TargetFlags, bool IsTargetMachO, SmallVectorImpl< AddrInsnModel > &Insn)
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
static const uint64_t InstrFlagIsWhile
static const uint64_t InstrFlagIsPTestLike
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
initializer< Ty > init(const Ty &Val)
constexpr double e
InstrType
Represents how an instruction should be mapped by the outliner.
NodeAddr< InstrNode * > Instr
Definition RDFGraph.h:389
iterator end() const
Definition BasicBlock.h:89
LLVM_ABI Instruction & back() const
LLVM_ABI iterator begin() const
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
@ Offset
Definition DWP.cpp:558
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
static bool isCondBranchOpcode(int Opc)
MCCFIInstruction createDefCFA(const TargetRegisterInfo &TRI, unsigned FrameReg, unsigned Reg, const StackOffset &Offset, bool LastAdjustmentWasScalable=true)
static bool isPTrueOpcode(unsigned Opc)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool succeeded(LogicalResult Result)
Utility function that returns true if the provided LogicalResult corresponds to a success value.
int isAArch64FrameOffsetLegal(const MachineInstr &MI, StackOffset &Offset, bool *OutUseUnscaledOp=nullptr, unsigned *OutUnscaledOp=nullptr, int64_t *EmittableOffset=nullptr)
Check if the Offset is a valid frame offset for MI.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
RegState
Flags to represent properties of register accesses.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
@ Renamable
Register that may be renamed.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
constexpr RegState getKillRegState(bool B)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
static bool isIndirectBranchOpcode(int Opc)
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
unsigned getBLRCallOpcode(const MachineFunction &MF)
Return opcode to be used for indirect calls.
@ AArch64FrameOffsetIsLegal
Offset is legal.
@ AArch64FrameOffsetCanUpdate
Offset can apply, at least partly.
@ AArch64FrameOffsetCannotUpdate
Offset cannot apply.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
Op::Description Desc
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337
static bool isSEHInstruction(const MachineInstr &MI)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1635
AArch64MachineCombinerPattern
@ MULSUBv8i16_OP2
@ FMULv4i16_indexed_OP1
@ FMLSv1i32_indexed_OP2
@ MULSUBv2i32_indexed_OP1
@ FMLAv2i32_indexed_OP2
@ MULADDv4i16_indexed_OP2
@ FMLAv1i64_indexed_OP1
@ MULSUBv16i8_OP1
@ FMLAv8i16_indexed_OP2
@ FMULv2i32_indexed_OP1
@ MULSUBv8i16_indexed_OP2
@ FMLAv1i64_indexed_OP2
@ MULSUBv4i16_indexed_OP2
@ FMLAv1i32_indexed_OP1
@ FMLAv2i64_indexed_OP2
@ FMLSv8i16_indexed_OP1
@ MULSUBv2i32_OP1
@ FMULv4i16_indexed_OP2
@ MULSUBv4i32_indexed_OP2
@ FMULv2i64_indexed_OP2
@ FMLAv4i32_indexed_OP1
@ MULADDv4i16_OP2
@ FMULv8i16_indexed_OP2
@ MULSUBv4i16_OP1
@ MULADDv4i32_OP2
@ MULADDv2i32_OP2
@ MULADDv16i8_OP2
@ FMLSv4i16_indexed_OP1
@ MULADDv16i8_OP1
@ FMLAv2i64_indexed_OP1
@ FMLAv1i32_indexed_OP2
@ FMLSv2i64_indexed_OP2
@ MULADDv2i32_OP1
@ MULADDv4i32_OP1
@ MULADDv2i32_indexed_OP1
@ MULSUBv16i8_OP2
@ MULADDv4i32_indexed_OP1
@ MULADDv2i32_indexed_OP2
@ FMLAv4i16_indexed_OP2
@ MULSUBv8i16_OP1
@ FMULv2i32_indexed_OP2
@ FMLSv2i32_indexed_OP2
@ FMLSv4i32_indexed_OP1
@ FMULv2i64_indexed_OP1
@ MULSUBv4i16_OP2
@ FMLSv4i16_indexed_OP2
@ FMLAv2i32_indexed_OP1
@ FMLSv2i32_indexed_OP1
@ FMLAv8i16_indexed_OP1
@ MULSUBv4i16_indexed_OP1
@ FMLSv4i32_indexed_OP2
@ MULADDv4i32_indexed_OP2
@ MULSUBv4i32_OP2
@ MULSUBv8i16_indexed_OP1
@ MULADDv8i16_OP2
@ MULSUBv2i32_indexed_OP2
@ FMULv4i32_indexed_OP2
@ FMLSv2i64_indexed_OP1
@ MULADDv4i16_OP1
@ FMLAv4i32_indexed_OP2
@ MULADDv8i16_indexed_OP1
@ FMULv4i32_indexed_OP1
@ FMLAv4i16_indexed_OP1
@ FMULv8i16_indexed_OP1
@ MULADDv8i16_OP1
@ MULSUBv4i32_indexed_OP1
@ MULSUBv4i32_OP1
@ FMLSv8i16_indexed_OP2
@ MULADDv8i16_indexed_OP2
@ MULSUBv2i32_OP2
@ FMLSv1i64_indexed_OP2
@ MULADDv4i16_indexed_OP1
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, StackOffset Offset, const TargetInstrInfo *TII, MachineInstr::MIFlag=MachineInstr::NoFlags, bool SetNZCV=false, bool NeedsWinCFI=false, bool *HasWinCFI=nullptr, bool EmitCFAOffset=false, StackOffset InitialOffset={}, unsigned FrameReg=AArch64::SP)
emitFrameOffset - Emit instructions as needed to set DestReg to SrcReg plus Offset.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
constexpr RegState getDefRegState(bool B)
CombinerObjective
The combiner's goal may differ based on which pattern it is attempting to optimize.
std::optional< UsedNZCV > examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr, const TargetRegisterInfo &TRI, SmallVectorImpl< MachineInstr * > *CCUseInstrs=nullptr)
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
auto instructionsWithoutDebug(IterT It, IterT End, bool SkipPseudoOp=true)
Construct a range iterator which begins at It and moves forwards until End is reached,...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:322
static MCRegister getXRegFromWReg(MCRegister Reg)
MCCFIInstruction createCFAOffset(const TargetRegisterInfo &MRI, unsigned Reg, const StackOffset &OffsetFromDefCFA, std::optional< int64_t > IncomingVGOffsetFromDefCFA)
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
static bool isUncondBranchOpcode(int Opc)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2191
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
bool rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, unsigned FrameReg, StackOffset &Offset, const AArch64InstrInfo *TII)
rewriteAArch64FrameIndex - Rewrite MI to access 'Offset' bytes from the FP.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
static const MachineMemOperand::Flags MOSuppressPair
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
void appendLEB128(SmallVectorImpl< U > &Buffer, T Value)
Definition LEB128.h:236
bool optimizeTerminators(MachineBasicBlock *MBB, const TargetInstrInfo &TII)
std::pair< MachineOperand, DIExpression * > ParamLoadedValue
bool isNZCVTouchedInInstructionRange(const MachineInstr &DefMI, const MachineInstr &UseMI, const TargetRegisterInfo *TRI)
Return true if there is an instruction /after/ DefMI and before UseMI which either reads or clobbers ...
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
static const MachineMemOperand::Flags MOStridedAccess
constexpr RegState getUndefRegState(bool B)
void fullyRecomputeLiveIns(ArrayRef< MachineBasicBlock * > MBBs)
Convenience function for recomputing live-in's for a set of MBBs until the computation converges.
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Used to describe addressing mode similar to ExtAddrMode in CodeGenPrepare.
LLVM_ABI static const MBBSectionID ColdSectionID
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getUnknownStack(MachineFunction &MF)
Stack memory without other information.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
An individual sequence of instructions to be replaced with a call to an outlined function.
MachineFunction * getMF() const
The information necessary to create an outlined function for some class of candidate.