LLVM 23.0.0git
AArch64SIMDInstrOpt.cpp
Go to the documentation of this file.
1//
2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
3// See https://llvm.org/LICENSE.txt for license information.
4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
5//
6//===----------------------------------------------------------------------===//
7//
8// This file contains a pass that performs optimization on SIMD instructions
9// with high latency by splitting them into more efficient series of
10// instructions.
11//
12// 1. Rewrite certain SIMD instructions with vector element due to their
13// inefficiency on some targets.
14//
15// For example:
16// fmla v0.4s, v1.4s, v2.s[1]
17//
18// Is rewritten into:
19// dup v3.4s, v2.s[1]
20// fmla v0.4s, v1.4s, v3.4s
21//
22// 2. Rewrite interleaved memory access instructions due to their
23// inefficiency on some targets.
24//
25// For example:
26// st2 {v0.4s, v1.4s}, addr
27//
28// Is rewritten into:
29// zip1 v2.4s, v0.4s, v1.4s
30// zip2 v3.4s, v0.4s, v1.4s
31// stp q2, q3, addr
32//
33//===----------------------------------------------------------------------===//
34
35#include "AArch64InstrInfo.h"
36#include "AArch64Subtarget.h"
38#include "llvm/ADT/Statistic.h"
39#include "llvm/ADT/StringRef.h"
50#include "llvm/MC/MCInstrDesc.h"
51#include "llvm/MC/MCSchedule.h"
52#include "llvm/Pass.h"
53#include <map>
54#include <unordered_map>
55
56using namespace llvm;
57
58#define DEBUG_TYPE "aarch64-simdinstr-opt"
59
60STATISTIC(NumModifiedInstr,
61 "Number of SIMD instructions modified");
62
63#define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME \
64 "AArch64 SIMD instructions optimization pass"
65
66namespace {
67
68struct AArch64SIMDInstrOpt : public MachineFunctionPass {
69 static char ID;
70
71 const AArch64InstrInfo *TII;
73 TargetSchedModel SchedModel;
74
75 // The two maps below are used to cache decisions instead of recomputing:
76 // This is used to cache instruction replacement decisions within function
77 // units and across function units.
78 std::map<std::pair<unsigned, std::string>, bool> SIMDInstrTable;
79 // This is used to cache the decision of whether to leave the interleaved
80 // store instructions replacement pass early or not for a particular target.
81 std::unordered_map<std::string, bool> InterlEarlyExit;
82
83 typedef enum {
84 VectorElem,
85 Interleave
86 } Subpass;
87
88 // Instruction represented by OrigOpc is replaced by instructions in ReplOpc.
89 struct InstReplInfo {
90 unsigned OrigOpc;
91 std::vector<unsigned> ReplOpc;
92 const TargetRegisterClass RC;
93 };
94
95#define RuleST2(OpcOrg, OpcR0, OpcR1, OpcR2, RC) \
96 {OpcOrg, {OpcR0, OpcR1, OpcR2}, RC}
97#define RuleST4(OpcOrg, OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, \
98 OpcR7, OpcR8, OpcR9, RC) \
99 {OpcOrg, \
100 {OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, OpcR7, OpcR8, OpcR9}, RC}
101
102 // The Instruction Replacement Table:
103 std::vector<InstReplInfo> IRT = {
104 // ST2 instructions
105 RuleST2(AArch64::ST2Twov2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
106 AArch64::STPQi, AArch64::FPR128RegClass),
107 RuleST2(AArch64::ST2Twov4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
108 AArch64::STPQi, AArch64::FPR128RegClass),
109 RuleST2(AArch64::ST2Twov2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
110 AArch64::STPDi, AArch64::FPR64RegClass),
111 RuleST2(AArch64::ST2Twov8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
112 AArch64::STPQi, AArch64::FPR128RegClass),
113 RuleST2(AArch64::ST2Twov4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
114 AArch64::STPDi, AArch64::FPR64RegClass),
115 RuleST2(AArch64::ST2Twov16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
116 AArch64::STPQi, AArch64::FPR128RegClass),
117 RuleST2(AArch64::ST2Twov8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
118 AArch64::STPDi, AArch64::FPR64RegClass),
119 // ST4 instructions
120 RuleST4(AArch64::ST4Fourv2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
121 AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, AArch64::ZIP1v2i64,
122 AArch64::ZIP2v2i64, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
123 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
124 RuleST4(AArch64::ST4Fourv4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
125 AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, AArch64::ZIP1v4i32,
126 AArch64::ZIP2v4i32, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
127 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
128 RuleST4(AArch64::ST4Fourv2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
129 AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, AArch64::ZIP1v2i32,
130 AArch64::ZIP2v2i32, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
131 AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass),
132 RuleST4(AArch64::ST4Fourv8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
133 AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, AArch64::ZIP1v8i16,
134 AArch64::ZIP2v8i16, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
135 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
136 RuleST4(AArch64::ST4Fourv4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
137 AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, AArch64::ZIP1v4i16,
138 AArch64::ZIP2v4i16, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
139 AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass),
140 RuleST4(AArch64::ST4Fourv16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
141 AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, AArch64::ZIP1v16i8,
142 AArch64::ZIP2v16i8, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
143 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
144 RuleST4(AArch64::ST4Fourv8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
145 AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, AArch64::ZIP1v8i8,
146 AArch64::ZIP2v8i8, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
147 AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass)
148 };
149
150 // A costly instruction is replaced in this work by N efficient instructions
151 // The maximum of N is currently 10 and it is for ST4 case.
152 static const unsigned MaxNumRepl = 10;
153
154 AArch64SIMDInstrOpt() : MachineFunctionPass(ID) {}
155
156 /// Based only on latency of instructions, determine if it is cost efficient
157 /// to replace the instruction InstDesc by the instructions stored in the
158 /// array InstDescRepl.
159 /// Return true if replacement is expected to be faster.
160 bool shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc,
161 SmallVectorImpl<const MCInstrDesc*> &ReplInstrMCID);
162
163 /// Determine if we need to exit the instruction replacement optimization
164 /// passes early. This makes sure that no compile time is spent in this pass
165 /// for targets with no need for any of these optimizations.
166 /// Return true if early exit of the pass is recommended.
167 bool shouldExitEarly(MachineFunction *MF, Subpass SP);
168
169 /// Check whether an equivalent DUP instruction has already been
170 /// created or not.
171 /// Return true when the DUP instruction already exists. In this case,
172 /// DestReg will point to the destination of the already created DUP.
173 bool reuseDUP(MachineInstr &MI, unsigned DupOpcode, unsigned SrcReg,
174 unsigned LaneNumber, unsigned *DestReg) const;
175
176 /// Certain SIMD instructions with vector element operand are not efficient.
177 /// Rewrite them into SIMD instructions with vector operands. This rewrite
178 /// is driven by the latency of the instructions.
179 /// Return true if the SIMD instruction is modified.
180 bool optimizeVectElement(MachineInstr &MI);
181
182 /// Process The REG_SEQUENCE instruction, and extract the source
183 /// operands of the ST2/4 instruction from it.
184 /// Example of such instructions.
185 /// %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1;
186 /// Return true when the instruction is processed successfully.
187 bool processSeqRegInst(MachineInstr *DefiningMI, unsigned *StReg,
188 RegState *StRegKill, unsigned NumArg) const;
189
190 /// Load/Store Interleaving instructions are not always beneficial.
191 /// Replace them by ZIP instructionand classical load/store.
192 /// Return true if the SIMD instruction is modified.
193 bool optimizeLdStInterleave(MachineInstr &MI);
194
195 /// Return the number of useful source registers for this
196 /// instruction (2 for ST2 and 4 for ST4).
197 unsigned determineSrcReg(MachineInstr &MI) const;
198
199 bool runOnMachineFunction(MachineFunction &Fn) override;
200
201 StringRef getPassName() const override {
203 }
204};
205
206char AArch64SIMDInstrOpt::ID = 0;
207
208} // end anonymous namespace
209
210INITIALIZE_PASS(AArch64SIMDInstrOpt, "aarch64-simdinstr-opt",
212
213/// Based only on latency of instructions, determine if it is cost efficient
214/// to replace the instruction InstDesc by the instructions stored in the
215/// array InstDescRepl.
216/// Return true if replacement is expected to be faster.
217bool AArch64SIMDInstrOpt::
218shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc,
219 SmallVectorImpl<const MCInstrDesc*> &InstDescRepl) {
220 // Check if replacement decision is already available in the cached table.
221 // if so, return it.
222 std::string Subtarget = std::string(SchedModel.getSubtargetInfo()->getCPU());
223 auto InstID = std::make_pair(InstDesc->getOpcode(), Subtarget);
224 auto It = SIMDInstrTable.find(InstID);
225 if (It != SIMDInstrTable.end())
226 return It->second;
227
228 unsigned SCIdx = InstDesc->getSchedClass();
229 const MCSchedClassDesc *SCDesc =
230 SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx);
231
232 // If a target does not define resources for the instructions
233 // of interest, then return false for no replacement.
234 const MCSchedClassDesc *SCDescRepl;
235 if (!SCDesc->isValid() || SCDesc->isVariant())
236 {
237 SIMDInstrTable[InstID] = false;
238 return false;
239 }
240 for (const auto *IDesc : InstDescRepl)
241 {
242 SCDescRepl = SchedModel.getMCSchedModel()->getSchedClassDesc(
243 IDesc->getSchedClass());
244 if (!SCDescRepl->isValid() || SCDescRepl->isVariant())
245 {
246 SIMDInstrTable[InstID] = false;
247 return false;
248 }
249 }
250
251 // Replacement cost.
252 unsigned ReplCost = 0;
253 for (const auto *IDesc :InstDescRepl)
254 ReplCost += SchedModel.computeInstrLatency(IDesc->getOpcode());
255
256 if (SchedModel.computeInstrLatency(InstDesc->getOpcode()) > ReplCost)
257 {
258 SIMDInstrTable[InstID] = true;
259 return true;
260 }
261 else
262 {
263 SIMDInstrTable[InstID] = false;
264 return false;
265 }
266}
267
268/// Determine if we need to exit this pass for a kind of instruction replacement
269/// early. This makes sure that no compile time is spent in this pass for
270/// targets with no need for any of these optimizations beyond performing this
271/// check.
272/// Return true if early exit of this pass for a kind of instruction
273/// replacement is recommended for a target.
274bool AArch64SIMDInstrOpt::shouldExitEarly(MachineFunction *MF, Subpass SP) {
275 const MCInstrDesc* OriginalMCID;
277
278 switch (SP) {
279 // For this optimization, check by comparing the latency of a representative
280 // instruction to that of the replacement instructions.
281 // TODO: check for all concerned instructions.
282 case VectorElem:
283 OriginalMCID = &TII->get(AArch64::FMLAv4i32_indexed);
284 ReplInstrMCID.push_back(&TII->get(AArch64::DUPv4i32lane));
285 ReplInstrMCID.push_back(&TII->get(AArch64::FMLAv4f32));
286 if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID))
287 return false;
288 break;
289
290 // For this optimization, check for all concerned instructions.
291 case Interleave:
292 std::string Subtarget =
293 std::string(SchedModel.getSubtargetInfo()->getCPU());
294 auto It = InterlEarlyExit.find(Subtarget);
295 if (It != InterlEarlyExit.end())
296 return It->second;
297
298 for (auto &I : IRT) {
299 OriginalMCID = &TII->get(I.OrigOpc);
300 for (auto &Repl : I.ReplOpc)
301 ReplInstrMCID.push_back(&TII->get(Repl));
302 if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID)) {
303 InterlEarlyExit[Subtarget] = false;
304 return false;
305 }
306 ReplInstrMCID.clear();
307 }
308 InterlEarlyExit[Subtarget] = true;
309 break;
310 }
311
312 return true;
313}
314
315/// Check whether an equivalent DUP instruction has already been
316/// created or not.
317/// Return true when the DUP instruction already exists. In this case,
318/// DestReg will point to the destination of the already created DUP.
319bool AArch64SIMDInstrOpt::reuseDUP(MachineInstr &MI, unsigned DupOpcode,
320 unsigned SrcReg, unsigned LaneNumber,
321 unsigned *DestReg) const {
322 for (MachineBasicBlock::iterator MII = MI, MIE = MI.getParent()->begin();
323 MII != MIE;) {
324 MII--;
325 MachineInstr *CurrentMI = &*MII;
326
327 if (CurrentMI->getOpcode() == DupOpcode &&
328 CurrentMI->getNumOperands() == 3 &&
329 CurrentMI->getOperand(1).getReg() == SrcReg &&
330 CurrentMI->getOperand(2).getImm() == LaneNumber) {
331 *DestReg = CurrentMI->getOperand(0).getReg();
332 return true;
333 }
334 }
335
336 return false;
337}
338
339/// Certain SIMD instructions with vector element operand are not efficient.
340/// Rewrite them into SIMD instructions with vector operands. This rewrite
341/// is driven by the latency of the instructions.
342/// The instruction of concerns are for the time being FMLA, FMLS, FMUL,
343/// and FMULX and hence they are hardcoded.
344///
345/// For example:
346/// fmla v0.4s, v1.4s, v2.s[1]
347///
348/// Is rewritten into
349/// dup v3.4s, v2.s[1] // DUP not necessary if redundant
350/// fmla v0.4s, v1.4s, v3.4s
351///
352/// Return true if the SIMD instruction is modified.
353bool AArch64SIMDInstrOpt::optimizeVectElement(MachineInstr &MI) {
354 const MCInstrDesc *MulMCID, *DupMCID;
355 const TargetRegisterClass *RC = &AArch64::FPR128RegClass;
356
357 switch (MI.getOpcode()) {
358 default:
359 return false;
360
361 // 4X32 instructions
362 case AArch64::FMLAv4i32_indexed:
363 DupMCID = &TII->get(AArch64::DUPv4i32lane);
364 MulMCID = &TII->get(AArch64::FMLAv4f32);
365 break;
366 case AArch64::FMLSv4i32_indexed:
367 DupMCID = &TII->get(AArch64::DUPv4i32lane);
368 MulMCID = &TII->get(AArch64::FMLSv4f32);
369 break;
370 case AArch64::FMULXv4i32_indexed:
371 DupMCID = &TII->get(AArch64::DUPv4i32lane);
372 MulMCID = &TII->get(AArch64::FMULXv4f32);
373 break;
374 case AArch64::FMULv4i32_indexed:
375 DupMCID = &TII->get(AArch64::DUPv4i32lane);
376 MulMCID = &TII->get(AArch64::FMULv4f32);
377 break;
378
379 // 2X64 instructions
380 case AArch64::FMLAv2i64_indexed:
381 DupMCID = &TII->get(AArch64::DUPv2i64lane);
382 MulMCID = &TII->get(AArch64::FMLAv2f64);
383 break;
384 case AArch64::FMLSv2i64_indexed:
385 DupMCID = &TII->get(AArch64::DUPv2i64lane);
386 MulMCID = &TII->get(AArch64::FMLSv2f64);
387 break;
388 case AArch64::FMULXv2i64_indexed:
389 DupMCID = &TII->get(AArch64::DUPv2i64lane);
390 MulMCID = &TII->get(AArch64::FMULXv2f64);
391 break;
392 case AArch64::FMULv2i64_indexed:
393 DupMCID = &TII->get(AArch64::DUPv2i64lane);
394 MulMCID = &TII->get(AArch64::FMULv2f64);
395 break;
396
397 // 2X32 instructions
398 case AArch64::FMLAv2i32_indexed:
399 RC = &AArch64::FPR64RegClass;
400 DupMCID = &TII->get(AArch64::DUPv2i32lane);
401 MulMCID = &TII->get(AArch64::FMLAv2f32);
402 break;
403 case AArch64::FMLSv2i32_indexed:
404 RC = &AArch64::FPR64RegClass;
405 DupMCID = &TII->get(AArch64::DUPv2i32lane);
406 MulMCID = &TII->get(AArch64::FMLSv2f32);
407 break;
408 case AArch64::FMULXv2i32_indexed:
409 RC = &AArch64::FPR64RegClass;
410 DupMCID = &TII->get(AArch64::DUPv2i32lane);
411 MulMCID = &TII->get(AArch64::FMULXv2f32);
412 break;
413 case AArch64::FMULv2i32_indexed:
414 RC = &AArch64::FPR64RegClass;
415 DupMCID = &TII->get(AArch64::DUPv2i32lane);
416 MulMCID = &TII->get(AArch64::FMULv2f32);
417 break;
418 }
419
421 ReplInstrMCID.push_back(DupMCID);
422 ReplInstrMCID.push_back(MulMCID);
423 if (!shouldReplaceInst(MI.getParent()->getParent(), &TII->get(MI.getOpcode()),
424 ReplInstrMCID))
425 return false;
426
427 const DebugLoc &DL = MI.getDebugLoc();
428 MachineBasicBlock &MBB = *MI.getParent();
429 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
430
431 // Get the operands of the current SIMD arithmetic instruction.
432 Register MulDest = MI.getOperand(0).getReg();
433 Register SrcReg0 = MI.getOperand(1).getReg();
434 RegState Src0IsKill = getKillRegState(MI.getOperand(1).isKill());
435 Register SrcReg1 = MI.getOperand(2).getReg();
436 RegState Src1IsKill = getKillRegState(MI.getOperand(2).isKill());
437 unsigned DupDest;
438
439 // Instructions of interest have either 4 or 5 operands.
440 if (MI.getNumOperands() == 5) {
441 Register SrcReg2 = MI.getOperand(3).getReg();
442 RegState Src2IsKill = getKillRegState(MI.getOperand(3).isKill());
443 unsigned LaneNumber = MI.getOperand(4).getImm();
444 // Create a new DUP instruction. Note that if an equivalent DUP instruction
445 // has already been created before, then use that one instead of creating
446 // a new one.
447 if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg2, LaneNumber, &DupDest)) {
448 DupDest = MRI.createVirtualRegister(RC);
449 BuildMI(MBB, MI, DL, *DupMCID, DupDest)
450 .addReg(SrcReg2, Src2IsKill)
451 .addImm(LaneNumber);
452 }
453 BuildMI(MBB, MI, DL, *MulMCID, MulDest)
454 .addReg(SrcReg0, Src0IsKill)
455 .addReg(SrcReg1, Src1IsKill)
456 .addReg(DupDest, Src2IsKill);
457 } else if (MI.getNumOperands() == 4) {
458 unsigned LaneNumber = MI.getOperand(3).getImm();
459 if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg1, LaneNumber, &DupDest)) {
460 DupDest = MRI.createVirtualRegister(RC);
461 BuildMI(MBB, MI, DL, *DupMCID, DupDest)
462 .addReg(SrcReg1, Src1IsKill)
463 .addImm(LaneNumber);
464 }
465 BuildMI(MBB, MI, DL, *MulMCID, MulDest)
466 .addReg(SrcReg0, Src0IsKill)
467 .addReg(DupDest, Src1IsKill);
468 } else {
469 return false;
470 }
471
472 ++NumModifiedInstr;
473 return true;
474}
475
476/// Load/Store Interleaving instructions are not always beneficial.
477/// Replace them by ZIP instructions and classical load/store.
478///
479/// For example:
480/// st2 {v0.4s, v1.4s}, addr
481///
482/// Is rewritten into:
483/// zip1 v2.4s, v0.4s, v1.4s
484/// zip2 v3.4s, v0.4s, v1.4s
485/// stp q2, q3, addr
486//
487/// For example:
488/// st4 {v0.4s, v1.4s, v2.4s, v3.4s}, addr
489///
490/// Is rewritten into:
491/// zip1 v4.4s, v0.4s, v2.4s
492/// zip2 v5.4s, v0.4s, v2.4s
493/// zip1 v6.4s, v1.4s, v3.4s
494/// zip2 v7.4s, v1.4s, v3.4s
495/// zip1 v8.4s, v4.4s, v6.4s
496/// zip2 v9.4s, v4.4s, v6.4s
497/// zip1 v10.4s, v5.4s, v7.4s
498/// zip2 v11.4s, v5.4s, v7.4s
499/// stp q8, q9, addr
500/// stp q10, q11, addr+32
501///
502/// Currently only instructions related to ST2 and ST4 are considered.
503/// Other may be added later.
504/// Return true if the SIMD instruction is modified.
505bool AArch64SIMDInstrOpt::optimizeLdStInterleave(MachineInstr &MI) {
506
507 unsigned SeqReg, AddrReg;
508 unsigned StReg[4];
509 RegState StRegKill[4];
510 MachineInstr *DefiningMI;
511 const DebugLoc &DL = MI.getDebugLoc();
512 MachineBasicBlock &MBB = *MI.getParent();
515
516 // If current instruction matches any of the rewriting rules, then
517 // gather information about parameters of the new instructions.
518 bool Match = false;
519 for (auto &I : IRT) {
520 if (MI.getOpcode() == I.OrigOpc) {
521 SeqReg = MI.getOperand(0).getReg();
522 AddrReg = MI.getOperand(1).getReg();
523 DefiningMI = MRI->getUniqueVRegDef(SeqReg);
524 unsigned NumReg = determineSrcReg(MI);
525 if (!processSeqRegInst(DefiningMI, StReg, StRegKill, NumReg))
526 return false;
527
528 for (auto &Repl : I.ReplOpc) {
529 ReplInstrMCID.push_back(&TII->get(Repl));
530 // Generate destination registers but only for non-store instruction.
531 if (Repl != AArch64::STPQi && Repl != AArch64::STPDi)
532 ZipDest.push_back(MRI->createVirtualRegister(&I.RC));
533 }
534 Match = true;
535 break;
536 }
537 }
538
539 if (!Match)
540 return false;
541
542 // Determine if it is profitable to replace MI by the series of instructions
543 // represented in ReplInstrMCID.
544 if (!shouldReplaceInst(MI.getParent()->getParent(), &TII->get(MI.getOpcode()),
545 ReplInstrMCID))
546 return false;
547
548 // Generate the replacement instructions composed of ZIP1, ZIP2, and STP (at
549 // this point, the code generation is hardcoded and does not rely on the IRT
550 // table used above given that code generation for ST2 replacement is somewhat
551 // different than for ST4 replacement. We could have added more info into the
552 // table related to how we build new instructions but we may be adding more
553 // complexity with that).
554 switch (MI.getOpcode()) {
555 default:
556 return false;
557
558 case AArch64::ST2Twov16b:
559 case AArch64::ST2Twov8b:
560 case AArch64::ST2Twov8h:
561 case AArch64::ST2Twov4h:
562 case AArch64::ST2Twov4s:
563 case AArch64::ST2Twov2s:
564 case AArch64::ST2Twov2d:
565 // ZIP instructions
566 BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0])
567 .addReg(StReg[0])
568 .addReg(StReg[1]);
569 BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1])
570 .addReg(StReg[0], StRegKill[0])
571 .addReg(StReg[1], StRegKill[1]);
572 // STP instructions
573 BuildMI(MBB, MI, DL, *ReplInstrMCID[2])
574 .addReg(ZipDest[0])
575 .addReg(ZipDest[1])
576 .addReg(AddrReg)
577 .addImm(0);
578 break;
579
580 case AArch64::ST4Fourv16b:
581 case AArch64::ST4Fourv8b:
582 case AArch64::ST4Fourv8h:
583 case AArch64::ST4Fourv4h:
584 case AArch64::ST4Fourv4s:
585 case AArch64::ST4Fourv2s:
586 case AArch64::ST4Fourv2d:
587 // ZIP instructions
588 BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0])
589 .addReg(StReg[0])
590 .addReg(StReg[2]);
591 BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1])
592 .addReg(StReg[0], StRegKill[0])
593 .addReg(StReg[2], StRegKill[2]);
594 BuildMI(MBB, MI, DL, *ReplInstrMCID[2], ZipDest[2])
595 .addReg(StReg[1])
596 .addReg(StReg[3]);
597 BuildMI(MBB, MI, DL, *ReplInstrMCID[3], ZipDest[3])
598 .addReg(StReg[1], StRegKill[1])
599 .addReg(StReg[3], StRegKill[3]);
600 BuildMI(MBB, MI, DL, *ReplInstrMCID[4], ZipDest[4])
601 .addReg(ZipDest[0])
602 .addReg(ZipDest[2]);
603 BuildMI(MBB, MI, DL, *ReplInstrMCID[5], ZipDest[5])
604 .addReg(ZipDest[0])
605 .addReg(ZipDest[2]);
606 BuildMI(MBB, MI, DL, *ReplInstrMCID[6], ZipDest[6])
607 .addReg(ZipDest[1])
608 .addReg(ZipDest[3]);
609 BuildMI(MBB, MI, DL, *ReplInstrMCID[7], ZipDest[7])
610 .addReg(ZipDest[1])
611 .addReg(ZipDest[3]);
612 // stp instructions
613 BuildMI(MBB, MI, DL, *ReplInstrMCID[8])
614 .addReg(ZipDest[4])
615 .addReg(ZipDest[5])
616 .addReg(AddrReg)
617 .addImm(0);
618 BuildMI(MBB, MI, DL, *ReplInstrMCID[9])
619 .addReg(ZipDest[6])
620 .addReg(ZipDest[7])
621 .addReg(AddrReg)
622 .addImm(2);
623 break;
624 }
625
626 ++NumModifiedInstr;
627 return true;
628}
629
630/// Process The REG_SEQUENCE instruction, and extract the source
631/// operands of the ST2/4 instruction from it.
632/// Example of such instruction.
633/// %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1;
634/// Return true when the instruction is processed successfully.
635bool AArch64SIMDInstrOpt::processSeqRegInst(MachineInstr *DefiningMI,
636 unsigned *StReg,
637 RegState *StRegKill,
638 unsigned NumArg) const {
639 assert(DefiningMI != nullptr);
640 if (DefiningMI->getOpcode() != AArch64::REG_SEQUENCE)
641 return false;
642
643 for (unsigned i=0; i<NumArg; i++) {
644 StReg[i] = DefiningMI->getOperand(2*i+1).getReg();
645 StRegKill[i] = getKillRegState(DefiningMI->getOperand(2*i+1).isKill());
646
647 // Validation check for the other arguments.
648 if (DefiningMI->getOperand(2*i+2).isImm()) {
649 switch (DefiningMI->getOperand(2*i+2).getImm()) {
650 default:
651 return false;
652
653 case AArch64::dsub0:
654 case AArch64::dsub1:
655 case AArch64::dsub2:
656 case AArch64::dsub3:
657 case AArch64::qsub0:
658 case AArch64::qsub1:
659 case AArch64::qsub2:
660 case AArch64::qsub3:
661 break;
662 }
663 }
664 else
665 return false;
666 }
667 return true;
668}
669
670/// Return the number of useful source registers for this instruction
671/// (2 for ST2 and 4 for ST4).
672unsigned AArch64SIMDInstrOpt::determineSrcReg(MachineInstr &MI) const {
673 switch (MI.getOpcode()) {
674 default:
675 llvm_unreachable("Unsupported instruction for this pass");
676
677 case AArch64::ST2Twov16b:
678 case AArch64::ST2Twov8b:
679 case AArch64::ST2Twov8h:
680 case AArch64::ST2Twov4h:
681 case AArch64::ST2Twov4s:
682 case AArch64::ST2Twov2s:
683 case AArch64::ST2Twov2d:
684 return 2;
685
686 case AArch64::ST4Fourv16b:
687 case AArch64::ST4Fourv8b:
688 case AArch64::ST4Fourv8h:
689 case AArch64::ST4Fourv4h:
690 case AArch64::ST4Fourv4s:
691 case AArch64::ST4Fourv2s:
692 case AArch64::ST4Fourv2d:
693 return 4;
694 }
695}
696
697bool AArch64SIMDInstrOpt::runOnMachineFunction(MachineFunction &MF) {
698 if (skipFunction(MF.getFunction()))
699 return false;
700
701 MRI = &MF.getRegInfo();
702 const AArch64Subtarget &ST = MF.getSubtarget<AArch64Subtarget>();
703 TII = ST.getInstrInfo();
704 SchedModel.init(&ST);
705 if (!SchedModel.hasInstrSchedModel())
706 return false;
707
708 bool Changed = false;
709 for (auto OptimizationKind : {VectorElem, Interleave}) {
710 if (!shouldExitEarly(&MF, OptimizationKind)) {
711 SmallVector<MachineInstr *, 8> RemoveMIs;
712 for (MachineBasicBlock &MBB : MF) {
713 for (MachineInstr &MI : MBB) {
714 bool InstRewrite;
715 if (OptimizationKind == VectorElem)
716 InstRewrite = optimizeVectElement(MI) ;
717 else
718 InstRewrite = optimizeLdStInterleave(MI);
719 if (InstRewrite) {
720 // Add MI to the list of instructions to be removed given that it
721 // has been replaced.
722 RemoveMIs.push_back(&MI);
723 Changed = true;
724 }
725 }
726 }
727 for (MachineInstr *MI : RemoveMIs)
728 MI->eraseFromParent();
729 }
730 }
731
732 return Changed;
733}
734
735/// Returns an instance of the high cost ASIMD instruction replacement
736/// optimization pass.
738 return new AArch64SIMDInstrOpt();
739}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
#define RuleST4(OpcOrg, OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, OpcR7, OpcR8, OpcR9, RC)
#define RuleST2(OpcOrg, OpcR0, OpcR1, OpcR2, RC)
#define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition MD5.cpp:57
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
Describe properties that are true of each instruction in the target description file.
unsigned getOpcode() const
Return the opcode number for this descriptor.
StringRef getCPU() const
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
unsigned getNumOperands() const
Retuns the total number of operands.
const MachineOperand & getOperand(unsigned i) const
int64_t getImm() const
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
Provide an instruction scheduling machine model to CodeGen passes.
LLVM_ABI bool hasInstrSchedModel() const
Return true if this machine model includes an instruction-level scheduling model.
LLVM_ABI void init(const TargetSubtargetInfo *TSInfo, bool EnableSModel=true, bool EnableSItins=true)
Initialize the machine model for instruction scheduling.
const TargetSubtargetInfo * getSubtargetInfo() const
TargetSubtargetInfo getter.
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
RegState
Flags to represent properties of register accesses.
constexpr RegState getKillRegState(bool B)
FunctionPass * createAArch64SIMDInstrOptPass()
Returns an instance of the high cost ASIMD instruction replacement optimization pass.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition MCSchedule.h:123
bool isVariant() const
Definition MCSchedule.h:144