LLVM 23.0.0git
HexagonLoopIdiomRecognition.cpp
Go to the documentation of this file.
1//===- HexagonLoopIdiomRecognition.cpp ------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "Hexagon.h"
11#include "llvm/ADT/APInt.h"
12#include "llvm/ADT/DenseMap.h"
13#include "llvm/ADT/SetVector.h"
16#include "llvm/ADT/StringRef.h"
28#include "llvm/IR/Attributes.h"
29#include "llvm/IR/BasicBlock.h"
30#include "llvm/IR/Constant.h"
31#include "llvm/IR/Constants.h"
32#include "llvm/IR/DataLayout.h"
33#include "llvm/IR/DebugLoc.h"
35#include "llvm/IR/Dominators.h"
36#include "llvm/IR/Function.h"
37#include "llvm/IR/IRBuilder.h"
38#include "llvm/IR/InstrTypes.h"
39#include "llvm/IR/Instruction.h"
41#include "llvm/IR/Intrinsics.h"
42#include "llvm/IR/IntrinsicsHexagon.h"
43#include "llvm/IR/Module.h"
44#include "llvm/IR/PassManager.h"
47#include "llvm/IR/Type.h"
48#include "llvm/IR/User.h"
49#include "llvm/IR/Value.h"
51#include "llvm/Pass.h"
55#include "llvm/Support/Debug.h"
64#include <algorithm>
65#include <array>
66#include <cassert>
67#include <cstdint>
68#include <cstdlib>
69#include <deque>
70#include <functional>
71#include <iterator>
72#include <map>
73#include <set>
74#include <utility>
75#include <vector>
76
77#define DEBUG_TYPE "hexagon-lir"
78
79using namespace llvm;
80
81static cl::opt<bool> DisableMemcpyIdiom("disable-memcpy-idiom",
82 cl::Hidden, cl::init(false),
83 cl::desc("Disable generation of memcpy in loop idiom recognition"));
84
85static cl::opt<bool> DisableMemmoveIdiom("disable-memmove-idiom",
86 cl::Hidden, cl::init(false),
87 cl::desc("Disable generation of memmove in loop idiom recognition"));
88
89static cl::opt<unsigned> RuntimeMemSizeThreshold("runtime-mem-idiom-threshold",
90 cl::Hidden, cl::init(0), cl::desc("Threshold (in bytes) for the runtime "
91 "check guarding the memmove."));
92
94 "compile-time-mem-idiom-threshold", cl::Hidden, cl::init(64),
95 cl::desc("Threshold (in bytes) to perform the transformation, if the "
96 "runtime loop count (mem transfer size) is known at compile-time."));
97
98static cl::opt<bool> OnlyNonNestedMemmove("only-nonnested-memmove-idiom",
99 cl::Hidden, cl::init(true),
100 cl::desc("Only enable generating memmove in non-nested loops"));
101
103 "disable-hexagon-volatile-memcpy", cl::Hidden, cl::init(false),
104 cl::desc("Enable Hexagon-specific memcpy for volatile destination."));
105
106static cl::opt<unsigned> SimplifyLimit("hlir-simplify-limit", cl::init(10000),
107 cl::Hidden, cl::desc("Maximum number of simplification steps in HLIR"));
108
109namespace {
110
111class HexagonLoopIdiomRecognize {
112public:
113 explicit HexagonLoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
114 LoopInfo *LF, const TargetLibraryInfo *TLI,
115 ScalarEvolution *SE,
117 : AA(AA), DT(DT), LF(LF), TLI(TLI), SE(SE), ORE(ORE) {}
118
119 bool run(Loop *L);
120
121private:
122 int getSCEVStride(const SCEVAddRecExpr *StoreEv);
123 bool isLegalStore(Loop *CurLoop, StoreInst *SI);
124 void collectStores(Loop *CurLoop, BasicBlock *BB,
125 SmallVectorImpl<StoreInst *> &Stores);
126 bool processCopyingStore(Loop *CurLoop, StoreInst *SI, const SCEV *BECount);
127 bool coverLoop(Loop *L, SmallVectorImpl<Instruction *> &Insts) const;
128 bool runOnLoopBlock(Loop *CurLoop, BasicBlock *BB, const SCEV *BECount,
129 SmallVectorImpl<BasicBlock *> &ExitBlocks);
130 bool runOnCountableLoop(Loop *L);
131
132 AliasAnalysis *AA;
133 const DataLayout *DL;
134 DominatorTree *DT;
135 LoopInfo *LF;
136 const TargetLibraryInfo *TLI;
137 ScalarEvolution *SE;
138 OptimizationRemarkEmitter &ORE;
139 bool HasMemcpy, HasMemmove;
140};
141
142class HexagonLoopIdiomRecognizeLegacyPass : public LoopPass {
143public:
144 static char ID;
145
146 explicit HexagonLoopIdiomRecognizeLegacyPass() : LoopPass(ID) {}
147
148 StringRef getPassName() const override {
149 return "Recognize Hexagon-specific loop idioms";
150 }
151
152 void getAnalysisUsage(AnalysisUsage &AU) const override {
153 AU.addRequired<LoopInfoWrapperPass>();
156 AU.addRequired<AAResultsWrapperPass>();
157 AU.addRequired<ScalarEvolutionWrapperPass>();
158 AU.addRequired<DominatorTreeWrapperPass>();
159 AU.addRequired<TargetLibraryInfoWrapperPass>();
160 AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
161 AU.addPreserved<TargetLibraryInfoWrapperPass>();
162 }
163
164 bool runOnLoop(Loop *L, LPPassManager &LPM) override;
165};
166
167struct Simplifier {
168 struct Rule {
169 using FuncType = std::function<Value *(Instruction *, LLVMContext &)>;
170 Rule(StringRef N, FuncType F) : Name(N), Fn(F) {}
171 StringRef Name; // For debugging.
172 FuncType Fn;
173 };
174
175 void addRule(StringRef N, const Rule::FuncType &F) {
176 Rules.push_back(Rule(N, F));
177 }
178
179private:
180 struct WorkListType {
181 WorkListType() = default;
182
183 void push_back(Value *V) {
184 // Do not push back duplicates.
185 if (S.insert(V).second)
186 Q.push_back(V);
187 }
188
189 Value *pop_front_val() {
190 Value *V = Q.front();
191 Q.pop_front();
192 S.erase(V);
193 return V;
194 }
195
196 bool empty() const { return Q.empty(); }
197
198 private:
199 std::deque<Value *> Q;
200 std::set<Value *> S;
201 };
202
203 using ValueSetType = std::set<Value *>;
204
205 std::vector<Rule> Rules;
206
207public:
208 struct Context {
209 using ValueMapType = DenseMap<Value *, Value *>;
210
211 Value *Root;
212 ValueSetType Used; // The set of all cloned values used by Root.
213 ValueSetType Clones; // The set of all cloned values.
214 LLVMContext &Ctx;
215
216 Context(Instruction *Exp)
217 : Ctx(Exp->getParent()->getParent()->getContext()) {
218 initialize(Exp);
219 }
220
221 ~Context() { cleanup(); }
222
223 void print(raw_ostream &OS, const Value *V) const;
224 Value *materialize(BasicBlock *B, BasicBlock::iterator At);
225
226 private:
227 friend struct Simplifier;
228
229 void initialize(Instruction *Exp);
230 void cleanup();
231
232 template <typename FuncT> void traverse(Value *V, FuncT F);
233 void record(Value *V);
234 void use(Value *V);
235 void unuse(Value *V);
236
237 bool equal(const Instruction *I, const Instruction *J) const;
238 Value *find(Value *Tree, Value *Sub) const;
239 Value *subst(Value *Tree, Value *OldV, Value *NewV);
240 void replace(Value *OldV, Value *NewV);
241 void link(Instruction *I, BasicBlock *B, BasicBlock::iterator At);
242 };
243
244 Value *simplify(Context &C);
245};
246
247 struct PE {
248 PE(const Simplifier::Context &c, Value *v = nullptr) : C(c), V(v) {}
249
250 const Simplifier::Context &C;
251 const Value *V;
252 };
253
255 raw_ostream &operator<<(raw_ostream &OS, const PE &P) {
256 P.C.print(OS, P.V ? P.V : P.C.Root);
257 return OS;
258 }
259
260} // end anonymous namespace
261
262char HexagonLoopIdiomRecognizeLegacyPass::ID = 0;
263
264INITIALIZE_PASS_BEGIN(HexagonLoopIdiomRecognizeLegacyPass, "hexagon-loop-idiom",
265 "Recognize Hexagon-specific loop idioms", false, false)
267INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
268INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
274INITIALIZE_PASS_END(HexagonLoopIdiomRecognizeLegacyPass, "hexagon-loop-idiom",
275 "Recognize Hexagon-specific loop idioms", false, false)
276
277template <typename FuncT>
278void Simplifier::Context::traverse(Value *V, FuncT F) {
279 WorkListType Q;
280 Q.push_back(V);
281
282 while (!Q.empty()) {
283 Instruction *U = dyn_cast<Instruction>(Q.pop_front_val());
284 if (!U || U->getParent())
285 continue;
286 if (!F(U))
287 continue;
288 for (Value *Op : U->operands())
289 Q.push_back(Op);
290 }
291}
292
293void Simplifier::Context::print(raw_ostream &OS, const Value *V) const {
294 const auto *U = dyn_cast<const Instruction>(V);
295 if (!U) {
296 OS << V << '(' << *V << ')';
297 return;
298 }
299
300 if (U->getParent()) {
301 OS << U << '(';
302 U->printAsOperand(OS, true);
303 OS << ')';
304 return;
305 }
306
307 unsigned N = U->getNumOperands();
308 if (N != 0)
309 OS << U << '(';
310 OS << U->getOpcodeName();
311 for (const Value *Op : U->operands()) {
312 OS << ' ';
313 print(OS, Op);
314 }
315 if (N != 0)
316 OS << ')';
317}
318
319void Simplifier::Context::initialize(Instruction *Exp) {
320 // Perform a deep clone of the expression, set Root to the root
321 // of the clone, and build a map from the cloned values to the
322 // original ones.
323 ValueMapType M;
324 BasicBlock *Block = Exp->getParent();
325 WorkListType Q;
326 Q.push_back(Exp);
327
328 while (!Q.empty()) {
329 Value *V = Q.pop_front_val();
330 if (M.contains(V))
331 continue;
332 if (Instruction *U = dyn_cast<Instruction>(V)) {
333 if (isa<PHINode>(U) || U->getParent() != Block)
334 continue;
335 for (Value *Op : U->operands())
336 Q.push_back(Op);
337 M.insert({U, U->clone()});
338 }
339 }
340
341 for (std::pair<Value*,Value*> P : M) {
342 Instruction *U = cast<Instruction>(P.second);
343 for (unsigned i = 0, n = U->getNumOperands(); i != n; ++i) {
344 auto F = M.find(U->getOperand(i));
345 if (F != M.end())
346 U->setOperand(i, F->second);
347 }
348 }
349
350 auto R = M.find(Exp);
351 assert(R != M.end());
352 Root = R->second;
353
354 record(Root);
355 use(Root);
356}
357
358void Simplifier::Context::record(Value *V) {
359 auto Record = [this](Instruction *U) -> bool {
360 Clones.insert(U);
361 return true;
362 };
363 traverse(V, Record);
364}
365
366void Simplifier::Context::use(Value *V) {
367 auto Use = [this](Instruction *U) -> bool {
368 Used.insert(U);
369 return true;
370 };
371 traverse(V, Use);
372}
373
374void Simplifier::Context::unuse(Value *V) {
375 if (!isa<Instruction>(V) || cast<Instruction>(V)->getParent() != nullptr)
376 return;
377
378 auto Unuse = [this](Instruction *U) -> bool {
379 if (!U->use_empty())
380 return false;
381 Used.erase(U);
382 return true;
383 };
384 traverse(V, Unuse);
385}
386
387Value *Simplifier::Context::subst(Value *Tree, Value *OldV, Value *NewV) {
388 if (Tree == OldV)
389 return NewV;
390 if (OldV == NewV)
391 return Tree;
392
393 WorkListType Q;
394 Q.push_back(Tree);
395 while (!Q.empty()) {
396 Instruction *U = dyn_cast<Instruction>(Q.pop_front_val());
397 // If U is not an instruction, or it's not a clone, skip it.
398 if (!U || U->getParent())
399 continue;
400 for (unsigned i = 0, n = U->getNumOperands(); i != n; ++i) {
401 Value *Op = U->getOperand(i);
402 if (Op == OldV) {
403 U->setOperand(i, NewV);
404 unuse(OldV);
405 } else {
406 Q.push_back(Op);
407 }
408 }
409 }
410 return Tree;
411}
412
413void Simplifier::Context::replace(Value *OldV, Value *NewV) {
414 if (Root == OldV) {
415 Root = NewV;
416 use(Root);
417 return;
418 }
419
420 // NewV may be a complex tree that has just been created by one of the
421 // transformation rules. We need to make sure that it is commoned with
422 // the existing Root to the maximum extent possible.
423 // Identify all subtrees of NewV (including NewV itself) that have
424 // equivalent counterparts in Root, and replace those subtrees with
425 // these counterparts.
426 WorkListType Q;
427 Q.push_back(NewV);
428 while (!Q.empty()) {
429 Value *V = Q.pop_front_val();
431 if (!U || U->getParent())
432 continue;
433 if (Value *DupV = find(Root, V)) {
434 if (DupV != V)
435 NewV = subst(NewV, V, DupV);
436 } else {
437 for (Value *Op : U->operands())
438 Q.push_back(Op);
439 }
440 }
441
442 // Now, simply replace OldV with NewV in Root.
443 Root = subst(Root, OldV, NewV);
444 use(Root);
445}
446
447void Simplifier::Context::cleanup() {
448 for (Value *V : Clones) {
450 if (!U->getParent())
451 U->dropAllReferences();
452 }
453
454 for (Value *V : Clones) {
456 if (!U->getParent())
457 U->deleteValue();
458 }
459}
460
461bool Simplifier::Context::equal(const Instruction *I,
462 const Instruction *J) const {
463 if (I == J)
464 return true;
465 if (!I->isSameOperationAs(J))
466 return false;
467 if (isa<PHINode>(I))
468 return I->isIdenticalTo(J);
469
470 for (unsigned i = 0, n = I->getNumOperands(); i != n; ++i) {
471 Value *OpI = I->getOperand(i), *OpJ = J->getOperand(i);
472 if (OpI == OpJ)
473 continue;
474 auto *InI = dyn_cast<const Instruction>(OpI);
475 auto *InJ = dyn_cast<const Instruction>(OpJ);
476 if (InI && InJ) {
477 if (!equal(InI, InJ))
478 return false;
479 } else if (InI != InJ || !InI)
480 return false;
481 }
482 return true;
483}
484
485Value *Simplifier::Context::find(Value *Tree, Value *Sub) const {
487 WorkListType Q;
488 Q.push_back(Tree);
489
490 while (!Q.empty()) {
491 Value *V = Q.pop_front_val();
492 if (V == Sub)
493 return V;
495 if (!U || U->getParent())
496 continue;
497 if (SubI && equal(SubI, U))
498 return U;
499 assert(!isa<PHINode>(U));
500 for (Value *Op : U->operands())
501 Q.push_back(Op);
502 }
503 return nullptr;
504}
505
506void Simplifier::Context::link(Instruction *I, BasicBlock *B,
508 if (I->getParent())
509 return;
510
511 for (Value *Op : I->operands()) {
512 if (Instruction *OpI = dyn_cast<Instruction>(Op))
513 link(OpI, B, At);
514 }
515
516 I->insertInto(B, At);
517}
518
519Value *Simplifier::Context::materialize(BasicBlock *B,
521 if (Instruction *RootI = dyn_cast<Instruction>(Root))
522 link(RootI, B, At);
523 return Root;
524}
525
526Value *Simplifier::simplify(Context &C) {
527 WorkListType Q;
528 Q.push_back(C.Root);
529 unsigned Count = 0;
530 const unsigned Limit = SimplifyLimit;
531
532 while (!Q.empty()) {
533 if (Count++ >= Limit)
534 break;
535 Instruction *U = dyn_cast<Instruction>(Q.pop_front_val());
536 if (!U || U->getParent() || !C.Used.count(U))
537 continue;
538 bool Changed = false;
539 for (Rule &R : Rules) {
540 Value *W = R.Fn(U, C.Ctx);
541 if (!W)
542 continue;
543 Changed = true;
544 C.record(W);
545 C.replace(U, W);
546 Q.push_back(C.Root);
547 break;
548 }
549 if (!Changed) {
550 for (Value *Op : U->operands())
551 Q.push_back(Op);
552 }
553 }
554 return Count < Limit ? C.Root : nullptr;
555}
556
557//===----------------------------------------------------------------------===//
558//
559// Implementation of PolynomialMultiplyRecognize
560//
561//===----------------------------------------------------------------------===//
562
563namespace {
564
565 class PolynomialMultiplyRecognize {
566 public:
567 explicit PolynomialMultiplyRecognize(Loop *loop, const DataLayout &dl,
568 const DominatorTree &dt, const TargetLibraryInfo &tli,
569 ScalarEvolution &se)
570 : CurLoop(loop), DL(dl), DT(dt), TLI(tli), SE(se) {}
571
572 bool recognize();
573
574 private:
575 using ValueSeq = SetVector<Value *>;
576
577 IntegerType *getPmpyType() const {
578 LLVMContext &Ctx = CurLoop->getHeader()->getParent()->getContext();
579 return IntegerType::get(Ctx, 32);
580 }
581
582 bool isPromotableTo(Value *V, IntegerType *Ty);
583 void promoteTo(Instruction *In, IntegerType *DestTy, BasicBlock *LoopB);
584 bool promoteTypes(BasicBlock *LoopB, BasicBlock *ExitB);
585
586 Value *getCountIV(BasicBlock *BB);
587 bool findCycle(Value *Out, Value *In, ValueSeq &Cycle);
588 void classifyCycle(Instruction *DivI, ValueSeq &Cycle, ValueSeq &Early,
589 ValueSeq &Late);
590 bool classifyInst(Instruction *UseI, ValueSeq &Early, ValueSeq &Late);
591 bool commutesWithShift(Instruction *I);
592 bool highBitsAreZero(Value *V, unsigned IterCount);
593 bool keepsHighBitsZero(Value *V, unsigned IterCount);
594 bool isOperandShifted(Instruction *I, Value *Op);
595 bool convertShiftsToLeft(BasicBlock *LoopB, BasicBlock *ExitB,
596 unsigned IterCount);
597 void cleanupLoopBody(BasicBlock *LoopB);
598
599 struct ParsedValues {
600 ParsedValues() = default;
601
602 Value *M = nullptr;
603 Value *P = nullptr;
604 Value *Q = nullptr;
605 Value *R = nullptr;
606 Value *X = nullptr;
607 Instruction *Res = nullptr;
608 unsigned IterCount = 0;
609 bool Left = false;
610 bool Inv = false;
611 };
612
613 bool matchLeftShift(SelectInst *SelI, Value *CIV, ParsedValues &PV);
614 bool matchRightShift(SelectInst *SelI, ParsedValues &PV);
615 bool scanSelect(SelectInst *SI, BasicBlock *LoopB, BasicBlock *PrehB,
616 Value *CIV, ParsedValues &PV, bool PreScan);
617 unsigned getInverseMxN(unsigned QP);
618 Value *generate(BasicBlock::iterator At, ParsedValues &PV);
619
620 void setupPreSimplifier(Simplifier &S);
621 void setupPostSimplifier(Simplifier &S);
622
623 Loop *CurLoop;
624 const DataLayout &DL;
625 const DominatorTree &DT;
626 const TargetLibraryInfo &TLI;
627 ScalarEvolution &SE;
628 };
629
630} // end anonymous namespace
631
632Value *PolynomialMultiplyRecognize::getCountIV(BasicBlock *BB) {
633 pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
634 if (std::distance(PI, PE) != 2)
635 return nullptr;
636 BasicBlock *PB = (*PI == BB) ? *std::next(PI) : *PI;
637
638 for (auto I = BB->begin(), E = BB->end(); I != E && isa<PHINode>(I); ++I) {
639 auto *PN = cast<PHINode>(I);
640 Value *InitV = PN->getIncomingValueForBlock(PB);
641 if (!isa<ConstantInt>(InitV) || !cast<ConstantInt>(InitV)->isZero())
642 continue;
643 Value *IterV = PN->getIncomingValueForBlock(BB);
644 auto *BO = dyn_cast<BinaryOperator>(IterV);
645 if (!BO)
646 continue;
647 if (BO->getOpcode() != Instruction::Add)
648 continue;
649 Value *IncV = nullptr;
650 if (BO->getOperand(0) == PN)
651 IncV = BO->getOperand(1);
652 else if (BO->getOperand(1) == PN)
653 IncV = BO->getOperand(0);
654 if (IncV == nullptr)
655 continue;
656
657 if (auto *T = dyn_cast<ConstantInt>(IncV))
658 if (T->isOne())
659 return PN;
660 }
661 return nullptr;
662}
663
665 for (auto UI = I->user_begin(), UE = I->user_end(); UI != UE;) {
666 Use &TheUse = UI.getUse();
667 ++UI;
668 if (auto *II = dyn_cast<Instruction>(TheUse.getUser()))
669 if (BB == II->getParent())
670 II->replaceUsesOfWith(I, J);
671 }
672}
673
674bool PolynomialMultiplyRecognize::matchLeftShift(SelectInst *SelI,
675 Value *CIV, ParsedValues &PV) {
676 // Match the following:
677 // select (X & (1 << i)) != 0 ? R ^ (Q << i) : R
678 // select (X & (1 << i)) == 0 ? R : R ^ (Q << i)
679 // The condition may also check for equality with the masked value, i.e
680 // select (X & (1 << i)) == (1 << i) ? R ^ (Q << i) : R
681 // select (X & (1 << i)) != (1 << i) ? R : R ^ (Q << i);
682
683 Value *CondV = SelI->getCondition();
684 Value *TrueV = SelI->getTrueValue();
685 Value *FalseV = SelI->getFalseValue();
686
687 using namespace PatternMatch;
688
689 CmpPredicate P;
690 Value *A = nullptr, *B = nullptr, *C = nullptr;
691
692 if (!match(CondV, m_ICmp(P, m_And(m_Value(A), m_Value(B)), m_Value(C))) &&
693 !match(CondV, m_ICmp(P, m_Value(C), m_And(m_Value(A), m_Value(B)))))
694 return false;
696 return false;
697 // Matched: select (A & B) == C ? ... : ...
698 // select (A & B) != C ? ... : ...
699
700 Value *X = nullptr, *Sh1 = nullptr;
701 // Check (A & B) for (X & (1 << i)):
702 if (match(A, m_Shl(m_One(), m_Specific(CIV)))) {
703 Sh1 = A;
704 X = B;
705 } else if (match(B, m_Shl(m_One(), m_Specific(CIV)))) {
706 Sh1 = B;
707 X = A;
708 } else {
709 // TODO: Could also check for an induction variable containing single
710 // bit shifted left by 1 in each iteration.
711 return false;
712 }
713
714 bool TrueIfZero;
715
716 // Check C against the possible values for comparison: 0 and (1 << i):
717 if (match(C, m_Zero()))
718 TrueIfZero = (P == CmpInst::ICMP_EQ);
719 else if (C == Sh1)
720 TrueIfZero = (P == CmpInst::ICMP_NE);
721 else
722 return false;
723
724 // So far, matched:
725 // select (X & (1 << i)) ? ... : ...
726 // including variations of the check against zero/non-zero value.
727
728 Value *ShouldSameV = nullptr, *ShouldXoredV = nullptr;
729 if (TrueIfZero) {
730 ShouldSameV = TrueV;
731 ShouldXoredV = FalseV;
732 } else {
733 ShouldSameV = FalseV;
734 ShouldXoredV = TrueV;
735 }
736
737 Value *Q = nullptr, *R = nullptr, *Y = nullptr, *Z = nullptr;
738 Value *T = nullptr;
739 if (match(ShouldXoredV, m_Xor(m_Value(Y), m_Value(Z)))) {
740 // Matched: select +++ ? ... : Y ^ Z
741 // select +++ ? Y ^ Z : ...
742 // where +++ denotes previously checked matches.
743 if (ShouldSameV == Y)
744 T = Z;
745 else if (ShouldSameV == Z)
746 T = Y;
747 else
748 return false;
749 R = ShouldSameV;
750 // Matched: select +++ ? R : R ^ T
751 // select +++ ? R ^ T : R
752 // depending on TrueIfZero.
753
754 } else if (match(ShouldSameV, m_Zero())) {
755 // Matched: select +++ ? 0 : ...
756 // select +++ ? ... : 0
757 if (!SelI->hasOneUse())
758 return false;
759 T = ShouldXoredV;
760 // Matched: select +++ ? 0 : T
761 // select +++ ? T : 0
762
763 Value *U = *SelI->user_begin();
764 if (!match(U, m_c_Xor(m_Specific(SelI), m_Value(R))))
765 return false;
766 // Matched: xor (select +++ ? 0 : T), R
767 // xor (select +++ ? T : 0), R
768 } else
769 return false;
770
771 // The xor input value T is isolated into its own match so that it could
772 // be checked against an induction variable containing a shifted bit
773 // (todo).
774 // For now, check against (Q << i).
775 if (!match(T, m_Shl(m_Value(Q), m_Specific(CIV))) &&
776 !match(T, m_Shl(m_ZExt(m_Value(Q)), m_ZExt(m_Specific(CIV)))))
777 return false;
778 // Matched: select +++ ? R : R ^ (Q << i)
779 // select +++ ? R ^ (Q << i) : R
780
781 PV.X = X;
782 PV.Q = Q;
783 PV.R = R;
784 PV.Left = true;
785 return true;
786}
787
788bool PolynomialMultiplyRecognize::matchRightShift(SelectInst *SelI,
789 ParsedValues &PV) {
790 // Match the following:
791 // select (X & 1) != 0 ? (R >> 1) ^ Q : (R >> 1)
792 // select (X & 1) == 0 ? (R >> 1) : (R >> 1) ^ Q
793 // The condition may also check for equality with the masked value, i.e
794 // select (X & 1) == 1 ? (R >> 1) ^ Q : (R >> 1)
795 // select (X & 1) != 1 ? (R >> 1) : (R >> 1) ^ Q
796
797 Value *CondV = SelI->getCondition();
798 Value *TrueV = SelI->getTrueValue();
799 Value *FalseV = SelI->getFalseValue();
800
801 using namespace PatternMatch;
802
803 Value *C = nullptr;
804 CmpPredicate P;
805 bool TrueIfZero;
806
807 if (match(CondV, m_c_ICmp(P, m_Value(C), m_Zero()))) {
809 return false;
810 // Matched: select C == 0 ? ... : ...
811 // select C != 0 ? ... : ...
812 TrueIfZero = (P == CmpInst::ICMP_EQ);
813 } else if (match(CondV, m_c_ICmp(P, m_Value(C), m_One()))) {
815 return false;
816 // Matched: select C == 1 ? ... : ...
817 // select C != 1 ? ... : ...
818 TrueIfZero = (P == CmpInst::ICMP_NE);
819 } else
820 return false;
821
822 Value *X = nullptr;
823 if (!match(C, m_And(m_Value(X), m_One())))
824 return false;
825 // Matched: select (X & 1) == +++ ? ... : ...
826 // select (X & 1) != +++ ? ... : ...
827
828 Value *R = nullptr, *Q = nullptr;
829 if (TrueIfZero) {
830 // The select's condition is true if the tested bit is 0.
831 // TrueV must be the shift, FalseV must be the xor.
832 if (!match(TrueV, m_LShr(m_Value(R), m_One())))
833 return false;
834 // Matched: select +++ ? (R >> 1) : ...
835 if (!match(FalseV, m_c_Xor(m_Specific(TrueV), m_Value(Q))))
836 return false;
837 // Matched: select +++ ? (R >> 1) : (R >> 1) ^ Q
838 // with commuting ^.
839 } else {
840 // The select's condition is true if the tested bit is 1.
841 // TrueV must be the xor, FalseV must be the shift.
842 if (!match(FalseV, m_LShr(m_Value(R), m_One())))
843 return false;
844 // Matched: select +++ ? ... : (R >> 1)
845 if (!match(TrueV, m_c_Xor(m_Specific(FalseV), m_Value(Q))))
846 return false;
847 // Matched: select +++ ? (R >> 1) ^ Q : (R >> 1)
848 // with commuting ^.
849 }
850
851 PV.X = X;
852 PV.Q = Q;
853 PV.R = R;
854 PV.Left = false;
855 return true;
856}
857
858bool PolynomialMultiplyRecognize::scanSelect(SelectInst *SelI,
859 BasicBlock *LoopB, BasicBlock *PrehB, Value *CIV, ParsedValues &PV,
860 bool PreScan) {
861 using namespace PatternMatch;
862
863 // The basic pattern for R = P.Q is:
864 // for i = 0..31
865 // R = phi (0, R')
866 // if (P & (1 << i)) ; test-bit(P, i)
867 // R' = R ^ (Q << i)
868 //
869 // Similarly, the basic pattern for R = (P/Q).Q - P
870 // for i = 0..31
871 // R = phi(P, R')
872 // if (R & (1 << i))
873 // R' = R ^ (Q << i)
874
875 // There exist idioms, where instead of Q being shifted left, P is shifted
876 // right. This produces a result that is shifted right by 32 bits (the
877 // non-shifted result is 64-bit).
878 //
879 // For R = P.Q, this would be:
880 // for i = 0..31
881 // R = phi (0, R')
882 // if ((P >> i) & 1)
883 // R' = (R >> 1) ^ Q ; R is cycled through the loop, so it must
884 // else ; be shifted by 1, not i.
885 // R' = R >> 1
886 //
887 // And for the inverse:
888 // for i = 0..31
889 // R = phi (P, R')
890 // if (R & 1)
891 // R' = (R >> 1) ^ Q
892 // else
893 // R' = R >> 1
894
895 // The left-shifting idioms share the same pattern:
896 // select (X & (1 << i)) ? R ^ (Q << i) : R
897 // Similarly for right-shifting idioms:
898 // select (X & 1) ? (R >> 1) ^ Q
899
900 if (matchLeftShift(SelI, CIV, PV)) {
901 // If this is a pre-scan, getting this far is sufficient.
902 if (PreScan)
903 return true;
904
905 // Need to make sure that the SelI goes back into R.
906 auto *RPhi = dyn_cast<PHINode>(PV.R);
907 if (!RPhi)
908 return false;
909 if (SelI != RPhi->getIncomingValueForBlock(LoopB))
910 return false;
911 PV.Res = SelI;
912
913 // If X is loop invariant, it must be the input polynomial, and the
914 // idiom is the basic polynomial multiply.
915 if (CurLoop->isLoopInvariant(PV.X)) {
916 PV.P = PV.X;
917 PV.Inv = false;
918 } else {
919 // X is not loop invariant. If X == R, this is the inverse pmpy.
920 // Otherwise, check for an xor with an invariant value. If the
921 // variable argument to the xor is R, then this is still a valid
922 // inverse pmpy.
923 PV.Inv = true;
924 if (PV.X != PV.R) {
925 Value *Var = nullptr, *Inv = nullptr, *X1 = nullptr, *X2 = nullptr;
926 if (!match(PV.X, m_Xor(m_Value(X1), m_Value(X2))))
927 return false;
928 auto *I1 = dyn_cast<Instruction>(X1);
929 auto *I2 = dyn_cast<Instruction>(X2);
930 if (!I1 || I1->getParent() != LoopB) {
931 Var = X2;
932 Inv = X1;
933 } else if (!I2 || I2->getParent() != LoopB) {
934 Var = X1;
935 Inv = X2;
936 } else
937 return false;
938 if (Var != PV.R)
939 return false;
940 PV.M = Inv;
941 }
942 // The input polynomial P still needs to be determined. It will be
943 // the entry value of R.
944 Value *EntryP = RPhi->getIncomingValueForBlock(PrehB);
945 PV.P = EntryP;
946 }
947
948 return true;
949 }
950
951 if (matchRightShift(SelI, PV)) {
952 // If this is an inverse pattern, the Q polynomial must be known at
953 // compile time.
954 if (PV.Inv && !isa<ConstantInt>(PV.Q))
955 return false;
956 if (PreScan)
957 return true;
958 // There is no exact matching of right-shift pmpy.
959 return false;
960 }
961
962 return false;
963}
964
965bool PolynomialMultiplyRecognize::isPromotableTo(Value *Val,
966 IntegerType *DestTy) {
967 IntegerType *T = dyn_cast<IntegerType>(Val->getType());
968 if (!T || T->getBitWidth() > DestTy->getBitWidth())
969 return false;
970 if (T->getBitWidth() == DestTy->getBitWidth())
971 return true;
972 // Non-instructions are promotable. The reason why an instruction may not
973 // be promotable is that it may produce a different result if its operands
974 // and the result are promoted, for example, it may produce more non-zero
975 // bits. While it would still be possible to represent the proper result
976 // in a wider type, it may require adding additional instructions (which
977 // we don't want to do).
979 if (!In)
980 return true;
981 // The bitwidth of the source type is smaller than the destination.
982 // Check if the individual operation can be promoted.
983 switch (In->getOpcode()) {
984 case Instruction::PHI:
985 case Instruction::ZExt:
986 case Instruction::And:
987 case Instruction::Or:
988 case Instruction::Xor:
989 case Instruction::LShr: // Shift right is ok.
990 case Instruction::Select:
991 case Instruction::Trunc:
992 return true;
993 case Instruction::ICmp:
994 if (CmpInst *CI = cast<CmpInst>(In))
995 return CI->isEquality() || CI->isUnsigned();
996 llvm_unreachable("Cast failed unexpectedly");
997 case Instruction::Add:
998 return In->hasNoSignedWrap() && In->hasNoUnsignedWrap();
999 }
1000 return false;
1001}
1002
1003void PolynomialMultiplyRecognize::promoteTo(Instruction *In,
1004 IntegerType *DestTy, BasicBlock *LoopB) {
1005 Type *OrigTy = In->getType();
1006 assert(!OrigTy->isVoidTy() && "Invalid instruction to promote");
1007
1008 // Leave boolean values alone.
1009 if (!In->getType()->isIntegerTy(1))
1010 In->mutateType(DestTy);
1011 unsigned DestBW = DestTy->getBitWidth();
1012
1013 // Handle PHIs.
1014 if (PHINode *P = dyn_cast<PHINode>(In)) {
1015 unsigned N = P->getNumIncomingValues();
1016 for (unsigned i = 0; i != N; ++i) {
1017 BasicBlock *InB = P->getIncomingBlock(i);
1018 if (InB == LoopB)
1019 continue;
1020 Value *InV = P->getIncomingValue(i);
1021 IntegerType *Ty = cast<IntegerType>(InV->getType());
1022 // Do not promote values in PHI nodes of type i1.
1023 if (Ty != P->getType()) {
1024 // If the value type does not match the PHI type, the PHI type
1025 // must have been promoted.
1026 assert(Ty->getBitWidth() < DestBW);
1027 InV = IRBuilder<>(InB->getTerminator()).CreateZExt(InV, DestTy);
1028 P->setIncomingValue(i, InV);
1029 }
1030 }
1031 } else if (ZExtInst *Z = dyn_cast<ZExtInst>(In)) {
1032 Value *Op = Z->getOperand(0);
1033 if (Op->getType() == Z->getType())
1034 Z->replaceAllUsesWith(Op);
1035 Z->eraseFromParent();
1036 return;
1037 }
1038 if (TruncInst *T = dyn_cast<TruncInst>(In)) {
1039 IntegerType *TruncTy = cast<IntegerType>(OrigTy);
1040 Value *Mask = ConstantInt::get(DestTy, (1u << TruncTy->getBitWidth()) - 1);
1041 Value *And = IRBuilder<>(In).CreateAnd(T->getOperand(0), Mask);
1042 T->replaceAllUsesWith(And);
1043 T->eraseFromParent();
1044 return;
1045 }
1046
1047 // Promote immediates.
1048 for (unsigned i = 0, n = In->getNumOperands(); i != n; ++i) {
1049 if (ConstantInt *CI = dyn_cast<ConstantInt>(In->getOperand(i)))
1050 if (CI->getBitWidth() < DestBW)
1051 In->setOperand(i, ConstantInt::get(DestTy, CI->getZExtValue()));
1052 }
1053}
1054
1055bool PolynomialMultiplyRecognize::promoteTypes(BasicBlock *LoopB,
1056 BasicBlock *ExitB) {
1057 assert(LoopB);
1058 // Skip loops where the exit block has more than one predecessor. The values
1059 // coming from the loop block will be promoted to another type, and so the
1060 // values coming into the exit block from other predecessors would also have
1061 // to be promoted.
1062 if (!ExitB || (ExitB->getSinglePredecessor() != LoopB))
1063 return false;
1064 IntegerType *DestTy = getPmpyType();
1065 // Check if the exit values have types that are no wider than the type
1066 // that we want to promote to.
1067 unsigned DestBW = DestTy->getBitWidth();
1068 for (PHINode &P : ExitB->phis()) {
1069 if (P.getNumIncomingValues() != 1)
1070 return false;
1071 assert(P.getIncomingBlock(0) == LoopB);
1072 IntegerType *T = dyn_cast<IntegerType>(P.getType());
1073 if (!T || T->getBitWidth() > DestBW)
1074 return false;
1075 }
1076
1077 // Check all instructions in the loop.
1078 for (Instruction &In : *LoopB)
1079 if (!In.isTerminator() && !isPromotableTo(&In, DestTy))
1080 return false;
1081
1082 // Perform the promotion.
1084 for (Instruction *In : LoopIns)
1085 if (!In->isTerminator())
1086 promoteTo(In, DestTy, LoopB);
1087
1088 // Fix up the PHI nodes in the exit block.
1090 for (auto I = ExitB->begin(); I != End; ++I) {
1091 PHINode *P = dyn_cast<PHINode>(I);
1092 if (!P)
1093 break;
1094 Type *Ty0 = P->getIncomingValue(0)->getType();
1095 Type *PTy = P->getType();
1096 if (PTy != Ty0) {
1097 assert(Ty0 == DestTy);
1098 // In order to create the trunc, P must have the promoted type.
1099 P->mutateType(Ty0);
1100 Value *T = IRBuilder<>(ExitB, End).CreateTrunc(P, PTy);
1101 // In order for the RAUW to work, the types of P and T must match.
1102 P->mutateType(PTy);
1103 P->replaceAllUsesWith(T);
1104 // Final update of the P's type.
1105 P->mutateType(Ty0);
1106 cast<Instruction>(T)->setOperand(0, P);
1107 }
1108 }
1109
1110 return true;
1111}
1112
1113bool PolynomialMultiplyRecognize::findCycle(Value *Out, Value *In,
1114 ValueSeq &Cycle) {
1115 // Out = ..., In, ...
1116 if (Out == In)
1117 return true;
1118
1119 auto *BB = cast<Instruction>(Out)->getParent();
1120 bool HadPhi = false;
1121
1122 for (auto *U : Out->users()) {
1123 auto *I = dyn_cast<Instruction>(&*U);
1124 if (I == nullptr || I->getParent() != BB)
1125 continue;
1126 // Make sure that there are no multi-iteration cycles, e.g.
1127 // p1 = phi(p2)
1128 // p2 = phi(p1)
1129 // The cycle p1->p2->p1 would span two loop iterations.
1130 // Check that there is only one phi in the cycle.
1131 bool IsPhi = isa<PHINode>(I);
1132 if (IsPhi && HadPhi)
1133 return false;
1134 HadPhi |= IsPhi;
1135 if (!Cycle.insert(I))
1136 return false;
1137 if (findCycle(I, In, Cycle))
1138 break;
1139 Cycle.remove(I);
1140 }
1141 return !Cycle.empty();
1142}
1143
1144void PolynomialMultiplyRecognize::classifyCycle(Instruction *DivI,
1145 ValueSeq &Cycle, ValueSeq &Early, ValueSeq &Late) {
1146 // All the values in the cycle that are between the phi node and the
1147 // divider instruction will be classified as "early", all other values
1148 // will be "late".
1149
1150 bool IsE = true;
1151 unsigned I, N = Cycle.size();
1152 for (I = 0; I < N; ++I) {
1153 Value *V = Cycle[I];
1154 if (DivI == V)
1155 IsE = false;
1156 else if (!isa<PHINode>(V))
1157 continue;
1158 // Stop if found either.
1159 break;
1160 }
1161 // "I" is the index of either DivI or the phi node, whichever was first.
1162 // "E" is "false" or "true" respectively.
1163 ValueSeq &First = !IsE ? Early : Late;
1164 for (unsigned J = 0; J < I; ++J)
1165 First.insert(Cycle[J]);
1166
1167 ValueSeq &Second = IsE ? Early : Late;
1168 Second.insert(Cycle[I]);
1169 for (++I; I < N; ++I) {
1170 Value *V = Cycle[I];
1171 if (DivI == V || isa<PHINode>(V))
1172 break;
1173 Second.insert(V);
1174 }
1175
1176 for (; I < N; ++I)
1177 First.insert(Cycle[I]);
1178}
1179
1180bool PolynomialMultiplyRecognize::classifyInst(Instruction *UseI,
1181 ValueSeq &Early, ValueSeq &Late) {
1182 // Select is an exception, since the condition value does not have to be
1183 // classified in the same way as the true/false values. The true/false
1184 // values do have to be both early or both late.
1185 if (UseI->getOpcode() == Instruction::Select) {
1186 Value *TV = UseI->getOperand(1), *FV = UseI->getOperand(2);
1187 if (Early.count(TV) || Early.count(FV)) {
1188 if (Late.count(TV) || Late.count(FV))
1189 return false;
1190 Early.insert(UseI);
1191 } else if (Late.count(TV) || Late.count(FV)) {
1192 if (Early.count(TV) || Early.count(FV))
1193 return false;
1194 Late.insert(UseI);
1195 }
1196 return true;
1197 }
1198
1199 // Not sure what would be the example of this, but the code below relies
1200 // on having at least one operand.
1201 if (UseI->getNumOperands() == 0)
1202 return true;
1203
1204 bool AE = true, AL = true;
1205 for (auto &I : UseI->operands()) {
1206 if (Early.count(&*I))
1207 AL = false;
1208 else if (Late.count(&*I))
1209 AE = false;
1210 }
1211 // If the operands appear "all early" and "all late" at the same time,
1212 // then it means that none of them are actually classified as either.
1213 // This is harmless.
1214 if (AE && AL)
1215 return true;
1216 // Conversely, if they are neither "all early" nor "all late", then
1217 // we have a mixture of early and late operands that is not a known
1218 // exception.
1219 if (!AE && !AL)
1220 return false;
1221
1222 // Check that we have covered the two special cases.
1223 assert(AE != AL);
1224
1225 if (AE)
1226 Early.insert(UseI);
1227 else
1228 Late.insert(UseI);
1229 return true;
1230}
1231
1232bool PolynomialMultiplyRecognize::commutesWithShift(Instruction *I) {
1233 switch (I->getOpcode()) {
1234 case Instruction::And:
1235 case Instruction::Or:
1236 case Instruction::Xor:
1237 case Instruction::LShr:
1238 case Instruction::Shl:
1239 case Instruction::Select:
1240 case Instruction::ICmp:
1241 case Instruction::PHI:
1242 break;
1243 default:
1244 return false;
1245 }
1246 return true;
1247}
1248
1249bool PolynomialMultiplyRecognize::highBitsAreZero(Value *V,
1250 unsigned IterCount) {
1251 auto *T = dyn_cast<IntegerType>(V->getType());
1252 if (!T)
1253 return false;
1254
1255 KnownBits Known(T->getBitWidth());
1256 computeKnownBits(V, Known, DL);
1257 return Known.countMinLeadingZeros() >= IterCount;
1258}
1259
1260bool PolynomialMultiplyRecognize::keepsHighBitsZero(Value *V,
1261 unsigned IterCount) {
1262 // Assume that all inputs to the value have the high bits zero.
1263 // Check if the value itself preserves the zeros in the high bits.
1264 if (auto *C = dyn_cast<ConstantInt>(V))
1265 return C->getValue().countl_zero() >= IterCount;
1266
1267 if (auto *I = dyn_cast<Instruction>(V)) {
1268 switch (I->getOpcode()) {
1269 case Instruction::And:
1270 case Instruction::Or:
1271 case Instruction::Xor:
1272 case Instruction::LShr:
1273 case Instruction::Select:
1274 case Instruction::ICmp:
1275 case Instruction::PHI:
1276 case Instruction::ZExt:
1277 return true;
1278 }
1279 }
1280
1281 return false;
1282}
1283
1284bool PolynomialMultiplyRecognize::isOperandShifted(Instruction *I, Value *Op) {
1285 unsigned Opc = I->getOpcode();
1286 if (Opc == Instruction::Shl || Opc == Instruction::LShr)
1287 return Op != I->getOperand(1);
1288 return true;
1289}
1290
1291bool PolynomialMultiplyRecognize::convertShiftsToLeft(BasicBlock *LoopB,
1292 BasicBlock *ExitB, unsigned IterCount) {
1293 Value *CIV = getCountIV(LoopB);
1294 if (CIV == nullptr)
1295 return false;
1296 auto *CIVTy = dyn_cast<IntegerType>(CIV->getType());
1297 if (CIVTy == nullptr)
1298 return false;
1299
1300 ValueSeq RShifts;
1301 ValueSeq Early, Late, Cycled;
1302
1303 // Find all value cycles that contain logical right shifts by 1.
1304 for (Instruction &I : *LoopB) {
1305 using namespace PatternMatch;
1306
1307 Value *V = nullptr;
1308 if (!match(&I, m_LShr(m_Value(V), m_One())))
1309 continue;
1310 ValueSeq C;
1311 if (!findCycle(&I, V, C))
1312 continue;
1313
1314 // Found a cycle.
1315 C.insert(&I);
1316 classifyCycle(&I, C, Early, Late);
1317 Cycled.insert_range(C);
1318 RShifts.insert(&I);
1319 }
1320
1321 // Find the set of all values affected by the shift cycles, i.e. all
1322 // cycled values, and (recursively) all their users.
1323 ValueSeq Users(llvm::from_range, Cycled);
1324 for (unsigned i = 0; i < Users.size(); ++i) {
1325 Value *V = Users[i];
1326 if (!isa<IntegerType>(V->getType()))
1327 return false;
1328 auto *R = cast<Instruction>(V);
1329 // If the instruction does not commute with shifts, the loop cannot
1330 // be unshifted.
1331 if (!commutesWithShift(R))
1332 return false;
1333 for (User *U : R->users()) {
1334 auto *T = cast<Instruction>(U);
1335 // Skip users from outside of the loop. They will be handled later.
1336 // Also, skip the right-shifts and phi nodes, since they mix early
1337 // and late values.
1338 if (T->getParent() != LoopB || RShifts.count(T) || isa<PHINode>(T))
1339 continue;
1340
1341 Users.insert(T);
1342 if (!classifyInst(T, Early, Late))
1343 return false;
1344 }
1345 }
1346
1347 if (Users.empty())
1348 return false;
1349
1350 // Verify that high bits remain zero.
1351 ValueSeq Internal(llvm::from_range, Users);
1352 ValueSeq Inputs;
1353 for (unsigned i = 0; i < Internal.size(); ++i) {
1354 auto *R = dyn_cast<Instruction>(Internal[i]);
1355 if (!R)
1356 continue;
1357 for (Value *Op : R->operands()) {
1358 auto *T = dyn_cast<Instruction>(Op);
1359 if (T && T->getParent() != LoopB)
1360 Inputs.insert(Op);
1361 else
1362 Internal.insert(Op);
1363 }
1364 }
1365 for (Value *V : Inputs)
1366 if (!highBitsAreZero(V, IterCount))
1367 return false;
1368 for (Value *V : Internal)
1369 if (!keepsHighBitsZero(V, IterCount))
1370 return false;
1371
1372 // Finally, the work can be done. Unshift each user.
1373 IRBuilder<> IRB(LoopB);
1374 std::map<Value*,Value*> ShiftMap;
1375
1376 using CastMapType = std::map<std::pair<Value *, Type *>, Value *>;
1377
1378 CastMapType CastMap;
1379
1380 auto upcast = [](CastMapType &CM, IRBuilder<> &IRB, Value *V,
1381 IntegerType *Ty) -> Value * {
1382 auto [H, Inserted] = CM.try_emplace(std::make_pair(V, Ty));
1383 if (Inserted)
1384 H->second = IRB.CreateIntCast(V, Ty, false);
1385 return H->second;
1386 };
1387
1388 for (auto I = LoopB->begin(), E = LoopB->end(); I != E; ++I) {
1389 using namespace PatternMatch;
1390
1391 if (isa<PHINode>(I) || !Users.count(&*I))
1392 continue;
1393
1394 // Match lshr x, 1.
1395 Value *V = nullptr;
1396 if (match(&*I, m_LShr(m_Value(V), m_One()))) {
1397 replaceAllUsesOfWithIn(&*I, V, LoopB);
1398 continue;
1399 }
1400 // For each non-cycled operand, replace it with the corresponding
1401 // value shifted left.
1402 for (auto &J : I->operands()) {
1403 Value *Op = J.get();
1404 if (!isOperandShifted(&*I, Op))
1405 continue;
1406 if (Users.count(Op))
1407 continue;
1408 // Skip shifting zeros.
1410 continue;
1411 // Check if we have already generated a shift for this value.
1412 auto F = ShiftMap.find(Op);
1413 Value *W = (F != ShiftMap.end()) ? F->second : nullptr;
1414 if (W == nullptr) {
1415 IRB.SetInsertPoint(&*I);
1416 // First, the shift amount will be CIV or CIV+1, depending on
1417 // whether the value is early or late. Instead of creating CIV+1,
1418 // do a single shift of the value.
1419 Value *ShAmt = CIV, *ShVal = Op;
1420 auto *VTy = cast<IntegerType>(ShVal->getType());
1421 auto *ATy = cast<IntegerType>(ShAmt->getType());
1422 if (Late.count(&*I))
1423 ShVal = IRB.CreateShl(Op, ConstantInt::get(VTy, 1));
1424 // Second, the types of the shifted value and the shift amount
1425 // must match.
1426 if (VTy != ATy) {
1427 if (VTy->getBitWidth() < ATy->getBitWidth())
1428 ShVal = upcast(CastMap, IRB, ShVal, ATy);
1429 else
1430 ShAmt = upcast(CastMap, IRB, ShAmt, VTy);
1431 }
1432 // Ready to generate the shift and memoize it.
1433 W = IRB.CreateShl(ShVal, ShAmt);
1434 ShiftMap.insert(std::make_pair(Op, W));
1435 }
1436 I->replaceUsesOfWith(Op, W);
1437 }
1438 }
1439
1440 // Update the users outside of the loop to account for having left
1441 // shifts. They would normally be shifted right in the loop, so shift
1442 // them right after the loop exit.
1443 // Take advantage of the loop-closed SSA form, which has all the post-
1444 // loop values in phi nodes.
1445 IRB.SetInsertPoint(ExitB, ExitB->getFirstInsertionPt());
1446 for (auto P = ExitB->begin(), Q = ExitB->end(); P != Q; ++P) {
1447 if (!isa<PHINode>(P))
1448 break;
1449 auto *PN = cast<PHINode>(P);
1450 Value *U = PN->getIncomingValueForBlock(LoopB);
1451 if (!Users.count(U))
1452 continue;
1453 Value *S = IRB.CreateLShr(PN, ConstantInt::get(PN->getType(), IterCount));
1454 PN->replaceAllUsesWith(S);
1455 // The above RAUW will create
1456 // S = lshr S, IterCount
1457 // so we need to fix it back into
1458 // S = lshr PN, IterCount
1459 cast<User>(S)->replaceUsesOfWith(S, PN);
1460 }
1461
1462 return true;
1463}
1464
1465void PolynomialMultiplyRecognize::cleanupLoopBody(BasicBlock *LoopB) {
1466 for (auto &I : *LoopB)
1467 if (Value *SV = simplifyInstruction(&I, {DL, &TLI, &DT}))
1468 I.replaceAllUsesWith(SV);
1469
1470 for (Instruction &I : llvm::make_early_inc_range(*LoopB))
1472}
1473
1474unsigned PolynomialMultiplyRecognize::getInverseMxN(unsigned QP) {
1475 // Arrays of coefficients of Q and the inverse, C.
1476 // Q[i] = coefficient at x^i.
1477 std::array<char,32> Q, C;
1478
1479 for (unsigned i = 0; i < 32; ++i) {
1480 Q[i] = QP & 1;
1481 QP >>= 1;
1482 }
1483 assert(Q[0] == 1);
1484
1485 // Find C, such that
1486 // (Q[n]*x^n + ... + Q[1]*x + Q[0]) * (C[n]*x^n + ... + C[1]*x + C[0]) = 1
1487 //
1488 // For it to have a solution, Q[0] must be 1. Since this is Z2[x], the
1489 // operations * and + are & and ^ respectively.
1490 //
1491 // Find C[i] recursively, by comparing i-th coefficient in the product
1492 // with 0 (or 1 for i=0).
1493 //
1494 // C[0] = 1, since C[0] = Q[0], and Q[0] = 1.
1495 C[0] = 1;
1496 for (unsigned i = 1; i < 32; ++i) {
1497 // Solve for C[i] in:
1498 // C[0]Q[i] ^ C[1]Q[i-1] ^ ... ^ C[i-1]Q[1] ^ C[i]Q[0] = 0
1499 // This is equivalent to
1500 // C[0]Q[i] ^ C[1]Q[i-1] ^ ... ^ C[i-1]Q[1] ^ C[i] = 0
1501 // which is
1502 // C[0]Q[i] ^ C[1]Q[i-1] ^ ... ^ C[i-1]Q[1] = C[i]
1503 unsigned T = 0;
1504 for (unsigned j = 0; j < i; ++j)
1505 T = T ^ (C[j] & Q[i-j]);
1506 C[i] = T;
1507 }
1508
1509 unsigned QV = 0;
1510 for (unsigned i = 0; i < 32; ++i)
1511 if (C[i])
1512 QV |= (1 << i);
1513
1514 return QV;
1515}
1516
1517Value *PolynomialMultiplyRecognize::generate(BasicBlock::iterator At,
1518 ParsedValues &PV) {
1519 IRBuilder<> B(&*At);
1520 Module *M = At->getParent()->getParent()->getParent();
1521 Function *PMF =
1522 Intrinsic::getOrInsertDeclaration(M, Intrinsic::hexagon_M4_pmpyw);
1523
1524 Value *P = PV.P, *Q = PV.Q, *P0 = P;
1525 unsigned IC = PV.IterCount;
1526
1527 if (PV.M != nullptr)
1528 P0 = P = B.CreateXor(P, PV.M);
1529
1530 // Create a bit mask to clear the high bits beyond IterCount.
1531 auto *BMI = ConstantInt::get(P->getType(), APInt::getLowBitsSet(32, IC));
1532
1533 if (PV.IterCount != 32)
1534 P = B.CreateAnd(P, BMI);
1535
1536 if (PV.Inv) {
1537 auto *QI = dyn_cast<ConstantInt>(PV.Q);
1538 assert(QI && QI->getBitWidth() <= 32);
1539
1540 // Again, clearing bits beyond IterCount.
1541 unsigned M = (1 << PV.IterCount) - 1;
1542 unsigned Tmp = (QI->getZExtValue() | 1) & M;
1543 unsigned QV = getInverseMxN(Tmp) & M;
1544 auto *QVI = ConstantInt::get(QI->getType(), QV);
1545 P = B.CreateCall(PMF, {P, QVI});
1546 P = B.CreateTrunc(P, QI->getType());
1547 if (IC != 32)
1548 P = B.CreateAnd(P, BMI);
1549 }
1550
1551 Value *R = B.CreateCall(PMF, {P, Q});
1552
1553 if (PV.M != nullptr)
1554 R = B.CreateXor(R, B.CreateIntCast(P0, R->getType(), false));
1555
1556 return R;
1557}
1558
1559static bool hasZeroSignBit(const Value *V) {
1560 if (const auto *CI = dyn_cast<const ConstantInt>(V))
1561 return CI->getValue().isNonNegative();
1563 if (!I)
1564 return false;
1565 switch (I->getOpcode()) {
1566 case Instruction::LShr:
1567 if (const auto SI = dyn_cast<const ConstantInt>(I->getOperand(1)))
1568 return SI->getZExtValue() > 0;
1569 return false;
1570 case Instruction::Or:
1571 case Instruction::Xor:
1572 return hasZeroSignBit(I->getOperand(0)) &&
1573 hasZeroSignBit(I->getOperand(1));
1574 case Instruction::And:
1575 return hasZeroSignBit(I->getOperand(0)) ||
1576 hasZeroSignBit(I->getOperand(1));
1577 }
1578 return false;
1579}
1580
1581void PolynomialMultiplyRecognize::setupPreSimplifier(Simplifier &S) {
1582 S.addRule("sink-zext",
1583 // Sink zext past bitwise operations.
1584 [](Instruction *I, LLVMContext &Ctx) -> Value* {
1585 if (I->getOpcode() != Instruction::ZExt)
1586 return nullptr;
1587 Instruction *T = dyn_cast<Instruction>(I->getOperand(0));
1588 if (!T)
1589 return nullptr;
1590 switch (T->getOpcode()) {
1591 case Instruction::And:
1592 case Instruction::Or:
1593 case Instruction::Xor:
1594 break;
1595 default:
1596 return nullptr;
1597 }
1598 IRBuilder<> B(Ctx);
1599 return B.CreateBinOp(cast<BinaryOperator>(T)->getOpcode(),
1600 B.CreateZExt(T->getOperand(0), I->getType()),
1601 B.CreateZExt(T->getOperand(1), I->getType()));
1602 });
1603 S.addRule("xor/and -> and/xor",
1604 // (xor (and x a) (and y a)) -> (and (xor x y) a)
1605 [](Instruction *I, LLVMContext &Ctx) -> Value* {
1606 if (I->getOpcode() != Instruction::Xor)
1607 return nullptr;
1608 Instruction *And0 = dyn_cast<Instruction>(I->getOperand(0));
1609 Instruction *And1 = dyn_cast<Instruction>(I->getOperand(1));
1610 if (!And0 || !And1)
1611 return nullptr;
1612 if (And0->getOpcode() != Instruction::And ||
1613 And1->getOpcode() != Instruction::And)
1614 return nullptr;
1615 if (And0->getOperand(1) != And1->getOperand(1))
1616 return nullptr;
1617 IRBuilder<> B(Ctx);
1618 return B.CreateAnd(B.CreateXor(And0->getOperand(0), And1->getOperand(0)),
1619 And0->getOperand(1));
1620 });
1621 S.addRule("sink binop into select",
1622 // (Op (select c x y) z) -> (select c (Op x z) (Op y z))
1623 // (Op x (select c y z)) -> (select c (Op x y) (Op x z))
1624 [](Instruction *I, LLVMContext &Ctx) -> Value* {
1625 BinaryOperator *BO = dyn_cast<BinaryOperator>(I);
1626 if (!BO)
1627 return nullptr;
1629 if (SelectInst *Sel = dyn_cast<SelectInst>(BO->getOperand(0))) {
1630 IRBuilder<> B(Ctx);
1631 Value *X = Sel->getTrueValue(), *Y = Sel->getFalseValue();
1632 Value *Z = BO->getOperand(1);
1633 return B.CreateSelect(Sel->getCondition(),
1634 B.CreateBinOp(Op, X, Z),
1635 B.CreateBinOp(Op, Y, Z));
1636 }
1637 if (SelectInst *Sel = dyn_cast<SelectInst>(BO->getOperand(1))) {
1638 IRBuilder<> B(Ctx);
1639 Value *X = BO->getOperand(0);
1640 Value *Y = Sel->getTrueValue(), *Z = Sel->getFalseValue();
1641 return B.CreateSelect(Sel->getCondition(),
1642 B.CreateBinOp(Op, X, Y),
1643 B.CreateBinOp(Op, X, Z));
1644 }
1645 return nullptr;
1646 });
1647 S.addRule("fold select-select",
1648 // (select c (select c x y) z) -> (select c x z)
1649 // (select c x (select c y z)) -> (select c x z)
1650 [](Instruction *I, LLVMContext &Ctx) -> Value* {
1651 SelectInst *Sel = dyn_cast<SelectInst>(I);
1652 if (!Sel)
1653 return nullptr;
1654 IRBuilder<> B(Ctx);
1655 Value *C = Sel->getCondition();
1656 if (SelectInst *Sel0 = dyn_cast<SelectInst>(Sel->getTrueValue())) {
1657 if (Sel0->getCondition() == C)
1658 return B.CreateSelect(C, Sel0->getTrueValue(), Sel->getFalseValue());
1659 }
1660 if (SelectInst *Sel1 = dyn_cast<SelectInst>(Sel->getFalseValue())) {
1661 if (Sel1->getCondition() == C)
1662 return B.CreateSelect(C, Sel->getTrueValue(), Sel1->getFalseValue());
1663 }
1664 return nullptr;
1665 });
1666 S.addRule("or-signbit -> xor-signbit",
1667 // (or (lshr x 1) 0x800.0) -> (xor (lshr x 1) 0x800.0)
1668 [](Instruction *I, LLVMContext &Ctx) -> Value* {
1669 if (I->getOpcode() != Instruction::Or)
1670 return nullptr;
1671 ConstantInt *Msb = dyn_cast<ConstantInt>(I->getOperand(1));
1672 if (!Msb || !Msb->getValue().isSignMask())
1673 return nullptr;
1674 if (!hasZeroSignBit(I->getOperand(0)))
1675 return nullptr;
1676 return IRBuilder<>(Ctx).CreateXor(I->getOperand(0), Msb);
1677 });
1678 S.addRule("sink lshr into binop",
1679 // (lshr (BitOp x y) c) -> (BitOp (lshr x c) (lshr y c))
1680 [](Instruction *I, LLVMContext &Ctx) -> Value* {
1681 if (I->getOpcode() != Instruction::LShr)
1682 return nullptr;
1683 BinaryOperator *BitOp = dyn_cast<BinaryOperator>(I->getOperand(0));
1684 if (!BitOp)
1685 return nullptr;
1686 switch (BitOp->getOpcode()) {
1687 case Instruction::And:
1688 case Instruction::Or:
1689 case Instruction::Xor:
1690 break;
1691 default:
1692 return nullptr;
1693 }
1694 IRBuilder<> B(Ctx);
1695 Value *S = I->getOperand(1);
1696 return B.CreateBinOp(BitOp->getOpcode(),
1697 B.CreateLShr(BitOp->getOperand(0), S),
1698 B.CreateLShr(BitOp->getOperand(1), S));
1699 });
1700 S.addRule("expose bitop-const",
1701 // (BitOp1 (BitOp2 x a) b) -> (BitOp2 x (BitOp1 a b))
1702 [](Instruction *I, LLVMContext &Ctx) -> Value* {
1703 auto IsBitOp = [](unsigned Op) -> bool {
1704 switch (Op) {
1705 case Instruction::And:
1706 case Instruction::Or:
1707 case Instruction::Xor:
1708 return true;
1709 }
1710 return false;
1711 };
1712 BinaryOperator *BitOp1 = dyn_cast<BinaryOperator>(I);
1713 if (!BitOp1 || !IsBitOp(BitOp1->getOpcode()))
1714 return nullptr;
1715 BinaryOperator *BitOp2 = dyn_cast<BinaryOperator>(BitOp1->getOperand(0));
1716 if (!BitOp2 || !IsBitOp(BitOp2->getOpcode()))
1717 return nullptr;
1718 ConstantInt *CA = dyn_cast<ConstantInt>(BitOp2->getOperand(1));
1719 ConstantInt *CB = dyn_cast<ConstantInt>(BitOp1->getOperand(1));
1720 if (!CA || !CB)
1721 return nullptr;
1722 IRBuilder<> B(Ctx);
1723 Value *X = BitOp2->getOperand(0);
1724 return B.CreateBinOp(BitOp2->getOpcode(), X,
1725 B.CreateBinOp(BitOp1->getOpcode(), CA, CB));
1726 });
1727 S.addRule("select with trunc cond to select with icmp cond",
1728 // select (trunc x to i1) -> select (icmp ne (and x, 1), 0)
1729 // select (xor (trunc x to i1) 1) -> select (icmp eq (and x, 1), 0)
1730 [](Instruction *I, LLVMContext &Ctx) -> Value * {
1731 SelectInst *Sel = dyn_cast<SelectInst>(I);
1732 if (!Sel)
1733 return nullptr;
1734 Value *C = Sel->getCondition();
1735 Value *X;
1736 using namespace PatternMatch;
1737 if (!(match(C, m_Trunc(m_Value(X))) ||
1738 match(C, m_Not(m_Trunc(m_Value(X))))))
1739 return nullptr;
1740
1741 IRBuilder<> B(Ctx);
1742 Type *Ty = X->getType();
1743 Value *And = B.CreateAnd(X, ConstantInt::get(Ty, 1));
1744 Value *Icmp = B.CreateICmp(isa<TruncInst>(C) ? ICmpInst::ICMP_NE
1745 : ICmpInst::ICMP_EQ,
1746 And, ConstantInt::get(Ty, 0));
1747 return B.CreateSelect(Icmp, Sel->getTrueValue(),
1748 Sel->getFalseValue());
1749 });
1750}
1751
1752void PolynomialMultiplyRecognize::setupPostSimplifier(Simplifier &S) {
1753 S.addRule("(and (xor (and x a) y) b) -> (and (xor x y) b), if b == b&a",
1754 [](Instruction *I, LLVMContext &Ctx) -> Value* {
1755 if (I->getOpcode() != Instruction::And)
1756 return nullptr;
1757 Instruction *Xor = dyn_cast<Instruction>(I->getOperand(0));
1758 ConstantInt *C0 = dyn_cast<ConstantInt>(I->getOperand(1));
1759 if (!Xor || !C0)
1760 return nullptr;
1761 if (Xor->getOpcode() != Instruction::Xor)
1762 return nullptr;
1763 Instruction *And0 = dyn_cast<Instruction>(Xor->getOperand(0));
1764 Instruction *And1 = dyn_cast<Instruction>(Xor->getOperand(1));
1765 // Pick the first non-null and.
1766 if (!And0 || And0->getOpcode() != Instruction::And)
1767 std::swap(And0, And1);
1768 ConstantInt *C1 = dyn_cast<ConstantInt>(And0->getOperand(1));
1769 if (!C1)
1770 return nullptr;
1771 uint32_t V0 = C0->getZExtValue();
1772 uint32_t V1 = C1->getZExtValue();
1773 if (V0 != (V0 & V1))
1774 return nullptr;
1775 IRBuilder<> B(Ctx);
1776 return B.CreateAnd(B.CreateXor(And0->getOperand(0), And1), C0);
1777 });
1778}
1779
1780bool PolynomialMultiplyRecognize::recognize() {
1781 LLVM_DEBUG(dbgs() << "Starting PolynomialMultiplyRecognize on loop\n"
1782 << *CurLoop << '\n');
1783 // Restrictions:
1784 // - The loop must consist of a single block.
1785 // - The iteration count must be known at compile-time.
1786 // - The loop must have an induction variable starting from 0, and
1787 // incremented in each iteration of the loop.
1788 BasicBlock *LoopB = CurLoop->getHeader();
1789 LLVM_DEBUG(dbgs() << "Loop header:\n" << *LoopB);
1790
1791 if (LoopB != CurLoop->getLoopLatch())
1792 return false;
1793 BasicBlock *ExitB = CurLoop->getExitBlock();
1794 if (ExitB == nullptr)
1795 return false;
1796 BasicBlock *EntryB = CurLoop->getLoopPreheader();
1797 if (EntryB == nullptr)
1798 return false;
1799
1800 unsigned IterCount = 0;
1801 const SCEV *CT = SE.getBackedgeTakenCount(CurLoop);
1803 return false;
1804 if (auto *CV = dyn_cast<SCEVConstant>(CT))
1805 IterCount = CV->getValue()->getZExtValue() + 1;
1806
1807 Value *CIV = getCountIV(LoopB);
1808 if (CIV == nullptr)
1809 return false;
1810 ParsedValues PV;
1811 Simplifier PreSimp;
1812 PV.IterCount = IterCount;
1813 LLVM_DEBUG(dbgs() << "Loop IV: " << *CIV << "\nIterCount: " << IterCount
1814 << '\n');
1815
1816 setupPreSimplifier(PreSimp);
1817
1818 // Perform a preliminary scan of select instructions to see if any of them
1819 // looks like a generator of the polynomial multiply steps. Assume that a
1820 // loop can only contain a single transformable operation, so stop the
1821 // traversal after the first reasonable candidate was found.
1822 // XXX: Currently this approach can modify the loop before being 100% sure
1823 // that the transformation can be carried out.
1824 bool FoundPreScan = false;
1825 auto FeedsPHI = [LoopB](const Value *V) -> bool {
1826 for (const Value *U : V->users()) {
1827 if (const auto *P = dyn_cast<const PHINode>(U))
1828 if (P->getParent() == LoopB)
1829 return true;
1830 }
1831 return false;
1832 };
1833 for (Instruction &In : *LoopB) {
1834 SelectInst *SI = dyn_cast<SelectInst>(&In);
1835 if (!SI || !FeedsPHI(SI))
1836 continue;
1837
1838 Simplifier::Context C(SI);
1839 Value *T = PreSimp.simplify(C);
1840 SelectInst *SelI = (T && isa<SelectInst>(T)) ? cast<SelectInst>(T) : SI;
1841 LLVM_DEBUG(dbgs() << "scanSelect(pre-scan): " << PE(C, SelI) << '\n');
1842 if (scanSelect(SelI, LoopB, EntryB, CIV, PV, true)) {
1843 FoundPreScan = true;
1844 if (SelI != SI) {
1845 Value *NewSel = C.materialize(LoopB, SI->getIterator());
1846 SI->replaceAllUsesWith(NewSel);
1848 }
1849 break;
1850 }
1851 }
1852
1853 if (!FoundPreScan) {
1854 LLVM_DEBUG(dbgs() << "Have not found candidates for pmpy\n");
1855 return false;
1856 }
1857
1858 if (!PV.Left) {
1859 // The right shift version actually only returns the higher bits of
1860 // the result (each iteration discards the LSB). If we want to convert it
1861 // to a left-shifting loop, the working data type must be at least as
1862 // wide as the target's pmpy instruction.
1863 if (!promoteTypes(LoopB, ExitB))
1864 return false;
1865 // Run post-promotion simplifications.
1866 Simplifier PostSimp;
1867 setupPostSimplifier(PostSimp);
1868 for (Instruction &In : *LoopB) {
1869 SelectInst *SI = dyn_cast<SelectInst>(&In);
1870 if (!SI || !FeedsPHI(SI))
1871 continue;
1872 Simplifier::Context C(SI);
1873 Value *T = PostSimp.simplify(C);
1874 SelectInst *SelI = dyn_cast_or_null<SelectInst>(T);
1875 if (SelI != SI) {
1876 Value *NewSel = C.materialize(LoopB, SI->getIterator());
1877 SI->replaceAllUsesWith(NewSel);
1879 }
1880 break;
1881 }
1882
1883 if (!convertShiftsToLeft(LoopB, ExitB, IterCount))
1884 return false;
1885 cleanupLoopBody(LoopB);
1886 }
1887
1888 // Scan the loop again, find the generating select instruction.
1889 bool FoundScan = false;
1890 for (Instruction &In : *LoopB) {
1891 SelectInst *SelI = dyn_cast<SelectInst>(&In);
1892 if (!SelI)
1893 continue;
1894 LLVM_DEBUG(dbgs() << "scanSelect: " << *SelI << '\n');
1895 FoundScan = scanSelect(SelI, LoopB, EntryB, CIV, PV, false);
1896 if (FoundScan)
1897 break;
1898 }
1899 assert(FoundScan);
1900
1901 LLVM_DEBUG({
1902 StringRef PP = (PV.M ? "(P+M)" : "P");
1903 if (!PV.Inv)
1904 dbgs() << "Found pmpy idiom: R = " << PP << ".Q\n";
1905 else
1906 dbgs() << "Found inverse pmpy idiom: R = (" << PP << "/Q).Q) + "
1907 << PP << "\n";
1908 dbgs() << " Res:" << *PV.Res << "\n P:" << *PV.P << "\n";
1909 if (PV.M)
1910 dbgs() << " M:" << *PV.M << "\n";
1911 dbgs() << " Q:" << *PV.Q << "\n";
1912 dbgs() << " Iteration count:" << PV.IterCount << "\n";
1913 });
1914
1915 BasicBlock::iterator At(EntryB->getTerminator());
1916 Value *PM = generate(At, PV);
1917 if (PM == nullptr)
1918 return false;
1919
1920 if (PM->getType() != PV.Res->getType())
1921 PM = IRBuilder<>(&*At).CreateIntCast(PM, PV.Res->getType(), false);
1922
1923 PV.Res->replaceAllUsesWith(PM);
1924 PV.Res->eraseFromParent();
1925 return true;
1926}
1927
1928int HexagonLoopIdiomRecognize::getSCEVStride(const SCEVAddRecExpr *S) {
1929 if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(S->getOperand(1)))
1930 return SC->getAPInt().getSExtValue();
1931 return 0;
1932}
1933
1934bool HexagonLoopIdiomRecognize::isLegalStore(Loop *CurLoop, StoreInst *SI) {
1935 // Allow volatile stores if HexagonVolatileMemcpy is enabled.
1936 if (!(SI->isVolatile() && HexagonVolatileMemcpy) && !SI->isSimple())
1937 return false;
1938
1939 Value *StoredVal = SI->getValueOperand();
1940 Value *StorePtr = SI->getPointerOperand();
1941
1942 // Reject stores that are so large that they overflow an unsigned.
1943 uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
1944 if ((SizeInBits & 7) || (SizeInBits >> 32) != 0)
1945 return false;
1946
1947 // See if the pointer expression is an AddRec like {base,+,1} on the current
1948 // loop, which indicates a strided store. If we have something else, it's a
1949 // random store we can't handle.
1950 auto *StoreEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
1951 if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine()) {
1952 ORE.emit([&]() {
1953 return OptimizationRemarkMissed(DEBUG_TYPE, "NonAffineStorePtr",
1954 SI->getDebugLoc(), SI->getParent())
1955 << "store pointer is not an affine AddRec";
1956 });
1957 return false;
1958 }
1959
1960 // Check to see if the stride matches the size of the store. If so, then we
1961 // know that every byte is touched in the loop.
1962 int Stride = getSCEVStride(StoreEv);
1963 if (Stride == 0)
1964 return false;
1965 unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
1966 if (StoreSize != unsigned(std::abs(Stride))) {
1967 ORE.emit([&]() {
1968 return OptimizationRemarkMissed(DEBUG_TYPE, "StrideSizeMismatch",
1969 SI->getDebugLoc(), SI->getParent())
1970 << "stride does not match store size";
1971 });
1972 return false;
1973 }
1974
1975 // The store must be feeding a non-volatile load.
1976 LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
1977 if (!LI || !LI->isSimple()) {
1978 ORE.emit([&]() {
1979 return OptimizationRemarkMissed(DEBUG_TYPE, "StoreNotFeedingLoad",
1980 SI->getDebugLoc(), SI->getParent())
1981 << "store value is not a simple load";
1982 });
1983 return false;
1984 }
1985
1986 // See if the pointer expression is an AddRec like {base,+,1} on the current
1987 // loop, which indicates a strided load. If we have something else, it's a
1988 // random load we can't handle.
1989 Value *LoadPtr = LI->getPointerOperand();
1990 auto *LoadEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LoadPtr));
1991 if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine()) {
1992 ORE.emit([&]() {
1993 return OptimizationRemarkMissed(DEBUG_TYPE, "NonAffineLoadPtr",
1994 LI->getDebugLoc(), LI->getParent())
1995 << "load pointer is not an affine AddRec";
1996 });
1997 return false;
1998 }
1999
2000 // The store and load must share the same stride.
2001 if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
2002 return false;
2003
2004 // Success. This store can be converted into a memcpy.
2005 return true;
2006}
2007
2008/// mayLoopAccessLocation - Return true if the specified loop might access the
2009/// specified pointer location, which is a loop-strided access. The 'Access'
2010/// argument specifies what the verboten forms of access are (read or write).
2011static bool
2013 const SCEV *BECount, unsigned StoreSize,
2016 // Get the location that may be stored across the loop. Since the access
2017 // is strided positively through memory, we say that the modified location
2018 // starts at the pointer and has infinite size.
2020
2021 // If the loop iterates a fixed number of times, we can refine the access
2022 // size to be exactly the size of the memset, which is (BECount+1)*StoreSize
2023 if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
2024 AccessSize = LocationSize::precise((BECst->getValue()->getZExtValue() + 1) *
2025 StoreSize);
2026
2027 // TODO: For this to be really effective, we have to dive into the pointer
2028 // operand in the store. Store to &A[i] of 100 will always return may alias
2029 // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
2030 // which will then no-alias a store to &A[100].
2031 MemoryLocation StoreLoc(Ptr, AccessSize);
2032
2033 for (auto *B : L->blocks())
2034 for (auto &I : *B)
2035 if (Ignored.count(&I) == 0 &&
2036 isModOrRefSet(AA.getModRefInfo(&I, StoreLoc) & Access))
2037 return true;
2038
2039 return false;
2040}
2041
2042void HexagonLoopIdiomRecognize::collectStores(Loop *CurLoop, BasicBlock *BB,
2043 SmallVectorImpl<StoreInst*> &Stores) {
2044 Stores.clear();
2045 for (Instruction &I : *BB)
2046 if (StoreInst *SI = dyn_cast<StoreInst>(&I))
2047 if (isLegalStore(CurLoop, SI))
2048 Stores.push_back(SI);
2049}
2050
2051bool HexagonLoopIdiomRecognize::processCopyingStore(Loop *CurLoop,
2052 StoreInst *SI, const SCEV *BECount) {
2053 assert((SI->isSimple() || (SI->isVolatile() && HexagonVolatileMemcpy)) &&
2054 "Expected only non-volatile stores, or Hexagon-specific memcpy"
2055 "to volatile destination.");
2056
2057 Value *StorePtr = SI->getPointerOperand();
2058 auto *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
2059 unsigned Stride = getSCEVStride(StoreEv);
2060 unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
2061 if (Stride != StoreSize)
2062 return false;
2063
2064 // See if the pointer expression is an AddRec like {base,+,1} on the current
2065 // loop, which indicates a strided load. If we have something else, it's a
2066 // random load we can't handle.
2067 auto *LI = cast<LoadInst>(SI->getValueOperand());
2068 auto *LoadEv = cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
2069
2070 // The trip count of the loop and the base pointer of the addrec SCEV is
2071 // guaranteed to be loop invariant, which means that it should dominate the
2072 // header. This allows us to insert code for it in the preheader.
2073 BasicBlock *Preheader = CurLoop->getLoopPreheader();
2074 Instruction *ExpPt = Preheader->getTerminator();
2075 IRBuilder<> Builder(ExpPt);
2076 SCEVExpander Expander(*SE, "hexagon-loop-idiom");
2077
2078 Type *IntPtrTy = Builder.getIntPtrTy(*DL, SI->getPointerAddressSpace());
2079
2080 // Okay, we have a strided store "p[i]" of a loaded value. We can turn
2081 // this into a memcpy/memmove in the loop preheader now if we want. However,
2082 // this would be unsafe to do if there is anything else in the loop that may
2083 // read or write the memory region we're storing to. For memcpy, this
2084 // includes the load that feeds the stores. Check for an alias by generating
2085 // the base address and checking everything.
2086 Value *StoreBasePtr = Expander.expandCodeFor(StoreEv->getStart(),
2087 Builder.getPtrTy(SI->getPointerAddressSpace()), ExpPt);
2088 Value *LoadBasePtr = nullptr;
2089
2090 bool Overlap = false;
2091 bool DestVolatile = SI->isVolatile();
2092 Type *BECountTy = BECount->getType();
2093
2094 if (DestVolatile) {
2095 // The trip count must fit in i32, since it is the type of the "num_words"
2096 // argument to hexagon_memcpy_forward_vp4cp4n2.
2097 if (StoreSize != 4 || DL->getTypeSizeInBits(BECountTy) > 32) {
2098CleanupAndExit:
2099 // If we generated new code for the base pointer, clean up.
2100 Expander.clear();
2101 if (StoreBasePtr && (LoadBasePtr != StoreBasePtr)) {
2103 StoreBasePtr = nullptr;
2104 }
2105 if (LoadBasePtr) {
2107 LoadBasePtr = nullptr;
2108 }
2109 return false;
2110 }
2111 }
2112
2113 SmallPtrSet<Instruction*, 2> Ignore1;
2114 Ignore1.insert(SI);
2115 if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
2116 StoreSize, *AA, Ignore1)) {
2117 // Check if the load is the offending instruction.
2118 Ignore1.insert(LI);
2119 if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop,
2120 BECount, StoreSize, *AA, Ignore1)) {
2121 // Still bad. Nothing we can do.
2122 ORE.emit([&]() {
2123 return OptimizationRemarkMissed(DEBUG_TYPE, "MemoryAlias",
2124 SI->getDebugLoc(), SI->getParent())
2125 << "memory aliasing prevents memcpy/memmove";
2126 });
2127 goto CleanupAndExit;
2128 }
2129 // It worked with the load ignored.
2130 Overlap = true;
2131 }
2132
2133 if (!Overlap) {
2134 if (DisableMemcpyIdiom || !HasMemcpy) {
2135 ORE.emit([&]() {
2136 return OptimizationRemarkMissed(DEBUG_TYPE, "MemcpyDisabled",
2137 SI->getDebugLoc(), SI->getParent())
2138 << "memcpy idiom is disabled or unavailable";
2139 });
2140 goto CleanupAndExit;
2141 }
2142 } else {
2143 // Don't generate memmove if this function will be inlined. This is
2144 // because the caller will undergo this transformation after inlining.
2145 Function *Func = CurLoop->getHeader()->getParent();
2146 if (Func->hasFnAttribute(Attribute::AlwaysInline))
2147 goto CleanupAndExit;
2148
2149 // In case of a memmove, the call to memmove will be executed instead
2150 // of the loop, so we need to make sure that there is nothing else in
2151 // the loop than the load, store and instructions that these two depend
2152 // on.
2153 SmallVector<Instruction*,2> Insts;
2154 Insts.push_back(SI);
2155 Insts.push_back(LI);
2156 if (!coverLoop(CurLoop, Insts)) {
2157 ORE.emit([&]() {
2158 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtraLoopInstructions",
2159 SI->getDebugLoc(), SI->getParent())
2160 << "loop contains instructions beyond load/store pair";
2161 });
2162 goto CleanupAndExit;
2163 }
2164
2165 if (DisableMemmoveIdiom || !HasMemmove) {
2166 ORE.emit([&]() {
2167 return OptimizationRemarkMissed(DEBUG_TYPE, "MemmoveDisabled",
2168 SI->getDebugLoc(), SI->getParent())
2169 << "memmove idiom is disabled or unavailable";
2170 });
2171 goto CleanupAndExit;
2172 }
2173 bool IsNested = CurLoop->getParentLoop() != nullptr;
2174 if (IsNested && OnlyNonNestedMemmove) {
2175 ORE.emit([&]() {
2176 return OptimizationRemarkMissed(DEBUG_TYPE, "NestedLoop",
2177 SI->getDebugLoc(), SI->getParent())
2178 << "memmove skipped for nested loop";
2179 });
2180 goto CleanupAndExit;
2181 }
2182 }
2183
2184 // For a memcpy, we have to make sure that the input array is not being
2185 // mutated by the loop.
2186 LoadBasePtr = Expander.expandCodeFor(LoadEv->getStart(),
2187 Builder.getPtrTy(LI->getPointerAddressSpace()), ExpPt);
2188
2189 SmallPtrSet<Instruction*, 2> Ignore2;
2190 Ignore2.insert(SI);
2191 if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
2192 StoreSize, *AA, Ignore2))
2193 goto CleanupAndExit;
2194
2195 // Check the stride.
2196 bool StridePos = getSCEVStride(LoadEv) >= 0;
2197
2198 // Currently, the volatile memcpy only emulates traversing memory forward.
2199 if (!StridePos && DestVolatile)
2200 goto CleanupAndExit;
2201
2202 bool RuntimeCheck = (Overlap || DestVolatile);
2203
2204 BasicBlock *ExitB;
2205 if (RuntimeCheck) {
2206 // The runtime check needs a single exit block.
2207 SmallVector<BasicBlock*, 8> ExitBlocks;
2208 CurLoop->getUniqueExitBlocks(ExitBlocks);
2209 if (ExitBlocks.size() != 1)
2210 goto CleanupAndExit;
2211 ExitB = ExitBlocks[0];
2212 }
2213
2214 // The # stored bytes is (BECount+1)*Size. Expand the trip count out to
2215 // pointer size if it isn't already.
2216 LLVMContext &Ctx = SI->getContext();
2217 BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy);
2218 DebugLoc DLoc = SI->getDebugLoc();
2219
2220 const SCEV *NumBytesS =
2221 SE->getAddExpr(BECount, SE->getOne(IntPtrTy), SCEV::FlagNUW);
2222 if (StoreSize != 1)
2223 NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize),
2225 Value *NumBytes = Expander.expandCodeFor(NumBytesS, IntPtrTy, ExpPt);
2226 if (Instruction *In = dyn_cast<Instruction>(NumBytes))
2227 if (Value *Simp = simplifyInstruction(In, {*DL, TLI, DT}))
2228 NumBytes = Simp;
2229
2230 CallInst *NewCall;
2231
2232 if (RuntimeCheck) {
2233 unsigned Threshold = RuntimeMemSizeThreshold;
2234 if (ConstantInt *CI = dyn_cast<ConstantInt>(NumBytes)) {
2235 uint64_t C = CI->getZExtValue();
2236 if (Threshold != 0 && C < Threshold)
2237 goto CleanupAndExit;
2239 goto CleanupAndExit;
2240 }
2241
2242 BasicBlock *Header = CurLoop->getHeader();
2243 Function *Func = Header->getParent();
2244 Loop *ParentL = LF->getLoopFor(Preheader);
2245 StringRef HeaderName = Header->getName();
2246
2247 // Create a new (empty) preheader, and update the PHI nodes in the
2248 // header to use the new preheader.
2249 BasicBlock *NewPreheader = BasicBlock::Create(Ctx, HeaderName+".rtli.ph",
2250 Func, Header);
2251 if (ParentL)
2252 ParentL->addBasicBlockToLoop(NewPreheader, *LF);
2253 IRBuilder<>(NewPreheader).CreateBr(Header);
2254 for (auto &In : *Header) {
2255 PHINode *PN = dyn_cast<PHINode>(&In);
2256 if (!PN)
2257 break;
2258 int bx = PN->getBasicBlockIndex(Preheader);
2259 if (bx >= 0)
2260 PN->setIncomingBlock(bx, NewPreheader);
2261 }
2262 DT->addNewBlock(NewPreheader, Preheader);
2263 DT->changeImmediateDominator(Header, NewPreheader);
2264
2265 // Check for safe conditions to execute memmove.
2266 // If stride is positive, copying things from higher to lower addresses
2267 // is equivalent to memmove. For negative stride, it's the other way
2268 // around. Copying forward in memory with positive stride may not be
2269 // same as memmove since we may be copying values that we just stored
2270 // in some previous iteration.
2271 Value *LA = Builder.CreatePtrToInt(LoadBasePtr, IntPtrTy);
2272 Value *SA = Builder.CreatePtrToInt(StoreBasePtr, IntPtrTy);
2273 Value *LowA = StridePos ? SA : LA;
2274 Value *HighA = StridePos ? LA : SA;
2275 Value *CmpA = Builder.CreateICmpULT(LowA, HighA);
2276 Value *Cond = CmpA;
2277
2278 // Check for distance between pointers. Since the case LowA < HighA
2279 // is checked for above, assume LowA >= HighA.
2280 Value *Dist = Builder.CreateSub(LowA, HighA);
2281 Value *CmpD = Builder.CreateICmpSLE(NumBytes, Dist);
2282 Value *CmpEither = Builder.CreateOr(Cond, CmpD);
2283 Cond = CmpEither;
2284
2285 if (Threshold != 0) {
2286 Type *Ty = NumBytes->getType();
2287 Value *Thr = ConstantInt::get(Ty, Threshold);
2288 Value *CmpB = Builder.CreateICmpULT(Thr, NumBytes);
2289 Value *CmpBoth = Builder.CreateAnd(Cond, CmpB);
2290 Cond = CmpBoth;
2291 }
2292 BasicBlock *MemmoveB = BasicBlock::Create(Ctx, Header->getName()+".rtli",
2293 Func, NewPreheader);
2294 if (ParentL)
2295 ParentL->addBasicBlockToLoop(MemmoveB, *LF);
2296 Instruction *OldT = Preheader->getTerminator();
2297 Builder.CreateCondBr(Cond, MemmoveB, NewPreheader);
2298 OldT->eraseFromParent();
2299 Preheader->setName(Preheader->getName()+".old");
2300 DT->addNewBlock(MemmoveB, Preheader);
2301 // Find the new immediate dominator of the exit block.
2302 BasicBlock *ExitD = Preheader;
2303 for (BasicBlock *PB : predecessors(ExitB)) {
2304 ExitD = DT->findNearestCommonDominator(ExitD, PB);
2305 if (!ExitD)
2306 break;
2307 }
2308 // If the prior immediate dominator of ExitB was dominated by the
2309 // old preheader, then the old preheader becomes the new immediate
2310 // dominator. Otherwise don't change anything (because the newly
2311 // added blocks are dominated by the old preheader).
2312 if (ExitD && DT->dominates(Preheader, ExitD)) {
2313 DomTreeNode *BN = DT->getNode(ExitB);
2314 DomTreeNode *DN = DT->getNode(ExitD);
2315 BN->setIDom(DN);
2316 }
2317
2318 // Add a call to memmove to the conditional block.
2319 IRBuilder<> CondBuilder(MemmoveB);
2320 CondBuilder.CreateBr(ExitB);
2321 CondBuilder.SetInsertPoint(MemmoveB->getTerminator());
2322
2323 if (DestVolatile) {
2324 Type *Int32Ty = Type::getInt32Ty(Ctx);
2325 Type *PtrTy = PointerType::get(Ctx, 0);
2326 Type *VoidTy = Type::getVoidTy(Ctx);
2327 Module *M = Func->getParent();
2328
2329 // FIXME: This should check if the call is supported
2330 StringRef HexagonVolatileMemcpyName =
2332 RTLIB::impl_hexagon_memcpy_forward_vp4cp4n2);
2333 FunctionCallee Fn = M->getOrInsertFunction(
2334 HexagonVolatileMemcpyName, VoidTy, PtrTy, PtrTy, Int32Ty);
2335
2336 const SCEV *OneS = SE->getConstant(Int32Ty, 1);
2337 const SCEV *BECount32 = SE->getTruncateOrZeroExtend(BECount, Int32Ty);
2338 const SCEV *NumWordsS = SE->getAddExpr(BECount32, OneS, SCEV::FlagNUW);
2339 Value *NumWords = Expander.expandCodeFor(NumWordsS, Int32Ty,
2340 MemmoveB->getTerminator());
2341 if (Instruction *In = dyn_cast<Instruction>(NumWords))
2342 if (Value *Simp = simplifyInstruction(In, {*DL, TLI, DT}))
2343 NumWords = Simp;
2344
2345 NewCall = CondBuilder.CreateCall(Fn,
2346 {StoreBasePtr, LoadBasePtr, NumWords});
2347 } else {
2348 NewCall = CondBuilder.CreateMemMove(
2349 StoreBasePtr, SI->getAlign(), LoadBasePtr, LI->getAlign(), NumBytes);
2350 }
2351 } else {
2352 NewCall = Builder.CreateMemCpy(StoreBasePtr, SI->getAlign(), LoadBasePtr,
2353 LI->getAlign(), NumBytes);
2354 // Okay, the memcpy has been formed. Zap the original store and
2355 // anything that feeds into it.
2357 }
2358
2359 NewCall->setDebugLoc(DLoc);
2360
2361 LLVM_DEBUG(dbgs() << " Formed " << (Overlap ? "memmove: " : "memcpy: ")
2362 << *NewCall << "\n"
2363 << " from load ptr=" << *LoadEv << " at: " << *LI << "\n"
2364 << " from store ptr=" << *StoreEv << " at: " << *SI
2365 << "\n");
2366
2367 if (Overlap) {
2368 ORE.emit([&]() {
2369 return OptimizationRemark(DEBUG_TYPE, "LoopToMemmove", DLoc,
2370 CurLoop->getHeader())
2371 << "converted loop to memmove";
2372 });
2373 } else {
2374 ORE.emit([&]() {
2375 return OptimizationRemark(DEBUG_TYPE, "LoopToMemcpy", DLoc,
2376 CurLoop->getHeader())
2377 << "converted loop to memcpy";
2378 });
2379 }
2380
2381 return true;
2382}
2383
2384// Check if the instructions in Insts, together with their dependencies
2385// cover the loop in the sense that the loop could be safely eliminated once
2386// the instructions in Insts are removed.
2387bool HexagonLoopIdiomRecognize::coverLoop(Loop *L,
2388 SmallVectorImpl<Instruction*> &Insts) const {
2389 SmallPtrSet<BasicBlock *, 8> LoopBlocks;
2390 LoopBlocks.insert_range(L->blocks());
2391
2392 SetVector<Instruction *> Worklist(llvm::from_range, Insts);
2393
2394 // Collect all instructions from the loop that the instructions in Insts
2395 // depend on (plus their dependencies, etc.). These instructions will
2396 // constitute the expression trees that feed those in Insts, but the trees
2397 // will be limited only to instructions contained in the loop.
2398 for (unsigned i = 0; i < Worklist.size(); ++i) {
2399 Instruction *In = Worklist[i];
2400 for (auto I = In->op_begin(), E = In->op_end(); I != E; ++I) {
2402 if (!OpI)
2403 continue;
2404 BasicBlock *PB = OpI->getParent();
2405 if (!LoopBlocks.count(PB))
2406 continue;
2407 Worklist.insert(OpI);
2408 }
2409 }
2410
2411 // Scan all instructions in the loop, if any of them have a user outside
2412 // of the loop, or outside of the expressions collected above, then either
2413 // the loop has a side-effect visible outside of it, or there are
2414 // instructions in it that are not involved in the original set Insts.
2415 for (auto *B : L->blocks()) {
2416 for (auto &In : *B) {
2418 continue;
2419 if (!Worklist.count(&In) && In.mayHaveSideEffects())
2420 return false;
2421 for (auto *K : In.users()) {
2423 if (!UseI)
2424 continue;
2425 BasicBlock *UseB = UseI->getParent();
2426 if (LF->getLoopFor(UseB) != L)
2427 return false;
2428 }
2429 }
2430 }
2431
2432 return true;
2433}
2434
2435/// runOnLoopBlock - Process the specified block, which lives in a counted loop
2436/// with the specified backedge count. This block is known to be in the current
2437/// loop and not in any subloops.
2438bool HexagonLoopIdiomRecognize::runOnLoopBlock(Loop *CurLoop, BasicBlock *BB,
2439 const SCEV *BECount, SmallVectorImpl<BasicBlock*> &ExitBlocks) {
2440 // We can only promote stores in this block if they are unconditionally
2441 // executed in the loop. For a block to be unconditionally executed, it has
2442 // to dominate all the exit blocks of the loop. Verify this now.
2443 auto DominatedByBB = [this,BB] (BasicBlock *EB) -> bool {
2444 return DT->dominates(BB, EB);
2445 };
2446 if (!all_of(ExitBlocks, DominatedByBB))
2447 return false;
2448
2449 bool MadeChange = false;
2450 // Look for store instructions, which may be optimized to memset/memcpy.
2451 SmallVector<StoreInst*,8> Stores;
2452 collectStores(CurLoop, BB, Stores);
2453
2454 // Optimize the store into a memcpy, if it feeds an similarly strided load.
2455 for (auto &SI : Stores)
2456 MadeChange |= processCopyingStore(CurLoop, SI, BECount);
2457
2458 return MadeChange;
2459}
2460
2461bool HexagonLoopIdiomRecognize::runOnCountableLoop(Loop *L) {
2462 PolynomialMultiplyRecognize PMR(L, *DL, *DT, *TLI, *SE);
2463 if (PMR.recognize()) {
2464 ORE.emit([&]() {
2465 return OptimizationRemark(DEBUG_TYPE, "PolynomialMultiply",
2466 L->getStartLoc(), L->getHeader())
2467 << "recognized polynomial multiply idiom";
2468 });
2469 return true;
2470 }
2471
2472 if (!HasMemcpy && !HasMemmove)
2473 return false;
2474
2475 const SCEV *BECount = SE->getBackedgeTakenCount(L);
2476 assert(!isa<SCEVCouldNotCompute>(BECount) &&
2477 "runOnCountableLoop() called on a loop without a predictable"
2478 "backedge-taken count");
2479
2480 SmallVector<BasicBlock *, 8> ExitBlocks;
2481 L->getUniqueExitBlocks(ExitBlocks);
2482
2483 bool Changed = false;
2484
2485 // Scan all the blocks in the loop that are not in subloops.
2486 for (auto *BB : L->getBlocks()) {
2487 // Ignore blocks in subloops.
2488 if (LF->getLoopFor(BB) != L)
2489 continue;
2490 Changed |= runOnLoopBlock(L, BB, BECount, ExitBlocks);
2491 }
2492
2493 return Changed;
2494}
2495
2496bool HexagonLoopIdiomRecognize::run(Loop *L) {
2497 const Module &M = *L->getHeader()->getParent()->getParent();
2498 if (M.getTargetTriple().getArch() != Triple::hexagon)
2499 return false;
2500
2501 // If the loop could not be converted to canonical form, it must have an
2502 // indirectbr in it, just give up.
2503 if (!L->getLoopPreheader()) {
2504 ORE.emit([&]() {
2505 return OptimizationRemarkMissed(DEBUG_TYPE, "NoPreheader",
2506 L->getStartLoc(), L->getHeader())
2507 << "loop not in canonical form (no preheader)";
2508 });
2509 return false;
2510 }
2511
2512 // Disable loop idiom recognition if the function's name is a common idiom.
2513 StringRef Name = L->getHeader()->getParent()->getName();
2514 if (Name == "memset" || Name == "memcpy" || Name == "memmove")
2515 return false;
2516
2517 DL = &L->getHeader()->getDataLayout();
2518
2519 HasMemcpy = TLI->has(LibFunc_memcpy);
2520 HasMemmove = TLI->has(LibFunc_memmove);
2521
2522 if (SE->hasLoopInvariantBackedgeTakenCount(L))
2523 return runOnCountableLoop(L);
2524
2525 ORE.emit([&]() {
2526 return OptimizationRemarkMissed(DEBUG_TYPE, "NonCountableLoop",
2527 L->getStartLoc(), L->getHeader())
2528 << "backedge-taken count is not loop-invariant";
2529 });
2530 return false;
2531}
2532
2533bool HexagonLoopIdiomRecognizeLegacyPass::runOnLoop(Loop *L,
2534 LPPassManager &LPM) {
2535 if (skipLoop(L))
2536 return false;
2537
2538 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2539 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2540 auto *LF = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2541 auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
2542 *L->getHeader()->getParent());
2543 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2544 auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2545 return HexagonLoopIdiomRecognize(AA, DT, LF, TLI, SE, ORE).run(L);
2546}
2547
2549 return new HexagonLoopIdiomRecognizeLegacyPass();
2550}
2551
2555 LPMUpdater &U) {
2556 OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
2557 return HexagonLoopIdiomRecognize(&AR.AA, &AR.DT, &AR.LI, &AR.TLI, &AR.SE, ORE)
2558 .run(&L)
2561}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:853
static void cleanup(BlockFrequencyInfoImplBase &BFI)
Clear all memory not needed downstream.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_ATTRIBUTE_USED
Definition Compiler.h:236
This file contains the declarations for the subclasses of Constant, which represent the different fla...
DXIL Resource Access
This file defines the DenseMap class.
#define DEBUG_TYPE
hexagon bit simplify
static cl::opt< unsigned > SimplifyLimit("hlir-simplify-limit", cl::init(10000), cl::Hidden, cl::desc("Maximum number of simplification steps in HLIR"))
static cl::opt< bool > DisableMemcpyIdiom("disable-memcpy-idiom", cl::Hidden, cl::init(false), cl::desc("Disable generation of memcpy in loop idiom recognition"))
static void replaceAllUsesOfWithIn(Value *I, Value *J, BasicBlock *BB)
static cl::opt< unsigned > RuntimeMemSizeThreshold("runtime-mem-idiom-threshold", cl::Hidden, cl::init(0), cl::desc("Threshold (in bytes) for the runtime " "check guarding the memmove."))
static cl::opt< bool > HexagonVolatileMemcpy("disable-hexagon-volatile-memcpy", cl::Hidden, cl::init(false), cl::desc("Enable Hexagon-specific memcpy for volatile destination."))
static cl::opt< bool > DisableMemmoveIdiom("disable-memmove-idiom", cl::Hidden, cl::init(false), cl::desc("Disable generation of memmove in loop idiom recognition"))
static cl::opt< unsigned > CompileTimeMemSizeThreshold("compile-time-mem-idiom-threshold", cl::Hidden, cl::init(64), cl::desc("Threshold (in bytes) to perform the transformation, if the " "runtime loop count (mem transfer size) is known at compile-time."))
static bool mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, const SCEV *BECount, unsigned StoreSize, AliasAnalysis &AA, SmallPtrSetImpl< Instruction * > &Ignored)
mayLoopAccessLocation - Return true if the specified loop might access the specified pointer location...
static bool hasZeroSignBit(const Value *V)
static cl::opt< bool > OnlyNonNestedMemmove("only-nonnested-memmove-idiom", cl::Hidden, cl::init(true), cl::desc("Only enable generating memmove in non-nested loops"))
Module.h This file contains the declarations for the Module class.
This header defines various interfaces for pass management in LLVM.
iv Induction Variable Users
Definition IVUsers.cpp:48
static Value * getOpcode(Value &V, Type &Ty, InstrumentationConfig &IConf, InstrumentorIRBuilderTy &IIRB)
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
Move duplicate certain instructions close to their use
Definition Localizer.cpp:33
This header provides classes for managing per-loop analyses.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define H(x, y, z)
Definition MD5.cpp:56
Machine Check Debug Module
This file provides utility analysis objects describing memory locations.
#define T
uint64_t IntrinsicInst * II
#define P(N)
PassBuilder PB(Machine, PassOpts->PTO, std::nullopt, &PIC)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
const SmallVectorImpl< MachineOperand > & Cond
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
#define LLVM_DEBUG(...)
Definition Debug.h:119
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, const llvm::StringTable &StandardNames, VectorLibrary VecLib)
Initialize the set of available library functions based on the specified target triple.
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:467
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
LLVM_ABI AnalysisUsage & addRequiredID(const void *ID)
Definition Pass.cpp:284
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:474
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:461
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition BasicBlock.h:530
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
BinaryOps getOpcode() const
Definition InstrTypes.h:409
@ ICMP_NE
not equal
Definition InstrTypes.h:762
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
void setIDom(DomTreeNodeBase *NewIDom)
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:310
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:155
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:350
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getPointerOperand()
bool isSimple() const
Align getAlign() const
Return the alignment of the access that is being performed.
static LocationSize precise(uint64_t Value)
static constexpr LocationSize afterPointer()
Any location after the base pointer (but still within the underlying object).
BlockT * getHeader() const
void addBasicBlockToLoop(BlockT *NewBB, LoopInfoBase< BlockT, LoopT > &LI)
This method is used by other analyses to update loop information.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
void getUniqueExitBlocks(SmallVectorImpl< BlockT * > &ExitBlocks) const
Return all unique successor blocks of this loop.
LoopT * getParentLoop() const
Return the parent loop if it exists or nullptr for top level loops.
The legacy pass manager's analysis pass to compute loop information.
Definition LoopInfo.h:612
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Representation for a specific memory location.
OptimizationRemarkEmitter legacy analysis pass.
The optimization diagnostic interface.
void setIncomingBlock(unsigned i, BasicBlock *BB)
int getBasicBlockIndex(const BasicBlock *BB) const
Return the first index of the specified basic block in the value list for this PHI.
Pass interface - Implemented by all 'passes'.
Definition Pass.h:99
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
This class represents a constant integer value.
SCEVUse getOperand(unsigned i) const
This class represents an analyzed expression in the program.
static constexpr auto FlagNUW
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
const Value * getFalseValue() const
const Value * getCondition() const
const Value * getTrueValue() const
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
void push_back(const T &Elt)
Provides information about what library functions are available for the current target.
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
op_range operands()
Definition User.h:267
Value * getOperand(unsigned i) const
Definition User.h:207
unsigned getNumOperands() const
Definition User.h:229
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
user_iterator user_begin()
Definition Value.h:402
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:393
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
const ParentTy * getParent() const
Definition ilist_node.h:34
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
CmpClass_match< LHS, RHS, ICmpInst, true > m_c_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
Matches an ICmp with a predicate over LHS and RHS in either order.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
auto m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Xor, true > m_c_Xor(const LHS &L, const RHS &R)
Matches an Xor with LHS and RHS in either order.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
initializer< Ty > init(const Ty &Val)
DXILDebugInfoMap run(Module &M)
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
bool empty() const
Definition BasicBlock.h:101
Context & getContext() const
Definition BasicBlock.h:99
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1764
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:535
auto pred_end(const MachineBasicBlock *BB)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:328
constexpr from_range_t from_range
Pass * createHexagonLoopIdiomPass()
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
LLVM_ABI char & LCSSAID
Definition LCSSA.cpp:526
LLVM_ABI char & LoopSimplifyID
LLVM_ABI Value * simplifyInstruction(Instruction *I, const SimplifyQuery &Q)
See if we can compute a simplified version of this instruction.
DomTreeNodeBase< BasicBlock > DomTreeNode
Definition Dominators.h:94
AnalysisManager< Loop, LoopStandardAnalysisResults & > LoopAnalysisManager
The loop analysis manager.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
CycleInfo::CycleT Cycle
Definition CycleInfo.h:26
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
bool isModOrRefSet(const ModRefInfo MRI)
Definition ModRef.h:43
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
ModRefInfo
Flags indicating whether a memory access modifies or references memory.
Definition ModRef.h:28
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
void replace(R &&Range, const T &OldValue, const T &NewValue)
Provide wrappers to std::replace which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1909
@ Xor
Bitwise or logical XOR of integers.
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
DWARFExpression::Operation Op
PredIterator< BasicBlock, Value::user_iterator > pred_iterator
Definition CFG.h:93
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
auto pred_begin(const MachineBasicBlock *BB)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
auto predecessors(const MachineBasicBlock *BB)
iterator_range< pointer_iterator< WrappedIteratorT > > make_pointer_range(RangeT &&Range)
Definition iterator.h:368
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2145
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:863
#define N
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
static StringRef getLibcallImplName(RTLIB::LibcallImpl CallImpl)
Get the libcall routine name for the specified libcall implementation.