LLVM 23.0.0git
MemProfContextDisambiguation.cpp
Go to the documentation of this file.
1//==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements support for context disambiguation of allocation
10// calls for profile guided heap optimization. Specifically, it uses Memprof
11// profiles which indicate context specific allocation behavior (currently
12// distinguishing cold vs hot memory allocations). Cloning is performed to
13// expose the cold allocation call contexts, and the allocation calls are
14// subsequently annotated with an attribute for later transformation.
15//
16// The transformations can be performed either directly on IR (regular LTO), or
17// on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18// Both types of LTO operate on a the same base graph representation, which
19// uses CRTP to support either IR or Index formats.
20//
21//===----------------------------------------------------------------------===//
22
24#include "llvm/ADT/DenseMap.h"
25#include "llvm/ADT/DenseSet.h"
26#include "llvm/ADT/MapVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
38#include "llvm/IR/Module.h"
40#include "llvm/Pass.h"
44#include "llvm/Support/SHA1.h"
46#include "llvm/Transforms/IPO.h"
50#include <deque>
51#include <sstream>
52#include <unordered_map>
53#include <vector>
54using namespace llvm;
55using namespace llvm::memprof;
56
57#define DEBUG_TYPE "memprof-context-disambiguation"
58
59STATISTIC(FunctionClonesAnalysis,
60 "Number of function clones created during whole program analysis");
61STATISTIC(FunctionClonesThinBackend,
62 "Number of function clones created during ThinLTO backend");
63STATISTIC(FunctionsClonedThinBackend,
64 "Number of functions that had clones created during ThinLTO backend");
66 FunctionCloneDuplicatesThinBackend,
67 "Number of function clone duplicates detected during ThinLTO backend");
68STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
69 "cloned) during whole program analysis");
70STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
71 "during whole program analysis");
72STATISTIC(AllocTypeNotColdThinBackend,
73 "Number of not cold static allocations (possibly cloned) during "
74 "ThinLTO backend");
75STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
76 "(possibly cloned) during ThinLTO backend");
77STATISTIC(OrigAllocsThinBackend,
78 "Number of original (not cloned) allocations with memprof profiles "
79 "during ThinLTO backend");
81 AllocVersionsThinBackend,
82 "Number of allocation versions (including clones) during ThinLTO backend");
83STATISTIC(MaxAllocVersionsThinBackend,
84 "Maximum number of allocation versions created for an original "
85 "allocation during ThinLTO backend");
86STATISTIC(UnclonableAllocsThinBackend,
87 "Number of unclonable ambigous allocations during ThinLTO backend");
88STATISTIC(RemovedEdgesWithMismatchedCallees,
89 "Number of edges removed due to mismatched callees (profiled vs IR)");
90STATISTIC(FoundProfiledCalleeCount,
91 "Number of profiled callees found via tail calls");
92STATISTIC(FoundProfiledCalleeDepth,
93 "Aggregate depth of profiled callees found via tail calls");
94STATISTIC(FoundProfiledCalleeMaxDepth,
95 "Maximum depth of profiled callees found via tail calls");
96STATISTIC(FoundProfiledCalleeNonUniquelyCount,
97 "Number of profiled callees found via multiple tail call chains");
98STATISTIC(DeferredBackedges, "Number of backedges with deferred cloning");
99STATISTIC(NewMergedNodes, "Number of new nodes created during merging");
100STATISTIC(NonNewMergedNodes, "Number of non new nodes used during merging");
101STATISTIC(MissingAllocForContextId,
102 "Number of missing alloc nodes for context ids");
103STATISTIC(SkippedCallsCloning,
104 "Number of calls skipped during cloning due to unexpected operand");
105STATISTIC(MismatchedCloneAssignments,
106 "Number of callsites assigned to call multiple non-matching clones");
107STATISTIC(TotalMergeInvokes, "Number of merge invocations for nodes");
108STATISTIC(TotalMergeIters, "Number of merge iterations for nodes");
109STATISTIC(MaxMergeIters, "Max merge iterations for nodes");
110STATISTIC(NumImportantContextIds, "Number of important context ids");
111STATISTIC(NumFixupEdgeIdsInserted, "Number of fixup edge ids inserted");
112STATISTIC(NumFixupEdgesAdded, "Number of fixup edges added");
113STATISTIC(NumFixedContexts, "Number of contexts with fixed edges");
114STATISTIC(AliaseesPrevailingInDiffModuleFromAlias,
115 "Number of aliasees prevailing in a different module than its alias");
116
118 "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
119 cl::value_desc("filename"),
120 cl::desc("Specify the path prefix of the MemProf dot files."));
121
122static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false),
124 cl::desc("Export graph to dot files."));
125
126// TODO: Remove this option once new handling is validated more widely.
128 "memprof-merge-iteration", cl::init(true), cl::Hidden,
129 cl::desc("Iteratively apply merging on a node to catch new callers"));
130
131// How much of the graph to export to dot.
133 All, // The full CCG graph.
134 Alloc, // Only contexts for the specified allocation.
135 Context, // Only the specified context.
136};
137
139 "memprof-dot-scope", cl::desc("Scope of graph to export to dot"),
142 clEnumValN(DotScope::All, "all", "Export full callsite graph"),
144 "Export only nodes with contexts feeding given "
145 "-memprof-dot-alloc-id"),
146 clEnumValN(DotScope::Context, "context",
147 "Export only nodes with given -memprof-dot-context-id")));
148
150 AllocIdForDot("memprof-dot-alloc-id", cl::init(0), cl::Hidden,
151 cl::desc("Id of alloc to export if -memprof-dot-scope=alloc "
152 "or to highlight if -memprof-dot-scope=all"));
153
155 "memprof-dot-context-id", cl::init(0), cl::Hidden,
156 cl::desc("Id of context to export if -memprof-dot-scope=context or to "
157 "highlight otherwise"));
158
159static cl::opt<bool>
160 DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden,
161 cl::desc("Dump CallingContextGraph to stdout after each stage."));
162
163static cl::opt<bool>
164 VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden,
165 cl::desc("Perform verification checks on CallingContextGraph."));
166
167static cl::opt<bool>
168 VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden,
169 cl::desc("Perform frequent verification checks on nodes."));
170
172 "memprof-import-summary",
173 cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
174 cl::Hidden);
175
177 TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5),
179 cl::desc("Max depth to recursively search for missing "
180 "frames through tail calls."));
181
182// Optionally enable cloning of callsites involved with recursive cycles
184 "memprof-allow-recursive-callsites", cl::init(true), cl::Hidden,
185 cl::desc("Allow cloning of callsites involved in recursive cycles"));
186
188 "memprof-clone-recursive-contexts", cl::init(true), cl::Hidden,
189 cl::desc("Allow cloning of contexts through recursive cycles"));
190
191// Generally this is needed for correct assignment of allocation clones to
192// function clones, however, allow it to be disabled for debugging while the
193// functionality is new and being tested more widely.
194static cl::opt<bool>
195 MergeClones("memprof-merge-clones", cl::init(true), cl::Hidden,
196 cl::desc("Merge clones before assigning functions"));
197
198// When disabled, try to detect and prevent cloning of recursive contexts.
199// This is only necessary until we support cloning through recursive cycles.
200// Leave on by default for now, as disabling requires a little bit of compile
201// time overhead and doesn't affect correctness, it will just inflate the cold
202// hinted bytes reporting a bit when -memprof-report-hinted-sizes is enabled.
204 "memprof-allow-recursive-contexts", cl::init(true), cl::Hidden,
205 cl::desc("Allow cloning of contexts having recursive cycles"));
206
207// Set the minimum absolute count threshold for allowing inlining of indirect
208// calls promoted during cloning.
210 "memprof-icp-noinline-threshold", cl::init(2), cl::Hidden,
211 cl::desc("Minimum absolute count for promoted target to be inlinable"));
212
213namespace llvm {
215 "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
216 cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
217
218// Indicate we are linking with an allocator that supports hot/cold operator
219// new interfaces.
221 "supports-hot-cold-new", cl::init(false), cl::Hidden,
222 cl::desc("Linking with hot/cold operator new interfaces"));
223
225 "memprof-require-definition-for-promotion", cl::init(false), cl::Hidden,
226 cl::desc(
227 "Require target function definition when promoting indirect calls"));
228
231
233 "memprof-top-n-important", cl::init(10), cl::Hidden,
234 cl::desc("Number of largest cold contexts to consider important"));
235
237 "memprof-fixup-important", cl::init(true), cl::Hidden,
238 cl::desc("Enables edge fixup for important contexts"));
239
241
242} // namespace llvm
243
244namespace {
245
246/// CRTP base for graphs built from either IR or ThinLTO summary index.
247///
248/// The graph represents the call contexts in all memprof metadata on allocation
249/// calls, with nodes for the allocations themselves, as well as for the calls
250/// in each context. The graph is initially built from the allocation memprof
251/// metadata (or summary) MIBs. It is then updated to match calls with callsite
252/// metadata onto the nodes, updating it to reflect any inlining performed on
253/// those calls.
254///
255/// Each MIB (representing an allocation's call context with allocation
256/// behavior) is assigned a unique context id during the graph build. The edges
257/// and nodes in the graph are decorated with the context ids they carry. This
258/// is used to correctly update the graph when cloning is performed so that we
259/// can uniquify the context for a single (possibly cloned) allocation.
260template <typename DerivedCCG, typename FuncTy, typename CallTy>
261class CallsiteContextGraph {
262public:
263 CallsiteContextGraph() = default;
264 CallsiteContextGraph(const CallsiteContextGraph &) = default;
265 CallsiteContextGraph(CallsiteContextGraph &&) = default;
266
267 /// Main entry point to perform analysis and transformations on graph.
268 bool process(function_ref<void(StringRef, StringRef, const Twine &)>
269 EmitRemark = nullptr,
270 bool AllowExtraAnalysis = false);
271
272 /// Perform cloning on the graph necessary to uniquely identify the allocation
273 /// behavior of an allocation based on its context.
274 void identifyClones();
275
276 /// Assign callsite clones to functions, cloning functions as needed to
277 /// accommodate the combinations of their callsite clones reached by callers.
278 /// For regular LTO this clones functions and callsites in the IR, but for
279 /// ThinLTO the cloning decisions are noted in the summaries and later applied
280 /// in applyImport.
281 bool assignFunctions();
282
283 void dump() const;
284 void print(raw_ostream &OS) const;
285 void printTotalSizes(raw_ostream &OS,
286 function_ref<void(StringRef, StringRef, const Twine &)>
287 EmitRemark = nullptr) const;
288
290 const CallsiteContextGraph &CCG) {
291 CCG.print(OS);
292 return OS;
293 }
294
295 friend struct GraphTraits<
296 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
297 friend struct DOTGraphTraits<
298 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
299
300 void exportToDot(std::string Label) const;
301
302 /// Represents a function clone via FuncTy pointer and clone number pair.
303 struct FuncInfo final
304 : public std::pair<FuncTy *, unsigned /*Clone number*/> {
305 using Base = std::pair<FuncTy *, unsigned>;
306 FuncInfo(const Base &B) : Base(B) {}
307 FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
308 explicit operator bool() const { return this->first != nullptr; }
309 FuncTy *func() const { return this->first; }
310 unsigned cloneNo() const { return this->second; }
311 };
312
313 /// Represents a callsite clone via CallTy and clone number pair.
314 struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
315 using Base = std::pair<CallTy, unsigned>;
316 CallInfo(const Base &B) : Base(B) {}
317 CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
318 : Base(Call, CloneNo) {}
319 explicit operator bool() const { return (bool)this->first; }
320 CallTy call() const { return this->first; }
321 unsigned cloneNo() const { return this->second; }
322 void setCloneNo(unsigned N) { this->second = N; }
323 void print(raw_ostream &OS) const {
324 if (!operator bool()) {
325 assert(!cloneNo());
326 OS << "null Call";
327 return;
328 }
329 call()->print(OS);
330 OS << "\t(clone " << cloneNo() << ")";
331 }
332 void dump() const {
333 print(dbgs());
334 dbgs() << "\n";
335 }
336 friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
337 Call.print(OS);
338 return OS;
339 }
340 };
341
342 struct ContextEdge;
343
344 /// Node in the Callsite Context Graph
345 struct ContextNode {
346 // Assigned to nodes as they are created, useful for debugging.
347 unsigned NodeId = 0;
348
349 // Keep this for now since in the IR case where we have an Instruction* it
350 // is not as immediately discoverable. Used for printing richer information
351 // when dumping graph.
352 bool IsAllocation;
353
354 // Keeps track of when the Call was reset to null because there was
355 // recursion.
356 bool Recursive = false;
357
358 // This will be formed by ORing together the AllocationType enum values
359 // for contexts including this node.
360 uint8_t AllocTypes = 0;
361
362 // The corresponding allocation or interior call. This is the primary call
363 // for which we have created this node.
364 CallInfo Call;
365
366 // List of other calls that can be treated the same as the primary call
367 // through cloning. I.e. located in the same function and have the same
368 // (possibly pruned) stack ids. They will be updated the same way as the
369 // primary call when assigning to function clones.
370 SmallVector<CallInfo, 0> MatchingCalls;
371
372 // For alloc nodes this is a unique id assigned when constructed, and for
373 // callsite stack nodes it is the original stack id when the node is
374 // constructed from the memprof MIB metadata on the alloc nodes. Note that
375 // this is only used when matching callsite metadata onto the stack nodes
376 // created when processing the allocation memprof MIBs, and for labeling
377 // nodes in the dot graph. Therefore we don't bother to assign a value for
378 // clones.
379 uint64_t OrigStackOrAllocId = 0;
380
381 // Edges to all callees in the profiled call stacks.
382 // TODO: Should this be a map (from Callee node) for more efficient lookup?
383 std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
384
385 // Edges to all callers in the profiled call stacks.
386 // TODO: Should this be a map (from Caller node) for more efficient lookup?
387 std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
388
389 // Returns true if we need to look at the callee edges for determining the
390 // node context ids and allocation type.
391 bool useCallerEdgesForContextInfo() const {
392 // Typically if the callee edges are empty either the caller edges are
393 // also empty, or this is an allocation (leaf node). However, if we are
394 // allowing recursive callsites and contexts this will be violated for
395 // incompletely cloned recursive cycles.
396 assert(!CalleeEdges.empty() || CallerEdges.empty() || IsAllocation ||
398 // When cloning for a recursive context, during cloning we might be in the
399 // midst of cloning for a recurrence and have moved context ids off of a
400 // caller edge onto the clone but not yet off of the incoming caller
401 // (back) edge. If we don't look at those we miss the fact that this node
402 // still has context ids of interest.
403 return IsAllocation || CloneRecursiveContexts;
404 }
405
406 // Compute the context ids for this node from the union of its edge context
407 // ids.
408 DenseSet<uint32_t> getContextIds() const {
409 unsigned Count = 0;
410 // Compute the number of ids for reserve below. In general we only need to
411 // look at one set of edges, typically the callee edges, since other than
412 // allocations and in some cases during recursion cloning, all the context
413 // ids on the callers should also flow out via callee edges.
414 for (auto &Edge : CalleeEdges.empty() ? CallerEdges : CalleeEdges)
415 Count += Edge->getContextIds().size();
416 DenseSet<uint32_t> ContextIds;
417 ContextIds.reserve(Count);
419 CalleeEdges, useCallerEdgesForContextInfo()
420 ? CallerEdges
421 : std::vector<std::shared_ptr<ContextEdge>>());
422 for (const auto &Edge : Edges)
423 ContextIds.insert_range(Edge->getContextIds());
424 return ContextIds;
425 }
426
427 // Compute the allocation type for this node from the OR of its edge
428 // allocation types.
429 uint8_t computeAllocType() const {
430 uint8_t BothTypes =
434 CalleeEdges, useCallerEdgesForContextInfo()
435 ? CallerEdges
436 : std::vector<std::shared_ptr<ContextEdge>>());
437 for (const auto &Edge : Edges) {
438 AllocType |= Edge->AllocTypes;
439 // Bail early if alloc type reached both, no further refinement.
440 if (AllocType == BothTypes)
441 return AllocType;
442 }
443 return AllocType;
444 }
445
446 // The context ids set for this node is empty if its edge context ids are
447 // also all empty.
448 bool emptyContextIds() const {
450 CalleeEdges, useCallerEdgesForContextInfo()
451 ? CallerEdges
452 : std::vector<std::shared_ptr<ContextEdge>>());
453 for (const auto &Edge : Edges) {
454 if (!Edge->getContextIds().empty())
455 return false;
456 }
457 return true;
458 }
459
460 // List of clones of this ContextNode, initially empty.
461 std::vector<ContextNode *> Clones;
462
463 // If a clone, points to the original uncloned node.
464 ContextNode *CloneOf = nullptr;
465
466 ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
467
468 ContextNode(bool IsAllocation, CallInfo C)
469 : IsAllocation(IsAllocation), Call(C) {}
470
471 void addClone(ContextNode *Clone) {
472 if (CloneOf) {
473 CloneOf->Clones.push_back(Clone);
474 Clone->CloneOf = CloneOf;
475 } else {
476 Clones.push_back(Clone);
477 assert(!Clone->CloneOf);
478 Clone->CloneOf = this;
479 }
480 }
481
482 ContextNode *getOrigNode() {
483 if (!CloneOf)
484 return this;
485 return CloneOf;
486 }
487
488 void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
489 unsigned int ContextId);
490
491 ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
492 ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
493 void eraseCalleeEdge(const ContextEdge *Edge);
494 void eraseCallerEdge(const ContextEdge *Edge);
495
496 void setCall(CallInfo C) { Call = std::move(C); }
497
498 bool hasCall() const { return (bool)Call.call(); }
499
500 void printCall(raw_ostream &OS) const { Call.print(OS); }
501
502 // True if this node was effectively removed from the graph, in which case
503 // it should have an allocation type of None and empty context ids.
504 bool isRemoved() const {
505 // Typically if the callee edges are empty either the caller edges are
506 // also empty, or this is an allocation (leaf node). However, if we are
507 // allowing recursive callsites and contexts this will be violated for
508 // incompletely cloned recursive cycles.
510 (AllocTypes == (uint8_t)AllocationType::None) ==
511 emptyContextIds());
512 return AllocTypes == (uint8_t)AllocationType::None;
513 }
514
515 void dump() const;
516 void print(raw_ostream &OS) const;
517
518 friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
519 Node.print(OS);
520 return OS;
521 }
522 };
523
524 /// Edge in the Callsite Context Graph from a ContextNode N to a caller or
525 /// callee.
526 struct ContextEdge {
527 ContextNode *Callee;
528 ContextNode *Caller;
529
530 // This will be formed by ORing together the AllocationType enum values
531 // for contexts including this edge.
532 uint8_t AllocTypes = 0;
533
534 // Set just before initiating cloning when cloning of recursive contexts is
535 // enabled. Used to defer cloning of backedges until we have done cloning of
536 // the callee node for non-backedge caller edges. This exposes cloning
537 // opportunities through the backedge of the cycle.
538 // TODO: Note that this is not updated during cloning, and it is unclear
539 // whether that would be needed.
540 bool IsBackedge = false;
541
542 // The set of IDs for contexts including this edge.
543 DenseSet<uint32_t> ContextIds;
544
545 ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
546 DenseSet<uint32_t> ContextIds)
547 : Callee(Callee), Caller(Caller), AllocTypes(AllocType),
548 ContextIds(std::move(ContextIds)) {}
549
550 DenseSet<uint32_t> &getContextIds() { return ContextIds; }
551
552 // Helper to clear the fields of this edge when we are removing it from the
553 // graph.
554 inline void clear() {
555 ContextIds.clear();
556 AllocTypes = (uint8_t)AllocationType::None;
557 Caller = nullptr;
558 Callee = nullptr;
559 }
560
561 // Check if edge was removed from the graph. This is useful while iterating
562 // over a copy of edge lists when performing operations that mutate the
563 // graph in ways that might remove one of the edges.
564 inline bool isRemoved() const {
565 if (Callee || Caller)
566 return false;
567 // Any edges that have been removed from the graph but are still in a
568 // shared_ptr somewhere should have all fields null'ed out by clear()
569 // above.
570 assert(AllocTypes == (uint8_t)AllocationType::None);
571 assert(ContextIds.empty());
572 return true;
573 }
574
575 void dump() const;
576 void print(raw_ostream &OS) const;
577
578 friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
579 Edge.print(OS);
580 return OS;
581 }
582 };
583
584 /// Helpers to remove edges that have allocation type None (due to not
585 /// carrying any context ids) after transformations.
586 void removeNoneTypeCalleeEdges(ContextNode *Node);
587 void removeNoneTypeCallerEdges(ContextNode *Node);
588 void
589 recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node,
591
592protected:
593 /// Get a list of nodes corresponding to the stack ids in the given callsite
594 /// context.
595 template <class NodeT, class IteratorT>
596 std::vector<uint64_t>
597 getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
598
599 /// Adds nodes for the given allocation and any stack ids on its memprof MIB
600 /// metadata (or summary).
601 ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
602
603 /// Adds nodes for the given MIB stack ids.
604 template <class NodeT, class IteratorT>
605 void addStackNodesForMIB(
606 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
608 ArrayRef<ContextTotalSize> ContextSizeInfo,
609 std::map<uint64_t, uint32_t> &TotalSizeToContextIdTopNCold);
610
611 /// Matches all callsite metadata (or summary) to the nodes created for
612 /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
613 /// inlining performed on those callsite instructions.
614 void updateStackNodes();
615
616 /// Optionally fixup edges for the N largest cold contexts to better enable
617 /// cloning. This is particularly helpful if the context includes recursion
618 /// as well as inlining, resulting in a single stack node for multiple stack
619 /// ids in the context. With recursion it is particularly difficult to get the
620 /// edge updates correct as in the general case we have lost the original
621 /// stack id ordering for the context. Do more expensive fixup for the largest
622 /// contexts, controlled by MemProfTopNImportant and MemProfFixupImportant.
623 void fixupImportantContexts();
624
625 /// Update graph to conservatively handle any callsite stack nodes that target
626 /// multiple different callee target functions.
627 void handleCallsitesWithMultipleTargets();
628
629 /// Mark backedges via the standard DFS based backedge algorithm.
630 void markBackedges();
631
632 /// Merge clones generated during cloning for different allocations but that
633 /// are called by the same caller node, to ensure proper function assignment.
634 void mergeClones();
635
636 // Try to partition calls on the given node (already placed into the AllCalls
637 // array) by callee function, creating new copies of Node as needed to hold
638 // calls with different callees, and moving the callee edges appropriately.
639 // Returns true if partitioning was successful.
640 bool partitionCallsByCallee(
641 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
642 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode);
643
644 /// Save lists of calls with MemProf metadata in each function, for faster
645 /// iteration.
646 MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
647
648 /// Map from callsite node to the enclosing caller function.
649 std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
650
651 // When exporting to dot, and an allocation id is specified, contains the
652 // context ids on that allocation.
653 DenseSet<uint32_t> DotAllocContextIds;
654
655private:
656 using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
657
658 // Structure to keep track of information for each call as we are matching
659 // non-allocation callsites onto context nodes created from the allocation
660 // call metadata / summary contexts.
661 struct CallContextInfo {
662 // The callsite we're trying to match.
663 CallTy Call;
664 // The callsites stack ids that have a context node in the graph.
665 std::vector<uint64_t> StackIds;
666 // The function containing this callsite.
667 const FuncTy *Func;
668 // Initially empty, if needed this will be updated to contain the context
669 // ids for use in a new context node created for this callsite.
670 DenseSet<uint32_t> ContextIds;
671 };
672
673 /// Helper to remove edge from graph, updating edge iterator if it is provided
674 /// (in which case CalleeIter indicates which edge list is being iterated).
675 /// This will also perform the necessary clearing of the ContextEdge members
676 /// to enable later checking if the edge has been removed (since we may have
677 /// other copies of the shared_ptr in existence, and in fact rely on this to
678 /// enable removal while iterating over a copy of a node's edge list).
679 void removeEdgeFromGraph(ContextEdge *Edge, EdgeIter *EI = nullptr,
680 bool CalleeIter = true);
681
682 /// Assigns the given Node to calls at or inlined into the location with
683 /// the Node's stack id, after post order traversing and processing its
684 /// caller nodes. Uses the call information recorded in the given
685 /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
686 /// as needed. Called by updateStackNodes which sets up the given
687 /// StackIdToMatchingCalls map.
688 void assignStackNodesPostOrder(
689 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
690 DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls,
691 DenseMap<CallInfo, CallInfo> &CallToMatchingCall,
692 const DenseSet<uint32_t> &ImportantContextIds);
693
694 /// Duplicates the given set of context ids, updating the provided
695 /// map from each original id with the newly generated context ids,
696 /// and returning the new duplicated id set.
697 DenseSet<uint32_t> duplicateContextIds(
698 const DenseSet<uint32_t> &StackSequenceContextIds,
699 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
700
701 /// Propagates all duplicated context ids across the graph.
702 void propagateDuplicateContextIds(
703 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
704
705 /// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
706 /// else to its callers. Also updates OrigNode's edges to remove any context
707 /// ids moved to the newly created edge.
708 void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
709 bool TowardsCallee,
710 DenseSet<uint32_t> RemainingContextIds);
711
712 /// Get the stack id corresponding to the given Id or Index (for IR this will
713 /// return itself, for a summary index this will return the id recorded in the
714 /// index for that stack id index value).
715 uint64_t getStackId(uint64_t IdOrIndex) const {
716 return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
717 }
718
719 /// Returns true if the given call targets the callee of the given edge, or if
720 /// we were able to identify the call chain through intermediate tail calls.
721 /// In the latter case new context nodes are added to the graph for the
722 /// identified tail calls, and their synthesized nodes are added to
723 /// TailCallToContextNodeMap. The EdgeIter is updated in the latter case for
724 /// the updated edges and to prepare it for an increment in the caller.
725 bool
726 calleesMatch(CallTy Call, EdgeIter &EI,
727 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);
728
729 // Return the callee function of the given call, or nullptr if it can't be
730 // determined
731 const FuncTy *getCalleeFunc(CallTy Call) {
732 return static_cast<DerivedCCG *>(this)->getCalleeFunc(Call);
733 }
734
735 /// Returns true if the given call targets the given function, or if we were
736 /// able to identify the call chain through intermediate tail calls (in which
737 /// case FoundCalleeChain will be populated).
738 bool calleeMatchesFunc(
739 CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc,
740 std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {
741 return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(
742 Call, Func, CallerFunc, FoundCalleeChain);
743 }
744
745 /// Returns true if both call instructions have the same callee.
746 bool sameCallee(CallTy Call1, CallTy Call2) {
747 return static_cast<DerivedCCG *>(this)->sameCallee(Call1, Call2);
748 }
749
750 /// Get a list of nodes corresponding to the stack ids in the given
751 /// callsite's context.
752 std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
753 return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
754 Call);
755 }
756
757 /// Get the last stack id in the context for callsite.
758 uint64_t getLastStackId(CallTy Call) {
759 return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
760 }
761
762 /// Update the allocation call to record type of allocated memory.
763 void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
764 AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
765 static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
766 }
767
768 /// Get the AllocationType assigned to the given allocation instruction clone.
769 AllocationType getAllocationCallType(const CallInfo &Call) const {
770 return static_cast<const DerivedCCG *>(this)->getAllocationCallType(Call);
771 }
772
773 /// Update non-allocation call to invoke (possibly cloned) function
774 /// CalleeFunc.
775 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
776 static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
777 }
778
779 /// Clone the given function for the given callsite, recording mapping of all
780 /// of the functions tracked calls to their new versions in the CallMap.
781 /// Assigns new clones to clone number CloneNo.
782 FuncInfo cloneFunctionForCallsite(
783 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
784 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
785 return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
786 Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
787 }
788
789 /// Gets a label to use in the dot graph for the given call clone in the given
790 /// function.
791 std::string getLabel(const FuncTy *Func, const CallTy Call,
792 unsigned CloneNo) const {
793 return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
794 }
795
796 // Create and return a new ContextNode.
797 ContextNode *createNewNode(bool IsAllocation, const FuncTy *F = nullptr,
798 CallInfo C = CallInfo()) {
799 NodeOwner.push_back(std::make_unique<ContextNode>(IsAllocation, C));
800 auto *NewNode = NodeOwner.back().get();
801 if (F)
802 NodeToCallingFunc[NewNode] = F;
803 NewNode->NodeId = NodeOwner.size();
804 return NewNode;
805 }
806
807 /// Helpers to find the node corresponding to the given call or stackid.
808 ContextNode *getNodeForInst(const CallInfo &C);
809 ContextNode *getNodeForAlloc(const CallInfo &C);
810 ContextNode *getNodeForStackId(uint64_t StackId);
811
812 /// Computes the alloc type corresponding to the given context ids, by
813 /// unioning their recorded alloc types.
814 uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds) const;
815
816 /// Returns the allocation type of the intersection of the contexts of two
817 /// nodes (based on their provided context id sets), optimized for the case
818 /// when Node1Ids is smaller than Node2Ids.
819 uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
820 const DenseSet<uint32_t> &Node2Ids) const;
821
822 /// Returns the allocation type of the intersection of the contexts of two
823 /// nodes (based on their provided context id sets).
824 uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
825 const DenseSet<uint32_t> &Node2Ids) const;
826
827 /// Create a clone of Edge's callee and move Edge to that new callee node,
828 /// performing the necessary context id and allocation type updates.
829 /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
830 /// moved to an edge to the new callee.
831 ContextNode *
832 moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
833 DenseSet<uint32_t> ContextIdsToMove = {});
834
835 /// Change the callee of Edge to existing callee clone NewCallee, performing
836 /// the necessary context id and allocation type updates.
837 /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
838 /// moved to an edge to the new callee.
839 void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
840 ContextNode *NewCallee,
841 bool NewClone = false,
842 DenseSet<uint32_t> ContextIdsToMove = {});
843
844 /// Change the caller of the edge at the given callee edge iterator to be
845 /// NewCaller, performing the necessary context id and allocation type
846 /// updates. This is similar to the above moveEdgeToExistingCalleeClone, but
847 /// a simplified version of it as we always move the given edge and all of its
848 /// context ids.
849 void moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
850 ContextNode *NewCaller);
851
852 /// Recursive helper for marking backedges via DFS.
853 void markBackedges(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
854 DenseSet<const ContextNode *> &CurrentStack);
855
856 /// Recursive helper for merging clones.
857 void
858 mergeClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
859 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode);
860 /// Main worker for merging callee clones for a given node.
861 void mergeNodeCalleeClones(
862 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
863 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode);
864 /// Helper to find other callers of the given set of callee edges that can
865 /// share the same callee merge node.
866 void findOtherCallersToShareMerge(
867 ContextNode *Node, std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges,
868 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode,
869 DenseSet<ContextNode *> &OtherCallersToShareMerge);
870
871 /// Recursively perform cloning on the graph for the given Node and its
872 /// callers, in order to uniquely identify the allocation behavior of an
873 /// allocation given its context. The context ids of the allocation being
874 /// processed are given in AllocContextIds.
875 void identifyClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
876 const DenseSet<uint32_t> &AllocContextIds);
877
878 /// Map from each context ID to the AllocationType assigned to that context.
879 DenseMap<uint32_t, AllocationType> ContextIdToAllocationType;
880
881 /// Map from each contextID to the profiled full contexts and their total
882 /// sizes (there may be more than one due to context trimming),
883 /// optionally populated when requested (via MemProfReportHintedSizes or
884 /// MinClonedColdBytePercent).
885 DenseMap<uint32_t, std::vector<ContextTotalSize>> ContextIdToContextSizeInfos;
886
887 /// Identifies the context node created for a stack id when adding the MIB
888 /// contexts to the graph. This is used to locate the context nodes when
889 /// trying to assign the corresponding callsites with those stack ids to these
890 /// nodes.
891 DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
892
893 /// Saves information for the contexts identified as important (the largest
894 /// cold contexts up to MemProfTopNImportant).
895 struct ImportantContextInfo {
896 // The original list of leaf first stack ids corresponding to this context.
897 std::vector<uint64_t> StackIds;
898 // Max length of stack ids corresponding to a single stack ContextNode for
899 // this context (i.e. the max length of a key in StackIdsToNode below).
900 unsigned MaxLength = 0;
901 // Mapping of slices of the stack ids to the corresponding ContextNode
902 // (there can be multiple stack ids due to inlining). Populated when
903 // updating stack nodes while matching them to the IR or summary.
904 std::map<std::vector<uint64_t>, ContextNode *> StackIdsToNode;
905 };
906
907 // Map of important full context ids to information about each.
908 DenseMap<uint32_t, ImportantContextInfo> ImportantContextIdInfo;
909
910 // For each important context id found in Node (if any), records the list of
911 // stack ids that corresponded to the given callsite Node. There can be more
912 // than one in the case of inlining.
913 void recordStackNode(std::vector<uint64_t> &StackIds, ContextNode *Node,
914 // We pass in the Node's context ids to avoid the
915 // overhead of computing them as the caller already has
916 // them in some cases.
917 const DenseSet<uint32_t> &NodeContextIds,
918 const DenseSet<uint32_t> &ImportantContextIds) {
920 assert(ImportantContextIds.empty());
921 return;
922 }
924 set_intersection(NodeContextIds, ImportantContextIds);
925 if (Ids.empty())
926 return;
927 auto Size = StackIds.size();
928 for (auto Id : Ids) {
929 auto &Entry = ImportantContextIdInfo[Id];
930 Entry.StackIdsToNode[StackIds] = Node;
931 // Keep track of the max to simplify later analysis.
932 if (Size > Entry.MaxLength)
933 Entry.MaxLength = Size;
934 }
935 }
936
937 /// Maps to track the calls to their corresponding nodes in the graph.
938 MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
939 MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
940
941 /// Owner of all ContextNode unique_ptrs.
942 std::vector<std::unique_ptr<ContextNode>> NodeOwner;
943
944 /// Perform sanity checks on graph when requested.
945 void check() const;
946
947 /// Keeps track of the last unique context id assigned.
948 unsigned int LastContextId = 0;
949};
950
951template <typename DerivedCCG, typename FuncTy, typename CallTy>
952using ContextNode =
953 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
954template <typename DerivedCCG, typename FuncTy, typename CallTy>
955using ContextEdge =
956 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
957template <typename DerivedCCG, typename FuncTy, typename CallTy>
958using FuncInfo =
959 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
960template <typename DerivedCCG, typename FuncTy, typename CallTy>
961using CallInfo =
962 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
963
964/// CRTP derived class for graphs built from IR (regular LTO).
965class ModuleCallsiteContextGraph
966 : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
967 Instruction *> {
968public:
969 ModuleCallsiteContextGraph(
970 Module &M,
971 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
972
973private:
974 friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
975 Instruction *>;
976
977 uint64_t getStackId(uint64_t IdOrIndex) const;
978 const Function *getCalleeFunc(Instruction *Call);
979 bool calleeMatchesFunc(
980 Instruction *Call, const Function *Func, const Function *CallerFunc,
981 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain);
982 bool sameCallee(Instruction *Call1, Instruction *Call2);
983 bool findProfiledCalleeThroughTailCalls(
984 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
985 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
986 bool &FoundMultipleCalleeChains);
987 uint64_t getLastStackId(Instruction *Call);
988 std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
989 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
990 AllocationType getAllocationCallType(const CallInfo &Call) const;
991 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
992 CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
993 Instruction *>::FuncInfo
994 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
995 DenseMap<CallInfo, CallInfo> &CallMap,
996 std::vector<CallInfo> &CallsWithMetadataInFunc,
997 unsigned CloneNo);
998 std::string getLabel(const Function *Func, const Instruction *Call,
999 unsigned CloneNo) const;
1000
1001 const Module &Mod;
1002 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
1003};
1004
1005/// Represents a call in the summary index graph, which can either be an
1006/// allocation or an interior callsite node in an allocation's context.
1007/// Holds a pointer to the corresponding data structure in the index.
1008struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
1009 IndexCall() : PointerUnion() {}
1010 IndexCall(std::nullptr_t) : IndexCall() {}
1011 IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {}
1012 IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
1013 IndexCall(PointerUnion PT) : PointerUnion(PT) {}
1014
1015 IndexCall *operator->() { return this; }
1016
1017 void print(raw_ostream &OS) const {
1018 PointerUnion<CallsiteInfo *, AllocInfo *> Base = *this;
1020 OS << *AI;
1021 } else {
1023 assert(CI);
1024 OS << *CI;
1025 }
1026 }
1027};
1028} // namespace
1029
1030namespace llvm {
1031template <> struct simplify_type<IndexCall> {
1033 static SimpleType getSimplifiedValue(IndexCall &Val) { return Val; }
1034};
1035template <> struct simplify_type<const IndexCall> {
1037 static SimpleType getSimplifiedValue(const IndexCall &Val) { return Val; }
1038};
1039} // namespace llvm
1040
1041namespace {
1042/// CRTP derived class for graphs built from summary index (ThinLTO).
1043class IndexCallsiteContextGraph
1044 : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
1045 IndexCall> {
1046public:
1047 IndexCallsiteContextGraph(
1048 ModuleSummaryIndex &Index,
1049 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
1050 isPrevailing);
1051
1052 ~IndexCallsiteContextGraph() {
1053 // Now that we are done with the graph it is safe to add the new
1054 // CallsiteInfo structs to the function summary vectors. The graph nodes
1055 // point into locations within these vectors, so we don't want to add them
1056 // any earlier.
1057 for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {
1058 auto *FS = I.first;
1059 for (auto &Callsite : I.second)
1060 FS->addCallsite(*Callsite.second);
1061 }
1062 }
1063
1064private:
1065 friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
1066 IndexCall>;
1067
1068 uint64_t getStackId(uint64_t IdOrIndex) const;
1069 const FunctionSummary *getCalleeFunc(IndexCall &Call);
1070 bool calleeMatchesFunc(
1071 IndexCall &Call, const FunctionSummary *Func,
1072 const FunctionSummary *CallerFunc,
1073 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
1074 bool sameCallee(IndexCall &Call1, IndexCall &Call2);
1075 bool findProfiledCalleeThroughTailCalls(
1076 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
1077 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
1078 bool &FoundMultipleCalleeChains);
1079 uint64_t getLastStackId(IndexCall &Call);
1080 std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
1081 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
1082 AllocationType getAllocationCallType(const CallInfo &Call) const;
1083 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
1084 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
1085 IndexCall>::FuncInfo
1086 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
1087 DenseMap<CallInfo, CallInfo> &CallMap,
1088 std::vector<CallInfo> &CallsWithMetadataInFunc,
1089 unsigned CloneNo);
1090 std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
1091 unsigned CloneNo) const;
1092 DenseSet<GlobalValue::GUID> findAliaseeGUIDsPrevailingInDifferentModule();
1093
1094 // Saves mapping from function summaries containing memprof records back to
1095 // its VI, for use in checking and debugging.
1096 std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
1097
1098 const ModuleSummaryIndex &Index;
1099 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
1100 isPrevailing;
1101
1102 // Saves/owns the callsite info structures synthesized for missing tail call
1103 // frames that we discover while building the graph.
1104 // It maps from the summary of the function making the tail call, to a map
1105 // of callee ValueInfo to corresponding synthesized callsite info.
1106 std::unordered_map<FunctionSummary *,
1107 std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>
1108 FunctionCalleesToSynthesizedCallsiteInfos;
1109};
1110} // namespace
1111
1112template <>
1113struct llvm::DenseMapInfo<CallsiteContextGraph<
1114 ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
1116template <>
1117struct llvm::DenseMapInfo<CallsiteContextGraph<
1118 IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
1119 : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
1120template <>
1121struct llvm::DenseMapInfo<IndexCall>
1122 : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
1123
1124namespace {
1125
1126// Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
1127// type we should actually use on the corresponding allocation.
1128// If we can't clone a node that has NotCold+Cold alloc type, we will fall
1129// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
1130// from NotCold.
1131AllocationType allocTypeToUse(uint8_t AllocTypes) {
1132 assert(AllocTypes != (uint8_t)AllocationType::None);
1133 if (AllocTypes ==
1136 else
1137 return (AllocationType)AllocTypes;
1138}
1139
1140// Helper to check if the alloc types for all edges recorded in the
1141// InAllocTypes vector match the alloc types for all edges in the Edges
1142// vector.
1143template <typename DerivedCCG, typename FuncTy, typename CallTy>
1144bool allocTypesMatch(
1145 const std::vector<uint8_t> &InAllocTypes,
1146 const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
1147 &Edges) {
1148 // This should be called only when the InAllocTypes vector was computed for
1149 // this set of Edges. Make sure the sizes are the same.
1150 assert(InAllocTypes.size() == Edges.size());
1151 return std::equal(
1152 InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(), Edges.end(),
1153 [](const uint8_t &l,
1154 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
1155 // Can share if one of the edges is None type - don't
1156 // care about the type along that edge as it doesn't
1157 // exist for those context ids.
1158 if (l == (uint8_t)AllocationType::None ||
1159 r->AllocTypes == (uint8_t)AllocationType::None)
1160 return true;
1161 return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes);
1162 });
1163}
1164
1165// Helper to check if the alloc types for all edges recorded in the
1166// InAllocTypes vector match the alloc types for callee edges in the given
1167// clone. Because the InAllocTypes were computed from the original node's callee
1168// edges, and other cloning could have happened after this clone was created, we
1169// need to find the matching clone callee edge, which may or may not exist.
1170template <typename DerivedCCG, typename FuncTy, typename CallTy>
1171bool allocTypesMatchClone(
1172 const std::vector<uint8_t> &InAllocTypes,
1173 const ContextNode<DerivedCCG, FuncTy, CallTy> *Clone) {
1174 const ContextNode<DerivedCCG, FuncTy, CallTy> *Node = Clone->CloneOf;
1175 assert(Node);
1176 // InAllocTypes should have been computed for the original node's callee
1177 // edges.
1178 assert(InAllocTypes.size() == Node->CalleeEdges.size());
1179 // First create a map of the clone callee edge callees to the edge alloc type.
1181 EdgeCalleeMap;
1182 for (const auto &E : Clone->CalleeEdges) {
1183 assert(!EdgeCalleeMap.contains(E->Callee));
1184 EdgeCalleeMap[E->Callee] = E->AllocTypes;
1185 }
1186 // Next, walk the original node's callees, and look for the corresponding
1187 // clone edge to that callee.
1188 for (unsigned I = 0; I < Node->CalleeEdges.size(); I++) {
1189 auto Iter = EdgeCalleeMap.find(Node->CalleeEdges[I]->Callee);
1190 // Not found is ok, we will simply add an edge if we use this clone.
1191 if (Iter == EdgeCalleeMap.end())
1192 continue;
1193 // Can share if one of the edges is None type - don't
1194 // care about the type along that edge as it doesn't
1195 // exist for those context ids.
1196 if (InAllocTypes[I] == (uint8_t)AllocationType::None ||
1197 Iter->second == (uint8_t)AllocationType::None)
1198 continue;
1199 if (allocTypeToUse(Iter->second) != allocTypeToUse(InAllocTypes[I]))
1200 return false;
1201 }
1202 return true;
1203}
1204
1205} // end anonymous namespace
1206
1207template <typename DerivedCCG, typename FuncTy, typename CallTy>
1208typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1209CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
1210 const CallInfo &C) {
1211 ContextNode *Node = getNodeForAlloc(C);
1212 if (Node)
1213 return Node;
1214
1215 return NonAllocationCallToContextNodeMap.lookup(C);
1216}
1217
1218template <typename DerivedCCG, typename FuncTy, typename CallTy>
1219typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1220CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
1221 const CallInfo &C) {
1222 return AllocationCallToContextNodeMap.lookup(C);
1223}
1224
1225template <typename DerivedCCG, typename FuncTy, typename CallTy>
1226typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1227CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
1228 uint64_t StackId) {
1229 auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
1230 if (StackEntryNode != StackEntryIdToContextNodeMap.end())
1231 return StackEntryNode->second;
1232 return nullptr;
1233}
1234
1235template <typename DerivedCCG, typename FuncTy, typename CallTy>
1236void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1237 addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
1238 unsigned int ContextId) {
1239 for (auto &Edge : CallerEdges) {
1240 if (Edge->Caller == Caller) {
1241 Edge->AllocTypes |= (uint8_t)AllocType;
1242 Edge->getContextIds().insert(ContextId);
1243 return;
1244 }
1245 }
1246 std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
1247 this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
1248 CallerEdges.push_back(Edge);
1249 Caller->CalleeEdges.push_back(Edge);
1250}
1251
1252template <typename DerivedCCG, typename FuncTy, typename CallTy>
1253void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::removeEdgeFromGraph(
1254 ContextEdge *Edge, EdgeIter *EI, bool CalleeIter) {
1255 assert(!EI || (*EI)->get() == Edge);
1256 assert(!Edge->isRemoved());
1257 // Save the Caller and Callee pointers so we can erase Edge from their edge
1258 // lists after clearing Edge below. We do the clearing first in case it is
1259 // destructed after removing from the edge lists (if those were the last
1260 // shared_ptr references to Edge).
1261 auto *Callee = Edge->Callee;
1262 auto *Caller = Edge->Caller;
1263
1264 // Make sure the edge fields are cleared out so we can properly detect
1265 // removed edges if Edge is not destructed because there is still a shared_ptr
1266 // reference.
1267 Edge->clear();
1268
1269#ifndef NDEBUG
1270 auto CalleeCallerCount = Callee->CallerEdges.size();
1271 auto CallerCalleeCount = Caller->CalleeEdges.size();
1272#endif
1273 if (!EI) {
1274 Callee->eraseCallerEdge(Edge);
1275 Caller->eraseCalleeEdge(Edge);
1276 } else if (CalleeIter) {
1277 Callee->eraseCallerEdge(Edge);
1278 *EI = Caller->CalleeEdges.erase(*EI);
1279 } else {
1280 Caller->eraseCalleeEdge(Edge);
1281 *EI = Callee->CallerEdges.erase(*EI);
1282 }
1283 assert(Callee->CallerEdges.size() < CalleeCallerCount);
1284 assert(Caller->CalleeEdges.size() < CallerCalleeCount);
1285}
1286
1287template <typename DerivedCCG, typename FuncTy, typename CallTy>
1288void CallsiteContextGraph<
1289 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
1290 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
1291 auto Edge = *EI;
1292 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1293 assert(Edge->ContextIds.empty());
1294 removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true);
1295 } else
1296 ++EI;
1297 }
1298}
1299
1300template <typename DerivedCCG, typename FuncTy, typename CallTy>
1301void CallsiteContextGraph<
1302 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCallerEdges(ContextNode *Node) {
1303 for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
1304 auto Edge = *EI;
1305 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1306 assert(Edge->ContextIds.empty());
1307 Edge->Caller->eraseCalleeEdge(Edge.get());
1308 EI = Node->CallerEdges.erase(EI);
1309 } else
1310 ++EI;
1311 }
1312}
1313
1314template <typename DerivedCCG, typename FuncTy, typename CallTy>
1315typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1316CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1317 findEdgeFromCallee(const ContextNode *Callee) {
1318 for (const auto &Edge : CalleeEdges)
1319 if (Edge->Callee == Callee)
1320 return Edge.get();
1321 return nullptr;
1322}
1323
1324template <typename DerivedCCG, typename FuncTy, typename CallTy>
1325typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1326CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1327 findEdgeFromCaller(const ContextNode *Caller) {
1328 for (const auto &Edge : CallerEdges)
1329 if (Edge->Caller == Caller)
1330 return Edge.get();
1331 return nullptr;
1332}
1333
1334template <typename DerivedCCG, typename FuncTy, typename CallTy>
1335void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1336 eraseCalleeEdge(const ContextEdge *Edge) {
1337 auto EI = llvm::find_if(
1338 CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
1339 return CalleeEdge.get() == Edge;
1340 });
1341 assert(EI != CalleeEdges.end());
1342 CalleeEdges.erase(EI);
1343}
1344
1345template <typename DerivedCCG, typename FuncTy, typename CallTy>
1346void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1347 eraseCallerEdge(const ContextEdge *Edge) {
1348 auto EI = llvm::find_if(
1349 CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
1350 return CallerEdge.get() == Edge;
1351 });
1352 assert(EI != CallerEdges.end());
1353 CallerEdges.erase(EI);
1354}
1355
1356template <typename DerivedCCG, typename FuncTy, typename CallTy>
1357uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
1358 DenseSet<uint32_t> &ContextIds) const {
1359 uint8_t BothTypes =
1360 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1361 uint8_t AllocType = (uint8_t)AllocationType::None;
1362 for (auto Id : ContextIds) {
1363 AllocType |= (uint8_t)ContextIdToAllocationType.at(Id);
1364 // Bail early if alloc type reached both, no further refinement.
1365 if (AllocType == BothTypes)
1366 return AllocType;
1367 }
1368 return AllocType;
1369}
1370
1371template <typename DerivedCCG, typename FuncTy, typename CallTy>
1372uint8_t
1373CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
1374 const DenseSet<uint32_t> &Node1Ids,
1375 const DenseSet<uint32_t> &Node2Ids) const {
1376 uint8_t BothTypes =
1377 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1378 uint8_t AllocType = (uint8_t)AllocationType::None;
1379 for (auto Id : Node1Ids) {
1380 if (!Node2Ids.count(Id))
1381 continue;
1382 AllocType |= (uint8_t)ContextIdToAllocationType.at(Id);
1383 // Bail early if alloc type reached both, no further refinement.
1384 if (AllocType == BothTypes)
1385 return AllocType;
1386 }
1387 return AllocType;
1388}
1389
1390template <typename DerivedCCG, typename FuncTy, typename CallTy>
1391uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
1392 const DenseSet<uint32_t> &Node1Ids,
1393 const DenseSet<uint32_t> &Node2Ids) const {
1394 if (Node1Ids.size() < Node2Ids.size())
1395 return intersectAllocTypesImpl(Node1Ids, Node2Ids);
1396 else
1397 return intersectAllocTypesImpl(Node2Ids, Node1Ids);
1398}
1399
1400template <typename DerivedCCG, typename FuncTy, typename CallTy>
1401typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1402CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
1403 CallInfo Call, const FuncTy *F) {
1404 assert(!getNodeForAlloc(Call));
1405 ContextNode *AllocNode = createNewNode(/*IsAllocation=*/true, F, Call);
1406 AllocationCallToContextNodeMap[Call] = AllocNode;
1407 // Use LastContextId as a uniq id for MIB allocation nodes.
1408 AllocNode->OrigStackOrAllocId = LastContextId;
1409 // Alloc type should be updated as we add in the MIBs. We should assert
1410 // afterwards that it is not still None.
1411 AllocNode->AllocTypes = (uint8_t)AllocationType::None;
1412
1413 return AllocNode;
1414}
1415
1416static std::string getAllocTypeString(uint8_t AllocTypes) {
1417 if (!AllocTypes)
1418 return "None";
1419 std::string Str;
1420 if (AllocTypes & (uint8_t)AllocationType::NotCold)
1421 Str += "NotCold";
1422 if (AllocTypes & (uint8_t)AllocationType::Cold)
1423 Str += "Cold";
1424 return Str;
1425}
1426
1427template <typename DerivedCCG, typename FuncTy, typename CallTy>
1428template <class NodeT, class IteratorT>
1429void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
1430 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
1431 CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
1432 ArrayRef<ContextTotalSize> ContextSizeInfo,
1433 std::map<uint64_t, uint32_t> &TotalSizeToContextIdTopNCold) {
1434 // Treating the hot alloc type as NotCold before the disambiguation for "hot"
1435 // is done.
1436 if (AllocType == AllocationType::Hot)
1437 AllocType = AllocationType::NotCold;
1438
1439 ContextIdToAllocationType[++LastContextId] = AllocType;
1440
1441 bool IsImportant = false;
1442 if (!ContextSizeInfo.empty()) {
1443 auto &Entry = ContextIdToContextSizeInfos[LastContextId];
1444 // If this is a cold allocation, and we are collecting non-zero largest
1445 // contexts, see if this is a candidate.
1446 if (AllocType == AllocationType::Cold && MemProfTopNImportant > 0) {
1447 uint64_t TotalCold = 0;
1448 for (auto &CSI : ContextSizeInfo)
1449 TotalCold += CSI.TotalSize;
1450 // Record this context if either we haven't found the first top-n largest
1451 // yet, or if it is larger than the smallest already recorded.
1452 if (TotalSizeToContextIdTopNCold.size() < MemProfTopNImportant ||
1453 // Since TotalSizeToContextIdTopNCold is a std::map, it is implicitly
1454 // sorted in ascending size of its key which is the size.
1455 TotalCold > TotalSizeToContextIdTopNCold.begin()->first) {
1456 if (TotalSizeToContextIdTopNCold.size() == MemProfTopNImportant) {
1457 // Remove old one and its associated entries.
1458 auto IdToRemove = TotalSizeToContextIdTopNCold.begin()->second;
1459 TotalSizeToContextIdTopNCold.erase(
1460 TotalSizeToContextIdTopNCold.begin());
1461 assert(ImportantContextIdInfo.count(IdToRemove));
1462 ImportantContextIdInfo.erase(IdToRemove);
1463 }
1464 TotalSizeToContextIdTopNCold[TotalCold] = LastContextId;
1465 IsImportant = true;
1466 }
1467 }
1468 Entry.insert(Entry.begin(), ContextSizeInfo.begin(), ContextSizeInfo.end());
1469 }
1470
1471 // Update alloc type and context ids for this MIB.
1472 AllocNode->AllocTypes |= (uint8_t)AllocType;
1473
1474 // Now add or update nodes for each stack id in alloc's context.
1475 // Later when processing the stack ids on non-alloc callsites we will adjust
1476 // for any inlining in the context.
1477 ContextNode *PrevNode = AllocNode;
1478 // Look for recursion (direct recursion should have been collapsed by
1479 // module summary analysis, here we should just be detecting mutual
1480 // recursion). Mark these nodes so we don't try to clone.
1481 SmallSet<uint64_t, 8> StackIdSet;
1482 // Skip any on the allocation call (inlining).
1483 for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
1484 ContextIter != StackContext.end(); ++ContextIter) {
1485 auto StackId = getStackId(*ContextIter);
1486 if (IsImportant)
1487 ImportantContextIdInfo[LastContextId].StackIds.push_back(StackId);
1488 ContextNode *StackNode = getNodeForStackId(StackId);
1489 if (!StackNode) {
1490 StackNode = createNewNode(/*IsAllocation=*/false);
1491 StackEntryIdToContextNodeMap[StackId] = StackNode;
1492 StackNode->OrigStackOrAllocId = StackId;
1493 }
1494 // Marking a node recursive will prevent its cloning completely, even for
1495 // non-recursive contexts flowing through it.
1497 auto Ins = StackIdSet.insert(StackId);
1498 if (!Ins.second)
1499 StackNode->Recursive = true;
1500 }
1501 StackNode->AllocTypes |= (uint8_t)AllocType;
1502 PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
1503 PrevNode = StackNode;
1504 }
1505}
1506
1507template <typename DerivedCCG, typename FuncTy, typename CallTy>
1508DenseSet<uint32_t>
1509CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
1510 const DenseSet<uint32_t> &StackSequenceContextIds,
1511 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1512 DenseSet<uint32_t> NewContextIds;
1513 for (auto OldId : StackSequenceContextIds) {
1514 NewContextIds.insert(++LastContextId);
1515 OldToNewContextIds[OldId].insert(LastContextId);
1516 assert(ContextIdToAllocationType.count(OldId));
1517 // The new context has the same allocation type and size info as original.
1518 ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
1519 auto CSI = ContextIdToContextSizeInfos.find(OldId);
1520 if (CSI != ContextIdToContextSizeInfos.end())
1521 ContextIdToContextSizeInfos[LastContextId] = CSI->second;
1522 if (DotAllocContextIds.contains(OldId))
1523 DotAllocContextIds.insert(LastContextId);
1524 }
1525 return NewContextIds;
1526}
1527
1528template <typename DerivedCCG, typename FuncTy, typename CallTy>
1529void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1530 propagateDuplicateContextIds(
1531 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1532 // Build a set of duplicated context ids corresponding to the input id set.
1533 auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
1534 DenseSet<uint32_t> NewIds;
1535 for (auto Id : ContextIds)
1536 if (auto NewId = OldToNewContextIds.find(Id);
1537 NewId != OldToNewContextIds.end())
1538 NewIds.insert_range(NewId->second);
1539 return NewIds;
1540 };
1541
1542 // Recursively update context ids sets along caller edges.
1543 auto UpdateCallers = [&](ContextNode *Node,
1544 DenseSet<const ContextEdge *> &Visited,
1545 auto &&UpdateCallers) -> void {
1546 for (const auto &Edge : Node->CallerEdges) {
1547 auto Inserted = Visited.insert(Edge.get());
1548 if (!Inserted.second)
1549 continue;
1550 ContextNode *NextNode = Edge->Caller;
1551 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
1552 // Only need to recursively iterate to NextNode via this caller edge if
1553 // it resulted in any added ids to NextNode.
1554 if (!NewIdsToAdd.empty()) {
1555 Edge->getContextIds().insert_range(NewIdsToAdd);
1556 UpdateCallers(NextNode, Visited, UpdateCallers);
1557 }
1558 }
1559 };
1560
1561 DenseSet<const ContextEdge *> Visited;
1562 for (auto &Entry : AllocationCallToContextNodeMap) {
1563 auto *Node = Entry.second;
1564 UpdateCallers(Node, Visited, UpdateCallers);
1565 }
1566}
1567
1568template <typename DerivedCCG, typename FuncTy, typename CallTy>
1569void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
1570 ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee,
1571 // This must be passed by value to make a copy since it will be adjusted
1572 // as ids are moved.
1573 DenseSet<uint32_t> RemainingContextIds) {
1574 auto &OrigEdges =
1575 TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
1576 DenseSet<uint32_t> RecursiveContextIds;
1577 DenseSet<uint32_t> AllCallerContextIds;
1579 // Identify which context ids are recursive which is needed to properly
1580 // update the RemainingContextIds set. The relevant recursive context ids
1581 // are those that are in multiple edges.
1582 for (auto &CE : OrigEdges) {
1583 AllCallerContextIds.reserve(CE->getContextIds().size());
1584 for (auto Id : CE->getContextIds())
1585 if (!AllCallerContextIds.insert(Id).second)
1586 RecursiveContextIds.insert(Id);
1587 }
1588 }
1589 // Increment iterator in loop so that we can remove edges as needed.
1590 for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
1591 auto Edge = *EI;
1592 DenseSet<uint32_t> NewEdgeContextIds;
1593 DenseSet<uint32_t> NotFoundContextIds;
1594 // Remove any matching context ids from Edge, return set that were found and
1595 // removed, these are the new edge's context ids. Also update the remaining
1596 // (not found ids).
1597 set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1598 NotFoundContextIds);
1599 // Update the remaining context ids set for the later edges. This is a
1600 // compile time optimization.
1601 if (RecursiveContextIds.empty()) {
1602 // No recursive ids, so all of the previously remaining context ids that
1603 // were not seen on this edge are the new remaining set.
1604 RemainingContextIds.swap(NotFoundContextIds);
1605 } else {
1606 // Keep the recursive ids in the remaining set as we expect to see those
1607 // on another edge. We can remove the non-recursive remaining ids that
1608 // were seen on this edge, however. We already have the set of remaining
1609 // ids that were on this edge (in NewEdgeContextIds). Figure out which are
1610 // non-recursive and only remove those. Note that despite the higher
1611 // overhead of updating the remaining context ids set when recursion
1612 // handling is enabled, it was found to be at worst performance neutral
1613 // and in one case a clear win.
1614 DenseSet<uint32_t> NonRecursiveRemainingCurEdgeIds =
1615 set_difference(NewEdgeContextIds, RecursiveContextIds);
1616 set_subtract(RemainingContextIds, NonRecursiveRemainingCurEdgeIds);
1617 }
1618 // If no matching context ids for this edge, skip it.
1619 if (NewEdgeContextIds.empty()) {
1620 ++EI;
1621 continue;
1622 }
1623 if (TowardsCallee) {
1624 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1625 auto NewEdge = std::make_shared<ContextEdge>(
1626 Edge->Callee, NewNode, NewAllocType, std::move(NewEdgeContextIds));
1627 NewNode->CalleeEdges.push_back(NewEdge);
1628 NewEdge->Callee->CallerEdges.push_back(NewEdge);
1629 } else {
1630 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1631 auto NewEdge = std::make_shared<ContextEdge>(
1632 NewNode, Edge->Caller, NewAllocType, std::move(NewEdgeContextIds));
1633 NewNode->CallerEdges.push_back(NewEdge);
1634 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1635 }
1636 // Remove old edge if context ids empty.
1637 if (Edge->getContextIds().empty()) {
1638 removeEdgeFromGraph(Edge.get(), &EI, TowardsCallee);
1639 continue;
1640 }
1641 ++EI;
1642 }
1643}
1644
1645template <typename DerivedCCG, typename FuncTy, typename CallTy>
1646static void checkEdge(
1647 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
1648 // Confirm that alloc type is not None and that we have at least one context
1649 // id.
1650 assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
1651 assert(!Edge->ContextIds.empty());
1652}
1653
1654template <typename DerivedCCG, typename FuncTy, typename CallTy>
1655static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
1656 bool CheckEdges = true) {
1657 if (Node->isRemoved())
1658 return;
1659#ifndef NDEBUG
1660 // Compute node's context ids once for use in asserts.
1661 auto NodeContextIds = Node->getContextIds();
1662#endif
1663 // Node's context ids should be the union of both its callee and caller edge
1664 // context ids.
1665 if (Node->CallerEdges.size()) {
1666 DenseSet<uint32_t> CallerEdgeContextIds(
1667 Node->CallerEdges.front()->ContextIds);
1668 for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) {
1669 if (CheckEdges)
1671 set_union(CallerEdgeContextIds, Edge->ContextIds);
1672 }
1673 // Node can have more context ids than callers if some contexts terminate at
1674 // node and some are longer. If we are allowing recursive callsites and
1675 // contexts this will be violated for incompletely cloned recursive cycles,
1676 // so skip the checking in that case.
1678 NodeContextIds == CallerEdgeContextIds ||
1679 set_is_subset(CallerEdgeContextIds, NodeContextIds));
1680 }
1681 if (Node->CalleeEdges.size()) {
1682 DenseSet<uint32_t> CalleeEdgeContextIds(
1683 Node->CalleeEdges.front()->ContextIds);
1684 for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) {
1685 if (CheckEdges)
1687 set_union(CalleeEdgeContextIds, Edge->getContextIds());
1688 }
1689 // If we are allowing recursive callsites and contexts this will be violated
1690 // for incompletely cloned recursive cycles, so skip the checking in that
1691 // case.
1693 NodeContextIds == CalleeEdgeContextIds);
1694 }
1695 // FIXME: Since this checking is only invoked under an option, we should
1696 // change the error checking from using assert to something that will trigger
1697 // an error on a release build.
1698#ifndef NDEBUG
1699 // Make sure we don't end up with duplicate edges between the same caller and
1700 // callee.
1702 for (const auto &E : Node->CalleeEdges)
1703 NodeSet.insert(E->Callee);
1704 assert(NodeSet.size() == Node->CalleeEdges.size());
1705#endif
1706}
1707
1708template <typename DerivedCCG, typename FuncTy, typename CallTy>
1709void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1710 assignStackNodesPostOrder(ContextNode *Node,
1711 DenseSet<const ContextNode *> &Visited,
1712 DenseMap<uint64_t, std::vector<CallContextInfo>>
1713 &StackIdToMatchingCalls,
1714 DenseMap<CallInfo, CallInfo> &CallToMatchingCall,
1715 const DenseSet<uint32_t> &ImportantContextIds) {
1716 auto Inserted = Visited.insert(Node);
1717 if (!Inserted.second)
1718 return;
1719 // Post order traversal. Iterate over a copy since we may add nodes and
1720 // therefore new callers during the recursive call, invalidating any
1721 // iterator over the original edge vector. We don't need to process these
1722 // new nodes as they were already processed on creation.
1723 auto CallerEdges = Node->CallerEdges;
1724 for (auto &Edge : CallerEdges) {
1725 // Skip any that have been removed during the recursion.
1726 if (Edge->isRemoved()) {
1727 assert(!is_contained(Node->CallerEdges, Edge));
1728 continue;
1729 }
1730 assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls,
1731 CallToMatchingCall, ImportantContextIds);
1732 }
1733
1734 // If this node's stack id is in the map, update the graph to contain new
1735 // nodes representing any inlining at interior callsites. Note we move the
1736 // associated context ids over to the new nodes.
1737
1738 // Ignore this node if it is for an allocation or we didn't record any
1739 // stack id lists ending at it.
1740 if (Node->IsAllocation ||
1741 !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1742 return;
1743
1744 auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1745 // Handle the simple case first. A single call with a single stack id.
1746 // In this case there is no need to create any new context nodes, simply
1747 // assign the context node for stack id to this Call.
1748 if (Calls.size() == 1) {
1749 auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
1750 if (Ids.size() == 1) {
1751 assert(SavedContextIds.empty());
1752 // It should be this Node
1753 assert(Node == getNodeForStackId(Ids[0]));
1754 if (Node->Recursive)
1755 return;
1756 Node->setCall(Call);
1757 NonAllocationCallToContextNodeMap[Call] = Node;
1758 NodeToCallingFunc[Node] = Func;
1759 recordStackNode(Ids, Node, Node->getContextIds(), ImportantContextIds);
1760 return;
1761 }
1762 }
1763
1764#ifndef NDEBUG
1765 // Find the node for the last stack id, which should be the same
1766 // across all calls recorded for this id, and is this node's id.
1767 uint64_t LastId = Node->OrigStackOrAllocId;
1768 ContextNode *LastNode = getNodeForStackId(LastId);
1769 // We should only have kept stack ids that had nodes.
1770 assert(LastNode);
1771 assert(LastNode == Node);
1772#else
1773 ContextNode *LastNode = Node;
1774#endif
1775
1776 // Compute the last node's context ids once, as it is shared by all calls in
1777 // this entry.
1778 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1779
1780 [[maybe_unused]] bool PrevIterCreatedNode = false;
1781 bool CreatedNode = false;
1782 for (unsigned I = 0; I < Calls.size();
1783 I++, PrevIterCreatedNode = CreatedNode) {
1784 CreatedNode = false;
1785 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1786 // Skip any for which we didn't assign any ids, these don't get a node in
1787 // the graph.
1788 if (SavedContextIds.empty()) {
1789 // If this call has a matching call (located in the same function and
1790 // having the same stack ids), simply add it to the context node created
1791 // for its matching call earlier. These can be treated the same through
1792 // cloning and get updated at the same time.
1793 if (!CallToMatchingCall.contains(Call))
1794 continue;
1795 auto MatchingCall = CallToMatchingCall[Call];
1796 if (!NonAllocationCallToContextNodeMap.contains(MatchingCall)) {
1797 // This should only happen if we had a prior iteration, and it didn't
1798 // create a node because of the below recomputation of context ids
1799 // finding none remaining and continuing early.
1800 assert(I > 0 && !PrevIterCreatedNode);
1801 continue;
1802 }
1803 NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back(
1804 Call);
1805 continue;
1806 }
1807
1808 assert(LastId == Ids.back());
1809
1810 // Recompute the context ids for this stack id sequence (the
1811 // intersection of the context ids of the corresponding nodes).
1812 // Start with the ids we saved in the map for this call, which could be
1813 // duplicated context ids. We have to recompute as we might have overlap
1814 // overlap between the saved context ids for different last nodes, and
1815 // removed them already during the post order traversal.
1816 set_intersect(SavedContextIds, LastNodeContextIds);
1817 ContextNode *PrevNode = LastNode;
1818 bool Skip = false;
1819 // Iterate backwards through the stack Ids, starting after the last Id
1820 // in the list, which was handled once outside for all Calls.
1821 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1822 auto Id = *IdIter;
1823 ContextNode *CurNode = getNodeForStackId(Id);
1824 // We should only have kept stack ids that had nodes and weren't
1825 // recursive.
1826 assert(CurNode);
1827 assert(!CurNode->Recursive);
1828
1829 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1830 if (!Edge) {
1831 Skip = true;
1832 break;
1833 }
1834 PrevNode = CurNode;
1835
1836 // Update the context ids, which is the intersection of the ids along
1837 // all edges in the sequence.
1838 set_intersect(SavedContextIds, Edge->getContextIds());
1839
1840 // If we now have no context ids for clone, skip this call.
1841 if (SavedContextIds.empty()) {
1842 Skip = true;
1843 break;
1844 }
1845 }
1846 if (Skip)
1847 continue;
1848
1849 // Create new context node.
1850 ContextNode *NewNode = createNewNode(/*IsAllocation=*/false, Func, Call);
1851 NonAllocationCallToContextNodeMap[Call] = NewNode;
1852 CreatedNode = true;
1853 NewNode->AllocTypes = computeAllocType(SavedContextIds);
1854
1855 ContextNode *FirstNode = getNodeForStackId(Ids[0]);
1856 assert(FirstNode);
1857
1858 // Connect to callees of innermost stack frame in inlined call chain.
1859 // This updates context ids for FirstNode's callee's to reflect those
1860 // moved to NewNode.
1861 connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true, SavedContextIds);
1862
1863 // Connect to callers of outermost stack frame in inlined call chain.
1864 // This updates context ids for FirstNode's caller's to reflect those
1865 // moved to NewNode.
1866 connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false, SavedContextIds);
1867
1868 // Now we need to remove context ids from edges/nodes between First and
1869 // Last Node.
1870 PrevNode = nullptr;
1871 for (auto Id : Ids) {
1872 ContextNode *CurNode = getNodeForStackId(Id);
1873 // We should only have kept stack ids that had nodes.
1874 assert(CurNode);
1875
1876 // Remove the context ids moved to NewNode from CurNode, and the
1877 // edge from the prior node.
1878 if (PrevNode) {
1879 auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1880 // If the sequence contained recursion, we might have already removed
1881 // some edges during the connectNewNode calls above.
1882 if (!PrevEdge) {
1883 PrevNode = CurNode;
1884 continue;
1885 }
1886 set_subtract(PrevEdge->getContextIds(), SavedContextIds);
1887 if (PrevEdge->getContextIds().empty())
1888 removeEdgeFromGraph(PrevEdge);
1889 }
1890 // Since we update the edges from leaf to tail, only look at the callee
1891 // edges. This isn't an alloc node, so if there are no callee edges, the
1892 // alloc type is None.
1893 CurNode->AllocTypes = CurNode->CalleeEdges.empty()
1894 ? (uint8_t)AllocationType::None
1895 : CurNode->computeAllocType();
1896 PrevNode = CurNode;
1897 }
1898
1899 recordStackNode(Ids, NewNode, SavedContextIds, ImportantContextIds);
1900
1901 if (VerifyNodes) {
1902 checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true);
1903 for (auto Id : Ids) {
1904 ContextNode *CurNode = getNodeForStackId(Id);
1905 // We should only have kept stack ids that had nodes.
1906 assert(CurNode);
1907 checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /*CheckEdges=*/true);
1908 }
1909 }
1910 }
1911}
1912
1913template <typename DerivedCCG, typename FuncTy, typename CallTy>
1914void CallsiteContextGraph<DerivedCCG, FuncTy,
1915 CallTy>::fixupImportantContexts() {
1916 if (ImportantContextIdInfo.empty())
1917 return;
1918
1919 // Update statistics as we are done building this map at this point.
1920 NumImportantContextIds = ImportantContextIdInfo.size();
1921
1923 return;
1924
1925 if (ExportToDot)
1926 exportToDot("beforestackfixup");
1927
1928 // For each context we identified as important, walk through the saved context
1929 // stack ids in order from leaf upwards, and make sure all edges are correct.
1930 // These can be difficult to get right when updating the graph while mapping
1931 // nodes onto summary or IR, especially when there is recursion. In
1932 // particular, when we have created new nodes to reflect inlining, it is
1933 // sometimes impossible to know exactly how to update the edges in the face of
1934 // recursion, as we have lost the original ordering of the stack ids in the
1935 // contexts.
1936 // TODO: Consider only doing this if we detect the context has recursive
1937 // cycles.
1938 //
1939 // I.e. assume we have a context with stack ids like: {A B A C A D E}
1940 // and let's say A was inlined into B, C, and D. The original graph will have
1941 // multiple recursive cycles through A. When we match the original context
1942 // nodes onto the IR or summary, we will merge {A B} into one context node,
1943 // {A C} onto another, and {A D} onto another. Looking at the stack sequence
1944 // above, we should end up with a non-cyclic set of edges like:
1945 // {AB} <- {AC} <- {AD} <- E. However, because we normally have lost the
1946 // original ordering, we won't get the edges correct initially (it's
1947 // impossible without the original ordering). Here we do the fixup (add and
1948 // removing edges where necessary) for this context. In the
1949 // ImportantContextInfo struct in this case we should have a MaxLength = 2,
1950 // and map entries for {A B}, {A C}, {A D}, and {E}.
1951 for (auto &[CurContextId, Info] : ImportantContextIdInfo) {
1952 if (Info.StackIdsToNode.empty())
1953 continue;
1954 bool Changed = false;
1955 ContextNode *PrevNode = nullptr;
1956 ContextNode *CurNode = nullptr;
1957 DenseSet<const ContextEdge *> VisitedEdges;
1958 ArrayRef<uint64_t> AllStackIds(Info.StackIds);
1959 // Try to identify what callsite ContextNode maps to which slice of the
1960 // context's ordered stack ids.
1961 for (unsigned I = 0; I < AllStackIds.size(); I++, PrevNode = CurNode) {
1962 // We will do this greedily, trying up to MaxLength stack ids in a row, to
1963 // see if we recorded a context node for that sequence.
1964 auto Len = Info.MaxLength;
1965 auto LenToEnd = AllStackIds.size() - I;
1966 if (Len > LenToEnd)
1967 Len = LenToEnd;
1968 CurNode = nullptr;
1969 // Try to find a recorded context node starting with the longest length
1970 // recorded, and on down until we check for just a single stack node.
1971 for (; Len > 0; Len--) {
1972 // Get the slice of the original stack id sequence to check.
1973 auto CheckStackIds = AllStackIds.slice(I, Len);
1974 auto EntryIt = Info.StackIdsToNode.find(CheckStackIds);
1975 if (EntryIt == Info.StackIdsToNode.end())
1976 continue;
1977 CurNode = EntryIt->second;
1978 // Skip forward so we don't try to look for the ones we just matched.
1979 // We increment by Len - 1, because the outer for loop will increment I.
1980 I += Len - 1;
1981 break;
1982 }
1983 // Give up if we couldn't find a node. Since we need to clone from the
1984 // leaf allocation upwards, no sense in doing anymore fixup further up
1985 // the context if we couldn't match part of the original stack context
1986 // onto a callsite node.
1987 if (!CurNode)
1988 break;
1989 // No edges to fix up until we have a pair of nodes that should be
1990 // adjacent in the graph.
1991 if (!PrevNode)
1992 continue;
1993 // See if we already have a call edge from CurNode to PrevNode.
1994 auto *CurEdge = PrevNode->findEdgeFromCaller(CurNode);
1995 if (CurEdge) {
1996 // We already have an edge. Make sure it contains this context id.
1997 if (CurEdge->getContextIds().insert(CurContextId).second) {
1998 NumFixupEdgeIdsInserted++;
1999 Changed = true;
2000 }
2001 } else {
2002 // No edge exists - add one.
2003 NumFixupEdgesAdded++;
2004 DenseSet<uint32_t> ContextIds({CurContextId});
2005 auto AllocType = computeAllocType(ContextIds);
2006 auto NewEdge = std::make_shared<ContextEdge>(
2007 PrevNode, CurNode, AllocType, std::move(ContextIds));
2008 PrevNode->CallerEdges.push_back(NewEdge);
2009 CurNode->CalleeEdges.push_back(NewEdge);
2010 // Save the new edge for the below handling.
2011 CurEdge = NewEdge.get();
2012 Changed = true;
2013 }
2014 VisitedEdges.insert(CurEdge);
2015 // Now remove this context id from any other caller edges calling
2016 // PrevNode.
2017 for (auto &Edge : PrevNode->CallerEdges) {
2018 // Skip the edge updating/created above and edges we have already
2019 // visited (due to recursion).
2020 if (Edge.get() != CurEdge && !VisitedEdges.contains(Edge.get()))
2021 Edge->getContextIds().erase(CurContextId);
2022 }
2023 }
2024 if (Changed)
2025 NumFixedContexts++;
2026 }
2027}
2028
2029template <typename DerivedCCG, typename FuncTy, typename CallTy>
2030void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
2031 // Map of stack id to all calls with that as the last (outermost caller)
2032 // callsite id that has a context node (some might not due to pruning
2033 // performed during matching of the allocation profile contexts).
2034 // The CallContextInfo contains the Call and a list of its stack ids with
2035 // ContextNodes, the function containing Call, and the set of context ids
2036 // the analysis will eventually identify for use in any new node created
2037 // for that callsite.
2038 DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls;
2039 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
2040 for (auto &Call : CallsWithMetadata) {
2041 // Ignore allocations, already handled.
2042 if (AllocationCallToContextNodeMap.count(Call))
2043 continue;
2044 auto StackIdsWithContextNodes =
2045 getStackIdsWithContextNodesForCall(Call.call());
2046 // If there were no nodes created for MIBs on allocs (maybe this was in
2047 // the unambiguous part of the MIB stack that was pruned), ignore.
2048 if (StackIdsWithContextNodes.empty())
2049 continue;
2050 // Otherwise, record this Call along with the list of ids for the last
2051 // (outermost caller) stack id with a node.
2052 StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
2053 {Call.call(), StackIdsWithContextNodes, Func, {}});
2054 }
2055 }
2056
2057 // First make a pass through all stack ids that correspond to a call,
2058 // as identified in the above loop. Compute the context ids corresponding to
2059 // each of these calls when they correspond to multiple stack ids due to
2060 // due to inlining. Perform any duplication of context ids required when
2061 // there is more than one call with the same stack ids. Their (possibly newly
2062 // duplicated) context ids are saved in the StackIdToMatchingCalls map.
2063 DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
2064 // Save a map from each call to any that are found to match it. I.e. located
2065 // in the same function and have the same (possibly pruned) stack ids. We use
2066 // this to avoid creating extra graph nodes as they can be treated the same.
2067 DenseMap<CallInfo, CallInfo> CallToMatchingCall;
2068 for (auto &It : StackIdToMatchingCalls) {
2069 auto &Calls = It.getSecond();
2070 // Skip single calls with a single stack id. These don't need a new node.
2071 if (Calls.size() == 1) {
2072 auto &Ids = Calls[0].StackIds;
2073 if (Ids.size() == 1)
2074 continue;
2075 }
2076 // In order to do the best and maximal matching of inlined calls to context
2077 // node sequences we will sort the vectors of stack ids in descending order
2078 // of length, and within each length, lexicographically by stack id. The
2079 // latter is so that we can specially handle calls that have identical stack
2080 // id sequences (either due to cloning or artificially because of the MIB
2081 // context pruning). Those with the same Ids are then sorted by function to
2082 // facilitate efficiently mapping them to the same context node.
2083 // Because the functions are pointers, to ensure a stable sort first assign
2084 // each function pointer to its first index in the Calls array, and then use
2085 // that to sort by.
2086 DenseMap<const FuncTy *, unsigned> FuncToIndex;
2087 for (const auto &[Idx, CallCtxInfo] : enumerate(Calls))
2088 FuncToIndex.insert({CallCtxInfo.Func, Idx});
2090 Calls,
2091 [&FuncToIndex](const CallContextInfo &A, const CallContextInfo &B) {
2092 return A.StackIds.size() > B.StackIds.size() ||
2093 (A.StackIds.size() == B.StackIds.size() &&
2094 (A.StackIds < B.StackIds ||
2095 (A.StackIds == B.StackIds &&
2096 FuncToIndex[A.Func] < FuncToIndex[B.Func])));
2097 });
2098
2099 // Find the node for the last stack id, which should be the same
2100 // across all calls recorded for this id, and is the id for this
2101 // entry in the StackIdToMatchingCalls map.
2102 uint64_t LastId = It.getFirst();
2103 ContextNode *LastNode = getNodeForStackId(LastId);
2104 // We should only have kept stack ids that had nodes.
2105 assert(LastNode);
2106
2107 if (LastNode->Recursive)
2108 continue;
2109
2110 // Initialize the context ids with the last node's. We will subsequently
2111 // refine the context ids by computing the intersection along all edges.
2112 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
2113 assert(!LastNodeContextIds.empty());
2114
2115#ifndef NDEBUG
2116 // Save the set of functions seen for a particular set of the same stack
2117 // ids. This is used to ensure that they have been correctly sorted to be
2118 // adjacent in the Calls list, since we rely on that to efficiently place
2119 // all such matching calls onto the same context node.
2120 DenseSet<const FuncTy *> MatchingIdsFuncSet;
2121#endif
2122
2123 for (unsigned I = 0; I < Calls.size(); I++) {
2124 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
2125 assert(SavedContextIds.empty());
2126 assert(LastId == Ids.back());
2127
2128#ifndef NDEBUG
2129 // If this call has a different set of ids than the last one, clear the
2130 // set used to ensure they are sorted properly.
2131 if (I > 0 && Ids != Calls[I - 1].StackIds)
2132 MatchingIdsFuncSet.clear();
2133#endif
2134
2135 // First compute the context ids for this stack id sequence (the
2136 // intersection of the context ids of the corresponding nodes).
2137 // Start with the remaining saved ids for the last node.
2138 assert(!LastNodeContextIds.empty());
2139 DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
2140
2141 ContextNode *PrevNode = LastNode;
2142 ContextNode *CurNode = LastNode;
2143 bool Skip = false;
2144
2145 // Iterate backwards through the stack Ids, starting after the last Id
2146 // in the list, which was handled once outside for all Calls.
2147 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
2148 auto Id = *IdIter;
2149 CurNode = getNodeForStackId(Id);
2150 // We should only have kept stack ids that had nodes.
2151 assert(CurNode);
2152
2153 if (CurNode->Recursive) {
2154 Skip = true;
2155 break;
2156 }
2157
2158 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
2159 // If there is no edge then the nodes belong to different MIB contexts,
2160 // and we should skip this inlined context sequence. For example, this
2161 // particular inlined context may include stack ids A->B, and we may
2162 // indeed have nodes for both A and B, but it is possible that they were
2163 // never profiled in sequence in a single MIB for any allocation (i.e.
2164 // we might have profiled an allocation that involves the callsite A,
2165 // but through a different one of its callee callsites, and we might
2166 // have profiled an allocation that involves callsite B, but reached
2167 // from a different caller callsite).
2168 if (!Edge) {
2169 Skip = true;
2170 break;
2171 }
2172 PrevNode = CurNode;
2173
2174 // Update the context ids, which is the intersection of the ids along
2175 // all edges in the sequence.
2176 set_intersect(StackSequenceContextIds, Edge->getContextIds());
2177
2178 // If we now have no context ids for clone, skip this call.
2179 if (StackSequenceContextIds.empty()) {
2180 Skip = true;
2181 break;
2182 }
2183 }
2184 if (Skip)
2185 continue;
2186
2187 // If some of this call's stack ids did not have corresponding nodes (due
2188 // to pruning), don't include any context ids for contexts that extend
2189 // beyond these nodes. Otherwise we would be matching part of unrelated /
2190 // not fully matching stack contexts. To do this, subtract any context ids
2191 // found in caller nodes of the last node found above.
2192 if (Ids.back() != getLastStackId(Call)) {
2193 for (const auto &PE : LastNode->CallerEdges) {
2194 set_subtract(StackSequenceContextIds, PE->getContextIds());
2195 if (StackSequenceContextIds.empty())
2196 break;
2197 }
2198 // If we now have no context ids for clone, skip this call.
2199 if (StackSequenceContextIds.empty())
2200 continue;
2201 }
2202
2203#ifndef NDEBUG
2204 // If the prior call had the same stack ids this set would not be empty.
2205 // Check if we already have a call that "matches" because it is located
2206 // in the same function. If the Calls list was sorted properly we should
2207 // not encounter this situation as all such entries should be adjacent
2208 // and processed in bulk further below.
2209 assert(!MatchingIdsFuncSet.contains(Func));
2210
2211 MatchingIdsFuncSet.insert(Func);
2212#endif
2213
2214 // Check if the next set of stack ids is the same (since the Calls vector
2215 // of tuples is sorted by the stack ids we can just look at the next one).
2216 // If so, save them in the CallToMatchingCall map so that they get
2217 // assigned to the same context node, and skip them.
2218 bool DuplicateContextIds = false;
2219 for (unsigned J = I + 1; J < Calls.size(); J++) {
2220 auto &CallCtxInfo = Calls[J];
2221 auto &NextIds = CallCtxInfo.StackIds;
2222 if (NextIds != Ids)
2223 break;
2224 auto *NextFunc = CallCtxInfo.Func;
2225 if (NextFunc != Func) {
2226 // We have another Call with the same ids but that cannot share this
2227 // node, must duplicate ids for it.
2228 DuplicateContextIds = true;
2229 break;
2230 }
2231 auto &NextCall = CallCtxInfo.Call;
2232 CallToMatchingCall[NextCall] = Call;
2233 // Update I so that it gets incremented correctly to skip this call.
2234 I = J;
2235 }
2236
2237 // If we don't have duplicate context ids, then we can assign all the
2238 // context ids computed for the original node sequence to this call.
2239 // If there are duplicate calls with the same stack ids then we synthesize
2240 // new context ids that are duplicates of the originals. These are
2241 // assigned to SavedContextIds, which is a reference into the map entry
2242 // for this call, allowing us to access these ids later on.
2243 OldToNewContextIds.reserve(OldToNewContextIds.size() +
2244 StackSequenceContextIds.size());
2245 SavedContextIds =
2246 DuplicateContextIds
2247 ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
2248 : StackSequenceContextIds;
2249 assert(!SavedContextIds.empty());
2250
2251 if (!DuplicateContextIds) {
2252 // Update saved last node's context ids to remove those that are
2253 // assigned to other calls, so that it is ready for the next call at
2254 // this stack id.
2255 set_subtract(LastNodeContextIds, StackSequenceContextIds);
2256 if (LastNodeContextIds.empty())
2257 break;
2258 }
2259 }
2260 }
2261
2262 // Propagate the duplicate context ids over the graph.
2263 propagateDuplicateContextIds(OldToNewContextIds);
2264
2265 if (VerifyCCG)
2266 check();
2267
2268 // Now perform a post-order traversal over the graph, starting with the
2269 // allocation nodes, essentially processing nodes from callers to callees.
2270 // For any that contains an id in the map, update the graph to contain new
2271 // nodes representing any inlining at interior callsites. Note we move the
2272 // associated context ids over to the new nodes.
2273 DenseSet<const ContextNode *> Visited;
2274 DenseSet<uint32_t> ImportantContextIds(llvm::from_range,
2275 ImportantContextIdInfo.keys());
2276 for (auto &Entry : AllocationCallToContextNodeMap)
2277 assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls,
2278 CallToMatchingCall, ImportantContextIds);
2279
2280 fixupImportantContexts();
2281
2282 if (VerifyCCG)
2283 check();
2284}
2285
2286uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
2287 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
2288 Call->getMetadata(LLVMContext::MD_callsite));
2289 return CallsiteContext.back();
2290}
2291
2292uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
2294 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
2295 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call));
2296 // Need to convert index into stack id.
2297 return Index.getStackIdAtIndex(CallsiteContext.back());
2298}
2299
2300static const std::string MemProfCloneSuffix = ".memprof.";
2301
2302static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
2303 // We use CloneNo == 0 to refer to the original version, which doesn't get
2304 // renamed with a suffix.
2305 if (!CloneNo)
2306 return Base.str();
2307 return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
2308}
2309
2310static bool isMemProfClone(const Function &F) {
2311 return F.getName().contains(MemProfCloneSuffix);
2312}
2313
2314// Return the clone number of the given function by extracting it from the
2315// memprof suffix. Assumes the caller has already confirmed it is a memprof
2316// clone.
2317static unsigned getMemProfCloneNum(const Function &F) {
2319 auto Pos = F.getName().find_last_of('.');
2320 assert(Pos > 0);
2321 unsigned CloneNo;
2322 bool Err = F.getName().drop_front(Pos + 1).getAsInteger(10, CloneNo);
2323 assert(!Err);
2324 (void)Err;
2325 return CloneNo;
2326}
2327
2328std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
2329 const Instruction *Call,
2330 unsigned CloneNo) const {
2331 return (Twine(Call->getFunction()->getName()) + " -> " +
2332 cast<CallBase>(Call)->getCalledFunction()->getName())
2333 .str();
2334}
2335
2336std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
2337 const IndexCall &Call,
2338 unsigned CloneNo) const {
2339 auto VI = FSToVIMap.find(Func);
2340 assert(VI != FSToVIMap.end());
2341 std::string CallerName = getMemProfFuncName(VI->second.name(), CloneNo);
2343 return CallerName + " -> alloc";
2344 else {
2345 auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call);
2346 return CallerName + " -> " +
2347 getMemProfFuncName(Callsite->Callee.name(),
2348 Callsite->Clones[CloneNo]);
2349 }
2350}
2351
2352std::vector<uint64_t>
2353ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
2354 Instruction *Call) {
2355 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
2356 Call->getMetadata(LLVMContext::MD_callsite));
2357 return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
2358 CallsiteContext);
2359}
2360
2361std::vector<uint64_t>
2362IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
2364 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
2365 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call));
2366 return getStackIdsWithContextNodes<CallsiteInfo,
2367 SmallVector<unsigned>::const_iterator>(
2368 CallsiteContext);
2369}
2370
2371template <typename DerivedCCG, typename FuncTy, typename CallTy>
2372template <class NodeT, class IteratorT>
2373std::vector<uint64_t>
2374CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
2375 CallStack<NodeT, IteratorT> &CallsiteContext) {
2376 std::vector<uint64_t> StackIds;
2377 for (auto IdOrIndex : CallsiteContext) {
2378 auto StackId = getStackId(IdOrIndex);
2379 ContextNode *Node = getNodeForStackId(StackId);
2380 if (!Node)
2381 break;
2382 StackIds.push_back(StackId);
2383 }
2384 return StackIds;
2385}
2386
2387ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
2388 Module &M,
2389 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
2390 : Mod(M), OREGetter(OREGetter) {
2391 // Map for keeping track of the largest cold contexts up to the number given
2392 // by MemProfTopNImportant. Must be a std::map (not DenseMap) because keys
2393 // must be sorted.
2394 std::map<uint64_t, uint32_t> TotalSizeToContextIdTopNCold;
2395 for (auto &F : M) {
2396 std::vector<CallInfo> CallsWithMetadata;
2397 for (auto &BB : F) {
2398 for (auto &I : BB) {
2399 if (!isa<CallBase>(I))
2400 continue;
2401 if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
2402 CallsWithMetadata.push_back(&I);
2403 auto *AllocNode = addAllocNode(&I, &F);
2404 auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
2405 assert(CallsiteMD);
2406 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
2407 // Add all of the MIBs and their stack nodes.
2408 for (auto &MDOp : MemProfMD->operands()) {
2409 auto *MIBMD = cast<const MDNode>(MDOp);
2410 std::vector<ContextTotalSize> ContextSizeInfo;
2411 // Collect the context size information if it exists.
2412 if (MIBMD->getNumOperands() > 2) {
2413 for (unsigned I = 2; I < MIBMD->getNumOperands(); I++) {
2414 MDNode *ContextSizePair =
2415 dyn_cast<MDNode>(MIBMD->getOperand(I));
2416 assert(ContextSizePair->getNumOperands() == 2);
2418 ContextSizePair->getOperand(0))
2419 ->getZExtValue();
2421 ContextSizePair->getOperand(1))
2422 ->getZExtValue();
2423 ContextSizeInfo.push_back({FullStackId, TotalSize});
2424 }
2425 }
2429 addStackNodesForMIB<MDNode, MDNode::op_iterator>(
2430 AllocNode, StackContext, CallsiteContext,
2431 getMIBAllocType(MIBMD), ContextSizeInfo,
2432 TotalSizeToContextIdTopNCold);
2433 }
2434 // If exporting the graph to dot and an allocation id of interest was
2435 // specified, record all the context ids for this allocation node.
2436 if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
2437 DotAllocContextIds = AllocNode->getContextIds();
2438 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2439 // Memprof and callsite metadata on memory allocations no longer
2440 // needed.
2441 I.setMetadata(LLVMContext::MD_memprof, nullptr);
2442 I.setMetadata(LLVMContext::MD_callsite, nullptr);
2443 }
2444 // For callsite metadata, add to list for this function for later use.
2445 else if (I.getMetadata(LLVMContext::MD_callsite)) {
2446 CallsWithMetadata.push_back(&I);
2447 }
2448 }
2449 }
2450 if (!CallsWithMetadata.empty())
2451 FuncToCallsWithMetadata[&F] = CallsWithMetadata;
2452 }
2453
2454 if (DumpCCG) {
2455 dbgs() << "CCG before updating call stack chains:\n";
2456 dbgs() << *this;
2457 }
2458
2459 if (ExportToDot)
2460 exportToDot("prestackupdate");
2461
2462 updateStackNodes();
2463
2464 if (ExportToDot)
2465 exportToDot("poststackupdate");
2466
2467 handleCallsitesWithMultipleTargets();
2468
2469 markBackedges();
2470
2471 // Strip off remaining callsite metadata, no longer needed.
2472 for (auto &FuncEntry : FuncToCallsWithMetadata)
2473 for (auto &Call : FuncEntry.second)
2474 Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr);
2475}
2476
2477// Finds the set of GUIDs for weak aliasees that are prevailing in different
2478// modules than any of their aliases. We need to handle these specially.
2480IndexCallsiteContextGraph::findAliaseeGUIDsPrevailingInDifferentModule() {
2481 DenseSet<GlobalValue::GUID> AliaseeGUIDs;
2482 for (auto &I : Index) {
2483 auto VI = Index.getValueInfo(I);
2484 for (auto &S : VI.getSummaryList()) {
2485 // We only care about aliases to functions.
2486 auto *AS = dyn_cast<AliasSummary>(S.get());
2487 if (!AS)
2488 continue;
2489 auto *AliaseeSummary = &AS->getAliasee();
2490 auto *AliaseeFS = dyn_cast<FunctionSummary>(AliaseeSummary);
2491 if (!AliaseeFS)
2492 continue;
2493 // Skip this summary if it is not for the prevailing symbol for this GUID.
2494 // The linker doesn't resolve local linkage values so don't check whether
2495 // those are prevailing.
2496 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
2497 !isPrevailing(VI.getGUID(), S.get()))
2498 continue;
2499 // Prevailing aliasee could be in a different module only if it is weak.
2500 if (!GlobalValue::isWeakForLinker(AliaseeSummary->linkage()))
2501 continue;
2502 auto AliaseeGUID = AS->getAliaseeGUID();
2503 // If the aliasee copy in this module is not prevailing, record it.
2504 if (!isPrevailing(AliaseeGUID, AliaseeSummary))
2505 AliaseeGUIDs.insert(AliaseeGUID);
2506 }
2507 }
2508 AliaseesPrevailingInDiffModuleFromAlias += AliaseeGUIDs.size();
2509 return AliaseeGUIDs;
2510}
2511
2512IndexCallsiteContextGraph::IndexCallsiteContextGraph(
2513 ModuleSummaryIndex &Index,
2514 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
2515 isPrevailing)
2516 : Index(Index), isPrevailing(isPrevailing) {
2517 // Since we use the aliasee summary info to create the necessary clones for
2518 // its aliases, conservatively skip recording the aliasee function's callsites
2519 // in the CCG for any that are prevailing in a different module than one of
2520 // its aliases. We could record the necessary information to do this in the
2521 // summary, but this case should not be common.
2522 DenseSet<GlobalValue::GUID> GUIDsToSkip =
2523 findAliaseeGUIDsPrevailingInDifferentModule();
2524 // Map for keeping track of the largest cold contexts up to the number given
2525 // by MemProfTopNImportant. Must be a std::map (not DenseMap) because keys
2526 // must be sorted.
2527 std::map<uint64_t, uint32_t> TotalSizeToContextIdTopNCold;
2528 for (auto &I : Index) {
2529 auto VI = Index.getValueInfo(I);
2530 if (GUIDsToSkip.contains(VI.getGUID()))
2531 continue;
2532 for (auto &S : VI.getSummaryList()) {
2533 // We should only add the prevailing nodes. Otherwise we may try to clone
2534 // in a weak copy that won't be linked (and may be different than the
2535 // prevailing version).
2536 // We only keep the memprof summary on the prevailing copy now when
2537 // building the combined index, as a space optimization, however don't
2538 // rely on this optimization. The linker doesn't resolve local linkage
2539 // values so don't check whether those are prevailing.
2540 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
2541 !isPrevailing(VI.getGUID(), S.get()))
2542 continue;
2543 auto *FS = dyn_cast<FunctionSummary>(S.get());
2544 if (!FS)
2545 continue;
2546 std::vector<CallInfo> CallsWithMetadata;
2547 if (!FS->allocs().empty()) {
2548 for (auto &AN : FS->mutableAllocs()) {
2549 // This can happen because of recursion elimination handling that
2550 // currently exists in ModuleSummaryAnalysis. Skip these for now.
2551 // We still added them to the summary because we need to be able to
2552 // correlate properly in applyImport in the backends.
2553 if (AN.MIBs.empty())
2554 continue;
2555 IndexCall AllocCall(&AN);
2556 CallsWithMetadata.push_back(AllocCall);
2557 auto *AllocNode = addAllocNode(AllocCall, FS);
2558 // Pass an empty CallStack to the CallsiteContext (second)
2559 // parameter, since for ThinLTO we already collapsed out the inlined
2560 // stack ids on the allocation call during ModuleSummaryAnalysis.
2562 EmptyContext;
2563 unsigned I = 0;
2565 AN.ContextSizeInfos.size() == AN.MIBs.size());
2566 // Now add all of the MIBs and their stack nodes.
2567 for (auto &MIB : AN.MIBs) {
2569 StackContext(&MIB);
2570 std::vector<ContextTotalSize> ContextSizeInfo;
2571 if (!AN.ContextSizeInfos.empty()) {
2572 for (auto [FullStackId, TotalSize] : AN.ContextSizeInfos[I])
2573 ContextSizeInfo.push_back({FullStackId, TotalSize});
2574 }
2575 addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
2576 AllocNode, StackContext, EmptyContext, MIB.AllocType,
2577 ContextSizeInfo, TotalSizeToContextIdTopNCold);
2578 I++;
2579 }
2580 // If exporting the graph to dot and an allocation id of interest was
2581 // specified, record all the context ids for this allocation node.
2582 if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
2583 DotAllocContextIds = AllocNode->getContextIds();
2584 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2585 // Initialize version 0 on the summary alloc node to the current alloc
2586 // type, unless it has both types in which case make it default, so
2587 // that in the case where we aren't able to clone the original version
2588 // always ends up with the default allocation behavior.
2589 AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes);
2590 }
2591 }
2592 // For callsite metadata, add to list for this function for later use.
2593 if (!FS->callsites().empty())
2594 for (auto &SN : FS->mutableCallsites()) {
2595 IndexCall StackNodeCall(&SN);
2596 CallsWithMetadata.push_back(StackNodeCall);
2597 }
2598
2599 if (!CallsWithMetadata.empty())
2600 FuncToCallsWithMetadata[FS] = CallsWithMetadata;
2601
2602 if (!FS->allocs().empty() || !FS->callsites().empty())
2603 FSToVIMap[FS] = VI;
2604 }
2605 }
2606
2607 if (DumpCCG) {
2608 dbgs() << "CCG before updating call stack chains:\n";
2609 dbgs() << *this;
2610 }
2611
2612 if (ExportToDot)
2613 exportToDot("prestackupdate");
2614
2615 updateStackNodes();
2616
2617 if (ExportToDot)
2618 exportToDot("poststackupdate");
2619
2620 handleCallsitesWithMultipleTargets();
2621
2622 markBackedges();
2623}
2624
2625template <typename DerivedCCG, typename FuncTy, typename CallTy>
2626void CallsiteContextGraph<DerivedCCG, FuncTy,
2627 CallTy>::handleCallsitesWithMultipleTargets() {
2628 // Look for and workaround callsites that call multiple functions.
2629 // This can happen for indirect calls, which needs better handling, and in
2630 // more rare cases (e.g. macro expansion).
2631 // TODO: To fix this for indirect calls we will want to perform speculative
2632 // devirtualization using either the normal PGO info with ICP, or using the
2633 // information in the profiled MemProf contexts. We can do this prior to
2634 // this transformation for regular LTO, and for ThinLTO we can simulate that
2635 // effect in the summary and perform the actual speculative devirtualization
2636 // while cloning in the ThinLTO backend.
2637
2638 // Keep track of the new nodes synthesized for discovered tail calls missing
2639 // from the profiled contexts.
2640 MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;
2641
2642 std::vector<std::pair<CallInfo, ContextNode *>> NewCallToNode;
2643 for (auto &Entry : NonAllocationCallToContextNodeMap) {
2644 auto *Node = Entry.second;
2645 assert(Node->Clones.empty());
2646 // Check all node callees and see if in the same function.
2647 // We need to check all of the calls recorded in this Node, because in some
2648 // cases we may have had multiple calls with the same debug info calling
2649 // different callees. This can happen, for example, when an object is
2650 // constructed in the paramter list - the destructor call of the object has
2651 // the same debug info (line/col) as the call the object was passed to.
2652 // Here we will prune any that don't match all callee nodes.
2653 std::vector<CallInfo> AllCalls;
2654 AllCalls.reserve(Node->MatchingCalls.size() + 1);
2655 AllCalls.push_back(Node->Call);
2656 llvm::append_range(AllCalls, Node->MatchingCalls);
2657
2658 // First see if we can partition the calls by callee function, creating new
2659 // nodes to host each set of calls calling the same callees. This is
2660 // necessary for support indirect calls with ThinLTO, for which we
2661 // synthesized CallsiteInfo records for each target. They will all have the
2662 // same callsite stack ids and would be sharing a context node at this
2663 // point. We need to perform separate cloning for each, which will be
2664 // applied along with speculative devirtualization in the ThinLTO backends
2665 // as needed. Note this does not currently support looking through tail
2666 // calls, it is unclear if we need that for indirect call targets.
2667 // First partition calls by callee func. Map indexed by func, value is
2668 // struct with list of matching calls, assigned node.
2669 if (partitionCallsByCallee(Node, AllCalls, NewCallToNode))
2670 continue;
2671
2672 auto It = AllCalls.begin();
2673 // Iterate through the calls until we find the first that matches.
2674 for (; It != AllCalls.end(); ++It) {
2675 auto ThisCall = *It;
2676 bool Match = true;
2677 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();
2678 ++EI) {
2679 auto Edge = *EI;
2680 if (!Edge->Callee->hasCall())
2681 continue;
2682 assert(NodeToCallingFunc.count(Edge->Callee));
2683 // Check if the called function matches that of the callee node.
2684 if (!calleesMatch(ThisCall.call(), EI, TailCallToContextNodeMap)) {
2685 Match = false;
2686 break;
2687 }
2688 }
2689 // Found a call that matches the callee nodes, we can quit now.
2690 if (Match) {
2691 // If the first match is not the primary call on the Node, update it
2692 // now. We will update the list of matching calls further below.
2693 if (Node->Call != ThisCall) {
2694 Node->setCall(ThisCall);
2695 // We need to update the NonAllocationCallToContextNodeMap, but don't
2696 // want to do this during iteration over that map, so save the calls
2697 // that need updated entries.
2698 NewCallToNode.push_back({ThisCall, Node});
2699 }
2700 break;
2701 }
2702 }
2703 // We will update this list below (or leave it cleared if there was no
2704 // match found above).
2705 Node->MatchingCalls.clear();
2706 // If we hit the end of the AllCalls vector, no call matching the callee
2707 // nodes was found, clear the call information in the node.
2708 if (It == AllCalls.end()) {
2709 RemovedEdgesWithMismatchedCallees++;
2710 // Work around by setting Node to have a null call, so it gets
2711 // skipped during cloning. Otherwise assignFunctions will assert
2712 // because its data structures are not designed to handle this case.
2713 Node->setCall(CallInfo());
2714 continue;
2715 }
2716 // Now add back any matching calls that call the same function as the
2717 // matching primary call on Node.
2718 for (++It; It != AllCalls.end(); ++It) {
2719 auto ThisCall = *It;
2720 if (!sameCallee(Node->Call.call(), ThisCall.call()))
2721 continue;
2722 Node->MatchingCalls.push_back(ThisCall);
2723 }
2724 }
2725
2726 // Remove all mismatched nodes identified in the above loop from the node map
2727 // (checking whether they have a null call which is set above). For a
2728 // MapVector like NonAllocationCallToContextNodeMap it is much more efficient
2729 // to do the removal via remove_if than by individually erasing entries above.
2730 // Also remove any entries if we updated the node's primary call above.
2731 NonAllocationCallToContextNodeMap.remove_if([](const auto &it) {
2732 return !it.second->hasCall() || it.second->Call != it.first;
2733 });
2734
2735 // Add entries for any new primary calls recorded above.
2736 for (auto &[Call, Node] : NewCallToNode)
2737 NonAllocationCallToContextNodeMap[Call] = Node;
2738
2739 // Add the new nodes after the above loop so that the iteration is not
2740 // invalidated.
2741 for (auto &[Call, Node] : TailCallToContextNodeMap)
2742 NonAllocationCallToContextNodeMap[Call] = Node;
2743}
2744
2745template <typename DerivedCCG, typename FuncTy, typename CallTy>
2746bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::partitionCallsByCallee(
2747 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
2748 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode) {
2749 // Struct to keep track of all the calls having the same callee function,
2750 // and the node we eventually assign to them. Eventually we will record the
2751 // context node assigned to this group of calls.
2752 struct CallsWithSameCallee {
2753 std::vector<CallInfo> Calls;
2754 ContextNode *Node = nullptr;
2755 };
2756
2757 // First partition calls by callee function. Build map from each function
2758 // to the list of matching calls.
2760 for (auto ThisCall : AllCalls) {
2761 auto *F = getCalleeFunc(ThisCall.call());
2762 if (F)
2763 CalleeFuncToCallInfo[F].Calls.push_back(ThisCall);
2764 }
2765
2766 // Next, walk through all callee edges. For each callee node, get its
2767 // containing function and see if it was recorded in the above map (meaning we
2768 // have at least one matching call). Build another map from each callee node
2769 // with a matching call to the structure instance created above containing all
2770 // the calls.
2772 for (const auto &Edge : Node->CalleeEdges) {
2773 if (!Edge->Callee->hasCall())
2774 continue;
2775 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2776 if (CalleeFuncToCallInfo.contains(ProfiledCalleeFunc))
2777 CalleeNodeToCallInfo[Edge->Callee] =
2778 &CalleeFuncToCallInfo[ProfiledCalleeFunc];
2779 }
2780
2781 // If there are entries in the second map, then there were no matching
2782 // calls/callees, nothing to do here. Return so we can go to the handling that
2783 // looks through tail calls.
2784 if (CalleeNodeToCallInfo.empty())
2785 return false;
2786
2787 // Walk through all callee edges again. Any and all callee edges that didn't
2788 // match any calls (callee not in the CalleeNodeToCallInfo map) are moved to a
2789 // new caller node (UnmatchedCalleesNode) which gets a null call so that it is
2790 // ignored during cloning. If it is in the map, then we use the node recorded
2791 // in that entry (creating it if needed), and move the callee edge to it.
2792 // The first callee will use the original node instead of creating a new one.
2793 // Note that any of the original calls on this node (in AllCalls) that didn't
2794 // have a callee function automatically get dropped from the node as part of
2795 // this process.
2796 ContextNode *UnmatchedCalleesNode = nullptr;
2797 // Track whether we already assigned original node to a callee.
2798 bool UsedOrigNode = false;
2799 assert(NodeToCallingFunc[Node]);
2800 // Iterate over a copy of Node's callee edges, since we may need to remove
2801 // edges in moveCalleeEdgeToNewCaller, and this simplifies the handling and
2802 // makes it less error-prone.
2803 auto CalleeEdges = Node->CalleeEdges;
2804 for (auto &Edge : CalleeEdges) {
2805 if (!Edge->Callee->hasCall())
2806 continue;
2807
2808 // Will be updated below to point to whatever (caller) node this callee edge
2809 // should be moved to.
2810 ContextNode *CallerNodeToUse = nullptr;
2811
2812 // Handle the case where there were no matching calls first. Move this
2813 // callee edge to the UnmatchedCalleesNode, creating it if needed.
2814 if (!CalleeNodeToCallInfo.contains(Edge->Callee)) {
2815 if (!UnmatchedCalleesNode)
2816 UnmatchedCalleesNode =
2817 createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
2818 CallerNodeToUse = UnmatchedCalleesNode;
2819 } else {
2820 // Look up the information recorded for this callee node, and use the
2821 // recorded caller node (creating it if needed).
2822 auto *Info = CalleeNodeToCallInfo[Edge->Callee];
2823 if (!Info->Node) {
2824 // If we haven't assigned any callees to the original node use it.
2825 if (!UsedOrigNode) {
2826 Info->Node = Node;
2827 // Clear the set of matching calls which will be updated below.
2828 Node->MatchingCalls.clear();
2829 UsedOrigNode = true;
2830 } else
2831 Info->Node =
2832 createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
2833 assert(!Info->Calls.empty());
2834 // The first call becomes the primary call for this caller node, and the
2835 // rest go in the matching calls list.
2836 Info->Node->setCall(Info->Calls.front());
2837 llvm::append_range(Info->Node->MatchingCalls,
2838 llvm::drop_begin(Info->Calls));
2839 // Save the primary call to node correspondence so that we can update
2840 // the NonAllocationCallToContextNodeMap, which is being iterated in the
2841 // caller of this function.
2842 NewCallToNode.push_back({Info->Node->Call, Info->Node});
2843 }
2844 CallerNodeToUse = Info->Node;
2845 }
2846
2847 // Don't need to move edge if we are using the original node;
2848 if (CallerNodeToUse == Node)
2849 continue;
2850
2851 moveCalleeEdgeToNewCaller(Edge, CallerNodeToUse);
2852 }
2853 // Now that we are done moving edges, clean up any caller edges that ended
2854 // up with no type or context ids. During moveCalleeEdgeToNewCaller all
2855 // caller edges from Node are replicated onto the new callers, and it
2856 // simplifies the handling to leave them until we have moved all
2857 // edges/context ids.
2858 for (auto &I : CalleeNodeToCallInfo)
2859 removeNoneTypeCallerEdges(I.second->Node);
2860 if (UnmatchedCalleesNode)
2861 removeNoneTypeCallerEdges(UnmatchedCalleesNode);
2862 removeNoneTypeCallerEdges(Node);
2863
2864 return true;
2865}
2866
2867uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2868 // In the Module (IR) case this is already the Id.
2869 return IdOrIndex;
2870}
2871
2872uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2873 // In the Index case this is an index into the stack id list in the summary
2874 // index, convert it to an Id.
2875 return Index.getStackIdAtIndex(IdOrIndex);
2876}
2877
2878template <typename DerivedCCG, typename FuncTy, typename CallTy>
2879bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
2880 CallTy Call, EdgeIter &EI,
2881 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {
2882 auto Edge = *EI;
2883 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2884 const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];
2885 // Will be populated in order of callee to caller if we find a chain of tail
2886 // calls between the profiled caller and callee.
2887 std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;
2888 if (!calleeMatchesFunc(Call, ProfiledCalleeFunc, CallerFunc,
2889 FoundCalleeChain))
2890 return false;
2891
2892 // The usual case where the profiled callee matches that of the IR/summary.
2893 if (FoundCalleeChain.empty())
2894 return true;
2895
2896 auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) {
2897 auto *CurEdge = Callee->findEdgeFromCaller(Caller);
2898 // If there is already an edge between these nodes, simply update it and
2899 // return.
2900 if (CurEdge) {
2901 CurEdge->ContextIds.insert_range(Edge->ContextIds);
2902 CurEdge->AllocTypes |= Edge->AllocTypes;
2903 return;
2904 }
2905 // Otherwise, create a new edge and insert it into the caller and callee
2906 // lists.
2907 auto NewEdge = std::make_shared<ContextEdge>(
2908 Callee, Caller, Edge->AllocTypes, Edge->ContextIds);
2909 Callee->CallerEdges.push_back(NewEdge);
2910 if (Caller == Edge->Caller) {
2911 // If we are inserting the new edge into the current edge's caller, insert
2912 // the new edge before the current iterator position, and then increment
2913 // back to the current edge.
2914 EI = Caller->CalleeEdges.insert(EI, NewEdge);
2915 ++EI;
2916 assert(*EI == Edge &&
2917 "Iterator position not restored after insert and increment");
2918 } else
2919 Caller->CalleeEdges.push_back(NewEdge);
2920 };
2921
2922 // Create new nodes for each found callee and connect in between the profiled
2923 // caller and callee.
2924 auto *CurCalleeNode = Edge->Callee;
2925 for (auto &[NewCall, Func] : FoundCalleeChain) {
2926 ContextNode *NewNode = nullptr;
2927 // First check if we have already synthesized a node for this tail call.
2928 if (TailCallToContextNodeMap.count(NewCall)) {
2929 NewNode = TailCallToContextNodeMap[NewCall];
2930 NewNode->AllocTypes |= Edge->AllocTypes;
2931 } else {
2932 FuncToCallsWithMetadata[Func].push_back({NewCall});
2933 // Create Node and record node info.
2934 NewNode = createNewNode(/*IsAllocation=*/false, Func, NewCall);
2935 TailCallToContextNodeMap[NewCall] = NewNode;
2936 NewNode->AllocTypes = Edge->AllocTypes;
2937 }
2938
2939 // Hook up node to its callee node
2940 AddEdge(NewNode, CurCalleeNode);
2941
2942 CurCalleeNode = NewNode;
2943 }
2944
2945 // Hook up edge's original caller to new callee node.
2946 AddEdge(Edge->Caller, CurCalleeNode);
2947
2948#ifndef NDEBUG
2949 // Save this because Edge's fields get cleared below when removed.
2950 auto *Caller = Edge->Caller;
2951#endif
2952
2953 // Remove old edge
2954 removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true);
2955
2956 // To simplify the increment of EI in the caller, subtract one from EI.
2957 // In the final AddEdge call we would have either added a new callee edge,
2958 // to Edge->Caller, or found an existing one. Either way we are guaranteed
2959 // that there is at least one callee edge.
2960 assert(!Caller->CalleeEdges.empty());
2961 --EI;
2962
2963 return true;
2964}
2965
2966bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2967 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
2968 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
2969 bool &FoundMultipleCalleeChains) {
2970 // Stop recursive search if we have already explored the maximum specified
2971 // depth.
2973 return false;
2974
2975 auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) {
2976 FoundCalleeChain.push_back({Callsite, F});
2977 };
2978
2979 auto *CalleeFunc = dyn_cast<Function>(CurCallee);
2980 if (!CalleeFunc) {
2981 auto *Alias = dyn_cast<GlobalAlias>(CurCallee);
2982 assert(Alias);
2983 CalleeFunc = dyn_cast<Function>(Alias->getAliasee());
2984 assert(CalleeFunc);
2985 }
2986
2987 // Look for tail calls in this function, and check if they either call the
2988 // profiled callee directly, or indirectly (via a recursive search).
2989 // Only succeed if there is a single unique tail call chain found between the
2990 // profiled caller and callee, otherwise we could perform incorrect cloning.
2991 bool FoundSingleCalleeChain = false;
2992 for (auto &BB : *CalleeFunc) {
2993 for (auto &I : BB) {
2994 auto *CB = dyn_cast<CallBase>(&I);
2995 if (!CB || !CB->isTailCall())
2996 continue;
2997 auto *CalledValue = CB->getCalledOperand();
2998 auto *CalledFunction = CB->getCalledFunction();
2999 if (CalledValue && !CalledFunction) {
3000 CalledValue = CalledValue->stripPointerCasts();
3001 // Stripping pointer casts can reveal a called function.
3002 CalledFunction = dyn_cast<Function>(CalledValue);
3003 }
3004 // Check if this is an alias to a function. If so, get the
3005 // called aliasee for the checks below.
3006 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
3007 assert(!CalledFunction &&
3008 "Expected null called function in callsite for alias");
3009 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
3010 }
3011 if (!CalledFunction)
3012 continue;
3013 if (CalledFunction == ProfiledCallee) {
3014 if (FoundSingleCalleeChain) {
3015 FoundMultipleCalleeChains = true;
3016 return false;
3017 }
3018 FoundSingleCalleeChain = true;
3019 FoundProfiledCalleeCount++;
3020 FoundProfiledCalleeDepth += Depth;
3021 if (Depth > FoundProfiledCalleeMaxDepth)
3022 FoundProfiledCalleeMaxDepth = Depth;
3023 SaveCallsiteInfo(&I, CalleeFunc);
3024 } else if (findProfiledCalleeThroughTailCalls(
3025 ProfiledCallee, CalledFunction, Depth + 1,
3026 FoundCalleeChain, FoundMultipleCalleeChains)) {
3027 // findProfiledCalleeThroughTailCalls should not have returned
3028 // true if FoundMultipleCalleeChains.
3029 assert(!FoundMultipleCalleeChains);
3030 if (FoundSingleCalleeChain) {
3031 FoundMultipleCalleeChains = true;
3032 return false;
3033 }
3034 FoundSingleCalleeChain = true;
3035 SaveCallsiteInfo(&I, CalleeFunc);
3036 } else if (FoundMultipleCalleeChains)
3037 return false;
3038 }
3039 }
3040
3041 return FoundSingleCalleeChain;
3042}
3043
3044const Function *ModuleCallsiteContextGraph::getCalleeFunc(Instruction *Call) {
3045 auto *CB = dyn_cast<CallBase>(Call);
3046 if (!CB->getCalledOperand() || CB->isIndirectCall())
3047 return nullptr;
3048 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
3049 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
3050 if (Alias)
3051 return dyn_cast<Function>(Alias->getAliasee());
3052 return dyn_cast<Function>(CalleeVal);
3053}
3054
3055bool ModuleCallsiteContextGraph::calleeMatchesFunc(
3056 Instruction *Call, const Function *Func, const Function *CallerFunc,
3057 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) {
3058 auto *CB = dyn_cast<CallBase>(Call);
3059 if (!CB->getCalledOperand() || CB->isIndirectCall())
3060 return false;
3061 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
3062 auto *CalleeFunc = dyn_cast<Function>(CalleeVal);
3063 if (CalleeFunc == Func)
3064 return true;
3065 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
3066 if (Alias && Alias->getAliasee() == Func)
3067 return true;
3068
3069 // Recursively search for the profiled callee through tail calls starting with
3070 // the actual Callee. The discovered tail call chain is saved in
3071 // FoundCalleeChain, and we will fixup the graph to include these callsites
3072 // after returning.
3073 // FIXME: We will currently redo the same recursive walk if we find the same
3074 // mismatched callee from another callsite. We can improve this with more
3075 // bookkeeping of the created chain of new nodes for each mismatch.
3076 unsigned Depth = 1;
3077 bool FoundMultipleCalleeChains = false;
3078 if (!findProfiledCalleeThroughTailCalls(Func, CalleeVal, Depth,
3079 FoundCalleeChain,
3080 FoundMultipleCalleeChains)) {
3081 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "
3082 << Func->getName() << " from " << CallerFunc->getName()
3083 << " that actually called " << CalleeVal->getName()
3084 << (FoundMultipleCalleeChains
3085 ? " (found multiple possible chains)"
3086 : "")
3087 << "\n");
3088 if (FoundMultipleCalleeChains)
3089 FoundProfiledCalleeNonUniquelyCount++;
3090 return false;
3091 }
3092
3093 return true;
3094}
3095
3096bool ModuleCallsiteContextGraph::sameCallee(Instruction *Call1,
3097 Instruction *Call2) {
3098 auto *CB1 = cast<CallBase>(Call1);
3099 if (!CB1->getCalledOperand() || CB1->isIndirectCall())
3100 return false;
3101 auto *CalleeVal1 = CB1->getCalledOperand()->stripPointerCasts();
3102 auto *CalleeFunc1 = dyn_cast<Function>(CalleeVal1);
3103 auto *CB2 = cast<CallBase>(Call2);
3104 if (!CB2->getCalledOperand() || CB2->isIndirectCall())
3105 return false;
3106 auto *CalleeVal2 = CB2->getCalledOperand()->stripPointerCasts();
3107 auto *CalleeFunc2 = dyn_cast<Function>(CalleeVal2);
3108 return CalleeFunc1 == CalleeFunc2;
3109}
3110
3111bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
3112 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
3113 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
3114 bool &FoundMultipleCalleeChains) {
3115 // Stop recursive search if we have already explored the maximum specified
3116 // depth.
3118 return false;
3119
3120 auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {
3121 // Make a CallsiteInfo for each discovered callee, if one hasn't already
3122 // been synthesized.
3123 if (!FunctionCalleesToSynthesizedCallsiteInfos.count(FS) ||
3124 !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(Callee))
3125 // StackIds is empty (we don't have debug info available in the index for
3126 // these callsites)
3127 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] =
3128 std::make_unique<CallsiteInfo>(Callee, SmallVector<unsigned>());
3129 CallsiteInfo *NewCallsiteInfo =
3130 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get();
3131 FoundCalleeChain.push_back({NewCallsiteInfo, FS});
3132 };
3133
3134 // Look for tail calls in this function, and check if they either call the
3135 // profiled callee directly, or indirectly (via a recursive search).
3136 // Only succeed if there is a single unique tail call chain found between the
3137 // profiled caller and callee, otherwise we could perform incorrect cloning.
3138 bool FoundSingleCalleeChain = false;
3139 for (auto &S : CurCallee.getSummaryList()) {
3140 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
3141 !isPrevailing(CurCallee.getGUID(), S.get()))
3142 continue;
3143 auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject());
3144 if (!FS)
3145 continue;
3146 auto FSVI = CurCallee;
3147 auto *AS = dyn_cast<AliasSummary>(S.get());
3148 if (AS)
3149 FSVI = AS->getAliaseeVI();
3150 for (auto &CallEdge : FS->calls()) {
3151 if (!CallEdge.second.hasTailCall())
3152 continue;
3153 if (CallEdge.first == ProfiledCallee) {
3154 if (FoundSingleCalleeChain) {
3155 FoundMultipleCalleeChains = true;
3156 return false;
3157 }
3158 FoundSingleCalleeChain = true;
3159 FoundProfiledCalleeCount++;
3160 FoundProfiledCalleeDepth += Depth;
3161 if (Depth > FoundProfiledCalleeMaxDepth)
3162 FoundProfiledCalleeMaxDepth = Depth;
3163 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
3164 // Add FS to FSToVIMap in case it isn't already there.
3165 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
3166 FSToVIMap[FS] = FSVI;
3167 } else if (findProfiledCalleeThroughTailCalls(
3168 ProfiledCallee, CallEdge.first, Depth + 1,
3169 FoundCalleeChain, FoundMultipleCalleeChains)) {
3170 // findProfiledCalleeThroughTailCalls should not have returned
3171 // true if FoundMultipleCalleeChains.
3172 assert(!FoundMultipleCalleeChains);
3173 if (FoundSingleCalleeChain) {
3174 FoundMultipleCalleeChains = true;
3175 return false;
3176 }
3177 FoundSingleCalleeChain = true;
3178 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
3179 // Add FS to FSToVIMap in case it isn't already there.
3180 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
3181 FSToVIMap[FS] = FSVI;
3182 } else if (FoundMultipleCalleeChains)
3183 return false;
3184 }
3185 }
3186
3187 return FoundSingleCalleeChain;
3188}
3189
3190const FunctionSummary *
3191IndexCallsiteContextGraph::getCalleeFunc(IndexCall &Call) {
3192 ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Call)->Callee;
3193 if (Callee.getSummaryList().empty())
3194 return nullptr;
3195 return dyn_cast<FunctionSummary>(Callee.getSummaryList()[0]->getBaseObject());
3196}
3197
3198bool IndexCallsiteContextGraph::calleeMatchesFunc(
3199 IndexCall &Call, const FunctionSummary *Func,
3200 const FunctionSummary *CallerFunc,
3201 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
3202 ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Call)->Callee;
3203 // If there is no summary list then this is a call to an externally defined
3204 // symbol.
3205 AliasSummary *Alias =
3206 Callee.getSummaryList().empty()
3207 ? nullptr
3208 : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get());
3209 assert(FSToVIMap.count(Func));
3210 auto FuncVI = FSToVIMap[Func];
3211 if (Callee == FuncVI ||
3212 // If callee is an alias, check the aliasee, since only function
3213 // summary base objects will contain the stack node summaries and thus
3214 // get a context node.
3215 (Alias && Alias->getAliaseeVI() == FuncVI))
3216 return true;
3217
3218 // Recursively search for the profiled callee through tail calls starting with
3219 // the actual Callee. The discovered tail call chain is saved in
3220 // FoundCalleeChain, and we will fixup the graph to include these callsites
3221 // after returning.
3222 // FIXME: We will currently redo the same recursive walk if we find the same
3223 // mismatched callee from another callsite. We can improve this with more
3224 // bookkeeping of the created chain of new nodes for each mismatch.
3225 unsigned Depth = 1;
3226 bool FoundMultipleCalleeChains = false;
3227 if (!findProfiledCalleeThroughTailCalls(
3228 FuncVI, Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {
3229 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI
3230 << " from " << FSToVIMap[CallerFunc]
3231 << " that actually called " << Callee
3232 << (FoundMultipleCalleeChains
3233 ? " (found multiple possible chains)"
3234 : "")
3235 << "\n");
3236 if (FoundMultipleCalleeChains)
3237 FoundProfiledCalleeNonUniquelyCount++;
3238 return false;
3239 }
3240
3241 return true;
3242}
3243
3244bool IndexCallsiteContextGraph::sameCallee(IndexCall &Call1, IndexCall &Call2) {
3245 ValueInfo Callee1 = dyn_cast_if_present<CallsiteInfo *>(Call1)->Callee;
3246 ValueInfo Callee2 = dyn_cast_if_present<CallsiteInfo *>(Call2)->Callee;
3247 return Callee1 == Callee2;
3248}
3249
3250template <typename DerivedCCG, typename FuncTy, typename CallTy>
3251void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
3252 const {
3253 print(dbgs());
3254 dbgs() << "\n";
3255}
3256
3257template <typename DerivedCCG, typename FuncTy, typename CallTy>
3258void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
3259 raw_ostream &OS) const {
3260 OS << "Node " << this << "\n";
3261 OS << "\t";
3262 printCall(OS);
3263 if (Recursive)
3264 OS << " (recursive)";
3265 OS << "\n";
3266 if (!MatchingCalls.empty()) {
3267 OS << "\tMatchingCalls:\n";
3268 for (auto &MatchingCall : MatchingCalls) {
3269 OS << "\t";
3270 MatchingCall.print(OS);
3271 OS << "\n";
3272 }
3273 }
3274 OS << "\tNodeId: " << NodeId << "\n";
3275 OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
3276 OS << "\tContextIds:";
3277 // Make a copy of the computed context ids that we can sort for stability.
3278 auto ContextIds = getContextIds();
3279 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3280 std::sort(SortedIds.begin(), SortedIds.end());
3281 for (auto Id : SortedIds)
3282 OS << " " << Id;
3283 OS << "\n";
3284 OS << "\tCalleeEdges:\n";
3285 for (auto &Edge : CalleeEdges)
3286 OS << "\t\t" << *Edge << " (Callee NodeId: " << Edge->Callee->NodeId
3287 << ")\n";
3288 OS << "\tCallerEdges:\n";
3289 for (auto &Edge : CallerEdges)
3290 OS << "\t\t" << *Edge << " (Caller NodeId: " << Edge->Caller->NodeId
3291 << ")\n";
3292 if (!Clones.empty()) {
3293 OS << "\tClones: ";
3294 ListSeparator LS;
3295 for (auto *C : Clones)
3296 OS << LS << C << " NodeId: " << C->NodeId;
3297 OS << "\n";
3298 } else if (CloneOf) {
3299 OS << "\tClone of " << CloneOf << " NodeId: " << CloneOf->NodeId << "\n";
3300 }
3301}
3302
3303template <typename DerivedCCG, typename FuncTy, typename CallTy>
3304void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
3305 const {
3306 print(dbgs());
3307 dbgs() << "\n";
3308}
3309
3310template <typename DerivedCCG, typename FuncTy, typename CallTy>
3311void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
3312 raw_ostream &OS) const {
3313 OS << "Edge from Callee " << Callee << " to Caller: " << Caller
3314 << (IsBackedge ? " (BE)" : "")
3315 << " AllocTypes: " << getAllocTypeString(AllocTypes);
3316 OS << " ContextIds:";
3317 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3318 std::sort(SortedIds.begin(), SortedIds.end());
3319 for (auto Id : SortedIds)
3320 OS << " " << Id;
3321}
3322
3323template <typename DerivedCCG, typename FuncTy, typename CallTy>
3324void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
3325 print(dbgs());
3326}
3327
3328template <typename DerivedCCG, typename FuncTy, typename CallTy>
3329void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
3330 raw_ostream &OS) const {
3331 OS << "Callsite Context Graph:\n";
3332 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3333 for (const auto Node : nodes<GraphType>(this)) {
3334 if (Node->isRemoved())
3335 continue;
3336 Node->print(OS);
3337 OS << "\n";
3338 }
3339}
3340
3341template <typename DerivedCCG, typename FuncTy, typename CallTy>
3342void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
3343 raw_ostream &OS,
3344 function_ref<void(StringRef, StringRef, const Twine &)> EmitRemark) const {
3345 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3346 for (const auto Node : nodes<GraphType>(this)) {
3347 if (Node->isRemoved())
3348 continue;
3349 if (!Node->IsAllocation)
3350 continue;
3351 DenseSet<uint32_t> ContextIds = Node->getContextIds();
3352 auto AllocTypeFromCall = getAllocationCallType(Node->Call);
3353 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3354 std::sort(SortedIds.begin(), SortedIds.end());
3355 for (auto Id : SortedIds) {
3356 auto TypeI = ContextIdToAllocationType.find(Id);
3357 assert(TypeI != ContextIdToAllocationType.end());
3358 auto CSI = ContextIdToContextSizeInfos.find(Id);
3359 if (CSI != ContextIdToContextSizeInfos.end()) {
3360 for (auto &Info : CSI->second) {
3361 std::string Msg =
3362 "MemProf hinting: " + getAllocTypeString((uint8_t)TypeI->second) +
3363 " full allocation context " + std::to_string(Info.FullStackId) +
3364 " with total size " + std::to_string(Info.TotalSize) + " is " +
3365 getAllocTypeString(Node->AllocTypes) + " after cloning";
3366 if (allocTypeToUse(Node->AllocTypes) != AllocTypeFromCall)
3367 Msg += " marked " + getAllocTypeString((uint8_t)AllocTypeFromCall) +
3368 " due to cold byte percent";
3369 // Print the internal context id to aid debugging and visualization.
3370 Msg += " (internal context id " + std::to_string(Id) + ")";
3372 OS << Msg << "\n";
3373 if (EmitRemark)
3374 EmitRemark(DEBUG_TYPE, "MemProfReport", Msg);
3375 }
3376 } else {
3377 // This is only emitted if the context size info is not present.
3378 std::string Msg =
3379 "MemProf hinting: " + getAllocTypeString((uint8_t)TypeI->second) +
3380 " is " + getAllocTypeString(Node->AllocTypes) + " after cloning";
3381 if (allocTypeToUse(Node->AllocTypes) != AllocTypeFromCall)
3382 Msg += " marked " + getAllocTypeString((uint8_t)AllocTypeFromCall) +
3383 " due to cold byte percent";
3384 // Print the internal context id to aid debugging and visualization.
3385 Msg += " (internal context id " + std::to_string(Id) + ")";
3387 OS << Msg << "\n";
3388 if (EmitRemark)
3389 EmitRemark(DEBUG_TYPE, "MemProfReport", Msg);
3390 }
3391 }
3392 }
3393}
3394
3395template <typename DerivedCCG, typename FuncTy, typename CallTy>
3396void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
3397 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3398 for (const auto Node : nodes<GraphType>(this)) {
3399 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3400 for (auto &Edge : Node->CallerEdges)
3402 }
3403}
3404
3405template <typename DerivedCCG, typename FuncTy, typename CallTy>
3406struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
3407 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3408 using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
3409
3410 using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
3411 static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
3412
3415 decltype(&getNode)>;
3416
3418 return nodes_iterator(G->NodeOwner.begin(), &getNode);
3419 }
3420
3422 return nodes_iterator(G->NodeOwner.end(), &getNode);
3423 }
3424
3426 return G->NodeOwner.begin()->get();
3427 }
3428
3429 using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
3430 static const ContextNode<DerivedCCG, FuncTy, CallTy> *
3432 return P->Callee;
3433 }
3434
3436 mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
3437 DerivedCCG, FuncTy, CallTy>>>::const_iterator,
3438 decltype(&GetCallee)>;
3439
3441 return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
3442 }
3443
3445 return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
3446 }
3447};
3448
3449template <typename DerivedCCG, typename FuncTy, typename CallTy>
3450struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
3451 : public DefaultDOTGraphTraits {
3452 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {
3453 // If the user requested the full graph to be exported, but provided an
3454 // allocation id, or if the user gave a context id and requested more than
3455 // just a specific context to be exported, note that highlighting is
3456 // enabled.
3457 DoHighlight =
3458 (AllocIdForDot.getNumOccurrences() && DotGraphScope == DotScope::All) ||
3459 (ContextIdForDot.getNumOccurrences() &&
3461 }
3462
3463 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3465 using NodeRef = typename GTraits::NodeRef;
3466 using ChildIteratorType = typename GTraits::ChildIteratorType;
3467
3468 static std::string getNodeLabel(NodeRef Node, GraphType G) {
3469 std::string LabelString =
3470 (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
3471 Twine(Node->OrigStackOrAllocId) + " NodeId: " + Twine(Node->NodeId))
3472 .str();
3473 LabelString += "\n";
3474 if (Node->hasCall()) {
3475 auto Func = G->NodeToCallingFunc.find(Node);
3476 assert(Func != G->NodeToCallingFunc.end());
3477 LabelString +=
3478 G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
3479 for (auto &MatchingCall : Node->MatchingCalls) {
3480 LabelString += "\n";
3481 LabelString += G->getLabel(Func->second, MatchingCall.call(),
3482 MatchingCall.cloneNo());
3483 }
3484 } else {
3485 LabelString += "null call";
3486 if (Node->Recursive)
3487 LabelString += " (recursive)";
3488 else
3489 LabelString += " (external)";
3490 }
3491 return LabelString;
3492 }
3493
3495 auto ContextIds = Node->getContextIds();
3496 // If highlighting enabled, see if this node contains any of the context ids
3497 // of interest. If so, it will use a different color and a larger fontsize
3498 // (which makes the node larger as well).
3499 bool Highlight = false;
3500 if (DoHighlight) {
3501 assert(ContextIdForDot.getNumOccurrences() ||
3502 AllocIdForDot.getNumOccurrences());
3503 if (ContextIdForDot.getNumOccurrences())
3504 Highlight = ContextIds.contains(ContextIdForDot);
3505 else
3506 Highlight = set_intersects(ContextIds, G->DotAllocContextIds);
3507 }
3508 std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
3509 getContextIds(ContextIds) + "\"")
3510 .str();
3511 // Default fontsize is 14
3512 if (Highlight)
3513 AttributeString += ",fontsize=\"30\"";
3514 AttributeString +=
3515 (Twine(",fillcolor=\"") + getColor(Node->AllocTypes, Highlight) + "\"")
3516 .str();
3517 if (Node->CloneOf) {
3518 AttributeString += ",color=\"blue\"";
3519 AttributeString += ",style=\"filled,bold,dashed\"";
3520 } else
3521 AttributeString += ",style=\"filled\"";
3522 return AttributeString;
3523 }
3524
3525 static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
3526 GraphType G) {
3527 auto &Edge = *(ChildIter.getCurrent());
3528 // If highlighting enabled, see if this edge contains any of the context ids
3529 // of interest. If so, it will use a different color and a heavier arrow
3530 // size and weight (the larger weight makes the highlighted path
3531 // straighter).
3532 bool Highlight = false;
3533 if (DoHighlight) {
3534 assert(ContextIdForDot.getNumOccurrences() ||
3535 AllocIdForDot.getNumOccurrences());
3536 if (ContextIdForDot.getNumOccurrences())
3537 Highlight = Edge->ContextIds.contains(ContextIdForDot);
3538 else
3539 Highlight = set_intersects(Edge->ContextIds, G->DotAllocContextIds);
3540 }
3541 auto Color = getColor(Edge->AllocTypes, Highlight);
3542 std::string AttributeString =
3543 (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" +
3544 // fillcolor is the arrow head and color is the line
3545 Twine(",fillcolor=\"") + Color + "\"" + Twine(",color=\"") + Color +
3546 "\"")
3547 .str();
3548 if (Edge->IsBackedge)
3549 AttributeString += ",style=\"dotted\"";
3550 // Default penwidth and weight are both 1.
3551 if (Highlight)
3552 AttributeString += ",penwidth=\"2.0\",weight=\"2\"";
3553 return AttributeString;
3554 }
3555
3556 // Since the NodeOwners list includes nodes that are no longer connected to
3557 // the graph, skip them here.
3559 if (Node->isRemoved())
3560 return true;
3561 // If a scope smaller than the full graph was requested, see if this node
3562 // contains any of the context ids of interest.
3564 return !set_intersects(Node->getContextIds(), G->DotAllocContextIds);
3566 return !Node->getContextIds().contains(ContextIdForDot);
3567 return false;
3568 }
3569
3570private:
3571 static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
3572 std::string IdString = "ContextIds:";
3573 if (ContextIds.size() < 100) {
3574 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3575 std::sort(SortedIds.begin(), SortedIds.end());
3576 for (auto Id : SortedIds)
3577 IdString += (" " + Twine(Id)).str();
3578 } else {
3579 IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
3580 }
3581 return IdString;
3582 }
3583
3584 static std::string getColor(uint8_t AllocTypes, bool Highlight) {
3585 // If DoHighlight is not enabled, we want to use the highlight colors for
3586 // NotCold and Cold, and the non-highlight color for NotCold+Cold. This is
3587 // both compatible with the color scheme before highlighting was supported,
3588 // and for the NotCold+Cold color the non-highlight color is a bit more
3589 // readable.
3590 if (AllocTypes == (uint8_t)AllocationType::NotCold)
3591 // Color "brown1" actually looks like a lighter red.
3592 return !DoHighlight || Highlight ? "brown1" : "lightpink";
3593 if (AllocTypes == (uint8_t)AllocationType::Cold)
3594 return !DoHighlight || Highlight ? "cyan" : "lightskyblue";
3595 if (AllocTypes ==
3596 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
3597 return Highlight ? "magenta" : "mediumorchid1";
3598 return "gray";
3599 }
3600
3601 static std::string getNodeId(NodeRef Node) {
3602 std::stringstream SStream;
3603 SStream << std::hex << "N0x" << (unsigned long long)Node;
3604 std::string Result = SStream.str();
3605 return Result;
3606 }
3607
3608 // True if we should highlight a specific context or allocation's contexts in
3609 // the emitted graph.
3610 static bool DoHighlight;
3611};
3612
3613template <typename DerivedCCG, typename FuncTy, typename CallTy>
3614bool DOTGraphTraits<
3615 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>::DoHighlight =
3616 false;
3617
3618template <typename DerivedCCG, typename FuncTy, typename CallTy>
3619void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
3620 std::string Label) const {
3621 WriteGraph(this, "", false, Label,
3622 DotFilePathPrefix + "ccg." + Label + ".dot");
3623}
3624
3625template <typename DerivedCCG, typename FuncTy, typename CallTy>
3626typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
3627CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
3628 const std::shared_ptr<ContextEdge> &Edge,
3629 DenseSet<uint32_t> ContextIdsToMove) {
3630 ContextNode *Node = Edge->Callee;
3631 assert(NodeToCallingFunc.count(Node));
3632 ContextNode *Clone =
3633 createNewNode(Node->IsAllocation, NodeToCallingFunc[Node], Node->Call);
3634 Node->addClone(Clone);
3635 Clone->MatchingCalls = Node->MatchingCalls;
3636 moveEdgeToExistingCalleeClone(Edge, Clone, /*NewClone=*/true,
3637 ContextIdsToMove);
3638 return Clone;
3639}
3640
3641template <typename DerivedCCG, typename FuncTy, typename CallTy>
3642void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3643 moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
3644 ContextNode *NewCallee, bool NewClone,
3645 DenseSet<uint32_t> ContextIdsToMove) {
3646 // NewCallee and Edge's current callee must be clones of the same original
3647 // node (Edge's current callee may be the original node too).
3648 assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
3649
3650 bool EdgeIsRecursive = Edge->Callee == Edge->Caller;
3651
3652 ContextNode *OldCallee = Edge->Callee;
3653
3654 // We might already have an edge to the new callee from earlier cloning for a
3655 // different allocation. If one exists we will reuse it.
3656 auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller);
3657
3658 // Callers will pass an empty ContextIdsToMove set when they want to move the
3659 // edge. Copy in Edge's ids for simplicity.
3660 if (ContextIdsToMove.empty())
3661 ContextIdsToMove = Edge->getContextIds();
3662
3663 // If we are moving all of Edge's ids, then just move the whole Edge.
3664 // Otherwise only move the specified subset, to a new edge if needed.
3665 if (Edge->getContextIds().size() == ContextIdsToMove.size()) {
3666 // First, update the alloc types on New Callee from Edge.
3667 // Do this before we potentially clear Edge's fields below!
3668 NewCallee->AllocTypes |= Edge->AllocTypes;
3669 // Moving the whole Edge.
3670 if (ExistingEdgeToNewCallee) {
3671 // Since we already have an edge to NewCallee, simply move the ids
3672 // onto it, and remove the existing Edge.
3673 ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
3674 ExistingEdgeToNewCallee->AllocTypes |= Edge->AllocTypes;
3675 assert(Edge->ContextIds == ContextIdsToMove);
3676 removeEdgeFromGraph(Edge.get());
3677 } else {
3678 // Otherwise just reconnect Edge to NewCallee.
3679 Edge->Callee = NewCallee;
3680 NewCallee->CallerEdges.push_back(Edge);
3681 // Remove it from callee where it was previously connected.
3682 OldCallee->eraseCallerEdge(Edge.get());
3683 // Don't need to update Edge's context ids since we are simply
3684 // reconnecting it.
3685 }
3686 } else {
3687 // Only moving a subset of Edge's ids.
3688 // Compute the alloc type of the subset of ids being moved.
3689 auto CallerEdgeAllocType = computeAllocType(ContextIdsToMove);
3690 if (ExistingEdgeToNewCallee) {
3691 // Since we already have an edge to NewCallee, simply move the ids
3692 // onto it.
3693 ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
3694 ExistingEdgeToNewCallee->AllocTypes |= CallerEdgeAllocType;
3695 } else {
3696 // Otherwise, create a new edge to NewCallee for the ids being moved.
3697 auto NewEdge = std::make_shared<ContextEdge>(
3698 NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove);
3699 Edge->Caller->CalleeEdges.push_back(NewEdge);
3700 NewCallee->CallerEdges.push_back(NewEdge);
3701 }
3702 // In either case, need to update the alloc types on NewCallee, and remove
3703 // those ids and update the alloc type on the original Edge.
3704 NewCallee->AllocTypes |= CallerEdgeAllocType;
3705 set_subtract(Edge->ContextIds, ContextIdsToMove);
3706 Edge->AllocTypes = computeAllocType(Edge->ContextIds);
3707 }
3708 // Now walk the old callee node's callee edges and move Edge's context ids
3709 // over to the corresponding edge into the clone (which is created here if
3710 // this is a newly created clone).
3711 for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
3712 ContextNode *CalleeToUse = OldCalleeEdge->Callee;
3713 // If this is a direct recursion edge, use NewCallee (the clone) as the
3714 // callee as well, so that any edge updated/created here is also direct
3715 // recursive.
3716 if (CalleeToUse == OldCallee) {
3717 // If this is a recursive edge, see if we already moved a recursive edge
3718 // (which would have to have been this one) - if we were only moving a
3719 // subset of context ids it would still be on OldCallee.
3720 if (EdgeIsRecursive) {
3721 assert(OldCalleeEdge == Edge);
3722 continue;
3723 }
3724 CalleeToUse = NewCallee;
3725 }
3726 // The context ids moving to the new callee are the subset of this edge's
3727 // context ids and the context ids on the caller edge being moved.
3728 DenseSet<uint32_t> EdgeContextIdsToMove =
3729 set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove);
3730 set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
3731 OldCalleeEdge->AllocTypes =
3732 computeAllocType(OldCalleeEdge->getContextIds());
3733 if (!NewClone) {
3734 // Update context ids / alloc type on corresponding edge to NewCallee.
3735 // There is a chance this may not exist if we are reusing an existing
3736 // clone, specifically during function assignment, where we would have
3737 // removed none type edges after creating the clone. If we can't find
3738 // a corresponding edge there, fall through to the cloning below.
3739 if (auto *NewCalleeEdge = NewCallee->findEdgeFromCallee(CalleeToUse)) {
3740 NewCalleeEdge->getContextIds().insert_range(EdgeContextIdsToMove);
3741 NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
3742 continue;
3743 }
3744 }
3745 auto NewEdge = std::make_shared<ContextEdge>(
3746 CalleeToUse, NewCallee, computeAllocType(EdgeContextIdsToMove),
3747 EdgeContextIdsToMove);
3748 NewCallee->CalleeEdges.push_back(NewEdge);
3749 NewEdge->Callee->CallerEdges.push_back(NewEdge);
3750 }
3751 // Recompute the node alloc type now that its callee edges have been
3752 // updated (since we will compute from those edges).
3753 OldCallee->AllocTypes = OldCallee->computeAllocType();
3754 // OldCallee alloc type should be None iff its context id set is now empty.
3755 assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
3756 OldCallee->emptyContextIds());
3757 if (VerifyCCG) {
3758 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
3759 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
3760 for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
3761 checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
3762 /*CheckEdges=*/false);
3763 for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
3764 checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
3765 /*CheckEdges=*/false);
3766 }
3767}
3768
3769template <typename DerivedCCG, typename FuncTy, typename CallTy>
3770void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3771 moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
3772 ContextNode *NewCaller) {
3773 auto *OldCallee = Edge->Callee;
3774 auto *NewCallee = OldCallee;
3775 // If this edge was direct recursive, make any new/updated edge also direct
3776 // recursive to NewCaller.
3777 bool Recursive = Edge->Caller == Edge->Callee;
3778 if (Recursive)
3779 NewCallee = NewCaller;
3780
3781 ContextNode *OldCaller = Edge->Caller;
3782 OldCaller->eraseCalleeEdge(Edge.get());
3783
3784 // We might already have an edge to the new caller. If one exists we will
3785 // reuse it.
3786 auto ExistingEdgeToNewCaller = NewCaller->findEdgeFromCallee(NewCallee);
3787
3788 if (ExistingEdgeToNewCaller) {
3789 // Since we already have an edge to NewCaller, simply move the ids
3790 // onto it, and remove the existing Edge.
3791 ExistingEdgeToNewCaller->getContextIds().insert_range(
3792 Edge->getContextIds());
3793 ExistingEdgeToNewCaller->AllocTypes |= Edge->AllocTypes;
3794 Edge->ContextIds.clear();
3795 Edge->AllocTypes = (uint8_t)AllocationType::None;
3796 OldCallee->eraseCallerEdge(Edge.get());
3797 } else {
3798 // Otherwise just reconnect Edge to NewCaller.
3799 Edge->Caller = NewCaller;
3800 NewCaller->CalleeEdges.push_back(Edge);
3801 if (Recursive) {
3802 assert(NewCallee == NewCaller);
3803 // In the case of (direct) recursive edges, we update the callee as well
3804 // so that it becomes recursive on the new caller.
3805 Edge->Callee = NewCallee;
3806 NewCallee->CallerEdges.push_back(Edge);
3807 OldCallee->eraseCallerEdge(Edge.get());
3808 }
3809 // Don't need to update Edge's context ids since we are simply
3810 // reconnecting it.
3811 }
3812 // In either case, need to update the alloc types on New Caller.
3813 NewCaller->AllocTypes |= Edge->AllocTypes;
3814
3815 // Now walk the old caller node's caller edges and move Edge's context ids
3816 // over to the corresponding edge into the node (which is created here if
3817 // this is a newly created node). We can tell whether this is a newly created
3818 // node by seeing if it has any caller edges yet.
3819#ifndef NDEBUG
3820 bool IsNewNode = NewCaller->CallerEdges.empty();
3821#endif
3822 // If we just moved a direct recursive edge, presumably its context ids should
3823 // also flow out of OldCaller via some other non-recursive callee edge. We
3824 // don't want to remove the recursive context ids from other caller edges yet,
3825 // otherwise the context ids get into an inconsistent state on OldCaller.
3826 // We will update these context ids on the non-recursive caller edge when and
3827 // if they are updated on the non-recursive callee.
3828 if (!Recursive) {
3829 for (auto &OldCallerEdge : OldCaller->CallerEdges) {
3830 auto OldCallerCaller = OldCallerEdge->Caller;
3831 // The context ids moving to the new caller are the subset of this edge's
3832 // context ids and the context ids on the callee edge being moved.
3833 DenseSet<uint32_t> EdgeContextIdsToMove = set_intersection(
3834 OldCallerEdge->getContextIds(), Edge->getContextIds());
3835 if (OldCaller == OldCallerCaller) {
3836 OldCallerCaller = NewCaller;
3837 // Don't actually move this one. The caller will move it directly via a
3838 // call to this function with this as the Edge if it is appropriate to
3839 // move to a diff node that has a matching callee (itself).
3840 continue;
3841 }
3842 set_subtract(OldCallerEdge->getContextIds(), EdgeContextIdsToMove);
3843 OldCallerEdge->AllocTypes =
3844 computeAllocType(OldCallerEdge->getContextIds());
3845 // In this function we expect that any pre-existing node already has edges
3846 // from the same callers as the old node. That should be true in the
3847 // current use case, where we will remove None-type edges after copying
3848 // over all caller edges from the callee.
3849 auto *ExistingCallerEdge = NewCaller->findEdgeFromCaller(OldCallerCaller);
3850 // Since we would have skipped caller edges when moving a direct recursive
3851 // edge, this may not hold true when recursive handling enabled.
3852 assert(IsNewNode || ExistingCallerEdge || AllowRecursiveCallsites);
3853 if (ExistingCallerEdge) {
3854 ExistingCallerEdge->getContextIds().insert_range(EdgeContextIdsToMove);
3855 ExistingCallerEdge->AllocTypes |=
3856 computeAllocType(EdgeContextIdsToMove);
3857 continue;
3858 }
3859 auto NewEdge = std::make_shared<ContextEdge>(
3860 NewCaller, OldCallerCaller, computeAllocType(EdgeContextIdsToMove),
3861 EdgeContextIdsToMove);
3862 NewCaller->CallerEdges.push_back(NewEdge);
3863 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
3864 }
3865 }
3866 // Recompute the node alloc type now that its caller edges have been
3867 // updated (since we will compute from those edges).
3868 OldCaller->AllocTypes = OldCaller->computeAllocType();
3869 // OldCaller alloc type should be None iff its context id set is now empty.
3870 assert((OldCaller->AllocTypes == (uint8_t)AllocationType::None) ==
3871 OldCaller->emptyContextIds());
3872 if (VerifyCCG) {
3873 checkNode<DerivedCCG, FuncTy, CallTy>(OldCaller, /*CheckEdges=*/false);
3874 checkNode<DerivedCCG, FuncTy, CallTy>(NewCaller, /*CheckEdges=*/false);
3875 for (const auto &OldCallerEdge : OldCaller->CallerEdges)
3876 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallerEdge->Caller,
3877 /*CheckEdges=*/false);
3878 for (const auto &NewCallerEdge : NewCaller->CallerEdges)
3879 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallerEdge->Caller,
3880 /*CheckEdges=*/false);
3881 }
3882}
3883
3884template <typename DerivedCCG, typename FuncTy, typename CallTy>
3885void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3886 recursivelyRemoveNoneTypeCalleeEdges(
3887 ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
3888 auto Inserted = Visited.insert(Node);
3889 if (!Inserted.second)
3890 return;
3891
3892 removeNoneTypeCalleeEdges(Node);
3893
3894 for (auto *Clone : Node->Clones)
3895 recursivelyRemoveNoneTypeCalleeEdges(Clone, Visited);
3896
3897 // The recursive call may remove some of this Node's caller edges.
3898 // Iterate over a copy and skip any that were removed.
3899 auto CallerEdges = Node->CallerEdges;
3900 for (auto &Edge : CallerEdges) {
3901 // Skip any that have been removed by an earlier recursive call.
3902 if (Edge->isRemoved()) {
3903 assert(!is_contained(Node->CallerEdges, Edge));
3904 continue;
3905 }
3906 recursivelyRemoveNoneTypeCalleeEdges(Edge->Caller, Visited);
3907 }
3908}
3909
3910// This is the standard DFS based backedge discovery algorithm.
3911template <typename DerivedCCG, typename FuncTy, typename CallTy>
3912void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges() {
3913 // If we are cloning recursive contexts, find and mark backedges from all root
3914 // callers, using the typical DFS based backedge analysis.
3916 return;
3917 DenseSet<const ContextNode *> Visited;
3918 DenseSet<const ContextNode *> CurrentStack;
3919 for (auto &Entry : NonAllocationCallToContextNodeMap) {
3920 auto *Node = Entry.second;
3921 if (Node->isRemoved())
3922 continue;
3923 // It is a root if it doesn't have callers.
3924 if (!Node->CallerEdges.empty())
3925 continue;
3926 markBackedges(Node, Visited, CurrentStack);
3927 assert(CurrentStack.empty());
3928 }
3929}
3930
3931// Recursive helper for above markBackedges method.
3932template <typename DerivedCCG, typename FuncTy, typename CallTy>
3933void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges(
3934 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3935 DenseSet<const ContextNode *> &CurrentStack) {
3936 auto I = Visited.insert(Node);
3937 // We should only call this for unvisited nodes.
3938 assert(I.second);
3939 (void)I;
3940 for (auto &CalleeEdge : Node->CalleeEdges) {
3941 auto *Callee = CalleeEdge->Callee;
3942 if (Visited.count(Callee)) {
3943 // Since this was already visited we need to check if it is currently on
3944 // the recursive stack in which case it is a backedge.
3945 if (CurrentStack.count(Callee))
3946 CalleeEdge->IsBackedge = true;
3947 continue;
3948 }
3949 CurrentStack.insert(Callee);
3950 markBackedges(Callee, Visited, CurrentStack);
3951 CurrentStack.erase(Callee);
3952 }
3953}
3954
3955template <typename DerivedCCG, typename FuncTy, typename CallTy>
3956void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
3957 DenseSet<const ContextNode *> Visited;
3958 for (auto &Entry : AllocationCallToContextNodeMap) {
3959 Visited.clear();
3960 identifyClones(Entry.second, Visited, Entry.second->getContextIds());
3961 }
3962 Visited.clear();
3963 for (auto &Entry : AllocationCallToContextNodeMap)
3964 recursivelyRemoveNoneTypeCalleeEdges(Entry.second, Visited);
3965 if (VerifyCCG)
3966 check();
3967}
3968
3969// helper function to check an AllocType is cold or notcold or both.
3976
3977template <typename DerivedCCG, typename FuncTy, typename CallTy>
3978void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
3979 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3980 const DenseSet<uint32_t> &AllocContextIds) {
3981 if (VerifyNodes)
3982 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3983 assert(!Node->CloneOf);
3984
3985 // If Node as a null call, then either it wasn't found in the module (regular
3986 // LTO) or summary index (ThinLTO), or there were other conditions blocking
3987 // cloning (e.g. recursion, calls multiple targets, etc).
3988 // Do this here so that we don't try to recursively clone callers below, which
3989 // isn't useful at least for this node.
3990 if (!Node->hasCall())
3991 return;
3992
3993 // No need to look at any callers if allocation type already unambiguous.
3994 if (hasSingleAllocType(Node->AllocTypes))
3995 return;
3996
3997#ifndef NDEBUG
3998 auto Insert =
3999#endif
4000 Visited.insert(Node);
4001 // We should not have visited this node yet.
4002 assert(Insert.second);
4003 // The recursive call to identifyClones may delete the current edge from the
4004 // CallerEdges vector. Make a copy and iterate on that, simpler than passing
4005 // in an iterator and having recursive call erase from it. Other edges may
4006 // also get removed during the recursion, which will have null Callee and
4007 // Caller pointers (and are deleted later), so we skip those below.
4008 {
4009 auto CallerEdges = Node->CallerEdges;
4010 for (auto &Edge : CallerEdges) {
4011 // Skip any that have been removed by an earlier recursive call.
4012 if (Edge->isRemoved()) {
4013 assert(!is_contained(Node->CallerEdges, Edge));
4014 continue;
4015 }
4016 // Defer backedges. See comments further below where these edges are
4017 // handled during the cloning of this Node.
4018 if (Edge->IsBackedge) {
4019 // We should only mark these if cloning recursive contexts, where we
4020 // need to do this deferral.
4022 continue;
4023 }
4024 // Ignore any caller we previously visited via another edge.
4025 if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
4026 identifyClones(Edge->Caller, Visited, AllocContextIds);
4027 }
4028 }
4029 }
4030
4031 // Check if we reached an unambiguous call or have have only a single caller.
4032 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
4033 return;
4034
4035 // We need to clone.
4036
4037 // Try to keep the original version as alloc type NotCold. This will make
4038 // cases with indirect calls or any other situation with an unknown call to
4039 // the original function get the default behavior. We do this by sorting the
4040 // CallerEdges of the Node we will clone by alloc type.
4041 //
4042 // Give NotCold edge the lowest sort priority so those edges are at the end of
4043 // the caller edges vector, and stay on the original version (since the below
4044 // code clones greedily until it finds all remaining edges have the same type
4045 // and leaves the remaining ones on the original Node).
4046 //
4047 // We shouldn't actually have any None type edges, so the sorting priority for
4048 // that is arbitrary, and we assert in that case below.
4049 const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,
4050 /*Cold*/ 1,
4051 /*NotColdCold*/ 2};
4052 llvm::stable_sort(Node->CallerEdges,
4053 [&](const std::shared_ptr<ContextEdge> &A,
4054 const std::shared_ptr<ContextEdge> &B) {
4055 // Nodes with non-empty context ids should be sorted
4056 // before those with empty context ids.
4057 if (A->ContextIds.empty())
4058 // Either B ContextIds are non-empty (in which case we
4059 // should return false because B < A), or B ContextIds
4060 // are empty, in which case they are equal, and we
4061 // should maintain the original relative ordering.
4062 return false;
4063 if (B->ContextIds.empty())
4064 return true;
4065
4066 if (A->AllocTypes == B->AllocTypes)
4067 // Use the first context id for each edge as a
4068 // tie-breaker.
4069 return *A->ContextIds.begin() < *B->ContextIds.begin();
4070 return AllocTypeCloningPriority[A->AllocTypes] <
4071 AllocTypeCloningPriority[B->AllocTypes];
4072 });
4073
4074 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
4075
4076 DenseSet<uint32_t> RecursiveContextIds;
4078 // If we are allowing recursive callsites, but have also disabled recursive
4079 // contexts, look for context ids that show up in multiple caller edges.
4081 DenseSet<uint32_t> AllCallerContextIds;
4082 for (auto &CE : Node->CallerEdges) {
4083 // Resize to the largest set of caller context ids, since we know the
4084 // final set will be at least that large.
4085 AllCallerContextIds.reserve(CE->getContextIds().size());
4086 for (auto Id : CE->getContextIds())
4087 if (!AllCallerContextIds.insert(Id).second)
4088 RecursiveContextIds.insert(Id);
4089 }
4090 }
4091
4092 // Iterate until we find no more opportunities for disambiguating the alloc
4093 // types via cloning. In most cases this loop will terminate once the Node
4094 // has a single allocation type, in which case no more cloning is needed.
4095 // Iterate over a copy of Node's caller edges, since we may need to remove
4096 // edges in the moveEdgeTo* methods, and this simplifies the handling and
4097 // makes it less error-prone.
4098 auto CallerEdges = Node->CallerEdges;
4099 for (auto &CallerEdge : CallerEdges) {
4100 // Skip any that have been removed by an earlier recursive call.
4101 if (CallerEdge->isRemoved()) {
4102 assert(!is_contained(Node->CallerEdges, CallerEdge));
4103 continue;
4104 }
4105 assert(CallerEdge->Callee == Node);
4106
4107 // See if cloning the prior caller edge left this node with a single alloc
4108 // type or a single caller. In that case no more cloning of Node is needed.
4109 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
4110 break;
4111
4112 // If the caller was not successfully matched to a call in the IR/summary,
4113 // there is no point in trying to clone for it as we can't update that call.
4114 if (!CallerEdge->Caller->hasCall())
4115 continue;
4116
4117 // Only need to process the ids along this edge pertaining to the given
4118 // allocation.
4119 auto CallerEdgeContextsForAlloc =
4120 set_intersection(CallerEdge->getContextIds(), AllocContextIds);
4121 if (!RecursiveContextIds.empty())
4122 CallerEdgeContextsForAlloc =
4123 set_difference(CallerEdgeContextsForAlloc, RecursiveContextIds);
4124 if (CallerEdgeContextsForAlloc.empty())
4125 continue;
4126
4127 auto CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
4128
4129 // Compute the node callee edge alloc types corresponding to the context ids
4130 // for this caller edge.
4131 std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
4132 CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size());
4133 for (auto &CalleeEdge : Node->CalleeEdges)
4134 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
4135 CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
4136
4137 // Don't clone if doing so will not disambiguate any alloc types amongst
4138 // caller edges (including the callee edges that would be cloned).
4139 // Otherwise we will simply move all edges to the clone.
4140 //
4141 // First check if by cloning we will disambiguate the caller allocation
4142 // type from node's allocation type. Query allocTypeToUse so that we don't
4143 // bother cloning to distinguish NotCold+Cold from NotCold. Note that
4144 // neither of these should be None type.
4145 //
4146 // Then check if by cloning node at least one of the callee edges will be
4147 // disambiguated by splitting out different context ids.
4148 //
4149 // However, always do the cloning if this is a backedge, in which case we
4150 // have not yet cloned along this caller edge.
4151 assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
4152 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
4153 if (!CallerEdge->IsBackedge &&
4154 allocTypeToUse(CallerAllocTypeForAlloc) ==
4155 allocTypeToUse(Node->AllocTypes) &&
4156 allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
4157 CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
4158 continue;
4159 }
4160
4161 if (CallerEdge->IsBackedge) {
4162 // We should only mark these if cloning recursive contexts, where we
4163 // need to do this deferral.
4165 DeferredBackedges++;
4166 }
4167
4168 // If this is a backedge, we now do recursive cloning starting from its
4169 // caller since we may have moved unambiguous caller contexts to a clone
4170 // of this Node in a previous iteration of the current loop, giving more
4171 // opportunity for cloning through the backedge. Because we sorted the
4172 // caller edges earlier so that cold caller edges are first, we would have
4173 // visited and cloned this node for any unamibiguously cold non-recursive
4174 // callers before any ambiguous backedge callers. Note that we don't do this
4175 // if the caller is already cloned or visited during cloning (e.g. via a
4176 // different context path from the allocation).
4177 // TODO: Can we do better in the case where the caller was already visited?
4178 if (CallerEdge->IsBackedge && !CallerEdge->Caller->CloneOf &&
4179 !Visited.count(CallerEdge->Caller)) {
4180 const auto OrigIdCount = CallerEdge->getContextIds().size();
4181 // Now do the recursive cloning of this backedge's caller, which was
4182 // deferred earlier.
4183 identifyClones(CallerEdge->Caller, Visited, CallerEdgeContextsForAlloc);
4184 removeNoneTypeCalleeEdges(CallerEdge->Caller);
4185 // See if the recursive call to identifyClones moved the context ids to a
4186 // new edge from this node to a clone of caller, and switch to looking at
4187 // that new edge so that we clone Node for the new caller clone.
4188 bool UpdatedEdge = false;
4189 if (OrigIdCount > CallerEdge->getContextIds().size()) {
4190 for (auto E : Node->CallerEdges) {
4191 // Only interested in clones of the current edges caller.
4192 if (E->Caller->CloneOf != CallerEdge->Caller)
4193 continue;
4194 // See if this edge contains any of the context ids originally on the
4195 // current caller edge.
4196 auto CallerEdgeContextsForAllocNew =
4197 set_intersection(CallerEdgeContextsForAlloc, E->getContextIds());
4198 if (CallerEdgeContextsForAllocNew.empty())
4199 continue;
4200 // Make sure we don't pick a previously existing caller edge of this
4201 // Node, which would be processed on a different iteration of the
4202 // outer loop over the saved CallerEdges.
4203 if (llvm::is_contained(CallerEdges, E))
4204 continue;
4205 // The CallerAllocTypeForAlloc and CalleeEdgeAllocTypesForCallerEdge
4206 // are updated further below for all cases where we just invoked
4207 // identifyClones recursively.
4208 CallerEdgeContextsForAlloc.swap(CallerEdgeContextsForAllocNew);
4209 CallerEdge = E;
4210 UpdatedEdge = true;
4211 break;
4212 }
4213 }
4214 // If cloning removed this edge (and we didn't update it to a new edge
4215 // above), we're done with this edge. It's possible we moved all of the
4216 // context ids to an existing clone, in which case there's no need to do
4217 // further processing for them.
4218 if (CallerEdge->isRemoved())
4219 continue;
4220
4221 // Now we need to update the information used for the cloning decisions
4222 // further below, as we may have modified edges and their context ids.
4223
4224 // Note if we changed the CallerEdge above we would have already updated
4225 // the context ids.
4226 if (!UpdatedEdge) {
4227 CallerEdgeContextsForAlloc = set_intersection(
4228 CallerEdgeContextsForAlloc, CallerEdge->getContextIds());
4229 if (CallerEdgeContextsForAlloc.empty())
4230 continue;
4231 }
4232 // Update the other information that depends on the edges and on the now
4233 // updated CallerEdgeContextsForAlloc.
4234 CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
4235 CalleeEdgeAllocTypesForCallerEdge.clear();
4236 for (auto &CalleeEdge : Node->CalleeEdges) {
4237 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
4238 CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
4239 }
4240 }
4241
4242 // First see if we can use an existing clone. Check each clone and its
4243 // callee edges for matching alloc types.
4244 ContextNode *Clone = nullptr;
4245 for (auto *CurClone : Node->Clones) {
4246 if (allocTypeToUse(CurClone->AllocTypes) !=
4247 allocTypeToUse(CallerAllocTypeForAlloc))
4248 continue;
4249
4250 bool BothSingleAlloc = hasSingleAllocType(CurClone->AllocTypes) &&
4251 hasSingleAllocType(CallerAllocTypeForAlloc);
4252 // The above check should mean that if both have single alloc types that
4253 // they should be equal.
4254 assert(!BothSingleAlloc ||
4255 CurClone->AllocTypes == CallerAllocTypeForAlloc);
4256
4257 // If either both have a single alloc type (which are the same), or if the
4258 // clone's callee edges have the same alloc types as those for the current
4259 // allocation on Node's callee edges (CalleeEdgeAllocTypesForCallerEdge),
4260 // then we can reuse this clone.
4261 if (BothSingleAlloc || allocTypesMatchClone<DerivedCCG, FuncTy, CallTy>(
4262 CalleeEdgeAllocTypesForCallerEdge, CurClone)) {
4263 Clone = CurClone;
4264 break;
4265 }
4266 }
4267
4268 // The edge iterator is adjusted when we move the CallerEdge to the clone.
4269 if (Clone)
4270 moveEdgeToExistingCalleeClone(CallerEdge, Clone, /*NewClone=*/false,
4271 CallerEdgeContextsForAlloc);
4272 else
4273 Clone = moveEdgeToNewCalleeClone(CallerEdge, CallerEdgeContextsForAlloc);
4274
4275 // Sanity check that no alloc types on clone or its edges are None.
4276 assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
4277 }
4278
4279 // We should still have some context ids on the original Node.
4280 assert(!Node->emptyContextIds());
4281
4282 // Sanity check that no alloc types on node or edges are None.
4283 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
4284
4285 if (VerifyNodes)
4286 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
4287}
4288
4289void ModuleCallsiteContextGraph::updateAllocationCall(
4290 CallInfo &Call, AllocationType AllocType) {
4291 std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
4293 auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
4294 "memprof", AllocTypeString);
4295 cast<CallBase>(Call.call())->addFnAttr(A);
4296 OREGetter(Call.call()->getFunction())
4297 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
4298 << ore::NV("AllocationCall", Call.call()) << " in clone "
4299 << ore::NV("Caller", Call.call()->getFunction())
4300 << " marked with memprof allocation attribute "
4301 << ore::NV("Attribute", AllocTypeString));
4302}
4303
4304void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
4306 auto *AI = cast<AllocInfo *>(Call.call());
4307 assert(AI);
4308 assert(AI->Versions.size() > Call.cloneNo());
4309 AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
4310}
4311
4313ModuleCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
4314 const auto *CB = cast<CallBase>(Call.call());
4315 if (!CB->getAttributes().hasFnAttr("memprof"))
4316 return AllocationType::None;
4317 return CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
4318 ? AllocationType::Cold
4319 : AllocationType::NotCold;
4320}
4321
4323IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
4324 const auto *AI = cast<AllocInfo *>(Call.call());
4325 assert(AI->Versions.size() > Call.cloneNo());
4326 return (AllocationType)AI->Versions[Call.cloneNo()];
4327}
4328
4329void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
4330 FuncInfo CalleeFunc) {
4331 auto *CurF = getCalleeFunc(CallerCall.call());
4332 auto NewCalleeCloneNo = CalleeFunc.cloneNo();
4333 if (isMemProfClone(*CurF)) {
4334 // If we already assigned this callsite to call a specific non-default
4335 // clone (i.e. not the original function which is clone 0), ensure that we
4336 // aren't trying to now update it to call a different clone, which is
4337 // indicative of a bug in the graph or function assignment.
4338 auto CurCalleeCloneNo = getMemProfCloneNum(*CurF);
4339 if (CurCalleeCloneNo != NewCalleeCloneNo) {
4340 LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was "
4341 << CurCalleeCloneNo << " now " << NewCalleeCloneNo
4342 << "\n");
4343 MismatchedCloneAssignments++;
4344 }
4345 }
4346 if (NewCalleeCloneNo > 0)
4347 cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
4348 OREGetter(CallerCall.call()->getFunction())
4349 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
4350 << ore::NV("Call", CallerCall.call()) << " in clone "
4351 << ore::NV("Caller", CallerCall.call()->getFunction())
4352 << " assigned to call function clone "
4353 << ore::NV("Callee", CalleeFunc.func()));
4354}
4355
4356void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
4357 FuncInfo CalleeFunc) {
4358 auto *CI = cast<CallsiteInfo *>(CallerCall.call());
4359 assert(CI &&
4360 "Caller cannot be an allocation which should not have profiled calls");
4361 assert(CI->Clones.size() > CallerCall.cloneNo());
4362 auto NewCalleeCloneNo = CalleeFunc.cloneNo();
4363 auto &CurCalleeCloneNo = CI->Clones[CallerCall.cloneNo()];
4364 // If we already assigned this callsite to call a specific non-default
4365 // clone (i.e. not the original function which is clone 0), ensure that we
4366 // aren't trying to now update it to call a different clone, which is
4367 // indicative of a bug in the graph or function assignment.
4368 if (CurCalleeCloneNo != 0 && CurCalleeCloneNo != NewCalleeCloneNo) {
4369 LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was "
4370 << CurCalleeCloneNo << " now " << NewCalleeCloneNo
4371 << "\n");
4372 MismatchedCloneAssignments++;
4373 }
4374 CurCalleeCloneNo = NewCalleeCloneNo;
4375}
4376
4377// Update the debug information attached to NewFunc to use the clone Name. Note
4378// this needs to be done for both any existing DISubprogram for the definition,
4379// as well as any separate declaration DISubprogram.
4381 assert(Name == NewFunc->getName());
4382 auto *SP = NewFunc->getSubprogram();
4383 if (!SP)
4384 return;
4385 auto *MDName = MDString::get(NewFunc->getParent()->getContext(), Name);
4386 SP->replaceLinkageName(MDName);
4387 DISubprogram *Decl = SP->getDeclaration();
4388 if (!Decl)
4389 return;
4390 TempDISubprogram NewDecl = Decl->clone();
4391 NewDecl->replaceLinkageName(MDName);
4392 SP->replaceDeclaration(MDNode::replaceWithUniqued(std::move(NewDecl)));
4393}
4394
4395CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
4396 Instruction *>::FuncInfo
4397ModuleCallsiteContextGraph::cloneFunctionForCallsite(
4398 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
4399 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
4400 // Use existing LLVM facilities for cloning and obtaining Call in clone
4401 ValueToValueMapTy VMap;
4402 auto *NewFunc = CloneFunction(Func.func(), VMap);
4403 std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo);
4404 assert(!Func.func()->getParent()->getFunction(Name));
4405 NewFunc->setName(Name);
4406 updateSubprogramLinkageName(NewFunc, Name);
4407 for (auto &Inst : CallsWithMetadataInFunc) {
4408 // This map always has the initial version in it.
4409 assert(Inst.cloneNo() == 0);
4410 CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo};
4411 }
4412 OREGetter(Func.func())
4413 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
4414 << "created clone " << ore::NV("NewFunction", NewFunc));
4415 return {NewFunc, CloneNo};
4416}
4417
4418CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
4419 IndexCall>::FuncInfo
4420IndexCallsiteContextGraph::cloneFunctionForCallsite(
4421 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
4422 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
4423 // Check how many clones we have of Call (and therefore function).
4424 // The next clone number is the current size of versions array.
4425 // Confirm this matches the CloneNo provided by the caller, which is based on
4426 // the number of function clones we have.
4427 assert(CloneNo == (isa<AllocInfo *>(Call.call())
4428 ? cast<AllocInfo *>(Call.call())->Versions.size()
4429 : cast<CallsiteInfo *>(Call.call())->Clones.size()));
4430 // Walk all the instructions in this function. Create a new version for
4431 // each (by adding an entry to the Versions/Clones summary array), and copy
4432 // over the version being called for the function clone being cloned here.
4433 // Additionally, add an entry to the CallMap for the new function clone,
4434 // mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
4435 // to the new call clone.
4436 for (auto &Inst : CallsWithMetadataInFunc) {
4437 // This map always has the initial version in it.
4438 assert(Inst.cloneNo() == 0);
4439 if (auto *AI = dyn_cast<AllocInfo *>(Inst.call())) {
4440 assert(AI->Versions.size() == CloneNo);
4441 // We assign the allocation type later (in updateAllocationCall), just add
4442 // an entry for it here.
4443 AI->Versions.push_back(0);
4444 } else {
4445 auto *CI = cast<CallsiteInfo *>(Inst.call());
4446 assert(CI && CI->Clones.size() == CloneNo);
4447 // We assign the clone number later (in updateCall), just add an entry for
4448 // it here.
4449 CI->Clones.push_back(0);
4450 }
4451 CallMap[Inst] = {Inst.call(), CloneNo};
4452 }
4453 return {Func.func(), CloneNo};
4454}
4455
4456// We perform cloning for each allocation node separately. However, this
4457// sometimes results in a situation where the same node calls multiple
4458// clones of the same callee, created for different allocations. This
4459// causes issues when assigning functions to these clones, as each node can
4460// in reality only call a single callee clone.
4461//
4462// To address this, before assigning functions, merge callee clone nodes as
4463// needed using a post order traversal from the allocations. We attempt to
4464// use existing clones as the merge node when legal, and to share them
4465// among callers with the same properties (callers calling the same set of
4466// callee clone nodes for the same allocations).
4467//
4468// Without this fix, in some cases incorrect function assignment will lead
4469// to calling the wrong allocation clone.
4470template <typename DerivedCCG, typename FuncTy, typename CallTy>
4471void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones() {
4472 if (!MergeClones)
4473 return;
4474
4475 // Generate a map from context id to the associated allocation node for use
4476 // when merging clones.
4477 DenseMap<uint32_t, ContextNode *> ContextIdToAllocationNode;
4478 for (auto &Entry : AllocationCallToContextNodeMap) {
4479 auto *Node = Entry.second;
4480 for (auto Id : Node->getContextIds())
4481 ContextIdToAllocationNode[Id] = Node->getOrigNode();
4482 for (auto *Clone : Node->Clones) {
4483 for (auto Id : Clone->getContextIds())
4484 ContextIdToAllocationNode[Id] = Clone->getOrigNode();
4485 }
4486 }
4487
4488 // Post order traversal starting from allocations to ensure each callsite
4489 // calls a single clone of its callee. Callee nodes that are clones of each
4490 // other are merged (via new merge nodes if needed) to achieve this.
4491 DenseSet<const ContextNode *> Visited;
4492 for (auto &Entry : AllocationCallToContextNodeMap) {
4493 auto *Node = Entry.second;
4494
4495 mergeClones(Node, Visited, ContextIdToAllocationNode);
4496
4497 // Make a copy so the recursive post order traversal that may create new
4498 // clones doesn't mess up iteration. Note that the recursive traversal
4499 // itself does not call mergeClones on any of these nodes, which are all
4500 // (clones of) allocations.
4501 auto Clones = Node->Clones;
4502 for (auto *Clone : Clones)
4503 mergeClones(Clone, Visited, ContextIdToAllocationNode);
4504 }
4505
4506 if (DumpCCG) {
4507 dbgs() << "CCG after merging:\n";
4508 dbgs() << *this;
4509 }
4510 if (ExportToDot)
4511 exportToDot("aftermerge");
4512
4513 if (VerifyCCG) {
4514 check();
4515 }
4516}
4517
4518// Recursive helper for above mergeClones method.
4519template <typename DerivedCCG, typename FuncTy, typename CallTy>
4520void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones(
4521 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
4522 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
4523 auto Inserted = Visited.insert(Node);
4524 if (!Inserted.second)
4525 return;
4526
4527 // Iteratively perform merging on this node to handle new caller nodes created
4528 // during the recursive traversal. We could do something more elegant such as
4529 // maintain a worklist, but this is a simple approach that doesn't cause a
4530 // measureable compile time effect, as most nodes don't have many caller
4531 // edges to check.
4532 bool FoundUnvisited = true;
4533 unsigned Iters = 0;
4534 while (FoundUnvisited) {
4535 Iters++;
4536 FoundUnvisited = false;
4537 // Make a copy since the recursive call may move a caller edge to a new
4538 // callee, messing up the iterator.
4539 auto CallerEdges = Node->CallerEdges;
4540 for (auto CallerEdge : CallerEdges) {
4541 // Skip any caller edge moved onto a different callee during recursion.
4542 if (CallerEdge->Callee != Node)
4543 continue;
4544 // If we found an unvisited caller, note that we should check the caller
4545 // edges again as mergeClones may add or change caller nodes.
4546 if (DoMergeIteration && !Visited.contains(CallerEdge->Caller))
4547 FoundUnvisited = true;
4548 mergeClones(CallerEdge->Caller, Visited, ContextIdToAllocationNode);
4549 }
4550 }
4551
4552 TotalMergeInvokes++;
4553 TotalMergeIters += Iters;
4554 if (Iters > MaxMergeIters)
4555 MaxMergeIters = Iters;
4556
4557 // Merge for this node after we handle its callers.
4558 mergeNodeCalleeClones(Node, Visited, ContextIdToAllocationNode);
4559}
4560
4561template <typename DerivedCCG, typename FuncTy, typename CallTy>
4562void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeNodeCalleeClones(
4563 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
4564 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
4565 // Ignore Node if we moved all of its contexts to clones.
4566 if (Node->emptyContextIds())
4567 return;
4568
4569 // First identify groups of clones among Node's callee edges, by building
4570 // a map from each callee base node to the associated callee edges from Node.
4571 MapVector<ContextNode *, std::vector<std::shared_ptr<ContextEdge>>>
4572 OrigNodeToCloneEdges;
4573 for (const auto &E : Node->CalleeEdges) {
4574 auto *Callee = E->Callee;
4575 if (!Callee->CloneOf && Callee->Clones.empty())
4576 continue;
4577 ContextNode *Base = Callee->getOrigNode();
4578 OrigNodeToCloneEdges[Base].push_back(E);
4579 }
4580
4581 // Helper for callee edge sorting below. Return true if A's callee has fewer
4582 // caller edges than B, or if A is a clone and B is not, or if A's first
4583 // context id is smaller than B's.
4584 auto CalleeCallerEdgeLessThan = [](const std::shared_ptr<ContextEdge> &A,
4585 const std::shared_ptr<ContextEdge> &B) {
4586 if (A->Callee->CallerEdges.size() != B->Callee->CallerEdges.size())
4587 return A->Callee->CallerEdges.size() < B->Callee->CallerEdges.size();
4588 if (A->Callee->CloneOf && !B->Callee->CloneOf)
4589 return true;
4590 else if (!A->Callee->CloneOf && B->Callee->CloneOf)
4591 return false;
4592 // Use the first context id for each edge as a
4593 // tie-breaker.
4594 return *A->ContextIds.begin() < *B->ContextIds.begin();
4595 };
4596
4597 // Process each set of callee clones called by Node, performing the needed
4598 // merging.
4599 for (auto Entry : OrigNodeToCloneEdges) {
4600 // CalleeEdges is the set of edges from Node reaching callees that are
4601 // mutual clones of each other.
4602 auto &CalleeEdges = Entry.second;
4603 auto NumCalleeClones = CalleeEdges.size();
4604 // A single edge means there is no merging needed.
4605 if (NumCalleeClones == 1)
4606 continue;
4607 // Sort the CalleeEdges calling this group of clones in ascending order of
4608 // their caller edge counts, putting the original non-clone node first in
4609 // cases of a tie. This simplifies finding an existing node to use as the
4610 // merge node.
4611 llvm::stable_sort(CalleeEdges, CalleeCallerEdgeLessThan);
4612
4613 /// Find other callers of the given set of callee edges that can
4614 /// share the same callee merge node. See the comments at this method
4615 /// definition for details.
4616 DenseSet<ContextNode *> OtherCallersToShareMerge;
4617 findOtherCallersToShareMerge(Node, CalleeEdges, ContextIdToAllocationNode,
4618 OtherCallersToShareMerge);
4619
4620 // Now do the actual merging. Identify existing or create a new MergeNode
4621 // during the first iteration. Move each callee over, along with edges from
4622 // other callers we've determined above can share the same merge node.
4623 ContextNode *MergeNode = nullptr;
4624 DenseMap<ContextNode *, unsigned> CallerToMoveCount;
4625 for (auto CalleeEdge : CalleeEdges) {
4626 auto *OrigCallee = CalleeEdge->Callee;
4627 // If we don't have a MergeNode yet (only happens on the first iteration,
4628 // as a new one will be created when we go to move the first callee edge
4629 // over as needed), see if we can use this callee.
4630 if (!MergeNode) {
4631 // If there are no other callers, simply use this callee.
4632 if (CalleeEdge->Callee->CallerEdges.size() == 1) {
4633 MergeNode = OrigCallee;
4634 NonNewMergedNodes++;
4635 continue;
4636 }
4637 // Otherwise, if we have identified other caller nodes that can share
4638 // the merge node with Node, see if all of OrigCallee's callers are
4639 // going to share the same merge node. In that case we can use callee
4640 // (since all of its callers would move to the new merge node).
4641 if (!OtherCallersToShareMerge.empty()) {
4642 bool MoveAllCallerEdges = true;
4643 for (auto CalleeCallerE : OrigCallee->CallerEdges) {
4644 if (CalleeCallerE == CalleeEdge)
4645 continue;
4646 if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller)) {
4647 MoveAllCallerEdges = false;
4648 break;
4649 }
4650 }
4651 // If we are going to move all callers over, we can use this callee as
4652 // the MergeNode.
4653 if (MoveAllCallerEdges) {
4654 MergeNode = OrigCallee;
4655 NonNewMergedNodes++;
4656 continue;
4657 }
4658 }
4659 }
4660 // Move this callee edge, creating a new merge node if necessary.
4661 if (MergeNode) {
4662 assert(MergeNode != OrigCallee);
4663 moveEdgeToExistingCalleeClone(CalleeEdge, MergeNode,
4664 /*NewClone*/ false);
4665 } else {
4666 MergeNode = moveEdgeToNewCalleeClone(CalleeEdge);
4667 NewMergedNodes++;
4668 }
4669 // Now move all identified edges from other callers over to the merge node
4670 // as well.
4671 if (!OtherCallersToShareMerge.empty()) {
4672 // Make and iterate over a copy of OrigCallee's caller edges because
4673 // some of these will be moved off of the OrigCallee and that would mess
4674 // up the iteration from OrigCallee.
4675 auto OrigCalleeCallerEdges = OrigCallee->CallerEdges;
4676 for (auto &CalleeCallerE : OrigCalleeCallerEdges) {
4677 if (CalleeCallerE == CalleeEdge)
4678 continue;
4679 if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller))
4680 continue;
4681 CallerToMoveCount[CalleeCallerE->Caller]++;
4682 moveEdgeToExistingCalleeClone(CalleeCallerE, MergeNode,
4683 /*NewClone*/ false);
4684 }
4685 }
4686 removeNoneTypeCalleeEdges(OrigCallee);
4687 removeNoneTypeCalleeEdges(MergeNode);
4688 }
4689 }
4690}
4691
4692// Look for other nodes that have edges to the same set of callee
4693// clones as the current Node. Those can share the eventual merge node
4694// (reducing cloning and binary size overhead) iff:
4695// - they have edges to the same set of callee clones
4696// - each callee edge reaches a subset of the same allocations as Node's
4697// corresponding edge to the same callee clone.
4698// The second requirement is to ensure that we don't undo any of the
4699// necessary cloning to distinguish contexts with different allocation
4700// behavior.
4701// FIXME: This is somewhat conservative, as we really just need to ensure
4702// that they don't reach the same allocations as contexts on edges from Node
4703// going to any of the *other* callee clones being merged. However, that
4704// requires more tracking and checking to get right.
4705template <typename DerivedCCG, typename FuncTy, typename CallTy>
4706void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
4707 findOtherCallersToShareMerge(
4708 ContextNode *Node,
4709 std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges,
4710 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode,
4711 DenseSet<ContextNode *> &OtherCallersToShareMerge) {
4712 auto NumCalleeClones = CalleeEdges.size();
4713 // This map counts how many edges to the same callee clone exist for other
4714 // caller nodes of each callee clone.
4715 DenseMap<ContextNode *, unsigned> OtherCallersToSharedCalleeEdgeCount;
4716 // Counts the number of other caller nodes that have edges to all callee
4717 // clones that don't violate the allocation context checking.
4718 unsigned PossibleOtherCallerNodes = 0;
4719
4720 // We only need to look at other Caller nodes if the first callee edge has
4721 // multiple callers (recall they are sorted in ascending order above).
4722 if (CalleeEdges[0]->Callee->CallerEdges.size() < 2)
4723 return;
4724
4725 // For each callee edge:
4726 // - Collect the count of other caller nodes calling the same callees.
4727 // - Collect the alloc nodes reached by contexts on each callee edge.
4728 DenseMap<ContextEdge *, DenseSet<ContextNode *>> CalleeEdgeToAllocNodes;
4729 for (auto CalleeEdge : CalleeEdges) {
4730 assert(CalleeEdge->Callee->CallerEdges.size() > 1);
4731 // For each other caller of the same callee, increment the count of
4732 // edges reaching the same callee clone.
4733 for (auto CalleeCallerEdges : CalleeEdge->Callee->CallerEdges) {
4734 if (CalleeCallerEdges->Caller == Node) {
4735 assert(CalleeCallerEdges == CalleeEdge);
4736 continue;
4737 }
4738 OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller]++;
4739 // If this caller edge now reaches all of the same callee clones,
4740 // increment the count of candidate other caller nodes.
4741 if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller] ==
4742 NumCalleeClones)
4743 PossibleOtherCallerNodes++;
4744 }
4745 // Collect the alloc nodes reached by contexts on each callee edge, for
4746 // later analysis.
4747 for (auto Id : CalleeEdge->getContextIds()) {
4748 auto *Alloc = ContextIdToAllocationNode.lookup(Id);
4749 if (!Alloc) {
4750 // FIXME: unclear why this happens occasionally, presumably
4751 // imperfect graph updates possibly with recursion.
4752 MissingAllocForContextId++;
4753 continue;
4754 }
4755 CalleeEdgeToAllocNodes[CalleeEdge.get()].insert(Alloc);
4756 }
4757 }
4758
4759 // Now walk the callee edges again, and make sure that for each candidate
4760 // caller node all of its edges to the callees reach the same allocs (or
4761 // a subset) as those along the corresponding callee edge from Node.
4762 for (auto CalleeEdge : CalleeEdges) {
4763 assert(CalleeEdge->Callee->CallerEdges.size() > 1);
4764 // Stop if we do not have any (more) candidate other caller nodes.
4765 if (!PossibleOtherCallerNodes)
4766 break;
4767 auto &CurCalleeAllocNodes = CalleeEdgeToAllocNodes[CalleeEdge.get()];
4768 // Check each other caller of this callee clone.
4769 for (auto &CalleeCallerE : CalleeEdge->Callee->CallerEdges) {
4770 // Not interested in the callee edge from Node itself.
4771 if (CalleeCallerE == CalleeEdge)
4772 continue;
4773 // Skip any callers that didn't have callee edges to all the same
4774 // callee clones.
4775 if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] !=
4776 NumCalleeClones)
4777 continue;
4778 // Make sure that each context along edge from candidate caller node
4779 // reaches an allocation also reached by this callee edge from Node.
4780 for (auto Id : CalleeCallerE->getContextIds()) {
4781 auto *Alloc = ContextIdToAllocationNode.lookup(Id);
4782 if (!Alloc)
4783 continue;
4784 // If not, simply reset the map entry to 0 so caller is ignored, and
4785 // reduce the count of candidate other caller nodes.
4786 if (!CurCalleeAllocNodes.contains(Alloc)) {
4787 OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] = 0;
4788 PossibleOtherCallerNodes--;
4789 break;
4790 }
4791 }
4792 }
4793 }
4794
4795 if (!PossibleOtherCallerNodes)
4796 return;
4797
4798 // Build the set of other caller nodes that can use the same callee merge
4799 // node.
4800 for (auto &[OtherCaller, Count] : OtherCallersToSharedCalleeEdgeCount) {
4801 if (Count != NumCalleeClones)
4802 continue;
4803 OtherCallersToShareMerge.insert(OtherCaller);
4804 }
4805}
4806
4807// This method assigns cloned callsites to functions, cloning the functions as
4808// needed. The assignment is greedy and proceeds roughly as follows:
4809//
4810// For each function Func:
4811// For each call with graph Node having clones:
4812// Initialize ClonesWorklist to Node and its clones
4813// Initialize NodeCloneCount to 0
4814// While ClonesWorklist is not empty:
4815// Clone = pop front ClonesWorklist
4816// NodeCloneCount++
4817// If Func has been cloned less than NodeCloneCount times:
4818// If NodeCloneCount is 1:
4819// Assign Clone to original Func
4820// Continue
4821// Create a new function clone
4822// If other callers not assigned to call a function clone yet:
4823// Assign them to call new function clone
4824// Continue
4825// Assign any other caller calling the cloned version to new clone
4826//
4827// For each caller of Clone:
4828// If caller is assigned to call a specific function clone:
4829// If we cannot assign Clone to that function clone:
4830// Create new callsite Clone NewClone
4831// Add NewClone to ClonesWorklist
4832// Continue
4833// Assign Clone to existing caller's called function clone
4834// Else:
4835// If Clone not already assigned to a function clone:
4836// Assign to first function clone without assignment
4837// Assign caller to selected function clone
4838// For each call with graph Node having clones:
4839// If number func clones > number call's callsite Node clones:
4840// Record func CallInfo clones without Node clone in UnassignedCallClones
4841// For callsite Nodes in DFS order from allocations:
4842// If IsAllocation:
4843// Update allocation with alloc type
4844// Else:
4845// For Call, all MatchingCalls, and associated UnnassignedCallClones:
4846// Update call to call recorded callee clone
4847//
4848template <typename DerivedCCG, typename FuncTy, typename CallTy>
4849bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
4850 bool Changed = false;
4851
4852 mergeClones();
4853
4854 // Keep track of the assignment of nodes (callsites) to function clones they
4855 // call.
4856 DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
4857
4858 // Update caller node to call function version CalleeFunc, by recording the
4859 // assignment in CallsiteToCalleeFuncCloneMap.
4860 auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
4861 const FuncInfo &CalleeFunc) {
4862 assert(Caller->hasCall());
4863 CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
4864 };
4865
4866 // Information for a single clone of this Func.
4867 struct FuncCloneInfo {
4868 // The function clone.
4869 FuncInfo FuncClone;
4870 // Remappings of each call of interest (from original uncloned call to the
4871 // corresponding cloned call in this function clone).
4872 DenseMap<CallInfo, CallInfo> CallMap;
4873 };
4874
4875 // Map to keep track of information needed to update calls in function clones
4876 // when their corresponding callsite node was not itself cloned for that
4877 // function clone. Because of call context pruning (i.e. we only keep as much
4878 // caller information as needed to distinguish hot vs cold), we may not have
4879 // caller edges coming to each callsite node from all possible function
4880 // callers. A function clone may get created for other callsites in the
4881 // function for which there are caller edges that were not pruned. Any other
4882 // callsites in that function clone, which were not themselved cloned for
4883 // that function clone, should get updated the same way as the corresponding
4884 // callsite in the original function (which may call a clone of its callee).
4885 //
4886 // We build this map after completing function cloning for each function, so
4887 // that we can record the information from its call maps before they are
4888 // destructed. The map will be used as we update calls to update any still
4889 // unassigned call clones. Note that we may create new node clones as we clone
4890 // other functions, so later on we check which node clones were still not
4891 // created. To this end, the inner map is a map from function clone number to
4892 // the list of calls cloned for that function (can be more than one due to the
4893 // Node's MatchingCalls array).
4894 //
4895 // The alternative is creating new callsite clone nodes below as we clone the
4896 // function, but that is tricker to get right and likely more overhead.
4897 //
4898 // Inner map is a std::map so sorted by key (clone number), in order to get
4899 // ordered remarks in the full LTO case.
4900 DenseMap<const ContextNode *, std::map<unsigned, SmallVector<CallInfo, 0>>>
4901 UnassignedCallClones;
4902
4903 // Walk all functions for which we saw calls with memprof metadata, and handle
4904 // cloning for each of its calls.
4905 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
4906 FuncInfo OrigFunc(Func);
4907 // Map from each clone number of OrigFunc to information about that function
4908 // clone (the function clone FuncInfo and call remappings). The index into
4909 // the vector is the clone number, as function clones are created and
4910 // numbered sequentially.
4911 std::vector<FuncCloneInfo> FuncCloneInfos;
4912 for (auto &Call : CallsWithMetadata) {
4913 ContextNode *Node = getNodeForInst(Call);
4914 // Skip call if we do not have a node for it (all uses of its stack ids
4915 // were either on inlined chains or pruned from the MIBs), or if we did
4916 // not create any clones for it.
4917 if (!Node || Node->Clones.empty())
4918 continue;
4919 assert(Node->hasCall() &&
4920 "Not having a call should have prevented cloning");
4921
4922 // Track the assignment of function clones to clones of the current
4923 // callsite Node being handled.
4924 std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
4925
4926 // Assign callsite version CallsiteClone to function version FuncClone,
4927 // and also assign (possibly cloned) Call to CallsiteClone.
4928 auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
4929 CallInfo &Call,
4930 ContextNode *CallsiteClone,
4931 bool IsAlloc) {
4932 // Record the clone of callsite node assigned to this function clone.
4933 FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
4934
4935 assert(FuncCloneInfos.size() > FuncClone.cloneNo());
4936 DenseMap<CallInfo, CallInfo> &CallMap =
4937 FuncCloneInfos[FuncClone.cloneNo()].CallMap;
4938 CallInfo CallClone(Call);
4939 if (auto It = CallMap.find(Call); It != CallMap.end())
4940 CallClone = It->second;
4941 CallsiteClone->setCall(CallClone);
4942 // Need to do the same for all matching calls.
4943 for (auto &MatchingCall : Node->MatchingCalls) {
4944 CallInfo CallClone(MatchingCall);
4945 if (auto It = CallMap.find(MatchingCall); It != CallMap.end())
4946 CallClone = It->second;
4947 // Updates the call in the list.
4948 MatchingCall = CallClone;
4949 }
4950 };
4951
4952 // Invokes moveEdgeToNewCalleeClone which creates a new clone, and then
4953 // performs the necessary fixups (removing none type edges, and
4954 // importantly, propagating any function call assignment of the original
4955 // node to the new clone).
4956 auto MoveEdgeToNewCalleeCloneAndSetUp =
4957 [&](const std::shared_ptr<ContextEdge> &Edge) {
4958 ContextNode *OrigCallee = Edge->Callee;
4959 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge);
4960 removeNoneTypeCalleeEdges(NewClone);
4961 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
4962 // If the original Callee was already assigned to call a specific
4963 // function version, make sure its new clone is assigned to call
4964 // that same function clone.
4965 if (CallsiteToCalleeFuncCloneMap.count(OrigCallee))
4966 RecordCalleeFuncOfCallsite(
4967 NewClone, CallsiteToCalleeFuncCloneMap[OrigCallee]);
4968 return NewClone;
4969 };
4970
4971 // Keep track of the clones of callsite Node that need to be assigned to
4972 // function clones. This list may be expanded in the loop body below if we
4973 // find additional cloning is required.
4974 std::deque<ContextNode *> ClonesWorklist;
4975 // Ignore original Node if we moved all of its contexts to clones.
4976 if (!Node->emptyContextIds())
4977 ClonesWorklist.push_back(Node);
4978 llvm::append_range(ClonesWorklist, Node->Clones);
4979
4980 // Now walk through all of the clones of this callsite Node that we need,
4981 // and determine the assignment to a corresponding clone of the current
4982 // function (creating new function clones as needed).
4983 unsigned NodeCloneCount = 0;
4984 while (!ClonesWorklist.empty()) {
4985 ContextNode *Clone = ClonesWorklist.front();
4986 ClonesWorklist.pop_front();
4987 NodeCloneCount++;
4988 if (VerifyNodes)
4990
4991 // Need to create a new function clone if we have more callsite clones
4992 // than existing function clones, which would have been assigned to an
4993 // earlier clone in the list (we assign callsite clones to function
4994 // clones greedily).
4995 if (FuncCloneInfos.size() < NodeCloneCount) {
4996 // If this is the first callsite copy, assign to original function.
4997 if (NodeCloneCount == 1) {
4998 // Since FuncCloneInfos is empty in this case, no clones have
4999 // been created for this function yet, and no callers should have
5000 // been assigned a function clone for this callee node yet.
5002 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
5003 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
5004 }));
5005 // Initialize with empty call map, assign Clone to original function
5006 // and its callers, and skip to the next clone.
5007 FuncCloneInfos.push_back(
5008 {OrigFunc, DenseMap<CallInfo, CallInfo>()});
5009 AssignCallsiteCloneToFuncClone(
5010 OrigFunc, Call, Clone,
5011 AllocationCallToContextNodeMap.count(Call));
5012 for (auto &CE : Clone->CallerEdges) {
5013 // Ignore any caller that does not have a recorded callsite Call.
5014 if (!CE->Caller->hasCall())
5015 continue;
5016 RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
5017 }
5018 continue;
5019 }
5020
5021 // First locate which copy of OrigFunc to clone again. If a caller
5022 // of this callsite clone was already assigned to call a particular
5023 // function clone, we need to redirect all of those callers to the
5024 // new function clone, and update their other callees within this
5025 // function.
5026 FuncInfo PreviousAssignedFuncClone;
5027 auto EI = llvm::find_if(
5028 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
5029 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
5030 });
5031 bool CallerAssignedToCloneOfFunc = false;
5032 if (EI != Clone->CallerEdges.end()) {
5033 const std::shared_ptr<ContextEdge> &Edge = *EI;
5034 PreviousAssignedFuncClone =
5035 CallsiteToCalleeFuncCloneMap[Edge->Caller];
5036 CallerAssignedToCloneOfFunc = true;
5037 }
5038
5039 // Clone function and save it along with the CallInfo map created
5040 // during cloning in the FuncCloneInfos.
5041 DenseMap<CallInfo, CallInfo> NewCallMap;
5042 unsigned CloneNo = FuncCloneInfos.size();
5043 assert(CloneNo > 0 && "Clone 0 is the original function, which "
5044 "should already exist in the map");
5045 FuncInfo NewFuncClone = cloneFunctionForCallsite(
5046 OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo);
5047 FuncCloneInfos.push_back({NewFuncClone, std::move(NewCallMap)});
5048 FunctionClonesAnalysis++;
5049 Changed = true;
5050
5051 // If no caller callsites were already assigned to a clone of this
5052 // function, we can simply assign this clone to the new func clone
5053 // and update all callers to it, then skip to the next clone.
5054 if (!CallerAssignedToCloneOfFunc) {
5055 AssignCallsiteCloneToFuncClone(
5056 NewFuncClone, Call, Clone,
5057 AllocationCallToContextNodeMap.count(Call));
5058 for (auto &CE : Clone->CallerEdges) {
5059 // Ignore any caller that does not have a recorded callsite Call.
5060 if (!CE->Caller->hasCall())
5061 continue;
5062 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
5063 }
5064 continue;
5065 }
5066
5067 // We may need to do additional node cloning in this case.
5068 // Reset the CallsiteToCalleeFuncCloneMap entry for any callers
5069 // that were previously assigned to call PreviousAssignedFuncClone,
5070 // to record that they now call NewFuncClone.
5071 // The none type edge removal may remove some of this Clone's caller
5072 // edges, if it is reached via another of its caller's callees.
5073 // Iterate over a copy and skip any that were removed.
5074 auto CallerEdges = Clone->CallerEdges;
5075 for (auto CE : CallerEdges) {
5076 // Skip any that have been removed on an earlier iteration.
5077 if (CE->isRemoved()) {
5078 assert(!is_contained(Clone->CallerEdges, CE));
5079 continue;
5080 }
5081 assert(CE);
5082 // Ignore any caller that does not have a recorded callsite Call.
5083 if (!CE->Caller->hasCall())
5084 continue;
5085
5086 if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
5087 // We subsequently fall through to later handling that
5088 // will perform any additional cloning required for
5089 // callers that were calling other function clones.
5090 CallsiteToCalleeFuncCloneMap[CE->Caller] !=
5091 PreviousAssignedFuncClone)
5092 continue;
5093
5094 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
5095
5096 // If we are cloning a function that was already assigned to some
5097 // callers, then essentially we are creating new callsite clones
5098 // of the other callsites in that function that are reached by those
5099 // callers. Clone the other callees of the current callsite's caller
5100 // that were already assigned to PreviousAssignedFuncClone
5101 // accordingly. This is important since we subsequently update the
5102 // calls from the nodes in the graph and their assignments to callee
5103 // functions recorded in CallsiteToCalleeFuncCloneMap.
5104 // The none type edge removal may remove some of this caller's
5105 // callee edges, if it is reached via another of its callees.
5106 // Iterate over a copy and skip any that were removed.
5107 auto CalleeEdges = CE->Caller->CalleeEdges;
5108 for (auto CalleeEdge : CalleeEdges) {
5109 // Skip any that have been removed on an earlier iteration when
5110 // cleaning up newly None type callee edges.
5111 if (CalleeEdge->isRemoved()) {
5112 assert(!is_contained(CE->Caller->CalleeEdges, CalleeEdge));
5113 continue;
5114 }
5115 assert(CalleeEdge);
5116 ContextNode *Callee = CalleeEdge->Callee;
5117 // Skip the current callsite, we are looking for other
5118 // callsites Caller calls, as well as any that does not have a
5119 // recorded callsite Call.
5120 if (Callee == Clone || !Callee->hasCall())
5121 continue;
5122 // Skip direct recursive calls. We don't need/want to clone the
5123 // caller node again, and this loop will not behave as expected if
5124 // we tried.
5125 if (Callee == CalleeEdge->Caller)
5126 continue;
5127 ContextNode *NewClone =
5128 MoveEdgeToNewCalleeCloneAndSetUp(CalleeEdge);
5129 // Moving the edge may have resulted in some none type
5130 // callee edges on the original Callee.
5131 removeNoneTypeCalleeEdges(Callee);
5132 // Update NewClone with the new Call clone of this callsite's Call
5133 // created for the new function clone created earlier.
5134 // Recall that we have already ensured when building the graph
5135 // that each caller can only call callsites within the same
5136 // function, so we are guaranteed that Callee Call is in the
5137 // current OrigFunc.
5138 // CallMap is set up as indexed by original Call at clone 0.
5139 CallInfo OrigCall(Callee->getOrigNode()->Call);
5140 OrigCall.setCloneNo(0);
5141 DenseMap<CallInfo, CallInfo> &CallMap =
5142 FuncCloneInfos[NewFuncClone.cloneNo()].CallMap;
5143 assert(CallMap.count(OrigCall));
5144 CallInfo NewCall(CallMap[OrigCall]);
5145 assert(NewCall);
5146 NewClone->setCall(NewCall);
5147 // Need to do the same for all matching calls.
5148 for (auto &MatchingCall : NewClone->MatchingCalls) {
5149 CallInfo OrigMatchingCall(MatchingCall);
5150 OrigMatchingCall.setCloneNo(0);
5151 assert(CallMap.count(OrigMatchingCall));
5152 CallInfo NewCall(CallMap[OrigMatchingCall]);
5153 assert(NewCall);
5154 // Updates the call in the list.
5155 MatchingCall = NewCall;
5156 }
5157 }
5158 }
5159 // Fall through to handling below to perform the recording of the
5160 // function for this callsite clone. This enables handling of cases
5161 // where the callers were assigned to different clones of a function.
5162 }
5163
5164 auto FindFirstAvailFuncClone = [&]() {
5165 // Find first function in FuncCloneInfos without an assigned
5166 // clone of this callsite Node. We should always have one
5167 // available at this point due to the earlier cloning when the
5168 // FuncCloneInfos size was smaller than the clone number.
5169 for (auto &CF : FuncCloneInfos) {
5170 if (!FuncCloneToCurNodeCloneMap.count(CF.FuncClone))
5171 return CF.FuncClone;
5172 }
5174 "Expected an available func clone for this callsite clone");
5175 };
5176
5177 // See if we can use existing function clone. Walk through
5178 // all caller edges to see if any have already been assigned to
5179 // a clone of this callsite's function. If we can use it, do so. If not,
5180 // because that function clone is already assigned to a different clone
5181 // of this callsite, then we need to clone again.
5182 // Basically, this checking is needed to handle the case where different
5183 // caller functions/callsites may need versions of this function
5184 // containing different mixes of callsite clones across the different
5185 // callsites within the function. If that happens, we need to create
5186 // additional function clones to handle the various combinations.
5187 //
5188 // Keep track of any new clones of this callsite created by the
5189 // following loop, as well as any existing clone that we decided to
5190 // assign this clone to.
5191 std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
5192 FuncInfo FuncCloneAssignedToCurCallsiteClone;
5193 // Iterate over a copy of Clone's caller edges, since we may need to
5194 // remove edges in the moveEdgeTo* methods, and this simplifies the
5195 // handling and makes it less error-prone.
5196 auto CloneCallerEdges = Clone->CallerEdges;
5197 for (auto &Edge : CloneCallerEdges) {
5198 // Skip removed edges (due to direct recursive edges updated when
5199 // updating callee edges when moving an edge and subsequently
5200 // removed by call to removeNoneTypeCalleeEdges on the Clone).
5201 if (Edge->isRemoved())
5202 continue;
5203 // Ignore any caller that does not have a recorded callsite Call.
5204 if (!Edge->Caller->hasCall())
5205 continue;
5206 // If this caller already assigned to call a version of OrigFunc, need
5207 // to ensure we can assign this callsite clone to that function clone.
5208 if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
5209 FuncInfo FuncCloneCalledByCaller =
5210 CallsiteToCalleeFuncCloneMap[Edge->Caller];
5211 // First we need to confirm that this function clone is available
5212 // for use by this callsite node clone.
5213 //
5214 // While FuncCloneToCurNodeCloneMap is built only for this Node and
5215 // its callsite clones, one of those callsite clones X could have
5216 // been assigned to the same function clone called by Edge's caller
5217 // - if Edge's caller calls another callsite within Node's original
5218 // function, and that callsite has another caller reaching clone X.
5219 // We need to clone Node again in this case.
5220 if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
5221 FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
5222 Clone) ||
5223 // Detect when we have multiple callers of this callsite that
5224 // have already been assigned to specific, and different, clones
5225 // of OrigFunc (due to other unrelated callsites in Func they
5226 // reach via call contexts). Is this Clone of callsite Node
5227 // assigned to a different clone of OrigFunc? If so, clone Node
5228 // again.
5229 (FuncCloneAssignedToCurCallsiteClone &&
5230 FuncCloneAssignedToCurCallsiteClone !=
5231 FuncCloneCalledByCaller)) {
5232 // We need to use a different newly created callsite clone, in
5233 // order to assign it to another new function clone on a
5234 // subsequent iteration over the Clones array (adjusted below).
5235 // Note we specifically do not reset the
5236 // CallsiteToCalleeFuncCloneMap entry for this caller, so that
5237 // when this new clone is processed later we know which version of
5238 // the function to copy (so that other callsite clones we have
5239 // assigned to that function clone are properly cloned over). See
5240 // comments in the function cloning handling earlier.
5241
5242 // Check if we already have cloned this callsite again while
5243 // walking through caller edges, for a caller calling the same
5244 // function clone. If so, we can move this edge to that new clone
5245 // rather than creating yet another new clone.
5246 if (FuncCloneToNewCallsiteCloneMap.count(
5247 FuncCloneCalledByCaller)) {
5248 ContextNode *NewClone =
5249 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
5250 moveEdgeToExistingCalleeClone(Edge, NewClone);
5251 // Cleanup any none type edges cloned over.
5252 removeNoneTypeCalleeEdges(NewClone);
5253 } else {
5254 // Create a new callsite clone.
5255 ContextNode *NewClone = MoveEdgeToNewCalleeCloneAndSetUp(Edge);
5256 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
5257 NewClone;
5258 // Add to list of clones and process later.
5259 ClonesWorklist.push_back(NewClone);
5260 }
5261 // Moving the caller edge may have resulted in some none type
5262 // callee edges.
5263 removeNoneTypeCalleeEdges(Clone);
5264 // We will handle the newly created callsite clone in a subsequent
5265 // iteration over this Node's Clones.
5266 continue;
5267 }
5268
5269 // Otherwise, we can use the function clone already assigned to this
5270 // caller.
5271 if (!FuncCloneAssignedToCurCallsiteClone) {
5272 FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
5273 // Assign Clone to FuncCloneCalledByCaller
5274 AssignCallsiteCloneToFuncClone(
5275 FuncCloneCalledByCaller, Call, Clone,
5276 AllocationCallToContextNodeMap.count(Call));
5277 } else
5278 // Don't need to do anything - callsite is already calling this
5279 // function clone.
5280 assert(FuncCloneAssignedToCurCallsiteClone ==
5281 FuncCloneCalledByCaller);
5282
5283 } else {
5284 // We have not already assigned this caller to a version of
5285 // OrigFunc. Do the assignment now.
5286
5287 // First check if we have already assigned this callsite clone to a
5288 // clone of OrigFunc for another caller during this iteration over
5289 // its caller edges.
5290 if (!FuncCloneAssignedToCurCallsiteClone) {
5291 FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone();
5292 assert(FuncCloneAssignedToCurCallsiteClone);
5293 // Assign Clone to FuncCloneAssignedToCurCallsiteClone
5294 AssignCallsiteCloneToFuncClone(
5295 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
5296 AllocationCallToContextNodeMap.count(Call));
5297 } else
5298 assert(FuncCloneToCurNodeCloneMap
5299 [FuncCloneAssignedToCurCallsiteClone] == Clone);
5300 // Update callers to record function version called.
5301 RecordCalleeFuncOfCallsite(Edge->Caller,
5302 FuncCloneAssignedToCurCallsiteClone);
5303 }
5304 }
5305 // If we didn't assign a function clone to this callsite clone yet, e.g.
5306 // none of its callers has a non-null call, do the assignment here.
5307 // We want to ensure that every callsite clone is assigned to some
5308 // function clone, so that the call updates below work as expected.
5309 // In particular if this is the original callsite, we want to ensure it
5310 // is assigned to the original function, otherwise the original function
5311 // will appear available for assignment to other callsite clones,
5312 // leading to unintended effects. For one, the unknown and not updated
5313 // callers will call into cloned paths leading to the wrong hints,
5314 // because they still call the original function (clone 0). Also,
5315 // because all callsites start out as being clone 0 by default, we can't
5316 // easily distinguish between callsites explicitly assigned to clone 0
5317 // vs those never assigned, which can lead to multiple updates of the
5318 // calls when invoking updateCall below, with mismatched clone values.
5319 // TODO: Add a flag to the callsite nodes or some other mechanism to
5320 // better distinguish and identify callsite clones that are not getting
5321 // assigned to function clones as expected.
5322 if (!FuncCloneAssignedToCurCallsiteClone) {
5323 FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone();
5324 assert(FuncCloneAssignedToCurCallsiteClone &&
5325 "No available func clone for this callsite clone");
5326 AssignCallsiteCloneToFuncClone(
5327 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
5328 /*IsAlloc=*/AllocationCallToContextNodeMap.contains(Call));
5329 }
5330 }
5331 if (VerifyCCG) {
5333 for (const auto &PE : Node->CalleeEdges)
5335 for (const auto &CE : Node->CallerEdges)
5337 for (auto *Clone : Node->Clones) {
5339 for (const auto &PE : Clone->CalleeEdges)
5341 for (const auto &CE : Clone->CallerEdges)
5343 }
5344 }
5345 }
5346
5347 if (FuncCloneInfos.size() < 2)
5348 continue;
5349
5350 // In this case there is more than just the original function copy.
5351 // Record call clones of any callsite nodes in the function that did not
5352 // themselves get cloned for all of the function clones.
5353 for (auto &Call : CallsWithMetadata) {
5354 ContextNode *Node = getNodeForInst(Call);
5355 if (!Node || !Node->hasCall() || Node->emptyContextIds())
5356 continue;
5357 // If Node has enough clones already to cover all function clones, we can
5358 // skip it. Need to add one for the original copy.
5359 // Use >= in case there were clones that were skipped due to having empty
5360 // context ids
5361 if (Node->Clones.size() + 1 >= FuncCloneInfos.size())
5362 continue;
5363 // First collect all function clones we cloned this callsite node for.
5364 // They may not be sequential due to empty clones e.g.
5365 DenseSet<unsigned> NodeCallClones;
5366 for (auto *C : Node->Clones)
5367 NodeCallClones.insert(C->Call.cloneNo());
5368 unsigned I = 0;
5369 // Now check all the function clones.
5370 for (auto &FC : FuncCloneInfos) {
5371 // Function clones should be sequential.
5372 assert(FC.FuncClone.cloneNo() == I);
5373 // Skip the first clone which got the original call.
5374 // Also skip any other clones created for this Node.
5375 if (++I == 1 || NodeCallClones.contains(I)) {
5376 continue;
5377 }
5378 // Record the call clones created for this callsite in this function
5379 // clone.
5380 auto &CallVector = UnassignedCallClones[Node][I];
5381 DenseMap<CallInfo, CallInfo> &CallMap = FC.CallMap;
5382 if (auto It = CallMap.find(Call); It != CallMap.end()) {
5383 CallInfo CallClone = It->second;
5384 CallVector.push_back(CallClone);
5385 } else {
5386 // All but the original clone (skipped earlier) should have an entry
5387 // for all calls.
5388 assert(false && "Expected to find call in CallMap");
5389 }
5390 // Need to do the same for all matching calls.
5391 for (auto &MatchingCall : Node->MatchingCalls) {
5392 if (auto It = CallMap.find(MatchingCall); It != CallMap.end()) {
5393 CallInfo CallClone = It->second;
5394 CallVector.push_back(CallClone);
5395 } else {
5396 // All but the original clone (skipped earlier) should have an entry
5397 // for all calls.
5398 assert(false && "Expected to find call in CallMap");
5399 }
5400 }
5401 }
5402 }
5403 }
5404
5405 uint8_t BothTypes =
5406 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
5407
5408 auto UpdateCalls = [&](ContextNode *Node,
5409 DenseSet<const ContextNode *> &Visited,
5410 auto &&UpdateCalls) {
5411 auto Inserted = Visited.insert(Node);
5412 if (!Inserted.second)
5413 return;
5414
5415 for (auto *Clone : Node->Clones)
5416 UpdateCalls(Clone, Visited, UpdateCalls);
5417
5418 for (auto &Edge : Node->CallerEdges)
5419 UpdateCalls(Edge->Caller, Visited, UpdateCalls);
5420
5421 // Skip if either no call to update, or if we ended up with no context ids
5422 // (we moved all edges onto other clones).
5423 if (!Node->hasCall() || Node->emptyContextIds())
5424 return;
5425
5426 if (Node->IsAllocation) {
5427 auto AT = allocTypeToUse(Node->AllocTypes);
5428 // If the allocation type is ambiguous, and more aggressive hinting
5429 // has been enabled via the MinClonedColdBytePercent flag, see if this
5430 // allocation should be hinted cold anyway because its fraction cold bytes
5431 // allocated is at least the given threshold.
5432 if (Node->AllocTypes == BothTypes && MinClonedColdBytePercent < 100 &&
5433 !ContextIdToContextSizeInfos.empty()) {
5434 uint64_t TotalCold = 0;
5435 uint64_t Total = 0;
5436 for (auto Id : Node->getContextIds()) {
5437 auto TypeI = ContextIdToAllocationType.find(Id);
5438 assert(TypeI != ContextIdToAllocationType.end());
5439 auto CSI = ContextIdToContextSizeInfos.find(Id);
5440 if (CSI != ContextIdToContextSizeInfos.end()) {
5441 for (auto &Info : CSI->second) {
5442 Total += Info.TotalSize;
5443 if (TypeI->second == AllocationType::Cold)
5444 TotalCold += Info.TotalSize;
5445 }
5446 }
5447 }
5448 if (TotalCold * 100 >= Total * MinClonedColdBytePercent)
5449 AT = AllocationType::Cold;
5450 }
5451 updateAllocationCall(Node->Call, AT);
5452 assert(Node->MatchingCalls.empty());
5453 return;
5454 }
5455
5456 if (!CallsiteToCalleeFuncCloneMap.count(Node))
5457 return;
5458
5459 auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
5460 updateCall(Node->Call, CalleeFunc);
5461 // Update all the matching calls as well.
5462 for (auto &Call : Node->MatchingCalls)
5463 updateCall(Call, CalleeFunc);
5464
5465 // Now update all calls recorded earlier that are still in function clones
5466 // which don't have a clone of this callsite node.
5467 if (!UnassignedCallClones.contains(Node))
5468 return;
5469 DenseSet<unsigned> NodeCallClones;
5470 for (auto *C : Node->Clones)
5471 NodeCallClones.insert(C->Call.cloneNo());
5472 // Note that we already confirmed Node is in this map a few lines above.
5473 auto &ClonedCalls = UnassignedCallClones[Node];
5474 for (auto &[CloneNo, CallVector] : ClonedCalls) {
5475 // Should start at 1 as we never create an entry for original node.
5476 assert(CloneNo > 0);
5477 // If we subsequently created a clone, skip this one.
5478 if (NodeCallClones.contains(CloneNo))
5479 continue;
5480 // Use the original Node's CalleeFunc.
5481 for (auto &Call : CallVector)
5482 updateCall(Call, CalleeFunc);
5483 }
5484 };
5485
5486 // Performs DFS traversal starting from allocation nodes to update calls to
5487 // reflect cloning decisions recorded earlier. For regular LTO this will
5488 // update the actual calls in the IR to call the appropriate function clone
5489 // (and add attributes to allocation calls), whereas for ThinLTO the decisions
5490 // are recorded in the summary entries.
5491 DenseSet<const ContextNode *> Visited;
5492 for (auto &Entry : AllocationCallToContextNodeMap)
5493 UpdateCalls(Entry.second, Visited, UpdateCalls);
5494
5495 return Changed;
5496}
5497
5498// Compute a SHA1 hash of the callsite and alloc version information of clone I
5499// in the summary, to use in detection of duplicate clones.
5501 SHA1 Hasher;
5502 // Update hash with any callsites that call non-default (non-zero) callee
5503 // versions.
5504 for (auto &SN : FS->callsites()) {
5505 // In theory all callsites and allocs in this function should have the same
5506 // number of clone entries, but handle any discrepancies gracefully below
5507 // for NDEBUG builds.
5508 assert(
5509 SN.Clones.size() > I &&
5510 "Callsite summary has fewer entries than other summaries in function");
5511 if (SN.Clones.size() <= I || !SN.Clones[I])
5512 continue;
5513 uint8_t Data[sizeof(SN.Clones[I])];
5514 support::endian::write32le(Data, SN.Clones[I]);
5515 Hasher.update(Data);
5516 }
5517 // Update hash with any allocs that have non-default (non-None) hints.
5518 for (auto &AN : FS->allocs()) {
5519 // In theory all callsites and allocs in this function should have the same
5520 // number of clone entries, but handle any discrepancies gracefully below
5521 // for NDEBUG builds.
5522 assert(AN.Versions.size() > I &&
5523 "Alloc summary has fewer entries than other summaries in function");
5524 if (AN.Versions.size() <= I ||
5525 (AllocationType)AN.Versions[I] == AllocationType::None)
5526 continue;
5527 Hasher.update(ArrayRef<uint8_t>(&AN.Versions[I], 1));
5528 }
5529 return support::endian::read64le(Hasher.result().data());
5530}
5531
5533 Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
5535 &FuncToAliasMap,
5536 FunctionSummary *FS) {
5537 auto TakeDeclNameAndReplace = [](GlobalValue *DeclGV, GlobalValue *NewGV) {
5538 // We might have created this when adjusting callsite in another
5539 // function. It should be a declaration.
5540 assert(DeclGV->isDeclaration());
5541 NewGV->takeName(DeclGV);
5542 DeclGV->replaceAllUsesWith(NewGV);
5543 DeclGV->eraseFromParent();
5544 };
5545
5546 // Handle aliases to this function, and create analogous alias clones to the
5547 // provided clone of this function.
5548 auto CloneFuncAliases = [&](Function *NewF, unsigned I) {
5549 if (!FuncToAliasMap.count(&F))
5550 return;
5551 for (auto *A : FuncToAliasMap[&F]) {
5552 std::string AliasName = getMemProfFuncName(A->getName(), I);
5553 auto *PrevA = M.getNamedAlias(AliasName);
5554 auto *NewA = GlobalAlias::create(A->getValueType(),
5555 A->getType()->getPointerAddressSpace(),
5556 A->getLinkage(), AliasName, NewF);
5557 NewA->copyAttributesFrom(A);
5558 if (PrevA)
5559 TakeDeclNameAndReplace(PrevA, NewA);
5560 }
5561 };
5562
5563 // The first "clone" is the original copy, we should only call this if we
5564 // needed to create new clones.
5565 assert(NumClones > 1);
5567 VMaps.reserve(NumClones - 1);
5568 FunctionsClonedThinBackend++;
5569
5570 // Map of hash of callsite/alloc versions to the instantiated function clone
5571 // (possibly the original) implementing those calls. Used to avoid
5572 // instantiating duplicate function clones.
5573 // FIXME: Ideally the thin link would not generate such duplicate clones to
5574 // start with, but right now it happens due to phase ordering in the function
5575 // assignment and possible new clones that produces. We simply make each
5576 // duplicate an alias to the matching instantiated clone recorded in the map
5577 // (except for available_externally which are made declarations as they would
5578 // be aliases in the prevailing module, and available_externally aliases are
5579 // not well supported right now).
5581
5582 // Save the hash of the original function version.
5583 HashToFunc[ComputeHash(FS, 0)] = &F;
5584
5585 for (unsigned I = 1; I < NumClones; I++) {
5586 VMaps.emplace_back(std::make_unique<ValueToValueMapTy>());
5587 std::string Name = getMemProfFuncName(F.getName(), I);
5588 auto Hash = ComputeHash(FS, I);
5589 // If this clone would duplicate a previously seen clone, don't generate the
5590 // duplicate clone body, just make an alias to satisfy any (potentially
5591 // cross-module) references.
5592 if (HashToFunc.contains(Hash)) {
5593 FunctionCloneDuplicatesThinBackend++;
5594 auto *Func = HashToFunc[Hash];
5595 if (Func->hasAvailableExternallyLinkage()) {
5596 // Skip these as EliminateAvailableExternallyPass does not handle
5597 // available_externally aliases correctly and we end up with an
5598 // available_externally alias to a declaration. Just create a
5599 // declaration for now as we know we will have a definition in another
5600 // module.
5601 auto Decl = M.getOrInsertFunction(Name, Func->getFunctionType());
5602 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
5603 << "created clone decl " << ore::NV("Decl", Decl.getCallee()));
5604 continue;
5605 }
5606 auto *PrevF = M.getFunction(Name);
5607 auto *Alias = GlobalAlias::create(Name, Func);
5608 if (PrevF)
5609 TakeDeclNameAndReplace(PrevF, Alias);
5610 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
5611 << "created clone alias " << ore::NV("Alias", Alias));
5612
5613 // Now handle aliases to this function, and clone those as well.
5614 CloneFuncAliases(Func, I);
5615 continue;
5616 }
5617 auto *NewF = CloneFunction(&F, *VMaps.back());
5618 HashToFunc[Hash] = NewF;
5619 FunctionClonesThinBackend++;
5620 // Strip memprof and callsite metadata from clone as they are no longer
5621 // needed.
5622 for (auto &BB : *NewF) {
5623 for (auto &Inst : BB) {
5624 Inst.setMetadata(LLVMContext::MD_memprof, nullptr);
5625 Inst.setMetadata(LLVMContext::MD_callsite, nullptr);
5626 }
5627 }
5628 auto *PrevF = M.getFunction(Name);
5629 if (PrevF)
5630 TakeDeclNameAndReplace(PrevF, NewF);
5631 else
5632 NewF->setName(Name);
5633 updateSubprogramLinkageName(NewF, Name);
5634 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
5635 << "created clone " << ore::NV("NewFunction", NewF));
5636
5637 // Now handle aliases to this function, and clone those as well.
5638 CloneFuncAliases(NewF, I);
5639 }
5640 return VMaps;
5641}
5642
5643// Locate the summary for F. This is complicated by the fact that it might
5644// have been internalized or promoted.
5646 const ModuleSummaryIndex *ImportSummary,
5647 const Function *CallingFunc = nullptr) {
5648 // FIXME: Ideally we would retain the original GUID in some fashion on the
5649 // function (e.g. as metadata), but for now do our best to locate the
5650 // summary without that information.
5651 ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID());
5652 if (!TheFnVI)
5653 // See if theFn was internalized, by checking index directly with
5654 // original name (this avoids the name adjustment done by getGUID() for
5655 // internal symbols).
5656 TheFnVI = ImportSummary->getValueInfo(
5658 if (TheFnVI)
5659 return TheFnVI;
5660 // Now query with the original name before any promotion was performed.
5661 StringRef OrigName =
5663 // When this pass is enabled, we always add thinlto_src_file provenance
5664 // metadata to imported function definitions, which allows us to recreate the
5665 // original internal symbol's GUID.
5666 auto SrcFileMD = F.getMetadata("thinlto_src_file");
5667 // If this is a call to an imported/promoted local for which we didn't import
5668 // the definition, the metadata will not exist on the declaration. However,
5669 // since we are doing this early, before any inlining in the LTO backend, we
5670 // can simply look at the metadata on the calling function which must have
5671 // been from the same module if F was an internal symbol originally.
5672 if (!SrcFileMD && F.isDeclaration()) {
5673 // We would only call this for a declaration for a direct callsite, in which
5674 // case the caller would have provided the calling function pointer.
5675 assert(CallingFunc);
5676 SrcFileMD = CallingFunc->getMetadata("thinlto_src_file");
5677 // If this is a promoted local (OrigName != F.getName()), since this is a
5678 // declaration, it must be imported from a different module and therefore we
5679 // should always find the metadata on its calling function. Any call to a
5680 // promoted local that came from this module should still be a definition.
5681 assert(SrcFileMD || OrigName == F.getName());
5682 }
5683 StringRef SrcFile = M.getSourceFileName();
5684 if (SrcFileMD)
5685 SrcFile = dyn_cast<MDString>(SrcFileMD->getOperand(0))->getString();
5686 std::string OrigId = GlobalValue::getGlobalIdentifier(
5687 OrigName, GlobalValue::InternalLinkage, SrcFile);
5688 TheFnVI = ImportSummary->getValueInfo(
5690 // Internal func in original module may have gotten a numbered suffix if we
5691 // imported an external function with the same name. This happens
5692 // automatically during IR linking for naming conflicts. It would have to
5693 // still be internal in that case (otherwise it would have been renamed on
5694 // promotion in which case we wouldn't have a naming conflict).
5695 if (!TheFnVI && OrigName == F.getName() && F.hasLocalLinkage() &&
5696 F.getName().contains('.')) {
5697 OrigName = F.getName().rsplit('.').first;
5699 OrigName, GlobalValue::InternalLinkage, SrcFile);
5700 TheFnVI = ImportSummary->getValueInfo(
5702 }
5703 // The only way we may not have a VI is if this is a declaration created for
5704 // an imported reference. For distributed ThinLTO we may not have a VI for
5705 // such declarations in the distributed summary.
5706 assert(TheFnVI || F.isDeclaration());
5707 return TheFnVI;
5708}
5709
5710bool MemProfContextDisambiguation::initializeIndirectCallPromotionInfo(
5711 Module &M) {
5712 ICallAnalysis = std::make_unique<ICallPromotionAnalysis>();
5713 Symtab = std::make_unique<InstrProfSymtab>();
5714 // Don't add canonical names, to avoid multiple functions to the symtab
5715 // when they both have the same root name with "." suffixes stripped.
5716 // If we pick the wrong one then this could lead to incorrect ICP and calling
5717 // a memprof clone that we don't actually create (resulting in linker unsats).
5718 // What this means is that the GUID of the function (or its PGOFuncName
5719 // metadata) *must* match that in the VP metadata to allow promotion.
5720 // In practice this should not be a limitation, since local functions should
5721 // have PGOFuncName metadata and global function names shouldn't need any
5722 // special handling (they should not get the ".llvm.*" suffix that the
5723 // canonicalization handling is attempting to strip).
5724 if (Error E = Symtab->create(M, /*InLTO=*/true, /*AddCanonical=*/false)) {
5725 std::string SymtabFailure = toString(std::move(E));
5726 M.getContext().emitError("Failed to create symtab: " + SymtabFailure);
5727 return false;
5728 }
5729 return true;
5730}
5731
5732#ifndef NDEBUG
5733// Sanity check that the MIB stack ids match between the summary and
5734// instruction metadata.
5736 const AllocInfo &AllocNode, const MDNode *MemProfMD,
5737 const CallStack<MDNode, MDNode::op_iterator> &CallsiteContext,
5738 const ModuleSummaryIndex *ImportSummary) {
5739 auto MIBIter = AllocNode.MIBs.begin();
5740 for (auto &MDOp : MemProfMD->operands()) {
5741 assert(MIBIter != AllocNode.MIBs.end());
5742 auto StackIdIndexIter = MIBIter->StackIdIndices.begin();
5743 auto *MIBMD = cast<const MDNode>(MDOp);
5744 MDNode *StackMDNode = getMIBStackNode(MIBMD);
5745 assert(StackMDNode);
5746 CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
5747 auto ContextIterBegin =
5748 StackContext.beginAfterSharedPrefix(CallsiteContext);
5749 // Skip the checking on the first iteration.
5750 uint64_t LastStackContextId =
5751 (ContextIterBegin != StackContext.end() && *ContextIterBegin == 0) ? 1
5752 : 0;
5753 for (auto ContextIter = ContextIterBegin; ContextIter != StackContext.end();
5754 ++ContextIter) {
5755 // If this is a direct recursion, simply skip the duplicate
5756 // entries, to be consistent with how the summary ids were
5757 // generated during ModuleSummaryAnalysis.
5758 if (LastStackContextId == *ContextIter)
5759 continue;
5760 LastStackContextId = *ContextIter;
5761 assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
5762 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
5763 *ContextIter);
5764 StackIdIndexIter++;
5765 }
5766 MIBIter++;
5767 }
5768}
5769#endif
5770
5771bool MemProfContextDisambiguation::applyImport(Module &M) {
5772 assert(ImportSummary);
5773 bool Changed = false;
5774
5775 // We also need to clone any aliases that reference cloned functions, because
5776 // the modified callsites may invoke via the alias. Keep track of the aliases
5777 // for each function.
5778 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
5779 FuncToAliasMap;
5780 for (auto &A : M.aliases()) {
5781 auto *Aliasee = A.getAliaseeObject();
5782 if (auto *F = dyn_cast<Function>(Aliasee))
5783 FuncToAliasMap[F].insert(&A);
5784 }
5785
5786 if (!initializeIndirectCallPromotionInfo(M))
5787 return false;
5788
5789 for (auto &F : M) {
5790 if (F.isDeclaration() || isMemProfClone(F))
5791 continue;
5792
5793 OptimizationRemarkEmitter ORE(&F);
5794
5796 bool ClonesCreated = false;
5797 unsigned NumClonesCreated = 0;
5798 auto CloneFuncIfNeeded = [&](unsigned NumClones, FunctionSummary *FS) {
5799 // We should at least have version 0 which is the original copy.
5800 assert(NumClones > 0);
5801 // If only one copy needed use original.
5802 if (NumClones == 1)
5803 return;
5804 // If we already performed cloning of this function, confirm that the
5805 // requested number of clones matches (the thin link should ensure the
5806 // number of clones for each constituent callsite is consistent within
5807 // each function), before returning.
5808 if (ClonesCreated) {
5809 assert(NumClonesCreated == NumClones);
5810 return;
5811 }
5812 VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap, FS);
5813 // The first "clone" is the original copy, which doesn't have a VMap.
5814 assert(VMaps.size() == NumClones - 1);
5815 Changed = true;
5816 ClonesCreated = true;
5817 NumClonesCreated = NumClones;
5818 };
5819
5820 auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
5821 Function *CalledFunction, FunctionSummary *FS) {
5822 // Perform cloning if not yet done.
5823 CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size(), FS);
5824
5825 assert(!isMemProfClone(*CalledFunction));
5826
5827 // Because we update the cloned calls by calling setCalledOperand (see
5828 // comment below), out of an abundance of caution make sure the called
5829 // function was actually the called operand (or its aliasee). We also
5830 // strip pointer casts when looking for calls (to match behavior during
5831 // summary generation), however, with opaque pointers in theory this
5832 // should not be an issue. Note we still clone the current function
5833 // (containing this call) above, as that could be needed for its callers.
5834 auto *GA = dyn_cast_or_null<GlobalAlias>(CB->getCalledOperand());
5835 if (CalledFunction != CB->getCalledOperand() &&
5836 (!GA || CalledFunction != GA->getAliaseeObject())) {
5837 SkippedCallsCloning++;
5838 return;
5839 }
5840 // Update the calls per the summary info.
5841 // Save orig name since it gets updated in the first iteration
5842 // below.
5843 auto CalleeOrigName = CalledFunction->getName();
5844 for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
5845 // If the VMap is empty, this clone was a duplicate of another and was
5846 // created as an alias or a declaration.
5847 if (J > 0 && VMaps[J - 1]->empty())
5848 continue;
5849 // Do nothing if this version calls the original version of its
5850 // callee.
5851 if (!StackNode.Clones[J])
5852 continue;
5853 auto NewF = M.getOrInsertFunction(
5854 getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]),
5855 CalledFunction->getFunctionType());
5856 CallBase *CBClone;
5857 // Copy 0 is the original function.
5858 if (!J)
5859 CBClone = CB;
5860 else
5861 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
5862 // Set the called operand directly instead of calling setCalledFunction,
5863 // as the latter mutates the function type on the call. In rare cases
5864 // we may have a slightly different type on a callee function
5865 // declaration due to it being imported from a different module with
5866 // incomplete types. We really just want to change the name of the
5867 // function to the clone, and not make any type changes.
5868 CBClone->setCalledOperand(NewF.getCallee());
5869 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
5870 << ore::NV("Call", CBClone) << " in clone "
5871 << ore::NV("Caller", CBClone->getFunction())
5872 << " assigned to call function clone "
5873 << ore::NV("Callee", NewF.getCallee()));
5874 }
5875 };
5876
5877 // Locate the summary for F.
5878 ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
5879 // If not found, this could be an imported local (see comment in
5880 // findValueInfoForFunc). Skip for now as it will be cloned in its original
5881 // module (where it would have been promoted to global scope so should
5882 // satisfy any reference in this module).
5883 if (!TheFnVI)
5884 continue;
5885
5886 auto *GVSummary =
5887 ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier());
5888 if (!GVSummary) {
5889 // Must have been imported, use the summary which matches the definition。
5890 // (might be multiple if this was a linkonce_odr).
5891 auto SrcModuleMD = F.getMetadata("thinlto_src_module");
5892 assert(SrcModuleMD &&
5893 "enable-import-metadata is needed to emit thinlto_src_module");
5894 StringRef SrcModule =
5895 dyn_cast<MDString>(SrcModuleMD->getOperand(0))->getString();
5896 for (auto &GVS : TheFnVI.getSummaryList()) {
5897 if (GVS->modulePath() == SrcModule) {
5898 GVSummary = GVS.get();
5899 break;
5900 }
5901 }
5902 assert(GVSummary && GVSummary->modulePath() == SrcModule);
5903 }
5904
5905 // If this was an imported alias skip it as we won't have the function
5906 // summary, and it should be cloned in the original module.
5907 if (isa<AliasSummary>(GVSummary))
5908 continue;
5909
5910 auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject());
5911
5912 if (FS->allocs().empty() && FS->callsites().empty())
5913 continue;
5914
5915 auto SI = FS->callsites().begin();
5916 auto AI = FS->allocs().begin();
5917
5918 // To handle callsite infos synthesized for tail calls which have missing
5919 // frames in the profiled context, map callee VI to the synthesized callsite
5920 // info.
5921 DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite;
5922 // Iterate the callsites for this function in reverse, since we place all
5923 // those synthesized for tail calls at the end.
5924 for (auto CallsiteIt = FS->callsites().rbegin();
5925 CallsiteIt != FS->callsites().rend(); CallsiteIt++) {
5926 auto &Callsite = *CallsiteIt;
5927 // Stop as soon as we see a non-synthesized callsite info (see comment
5928 // above loop). All the entries added for discovered tail calls have empty
5929 // stack ids.
5930 if (!Callsite.StackIdIndices.empty())
5931 break;
5932 MapTailCallCalleeVIToCallsite.insert({Callsite.Callee, Callsite});
5933 }
5934
5935 // Keeps track of needed ICP for the function.
5936 SmallVector<ICallAnalysisData> ICallAnalysisInfo;
5937
5938 // Assume for now that the instructions are in the exact same order
5939 // as when the summary was created, but confirm this is correct by
5940 // matching the stack ids.
5941 for (auto &BB : F) {
5942 for (auto &I : BB) {
5943 auto *CB = dyn_cast<CallBase>(&I);
5944 // Same handling as when creating module summary.
5945 if (!mayHaveMemprofSummary(CB))
5946 continue;
5947
5948 auto *CalledValue = CB->getCalledOperand();
5949 auto *CalledFunction = CB->getCalledFunction();
5950 if (CalledValue && !CalledFunction) {
5951 CalledValue = CalledValue->stripPointerCasts();
5952 // Stripping pointer casts can reveal a called function.
5953 CalledFunction = dyn_cast<Function>(CalledValue);
5954 }
5955 // Check if this is an alias to a function. If so, get the
5956 // called aliasee for the checks below.
5957 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
5958 assert(!CalledFunction &&
5959 "Expected null called function in callsite for alias");
5960 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
5961 }
5962
5963 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
5964 I.getMetadata(LLVMContext::MD_callsite));
5965 auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
5966
5967 // Include allocs that were already assigned a memprof function
5968 // attribute in the statistics. Only do this for those that do not have
5969 // memprof metadata, since we add an "ambiguous" memprof attribute by
5970 // default.
5971 if (CB->getAttributes().hasFnAttr("memprof") && !MemProfMD) {
5972 CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
5973 ? AllocTypeColdThinBackend++
5974 : AllocTypeNotColdThinBackend++;
5975 OrigAllocsThinBackend++;
5976 AllocVersionsThinBackend++;
5977 if (!MaxAllocVersionsThinBackend)
5978 MaxAllocVersionsThinBackend = 1;
5979 continue;
5980 }
5981
5982 if (MemProfMD) {
5983 // Consult the next alloc node.
5984 assert(AI != FS->allocs().end());
5985 auto &AllocNode = *(AI++);
5986
5987#ifndef NDEBUG
5988 checkAllocContextIds(AllocNode, MemProfMD, CallsiteContext,
5989 ImportSummary);
5990#endif
5991
5992 // Perform cloning if not yet done.
5993 CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size(), FS);
5994
5995 OrigAllocsThinBackend++;
5996 AllocVersionsThinBackend += AllocNode.Versions.size();
5997 if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
5998 MaxAllocVersionsThinBackend = AllocNode.Versions.size();
5999
6000 // If there is only one version that means we didn't end up
6001 // considering this function for cloning, and in that case the alloc
6002 // will still be none type or should have gotten the default NotCold.
6003 // Skip that after calling clone helper since that does some sanity
6004 // checks that confirm we haven't decided yet that we need cloning.
6005 // We might have a single version that is cold due to the
6006 // MinClonedColdBytePercent heuristic, make sure we don't skip in that
6007 // case.
6008 if (AllocNode.Versions.size() == 1 &&
6009 (AllocationType)AllocNode.Versions[0] != AllocationType::Cold) {
6010 assert((AllocationType)AllocNode.Versions[0] ==
6011 AllocationType::NotCold ||
6012 (AllocationType)AllocNode.Versions[0] ==
6013 AllocationType::None);
6014 UnclonableAllocsThinBackend++;
6015 continue;
6016 }
6017
6018 // All versions should have a singular allocation type.
6019 assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
6020 return Type == ((uint8_t)AllocationType::NotCold |
6021 (uint8_t)AllocationType::Cold);
6022 }));
6023
6024 // Update the allocation types per the summary info.
6025 for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
6026 // If the VMap is empty, this clone was a duplicate of another and
6027 // was created as an alias or a declaration.
6028 if (J > 0 && VMaps[J - 1]->empty())
6029 continue;
6030 // Ignore any that didn't get an assigned allocation type.
6031 if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
6032 continue;
6033 AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
6034 AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
6035 : AllocTypeNotColdThinBackend++;
6036 std::string AllocTypeString = getAllocTypeAttributeString(AllocTy);
6037 auto A = llvm::Attribute::get(F.getContext(), "memprof",
6038 AllocTypeString);
6039 CallBase *CBClone;
6040 // Copy 0 is the original function.
6041 if (!J)
6042 CBClone = CB;
6043 else
6044 // Since VMaps are only created for new clones, we index with
6045 // clone J-1 (J==0 is the original clone and does not have a VMaps
6046 // entry).
6047 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
6049 CBClone->addFnAttr(A);
6050 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
6051 << ore::NV("AllocationCall", CBClone) << " in clone "
6052 << ore::NV("Caller", CBClone->getFunction())
6053 << " marked with memprof allocation attribute "
6054 << ore::NV("Attribute", AllocTypeString));
6055 }
6056 } else if (!CallsiteContext.empty()) {
6057 if (!CalledFunction) {
6058#ifndef NDEBUG
6059 // We should have skipped inline assembly calls.
6060 auto *CI = dyn_cast<CallInst>(CB);
6061 assert(!CI || !CI->isInlineAsm());
6062#endif
6063 // We should have skipped direct calls via a Constant.
6064 assert(CalledValue && !isa<Constant>(CalledValue));
6065
6066 // This is an indirect call, see if we have profile information and
6067 // whether any clones were recorded for the profiled targets (that
6068 // we synthesized CallsiteInfo summary records for when building the
6069 // index).
6070 auto NumClones =
6071 recordICPInfo(CB, FS->callsites(), SI, ICallAnalysisInfo);
6072
6073 // Perform cloning if not yet done. This is done here in case
6074 // we don't need to do ICP, but might need to clone this
6075 // function as it is the target of other cloned calls.
6076 if (NumClones)
6077 CloneFuncIfNeeded(NumClones, FS);
6078 }
6079
6080 else {
6081 // Consult the next callsite node.
6082 assert(SI != FS->callsites().end());
6083 auto &StackNode = *(SI++);
6084
6085#ifndef NDEBUG
6086 // Sanity check that the stack ids match between the summary and
6087 // instruction metadata.
6088 auto StackIdIndexIter = StackNode.StackIdIndices.begin();
6089 for (auto StackId : CallsiteContext) {
6090 assert(StackIdIndexIter != StackNode.StackIdIndices.end());
6091 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
6092 StackId);
6093 StackIdIndexIter++;
6094 }
6095#endif
6096
6097 CloneCallsite(StackNode, CB, CalledFunction, FS);
6098 }
6099 } else if (CB->isTailCall() && CalledFunction) {
6100 // Locate the synthesized callsite info for the callee VI, if any was
6101 // created, and use that for cloning.
6102 ValueInfo CalleeVI =
6103 findValueInfoForFunc(*CalledFunction, M, ImportSummary, &F);
6104 if (CalleeVI && MapTailCallCalleeVIToCallsite.count(CalleeVI)) {
6105 auto Callsite = MapTailCallCalleeVIToCallsite.find(CalleeVI);
6106 assert(Callsite != MapTailCallCalleeVIToCallsite.end());
6107 CloneCallsite(Callsite->second, CB, CalledFunction, FS);
6108 }
6109 }
6110 }
6111 }
6112
6113 // Now do any promotion required for cloning.
6114 performICP(M, FS->callsites(), VMaps, ICallAnalysisInfo, ORE);
6115 }
6116
6117 // We skip some of the functions and instructions above, so remove all the
6118 // metadata in a single sweep here.
6119 for (auto &F : M) {
6120 // We can skip memprof clones because createFunctionClones already strips
6121 // the metadata from the newly created clones.
6122 if (F.isDeclaration() || isMemProfClone(F))
6123 continue;
6124 for (auto &BB : F) {
6125 for (auto &I : BB) {
6126 if (!isa<CallBase>(I))
6127 continue;
6128 I.setMetadata(LLVMContext::MD_memprof, nullptr);
6129 I.setMetadata(LLVMContext::MD_callsite, nullptr);
6130 }
6131 }
6132 }
6133
6134 return Changed;
6135}
6136
6137unsigned MemProfContextDisambiguation::recordICPInfo(
6138 CallBase *CB, ArrayRef<CallsiteInfo> AllCallsites,
6140 SmallVector<ICallAnalysisData> &ICallAnalysisInfo) {
6141 // First see if we have profile information for this indirect call.
6142 uint32_t NumCandidates;
6143 uint64_t TotalCount;
6144 auto CandidateProfileData =
6145 ICallAnalysis->getPromotionCandidatesForInstruction(
6146 CB, TotalCount, NumCandidates, MaxSummaryIndirectEdges);
6147 if (CandidateProfileData.empty())
6148 return 0;
6149
6150 // Iterate through all of the candidate profiled targets along with the
6151 // CallsiteInfo summary records synthesized for them when building the index,
6152 // and see if any are cloned and/or refer to clones.
6153 bool ICPNeeded = false;
6154 unsigned NumClones = 0;
6155 size_t CallsiteInfoStartIndex = std::distance(AllCallsites.begin(), SI);
6156 for (const auto &Candidate : CandidateProfileData) {
6157#ifndef NDEBUG
6158 auto CalleeValueInfo =
6159#endif
6160 ImportSummary->getValueInfo(Candidate.Value);
6161 // We might not have a ValueInfo if this is a distributed
6162 // ThinLTO backend and decided not to import that function.
6163 assert(!CalleeValueInfo || SI->Callee == CalleeValueInfo);
6164 assert(SI != AllCallsites.end());
6165 auto &StackNode = *(SI++);
6166 // See if any of the clones of the indirect callsite for this
6167 // profiled target should call a cloned version of the profiled
6168 // target. We only need to do the ICP here if so.
6169 ICPNeeded |= llvm::any_of(StackNode.Clones,
6170 [](unsigned CloneNo) { return CloneNo != 0; });
6171 // Every callsite in the same function should have been cloned the same
6172 // number of times.
6173 assert(!NumClones || NumClones == StackNode.Clones.size());
6174 NumClones = StackNode.Clones.size();
6175 }
6176 if (!ICPNeeded)
6177 return NumClones;
6178 // Save information for ICP, which is performed later to avoid messing up the
6179 // current function traversal.
6180 ICallAnalysisInfo.push_back({CB, CandidateProfileData.vec(), NumCandidates,
6181 TotalCount, CallsiteInfoStartIndex});
6182 return NumClones;
6183}
6184
6185void MemProfContextDisambiguation::performICP(
6186 Module &M, ArrayRef<CallsiteInfo> AllCallsites,
6187 ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps,
6188 ArrayRef<ICallAnalysisData> ICallAnalysisInfo,
6189 OptimizationRemarkEmitter &ORE) {
6190 // Now do any promotion required for cloning. Specifically, for each
6191 // recorded ICP candidate (which was only recorded because one clone of that
6192 // candidate should call a cloned target), we perform ICP (speculative
6193 // devirtualization) for each clone of the callsite, and update its callee
6194 // to the appropriate clone. Note that the ICP compares against the original
6195 // version of the target, which is what is in the vtable.
6196 for (auto &Info : ICallAnalysisInfo) {
6197 auto *CB = Info.CB;
6198 auto CallsiteIndex = Info.CallsiteInfoStartIndex;
6199 auto TotalCount = Info.TotalCount;
6200 unsigned NumPromoted = 0;
6201 unsigned NumClones = 0;
6202
6203 for (auto &Candidate : Info.CandidateProfileData) {
6204 auto &StackNode = AllCallsites[CallsiteIndex++];
6205
6206 // All calls in the same function must have the same number of clones.
6207 assert(!NumClones || NumClones == StackNode.Clones.size());
6208 NumClones = StackNode.Clones.size();
6209
6210 // See if the target is in the module. If it wasn't imported, it is
6211 // possible that this profile could have been collected on a different
6212 // target (or version of the code), and we need to be conservative
6213 // (similar to what is done in the ICP pass).
6214 Function *TargetFunction = Symtab->getFunction(Candidate.Value);
6215 if (TargetFunction == nullptr ||
6216 // Any ThinLTO global dead symbol removal should have already
6217 // occurred, so it should be safe to promote when the target is a
6218 // declaration.
6219 // TODO: Remove internal option once more fully tested.
6221 TargetFunction->isDeclaration())) {
6222 ORE.emit([&]() {
6223 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", CB)
6224 << "Memprof cannot promote indirect call: target with md5sum "
6225 << ore::NV("target md5sum", Candidate.Value) << " not found";
6226 });
6227 // FIXME: See if we can use the new declaration importing support to
6228 // at least get the declarations imported for this case. Hot indirect
6229 // targets should have been imported normally, however.
6230 continue;
6231 }
6232
6233 // Check if legal to promote
6234 const char *Reason = nullptr;
6235 if (!isLegalToPromote(*CB, TargetFunction, &Reason)) {
6236 ORE.emit([&]() {
6237 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", CB)
6238 << "Memprof cannot promote indirect call to "
6239 << ore::NV("TargetFunction", TargetFunction)
6240 << " with count of " << ore::NV("TotalCount", TotalCount)
6241 << ": " << Reason;
6242 });
6243 continue;
6244 }
6245
6246 assert(!isMemProfClone(*TargetFunction));
6247
6248 // Handle each call clone, applying ICP so that each clone directly
6249 // calls the specified callee clone, guarded by the appropriate ICP
6250 // check.
6251 CallBase *CBClone = CB;
6252 for (unsigned J = 0; J < NumClones; J++) {
6253 // If the VMap is empty, this clone was a duplicate of another and was
6254 // created as an alias or a declaration.
6255 if (J > 0 && VMaps[J - 1]->empty())
6256 continue;
6257 // Copy 0 is the original function.
6258 if (J > 0)
6259 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
6260 // We do the promotion using the original name, so that the comparison
6261 // is against the name in the vtable. Then just below, change the new
6262 // direct call to call the cloned function.
6263 auto &DirectCall =
6264 pgo::promoteIndirectCall(*CBClone, TargetFunction, Candidate.Count,
6265 TotalCount, isSamplePGO, &ORE);
6266 auto *TargetToUse = TargetFunction;
6267 // Call original if this version calls the original version of its
6268 // callee.
6269 if (StackNode.Clones[J]) {
6270 TargetToUse =
6271 cast<Function>(M.getOrInsertFunction(
6272 getMemProfFuncName(TargetFunction->getName(),
6273 StackNode.Clones[J]),
6274 TargetFunction->getFunctionType())
6275 .getCallee());
6276 }
6277 DirectCall.setCalledFunction(TargetToUse);
6278 // During matching we generate synthetic VP metadata for indirect calls
6279 // not already having any, from the memprof profile's callee GUIDs. If
6280 // we subsequently promote and inline those callees, we currently lose
6281 // the ability to generate this synthetic VP metadata. Optionally apply
6282 // a noinline attribute to promoted direct calls, where the threshold is
6283 // set to capture synthetic VP metadata targets which get a count of 1.
6285 Candidate.Count < MemProfICPNoInlineThreshold)
6286 DirectCall.setIsNoInline();
6287 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
6288 << ore::NV("Call", CBClone) << " in clone "
6289 << ore::NV("Caller", CBClone->getFunction())
6290 << " promoted and assigned to call function clone "
6291 << ore::NV("Callee", TargetToUse));
6292 }
6293
6294 // Update TotalCount (all clones should get same count above)
6295 TotalCount -= Candidate.Count;
6296 NumPromoted++;
6297 }
6298 // Adjust the MD.prof metadata for all clones, now that we have the new
6299 // TotalCount and the number promoted.
6300 CallBase *CBClone = CB;
6301 for (unsigned J = 0; J < NumClones; J++) {
6302 // If the VMap is empty, this clone was a duplicate of another and was
6303 // created as an alias or a declaration.
6304 if (J > 0 && VMaps[J - 1]->empty())
6305 continue;
6306 // Copy 0 is the original function.
6307 if (J > 0)
6308 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
6309 // First delete the old one.
6310 CBClone->setMetadata(LLVMContext::MD_prof, nullptr);
6311 // If all promoted, we don't need the MD.prof metadata.
6312 // Otherwise we need update with the un-promoted records back.
6313 if (TotalCount != 0)
6315 M, *CBClone, ArrayRef(Info.CandidateProfileData).slice(NumPromoted),
6316 TotalCount, IPVK_IndirectCallTarget, Info.NumCandidates);
6317 }
6318 }
6319}
6320
6321template <typename DerivedCCG, typename FuncTy, typename CallTy>
6322bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process(
6323 function_ref<void(StringRef, StringRef, const Twine &)> EmitRemark,
6324 bool AllowExtraAnalysis) {
6325 if (DumpCCG) {
6326 dbgs() << "CCG before cloning:\n";
6327 dbgs() << *this;
6328 }
6329 if (ExportToDot)
6330 exportToDot("postbuild");
6331
6332 if (VerifyCCG) {
6333 check();
6334 }
6335
6336 identifyClones();
6337
6338 if (VerifyCCG) {
6339 check();
6340 }
6341
6342 if (DumpCCG) {
6343 dbgs() << "CCG after cloning:\n";
6344 dbgs() << *this;
6345 }
6346 if (ExportToDot)
6347 exportToDot("cloned");
6348
6349 bool Changed = assignFunctions();
6350
6351 if (DumpCCG) {
6352 dbgs() << "CCG after assigning function clones:\n";
6353 dbgs() << *this;
6354 }
6355 if (ExportToDot)
6356 exportToDot("clonefuncassign");
6357
6358 if (MemProfReportHintedSizes || AllowExtraAnalysis)
6359 printTotalSizes(errs(), EmitRemark);
6360
6361 return Changed;
6362}
6363
6364bool MemProfContextDisambiguation::processModule(
6365 Module &M,
6366 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
6367
6368 // If we have an import summary, then the cloning decisions were made during
6369 // the thin link on the index. Apply them and return.
6370 if (ImportSummary)
6371 return applyImport(M);
6372
6373 // TODO: If/when other types of memprof cloning are enabled beyond just for
6374 // hot and cold, we will need to change this to individually control the
6375 // AllocationType passed to addStackNodesForMIB during CCG construction.
6376 // Note that we specifically check this after applying imports above, so that
6377 // the option isn't needed to be passed to distributed ThinLTO backend
6378 // clang processes, which won't necessarily have visibility into the linker
6379 // dependences. Instead the information is communicated from the LTO link to
6380 // the backends via the combined summary index.
6381 if (!SupportsHotColdNew)
6382 return false;
6383
6384 ModuleCallsiteContextGraph CCG(M, OREGetter);
6385 // TODO: Set up remarks for regular LTO. We need to decide what function to
6386 // use in the callback.
6387 return CCG.process();
6388}
6389
6391 const ModuleSummaryIndex *Summary, bool isSamplePGO)
6392 : ImportSummary(Summary), isSamplePGO(isSamplePGO) {
6393 // Check the dot graph printing options once here, to make sure we have valid
6394 // and expected combinations.
6395 if (DotGraphScope == DotScope::Alloc && !AllocIdForDot.getNumOccurrences())
6397 "-memprof-dot-scope=alloc requires -memprof-dot-alloc-id");
6399 !ContextIdForDot.getNumOccurrences())
6401 "-memprof-dot-scope=context requires -memprof-dot-context-id");
6402 if (DotGraphScope == DotScope::All && AllocIdForDot.getNumOccurrences() &&
6403 ContextIdForDot.getNumOccurrences())
6405 "-memprof-dot-scope=all can't have both -memprof-dot-alloc-id and "
6406 "-memprof-dot-context-id");
6407 if (ImportSummary) {
6408 // The MemProfImportSummary should only be used for testing ThinLTO
6409 // distributed backend handling via opt, in which case we don't have a
6410 // summary from the pass pipeline.
6412 return;
6413 }
6414 if (MemProfImportSummary.empty())
6415 return;
6416
6417 auto ReadSummaryFile =
6419 if (!ReadSummaryFile) {
6420 logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(),
6421 "Error loading file '" + MemProfImportSummary +
6422 "': ");
6423 return;
6424 }
6425 auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile);
6426 if (!ImportSummaryForTestingOrErr) {
6427 logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(),
6428 "Error parsing file '" + MemProfImportSummary +
6429 "': ");
6430 return;
6431 }
6432 ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
6433 ImportSummary = ImportSummaryForTesting.get();
6434}
6435
6438 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
6439 auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
6440 return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
6441 };
6442 if (!processModule(M, OREGetter))
6443 return PreservedAnalyses::all();
6444 return PreservedAnalyses::none();
6445}
6446
6448 ModuleSummaryIndex &Index,
6450 isPrevailing,
6451 LLVMContext &Ctx,
6452 function_ref<void(StringRef, StringRef, const Twine &)> EmitRemark) {
6453 // TODO: If/when other types of memprof cloning are enabled beyond just for
6454 // hot and cold, we will need to change this to individually control the
6455 // AllocationType passed to addStackNodesForMIB during CCG construction.
6456 // The index was set from the option, so these should be in sync.
6457 assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
6458 if (!SupportsHotColdNew)
6459 return;
6460
6461 bool AllowExtraAnalysis =
6463
6464 IndexCallsiteContextGraph CCG(Index, isPrevailing);
6465 CCG.process(EmitRemark, AllowExtraAnalysis);
6466}
6467
6468// Strips MemProf attributes and metadata. Can be invoked by the pass pipeline
6469// when we don't have an index that has recorded that we are linking with
6470// allocation libraries containing the necessary APIs for downstream
6471// transformations.
6473 // The profile matcher applies hotness attributes directly for allocations,
6474 // and those will cause us to generate calls to the hot/cold interfaces
6475 // unconditionally. If supports-hot-cold-new was not enabled in the LTO
6476 // link then assume we don't want these calls (e.g. not linking with
6477 // the appropriate library, or otherwise trying to disable this behavior).
6478 bool Changed = false;
6479 for (auto &F : M) {
6480 for (auto &BB : F) {
6481 for (auto &I : BB) {
6482 auto *CI = dyn_cast<CallBase>(&I);
6483 if (!CI)
6484 continue;
6485 if (CI->hasFnAttr("memprof")) {
6486 CI->removeFnAttr("memprof");
6487 Changed = true;
6488 }
6489 if (!CI->hasMetadata(LLVMContext::MD_callsite)) {
6490 assert(!CI->hasMetadata(LLVMContext::MD_memprof));
6491 continue;
6492 }
6493 // Strip off all memprof metadata as it is no longer needed.
6494 // Importantly, this avoids the addition of new memprof attributes
6495 // after inlining propagation.
6496 CI->setMetadata(LLVMContext::MD_memprof, nullptr);
6497 CI->setMetadata(LLVMContext::MD_callsite, nullptr);
6498 Changed = true;
6499 }
6500 }
6501 }
6502 if (!Changed)
6503 return PreservedAnalyses::all();
6504 return PreservedAnalyses::none();
6505}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
Unify divergent function exit nodes
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
#define DEBUG_TYPE
Module.h This file contains the declarations for the Module class.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
Machine Check Debug Module
This file implements a map that provides insertion order iteration.
static cl::opt< unsigned > TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5), cl::Hidden, cl::desc("Max depth to recursively search for missing " "frames through tail calls."))
uint64_t ComputeHash(const FunctionSummary *FS, unsigned I)
static cl::opt< DotScope > DotGraphScope("memprof-dot-scope", cl::desc("Scope of graph to export to dot"), cl::Hidden, cl::init(DotScope::All), cl::values(clEnumValN(DotScope::All, "all", "Export full callsite graph"), clEnumValN(DotScope::Alloc, "alloc", "Export only nodes with contexts feeding given " "-memprof-dot-alloc-id"), clEnumValN(DotScope::Context, "context", "Export only nodes with given -memprof-dot-context-id")))
static cl::opt< bool > DoMergeIteration("memprof-merge-iteration", cl::init(true), cl::Hidden, cl::desc("Iteratively apply merging on a node to catch new callers"))
static bool isMemProfClone(const Function &F)
static cl::opt< unsigned > AllocIdForDot("memprof-dot-alloc-id", cl::init(0), cl::Hidden, cl::desc("Id of alloc to export if -memprof-dot-scope=alloc " "or to highlight if -memprof-dot-scope=all"))
static cl::opt< unsigned > ContextIdForDot("memprof-dot-context-id", cl::init(0), cl::Hidden, cl::desc("Id of context to export if -memprof-dot-scope=context or to " "highlight otherwise"))
static cl::opt< bool > ExportToDot("memprof-export-to-dot", cl::init(false), cl::Hidden, cl::desc("Export graph to dot files."))
static void checkEdge(const std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > &Edge)
static cl::opt< bool > AllowRecursiveCallsites("memprof-allow-recursive-callsites", cl::init(true), cl::Hidden, cl::desc("Allow cloning of callsites involved in recursive cycles"))
bool checkColdOrNotCold(uint8_t AllocType)
static ValueInfo findValueInfoForFunc(const Function &F, const Module &M, const ModuleSummaryIndex *ImportSummary, const Function *CallingFunc=nullptr)
static cl::opt< bool > CloneRecursiveContexts("memprof-clone-recursive-contexts", cl::init(true), cl::Hidden, cl::desc("Allow cloning of contexts through recursive cycles"))
static std::string getAllocTypeString(uint8_t AllocTypes)
static cl::opt< unsigned > MemProfICPNoInlineThreshold("memprof-icp-noinline-threshold", cl::init(2), cl::Hidden, cl::desc("Minimum absolute count for promoted target to be inlinable"))
bool DOTGraphTraits< constCallsiteContextGraph< DerivedCCG, FuncTy, CallTy > * >::DoHighlight
static unsigned getMemProfCloneNum(const Function &F)
static SmallVector< std::unique_ptr< ValueToValueMapTy >, 4 > createFunctionClones(Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE, std::map< const Function *, SmallPtrSet< const GlobalAlias *, 1 > > &FuncToAliasMap, FunctionSummary *FS)
static cl::opt< bool > VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden, cl::desc("Perform verification checks on CallingContextGraph."))
static void checkNode(const ContextNode< DerivedCCG, FuncTy, CallTy > *Node, bool CheckEdges=true)
static cl::opt< bool > MergeClones("memprof-merge-clones", cl::init(true), cl::Hidden, cl::desc("Merge clones before assigning functions"))
static std::string getMemProfFuncName(Twine Base, unsigned CloneNo)
static cl::opt< std::string > MemProfImportSummary("memprof-import-summary", cl::desc("Import summary to use for testing the ThinLTO backend via opt"), cl::Hidden)
static const std::string MemProfCloneSuffix
static void updateSubprogramLinkageName(Function *NewFunc, StringRef Name)
static cl::opt< bool > AllowRecursiveContexts("memprof-allow-recursive-contexts", cl::init(true), cl::Hidden, cl::desc("Allow cloning of contexts having recursive cycles"))
static cl::opt< std::string > DotFilePathPrefix("memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, cl::value_desc("filename"), cl::desc("Specify the path prefix of the MemProf dot files."))
static cl::opt< bool > VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden, cl::desc("Perform frequent verification checks on nodes."))
static void checkAllocContextIds(const AllocInfo &AllocNode, const MDNode *MemProfMD, const CallStack< MDNode, MDNode::op_iterator > &CallsiteContext, const ModuleSummaryIndex *ImportSummary)
static cl::opt< bool > DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden, cl::desc("Dump CallingContextGraph to stdout after each stage."))
AllocType
This is the interface to build a ModuleSummaryIndex for a module.
ModuleSummaryIndex.h This file contains the declarations the classes that hold the module index and s...
#define P(N)
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
FunctionAnalysisManager FAM
if(PassOpts->AAPipeline)
std::pair< BasicBlock *, BasicBlock * > Edge
This file defines generic set operations that may be used on set's of different types,...
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
This file contains some functions that are useful when dealing with strings.
#define LLVM_DEBUG(...)
Definition Debug.h:114
void print(OutputBuffer &OB) const
ValueInfo getAliaseeVI() const
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
iterator end() const
Definition ArrayRef.h:131
const_pointer iterator
Definition ArrayRef.h:47
iterator begin() const
Definition ArrayRef.h:130
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
static LLVM_ABI Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
void addFnAttr(Attribute::AttrKind Kind)
Adds the attribute to the function.
void setCalledOperand(Value *V)
Subprogram description. Uses SubclassData1.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
unsigned size() const
Definition DenseMap.h:110
bool empty() const
Definition DenseMap.h:109
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition DenseMap.h:174
iterator end()
Definition DenseMap.h:81
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:169
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
void reserve(size_type NumEntries)
Grow the densemap so that it can contain at least NumEntries items before resizing again.
Definition DenseMap.h:114
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
Function summary information to aid decisions and implementation of importing.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:211
DISubprogram * getSubprogram() const
Get the attached subprogram.
const Function & getFunction() const
Definition Function.h:166
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
static LLVM_ABI GlobalAlias * create(Type *Ty, unsigned AddressSpace, LinkageTypes Linkage, const Twine &Name, Constant *Aliasee, Module *Parent)
If a parent module is specified, the alias is automatically inserted into the end of the specified mo...
Definition Globals.cpp:613
Function and variable summary information to aid decisions and implementation of importing.
static LLVM_ABI GUID getGUIDAssumingExternalLinkage(StringRef GlobalName)
Return a 64-bit global unique ID constructed from the name of a global symbol.
Definition Globals.cpp:78
static bool isLocalLinkage(LinkageTypes Linkage)
LLVM_ABI bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition Globals.cpp:329
uint64_t GUID
Declare a type to represent a global unique identifier for a global value.
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI void eraseFromParent()
This method unlinks 'this' from the containing module and deletes it.
Definition Globals.cpp:94
static LLVM_ABI std::string getGlobalIdentifier(StringRef Name, GlobalValue::LinkageTypes Linkage, StringRef FileName)
Return the modified name for a global value suitable to be used as the key for a global lookup (e....
Definition Globals.cpp:162
bool isWeakForLinker() const
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Metadata node.
Definition Metadata.h:1080
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1444
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1442
unsigned getNumOperands() const
Return number of MDNode operands.
Definition Metadata.h:1450
LLVM_ABI TempMDNode clone() const
Create a (temporary) clone of this.
Definition Metadata.cpp:683
static std::enable_if_t< std::is_base_of< MDNode, T >::value, T * > replaceWithUniqued(std::unique_ptr< T, TempMDNodeDeleter > N)
Replace a temporary node with a uniqued one.
Definition Metadata.h:1319
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition Metadata.cpp:614
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
size_type count(const KeyT &Key) const
Definition MapVector.h:150
MemProfContextDisambiguation(const ModuleSummaryIndex *Summary=nullptr, bool isSamplePGO=false)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
static ErrorOr< std::unique_ptr< MemoryBuffer > > getFile(const Twine &Filename, bool IsText=false, bool RequiresNullTerminator=true, bool IsVolatile=false, std::optional< Align > Alignment=std::nullopt)
Open the specified file as a MemoryBuffer, returning a new MemoryBuffer if successful,...
Class to hold module path string table and global value map, and encapsulate methods for operating on...
static StringRef getOriginalNameBeforePromote(StringRef Name)
Helper to obtain the unpromoted name for a global value (or the original name if not promoted).
ValueInfo getValueInfo(const GlobalValueSummaryMapTy::value_type &R) const
Return a ValueInfo for the index value_type (convenient when iterating index).
uint64_t getStackIdAtIndex(unsigned Index) const
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
LLVMContext & getContext() const
Get the global data context.
Definition Module.h:285
A NodeSet contains a set of SUnit DAG nodes with additional information that assigns a priority to th...
unsigned size() const
bool insert(SUnit *SU)
The optimization diagnostic interface.
bool allowExtraAnalysis(StringRef PassName) const
Whether we allow for extra compile-time budget to perform more analysis to produce fewer false positi...
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition Analysis.h:115
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
A class that wrap the SHA1 algorithm.
Definition SHA1.h:27
LLVM_ABI void update(ArrayRef< uint8_t > Data)
Digest more data.
Definition SHA1.cpp:208
LLVM_ABI std::array< uint8_t, 20 > result()
Return the current raw 160-bits SHA1 for the digested data since the last call to init().
Definition SHA1.cpp:288
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
void reserve(size_t Size)
Grow the DenseSet so that it can contain at least NumEntries items before resizing again.
Definition DenseSet.h:96
void insert_range(Range &&R)
Definition DenseSet.h:228
size_type size() const
Definition DenseSet.h:87
void swap(DenseSetImpl &RHS)
Definition DenseSet.h:102
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
bool erase(const ValueT &V)
Definition DenseSet.h:100
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
An efficient, type-erasing, non-owning reference to a callable.
Helper class to iterate through stack ids in both metadata (memprof MIB and callsite) and the corresp...
CallStackIterator beginAfterSharedPrefix(const CallStack &Other)
CallStackIterator end() const
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
CallInst * Call
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ Entry
Definition COFF.h:862
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ CE
Windows NT (Windows on ARM)
Definition MCAsmInfo.h:48
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > dyn_extract(Y &&MD)
Extract a Value from Metadata, if any.
Definition Metadata.h:696
LLVM_ABI AllocationType getMIBAllocType(const MDNode *MIB)
Returns the allocation type from an MIB metadata node.
LLVM_ABI bool metadataMayIncludeContextSizeInfo()
Whether the alloc memprof metadata may include context size info for some MIBs (but possibly not all)...
LLVM_ABI bool hasSingleAllocType(uint8_t AllocTypes)
True if the AllocTypes bitmask contains just a single type.
LLVM_ABI std::string getAllocTypeAttributeString(AllocationType Type)
Returns the string to use in attributes with the given type.
LLVM_ABI MDNode * getMIBStackNode(const MDNode *MIB)
Returns the stack node from an MIB metadata node.
LLVM_ABI void removeAnyExistingAmbiguousAttribute(CallBase *CB)
Removes any existing "ambiguous" memprof attribute.
DiagnosticInfoOptimizationBase::Argument NV
LLVM_ABI CallBase & promoteIndirectCall(CallBase &CB, Function *F, uint64_t Count, uint64_t TotalCount, bool AttachProfToDirectCall, OptimizationRemarkEmitter *ORE)
uint32_t NodeId
Definition RDFGraph.h:262
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
bool empty() const
Definition BasicBlock.h:101
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
uint64_t read64le(const void *P)
Definition Endian.h:435
void write32le(void *P, uint32_t V)
Definition Endian.h:475
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
cl::opt< unsigned > MinClonedColdBytePercent("memprof-cloning-cold-threshold", cl::init(100), cl::Hidden, cl::desc("Min percent of cold bytes to hint alloc cold during cloning"))
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI void logAllUnhandledErrors(Error E, raw_ostream &OS, Twine ErrorBanner={})
Log all errors (if any) in E to OS.
Definition Error.cpp:61
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2116
cl::opt< bool > MemProfReportHintedSizes("memprof-report-hinted-sizes", cl::init(false), cl::Hidden, cl::desc("Report total allocation sizes of hinted allocations"))
LLVM_ABI bool isLegalToPromote(const CallBase &CB, Function *Callee, const char **FailureReason=nullptr)
Return true if the given indirect call site can be made to call Callee.
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool mayHaveMemprofSummary(const CallBase *CB)
Returns true if the instruction could have memprof metadata, used to ensure consistency between summa...
constexpr from_range_t from_range
static cl::opt< bool > MemProfRequireDefinitionForPromotion("memprof-require-definition-for-promotion", cl::init(false), cl::Hidden, cl::desc("Require target function definition when promoting indirect calls"))
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
cl::opt< unsigned > MemProfTopNImportant("memprof-top-n-important", cl::init(10), cl::Hidden, cl::desc("Number of largest cold contexts to consider important"))
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
void set_subtract(S1Ty &S1, const S2Ty &S2)
set_subtract(A, B) - Compute A := A - B
InnerAnalysisManagerProxy< FunctionAnalysisManager, Module > FunctionAnalysisManagerModuleProxy
Provide the FunctionAnalysisManager to Module proxy.
raw_ostream & WriteGraph(raw_ostream &O, const GraphType &G, bool ShortNames=false, const Twine &Title="")
bool set_intersects(const S1Ty &S1, const S2Ty &S2)
set_intersects(A, B) - Return true iff A ^ B is non empty
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1152
LLVM_ABI Expected< std::unique_ptr< ModuleSummaryIndex > > getModuleSummaryIndex(MemoryBufferRef Buffer)
Parse the specified bitcode buffer, returning the module summary index.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
LLVM_ABI void annotateValueSite(Module &M, Instruction &Inst, const InstrProfRecord &InstrProfR, InstrProfValueKind ValueKind, uint32_t SiteIndx, uint32_t MaxMDCount=3)
Get the value profile data for value site SiteIdx from InstrProfR and annotate the instruction Inst w...
cl::opt< unsigned > MaxSummaryIndirectEdges("module-summary-max-indirect-edges", cl::init(0), cl::Hidden, cl::desc("Max number of summary edges added from " "indirect call profile metadata"))
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
bool set_union(S1Ty &S1, const S2Ty &S2)
set_union(A, B) - Compute A := A u B, return whether A changed.
cl::opt< bool > SupportsHotColdNew
Indicate we are linking with an allocator that supports hot/cold operator new interfaces.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
S1Ty set_intersection(const S1Ty &S1, const S2Ty &S2)
set_intersection(A, B) - Return A ^ B
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
cl::opt< bool > EnableMemProfContextDisambiguation
Enable MemProf context disambiguation for thin link.
S1Ty set_difference(const S1Ty &S1, const S2Ty &S2)
set_difference(A, B) - Return A - B
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Expected< T > errorOrToExpected(ErrorOr< T > &&EO)
Convert an ErrorOr<T> to an Expected<T>.
Definition Error.h:1245
ArrayRef(const T &OneElt) -> ArrayRef< T >
std::string toString(const APInt &I, unsigned Radix, bool Signed, bool formatAsCLiteral=false, bool UpperCase=true, bool InsertSeparators=false)
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1772
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
LLVM_ABI Function * CloneFunction(Function *F, ValueToValueMapTy &VMap, ClonedCodeInfo *CodeInfo=nullptr)
Return a copy of the specified function and add it to that function's module.
AnalysisManager< Module > ModuleAnalysisManager
Convenience typedef for the Module analysis manager.
Definition MIRParser.h:39
cl::opt< bool > MemProfFixupImportant("memprof-fixup-important", cl::init(true), cl::Hidden, cl::desc("Enables edge fixup for important contexts"))
#define N
static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter, GraphType G)
static const ContextNode< DerivedCCG, FuncTy, CallTy > * GetCallee(const EdgePtrTy &P)
std::unique_ptr< ContextNode< DerivedCCG, FuncTy, CallTy > > NodePtrTy
mapped_iterator< typename std::vector< std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > >::const_iterator, decltype(&GetCallee)> ChildIteratorType
mapped_iterator< typename std::vector< NodePtrTy >::const_iterator, decltype(&getNode)> nodes_iterator
std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > EdgePtrTy
Summary of memprof metadata on allocations.
std::vector< MIBInfo > MIBs
SmallVector< unsigned > StackIdIndices
SmallVector< unsigned > Clones
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits(bool simple=false)
An information struct used to provide DenseMap with the various necessary components for a given valu...
typename GraphType::UnknownGraphTypeError NodeRef
Definition GraphTraits.h:95
Struct that holds a reference to a particular GUID in a global value summary.
ArrayRef< std::unique_ptr< GlobalValueSummary > > getSummaryList() const
GlobalValue::GUID getGUID() const
PointerUnion< CallsiteInfo *, AllocInfo * > SimpleType
static SimpleType getSimplifiedValue(IndexCall &Val)
const PointerUnion< CallsiteInfo *, AllocInfo * > SimpleType
static SimpleType getSimplifiedValue(const IndexCall &Val)
Define a template that can be specialized by smart pointers to reflect the fact that they are automat...
Definition Casting.h:34