LLVM 23.0.0git
OMPIRBuilder.cpp
Go to the documentation of this file.
1//===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the OpenMPIRBuilder class, which is used as a
11/// convenient way to create LLVM instructions for OpenMP directives.
12///
13//===----------------------------------------------------------------------===//
14
17#include "llvm/ADT/SmallSet.h"
19#include "llvm/ADT/StringRef.h"
30#include "llvm/IR/Attributes.h"
31#include "llvm/IR/BasicBlock.h"
32#include "llvm/IR/CFG.h"
33#include "llvm/IR/CallingConv.h"
34#include "llvm/IR/Constant.h"
35#include "llvm/IR/Constants.h"
36#include "llvm/IR/DIBuilder.h"
39#include "llvm/IR/Function.h"
41#include "llvm/IR/IRBuilder.h"
44#include "llvm/IR/LLVMContext.h"
45#include "llvm/IR/MDBuilder.h"
46#include "llvm/IR/Metadata.h"
48#include "llvm/IR/PassManager.h"
50#include "llvm/IR/Value.h"
53#include "llvm/Support/Error.h"
64
65#include <cstdint>
66#include <optional>
67
68#define DEBUG_TYPE "openmp-ir-builder"
69
70using namespace llvm;
71using namespace omp;
72
73static cl::opt<bool>
74 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
75 cl::desc("Use optimistic attributes describing "
76 "'as-if' properties of runtime calls."),
77 cl::init(false));
78
80 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
81 cl::desc("Factor for the unroll threshold to account for code "
82 "simplifications still taking place"),
83 cl::init(1.5));
84
85#ifndef NDEBUG
86/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
87/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
88/// an InsertPoint stores the instruction before something is inserted. For
89/// instance, if both point to the same instruction, two IRBuilders alternating
90/// creating instruction will cause the instructions to be interleaved.
93 if (!IP1.isSet() || !IP2.isSet())
94 return false;
95 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
96}
97
99 // Valid ordered/unordered and base algorithm combinations.
100 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
101 case OMPScheduleType::UnorderedStaticChunked:
102 case OMPScheduleType::UnorderedStatic:
103 case OMPScheduleType::UnorderedDynamicChunked:
104 case OMPScheduleType::UnorderedGuidedChunked:
105 case OMPScheduleType::UnorderedRuntime:
106 case OMPScheduleType::UnorderedAuto:
107 case OMPScheduleType::UnorderedTrapezoidal:
108 case OMPScheduleType::UnorderedGreedy:
109 case OMPScheduleType::UnorderedBalanced:
110 case OMPScheduleType::UnorderedGuidedIterativeChunked:
111 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
112 case OMPScheduleType::UnorderedSteal:
113 case OMPScheduleType::UnorderedStaticBalancedChunked:
114 case OMPScheduleType::UnorderedGuidedSimd:
115 case OMPScheduleType::UnorderedRuntimeSimd:
116 case OMPScheduleType::OrderedStaticChunked:
117 case OMPScheduleType::OrderedStatic:
118 case OMPScheduleType::OrderedDynamicChunked:
119 case OMPScheduleType::OrderedGuidedChunked:
120 case OMPScheduleType::OrderedRuntime:
121 case OMPScheduleType::OrderedAuto:
122 case OMPScheduleType::OrderdTrapezoidal:
123 case OMPScheduleType::NomergeUnorderedStaticChunked:
124 case OMPScheduleType::NomergeUnorderedStatic:
125 case OMPScheduleType::NomergeUnorderedDynamicChunked:
126 case OMPScheduleType::NomergeUnorderedGuidedChunked:
127 case OMPScheduleType::NomergeUnorderedRuntime:
128 case OMPScheduleType::NomergeUnorderedAuto:
129 case OMPScheduleType::NomergeUnorderedTrapezoidal:
130 case OMPScheduleType::NomergeUnorderedGreedy:
131 case OMPScheduleType::NomergeUnorderedBalanced:
132 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
133 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
134 case OMPScheduleType::NomergeUnorderedSteal:
135 case OMPScheduleType::NomergeOrderedStaticChunked:
136 case OMPScheduleType::NomergeOrderedStatic:
137 case OMPScheduleType::NomergeOrderedDynamicChunked:
138 case OMPScheduleType::NomergeOrderedGuidedChunked:
139 case OMPScheduleType::NomergeOrderedRuntime:
140 case OMPScheduleType::NomergeOrderedAuto:
141 case OMPScheduleType::NomergeOrderedTrapezoidal:
142 case OMPScheduleType::OrderedDistributeChunked:
143 case OMPScheduleType::OrderedDistribute:
144 break;
145 default:
146 return false;
147 }
148
149 // Must not set both monotonicity modifiers at the same time.
150 OMPScheduleType MonotonicityFlags =
151 SchedType & OMPScheduleType::MonotonicityMask;
152 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
153 return false;
154
155 return true;
156}
157#endif
158
159/// This is wrapper over IRBuilderBase::restoreIP that also restores the current
160/// debug location to the last instruction in the specified basic block if the
161/// insert point points to the end of the block.
164 Builder.restoreIP(IP);
165 llvm::BasicBlock *BB = Builder.GetInsertBlock();
166 llvm::BasicBlock::iterator I = Builder.GetInsertPoint();
167 if (!BB->empty() && I == BB->end())
168 Builder.SetCurrentDebugLocation(BB->back().getStableDebugLoc());
169}
170
171static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
172 if (T.isAMDGPU()) {
173 StringRef Features =
174 Kernel->getFnAttribute("target-features").getValueAsString();
175 if (Features.count("+wavefrontsize64"))
178 }
179 if (T.isNVPTX())
181 if (T.isSPIRV())
183 llvm_unreachable("No grid value available for this architecture!");
184}
185
186/// Determine which scheduling algorithm to use, determined from schedule clause
187/// arguments.
188static OMPScheduleType
189getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
190 bool HasSimdModifier, bool HasDistScheduleChunks) {
191 // Currently, the default schedule it static.
192 switch (ClauseKind) {
193 case OMP_SCHEDULE_Default:
194 case OMP_SCHEDULE_Static:
195 return HasChunks ? OMPScheduleType::BaseStaticChunked
196 : OMPScheduleType::BaseStatic;
197 case OMP_SCHEDULE_Dynamic:
198 return OMPScheduleType::BaseDynamicChunked;
199 case OMP_SCHEDULE_Guided:
200 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
201 : OMPScheduleType::BaseGuidedChunked;
202 case OMP_SCHEDULE_Auto:
204 case OMP_SCHEDULE_Runtime:
205 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
206 : OMPScheduleType::BaseRuntime;
207 case OMP_SCHEDULE_Distribute:
208 return HasDistScheduleChunks ? OMPScheduleType::BaseDistributeChunked
209 : OMPScheduleType::BaseDistribute;
210 }
211 llvm_unreachable("unhandled schedule clause argument");
212}
213
214/// Adds ordering modifier flags to schedule type.
215static OMPScheduleType
217 bool HasOrderedClause) {
218 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
219 OMPScheduleType::None &&
220 "Must not have ordering nor monotonicity flags already set");
221
222 OMPScheduleType OrderingModifier = HasOrderedClause
223 ? OMPScheduleType::ModifierOrdered
224 : OMPScheduleType::ModifierUnordered;
225 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
226
227 // Unsupported combinations
228 if (OrderingScheduleType ==
229 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
230 return OMPScheduleType::OrderedGuidedChunked;
231 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
232 OMPScheduleType::ModifierOrdered))
233 return OMPScheduleType::OrderedRuntime;
234
235 return OrderingScheduleType;
236}
237
238/// Adds monotonicity modifier flags to schedule type.
239static OMPScheduleType
241 bool HasSimdModifier, bool HasMonotonic,
242 bool HasNonmonotonic, bool HasOrderedClause) {
243 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
244 OMPScheduleType::None &&
245 "Must not have monotonicity flags already set");
246 assert((!HasMonotonic || !HasNonmonotonic) &&
247 "Monotonic and Nonmonotonic are contradicting each other");
248
249 if (HasMonotonic) {
250 return ScheduleType | OMPScheduleType::ModifierMonotonic;
251 } else if (HasNonmonotonic) {
252 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
253 } else {
254 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
255 // If the static schedule kind is specified or if the ordered clause is
256 // specified, and if the nonmonotonic modifier is not specified, the
257 // effect is as if the monotonic modifier is specified. Otherwise, unless
258 // the monotonic modifier is specified, the effect is as if the
259 // nonmonotonic modifier is specified.
260 OMPScheduleType BaseScheduleType =
261 ScheduleType & ~OMPScheduleType::ModifierMask;
262 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
263 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
264 HasOrderedClause) {
265 // The monotonic is used by default in openmp runtime library, so no need
266 // to set it.
267 return ScheduleType;
268 } else {
269 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
270 }
271 }
272}
273
274/// Determine the schedule type using schedule and ordering clause arguments.
275static OMPScheduleType
276computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
277 bool HasSimdModifier, bool HasMonotonicModifier,
278 bool HasNonmonotonicModifier, bool HasOrderedClause,
279 bool HasDistScheduleChunks) {
281 ClauseKind, HasChunks, HasSimdModifier, HasDistScheduleChunks);
282 OMPScheduleType OrderedSchedule =
283 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
285 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
286 HasNonmonotonicModifier, HasOrderedClause);
287
289 return Result;
290}
291
292/// Make \p Source branch to \p Target.
293///
294/// Handles two situations:
295/// * \p Source already has an unconditional branch.
296/// * \p Source is a degenerate block (no terminator because the BB is
297/// the current head of the IR construction).
299 if (Instruction *Term = Source->getTerminator()) {
300 auto *Br = cast<BranchInst>(Term);
301 assert(!Br->isConditional() &&
302 "BB's terminator must be an unconditional branch (or degenerate)");
303 BasicBlock *Succ = Br->getSuccessor(0);
304 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
305 Br->setSuccessor(0, Target);
306 return;
307 }
308
309 auto *NewBr = BranchInst::Create(Target, Source);
310 NewBr->setDebugLoc(DL);
311}
312
314 bool CreateBranch, DebugLoc DL) {
315 assert(New->getFirstInsertionPt() == New->begin() &&
316 "Target BB must not have PHI nodes");
317
318 // Move instructions to new block.
319 BasicBlock *Old = IP.getBlock();
320 // If the `Old` block is empty then there are no instructions to move. But in
321 // the new debug scheme, it could have trailing debug records which will be
322 // moved to `New` in `spliceDebugInfoEmptyBlock`. We dont want that for 2
323 // reasons:
324 // 1. If `New` is also empty, `BasicBlock::splice` crashes.
325 // 2. Even if `New` is not empty, the rationale to move those records to `New`
326 // (in `spliceDebugInfoEmptyBlock`) does not apply here. That function
327 // assumes that `Old` is optimized out and is going away. This is not the case
328 // here. The `Old` block is still being used e.g. a branch instruction is
329 // added to it later in this function.
330 // So we call `BasicBlock::splice` only when `Old` is not empty.
331 if (!Old->empty())
332 New->splice(New->begin(), Old, IP.getPoint(), Old->end());
333
334 if (CreateBranch) {
335 auto *NewBr = BranchInst::Create(New, Old);
336 NewBr->setDebugLoc(DL);
337 }
338}
339
340void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
341 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
342 BasicBlock *Old = Builder.GetInsertBlock();
343
344 spliceBB(Builder.saveIP(), New, CreateBranch, DebugLoc);
345 if (CreateBranch)
346 Builder.SetInsertPoint(Old->getTerminator());
347 else
348 Builder.SetInsertPoint(Old);
349
350 // SetInsertPoint also updates the Builder's debug location, but we want to
351 // keep the one the Builder was configured to use.
352 Builder.SetCurrentDebugLocation(DebugLoc);
353}
354
356 DebugLoc DL, llvm::Twine Name) {
357 BasicBlock *Old = IP.getBlock();
359 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
360 Old->getParent(), Old->getNextNode());
361 spliceBB(IP, New, CreateBranch, DL);
362 New->replaceSuccessorsPhiUsesWith(Old, New);
363 return New;
364}
365
366BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
367 llvm::Twine Name) {
368 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
369 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
370 if (CreateBranch)
371 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
372 else
373 Builder.SetInsertPoint(Builder.GetInsertBlock());
374 // SetInsertPoint also updates the Builder's debug location, but we want to
375 // keep the one the Builder was configured to use.
376 Builder.SetCurrentDebugLocation(DebugLoc);
377 return New;
378}
379
380BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
381 llvm::Twine Name) {
382 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
383 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
384 if (CreateBranch)
385 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
386 else
387 Builder.SetInsertPoint(Builder.GetInsertBlock());
388 // SetInsertPoint also updates the Builder's debug location, but we want to
389 // keep the one the Builder was configured to use.
390 Builder.SetCurrentDebugLocation(DebugLoc);
391 return New;
392}
393
395 llvm::Twine Suffix) {
396 BasicBlock *Old = Builder.GetInsertBlock();
397 return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
398}
399
400// This function creates a fake integer value and a fake use for the integer
401// value. It returns the fake value created. This is useful in modeling the
402// extra arguments to the outlined functions.
404 OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
406 OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
407 const Twine &Name = "", bool AsPtr = true,
408 bool Is64Bit = false) {
409 Builder.restoreIP(OuterAllocaIP);
410 IntegerType *IntTy = Is64Bit ? Builder.getInt64Ty() : Builder.getInt32Ty();
411 Instruction *FakeVal;
412 AllocaInst *FakeValAddr =
413 Builder.CreateAlloca(IntTy, nullptr, Name + ".addr");
414 ToBeDeleted.push_back(FakeValAddr);
415
416 if (AsPtr) {
417 FakeVal = FakeValAddr;
418 } else {
419 FakeVal = Builder.CreateLoad(IntTy, FakeValAddr, Name + ".val");
420 ToBeDeleted.push_back(FakeVal);
421 }
422
423 // Generate a fake use of this value
424 Builder.restoreIP(InnerAllocaIP);
425 Instruction *UseFakeVal;
426 if (AsPtr) {
427 UseFakeVal = Builder.CreateLoad(IntTy, FakeVal, Name + ".use");
428 } else {
429 UseFakeVal = cast<BinaryOperator>(Builder.CreateAdd(
430 FakeVal, Is64Bit ? Builder.getInt64(10) : Builder.getInt32(10)));
431 }
432 ToBeDeleted.push_back(UseFakeVal);
433 return FakeVal;
434}
435
436//===----------------------------------------------------------------------===//
437// OpenMPIRBuilderConfig
438//===----------------------------------------------------------------------===//
439
440namespace {
442/// Values for bit flags for marking which requires clauses have been used.
443enum OpenMPOffloadingRequiresDirFlags {
444 /// flag undefined.
445 OMP_REQ_UNDEFINED = 0x000,
446 /// no requires directive present.
447 OMP_REQ_NONE = 0x001,
448 /// reverse_offload clause.
449 OMP_REQ_REVERSE_OFFLOAD = 0x002,
450 /// unified_address clause.
451 OMP_REQ_UNIFIED_ADDRESS = 0x004,
452 /// unified_shared_memory clause.
453 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
454 /// dynamic_allocators clause.
455 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
456 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
457};
458
459} // anonymous namespace
460
462 : RequiresFlags(OMP_REQ_UNDEFINED) {}
463
466 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress,
467 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators)
470 RequiresFlags(OMP_REQ_UNDEFINED) {
471 if (HasRequiresReverseOffload)
472 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
473 if (HasRequiresUnifiedAddress)
474 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
475 if (HasRequiresUnifiedSharedMemory)
476 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
477 if (HasRequiresDynamicAllocators)
478 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
479}
480
482 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
483}
484
486 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
487}
488
490 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
491}
492
494 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
495}
496
498 return hasRequiresFlags() ? RequiresFlags
499 : static_cast<int64_t>(OMP_REQ_NONE);
500}
501
503 if (Value)
504 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
505 else
506 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
507}
508
510 if (Value)
511 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
512 else
513 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
514}
515
517 if (Value)
518 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
519 else
520 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
521}
522
524 if (Value)
525 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
526 else
527 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
528}
529
530//===----------------------------------------------------------------------===//
531// OpenMPIRBuilder
532//===----------------------------------------------------------------------===//
533
536 SmallVector<Value *> &ArgsVector) {
538 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
539 auto Int32Ty = Type::getInt32Ty(Builder.getContext());
540 constexpr size_t MaxDim = 3;
541 Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
542
543 Value *HasNoWaitFlag = Builder.getInt64(KernelArgs.HasNoWait);
544
545 Value *DynCGroupMemFallbackFlag =
546 Builder.getInt64(static_cast<uint64_t>(KernelArgs.DynCGroupMemFallback));
547 DynCGroupMemFallbackFlag = Builder.CreateShl(DynCGroupMemFallbackFlag, 2);
548 Value *Flags = Builder.CreateOr(HasNoWaitFlag, DynCGroupMemFallbackFlag);
549
550 assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());
551
552 Value *NumTeams3D =
553 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams[0], {0});
554 Value *NumThreads3D =
555 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads[0], {0});
556 for (unsigned I :
557 seq<unsigned>(1, std::min(KernelArgs.NumTeams.size(), MaxDim)))
558 NumTeams3D =
559 Builder.CreateInsertValue(NumTeams3D, KernelArgs.NumTeams[I], {I});
560 for (unsigned I :
561 seq<unsigned>(1, std::min(KernelArgs.NumThreads.size(), MaxDim)))
562 NumThreads3D =
563 Builder.CreateInsertValue(NumThreads3D, KernelArgs.NumThreads[I], {I});
564
565 ArgsVector = {Version,
566 PointerNum,
567 KernelArgs.RTArgs.BasePointersArray,
568 KernelArgs.RTArgs.PointersArray,
569 KernelArgs.RTArgs.SizesArray,
570 KernelArgs.RTArgs.MapTypesArray,
571 KernelArgs.RTArgs.MapNamesArray,
572 KernelArgs.RTArgs.MappersArray,
573 KernelArgs.NumIterations,
574 Flags,
575 NumTeams3D,
576 NumThreads3D,
577 KernelArgs.DynCGroupMem};
578}
579
581 LLVMContext &Ctx = Fn.getContext();
582
583 // Get the function's current attributes.
584 auto Attrs = Fn.getAttributes();
585 auto FnAttrs = Attrs.getFnAttrs();
586 auto RetAttrs = Attrs.getRetAttrs();
588 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
589 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
590
591 // Add AS to FnAS while taking special care with integer extensions.
592 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS,
593 bool Param = true) -> void {
594 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
595 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
596 if (HasSignExt || HasZeroExt) {
597 assert(AS.getNumAttributes() == 1 &&
598 "Currently not handling extension attr combined with others.");
599 if (Param) {
600 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
601 FnAS = FnAS.addAttribute(Ctx, AK);
602 } else if (auto AK =
603 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
604 FnAS = FnAS.addAttribute(Ctx, AK);
605 } else {
606 FnAS = FnAS.addAttributes(Ctx, AS);
607 }
608 };
609
610#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
611#include "llvm/Frontend/OpenMP/OMPKinds.def"
612
613 // Add attributes to the function declaration.
614 switch (FnID) {
615#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
616 case Enum: \
617 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
618 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
619 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
620 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
621 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
622 break;
623#include "llvm/Frontend/OpenMP/OMPKinds.def"
624 default:
625 // Attributes are optional.
626 break;
627 }
628}
629
632 FunctionType *FnTy = nullptr;
633 Function *Fn = nullptr;
634
635 // Try to find the declation in the module first.
636 switch (FnID) {
637#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
638 case Enum: \
639 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
640 IsVarArg); \
641 Fn = M.getFunction(Str); \
642 break;
643#include "llvm/Frontend/OpenMP/OMPKinds.def"
644 }
645
646 if (!Fn) {
647 // Create a new declaration if we need one.
648 switch (FnID) {
649#define OMP_RTL(Enum, Str, ...) \
650 case Enum: \
651 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
652 break;
653#include "llvm/Frontend/OpenMP/OMPKinds.def"
654 }
655 Fn->setCallingConv(Config.getRuntimeCC());
656 // Add information if the runtime function takes a callback function
657 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
658 if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
659 LLVMContext &Ctx = Fn->getContext();
660 MDBuilder MDB(Ctx);
661 // Annotate the callback behavior of the runtime function:
662 // - The callback callee is argument number 2 (microtask).
663 // - The first two arguments of the callback callee are unknown (-1).
664 // - All variadic arguments to the runtime function are passed to the
665 // callback callee.
666 Fn->addMetadata(
667 LLVMContext::MD_callback,
669 2, {-1, -1}, /* VarArgsArePassed */ true)}));
670 }
671 }
672
673 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
674 << " with type " << *Fn->getFunctionType() << "\n");
675 addAttributes(FnID, *Fn);
676
677 } else {
678 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
679 << " with type " << *Fn->getFunctionType() << "\n");
680 }
681
682 assert(Fn && "Failed to create OpenMP runtime function");
683
684 return {FnTy, Fn};
685}
686
689 if (!FiniBB) {
690 Function *ParentFunc = Builder.GetInsertBlock()->getParent();
692 FiniBB = BasicBlock::Create(Builder.getContext(), ".fini", ParentFunc);
693 Builder.SetInsertPoint(FiniBB);
694 // FiniCB adds the branch to the exit stub.
695 if (Error Err = FiniCB(Builder.saveIP()))
696 return Err;
697 }
698 return FiniBB;
699}
700
702 BasicBlock *OtherFiniBB) {
703 // Simple case: FiniBB does not exist yet: re-use OtherFiniBB.
704 if (!FiniBB) {
705 FiniBB = OtherFiniBB;
706
707 Builder.SetInsertPoint(FiniBB->getFirstNonPHIIt());
708 if (Error Err = FiniCB(Builder.saveIP()))
709 return Err;
710
711 return Error::success();
712 }
713
714 // Move instructions from FiniBB to the start of OtherFiniBB.
715 auto EndIt = FiniBB->end();
716 if (FiniBB->size() >= 1)
717 if (auto Prev = std::prev(EndIt); Prev->isTerminator())
718 EndIt = Prev;
719 OtherFiniBB->splice(OtherFiniBB->getFirstNonPHIIt(), FiniBB, FiniBB->begin(),
720 EndIt);
721
722 FiniBB->replaceAllUsesWith(OtherFiniBB);
723 FiniBB->eraseFromParent();
724 FiniBB = OtherFiniBB;
725 return Error::success();
726}
727
730 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
731 assert(Fn && "Failed to create OpenMP runtime function pointer");
732 return Fn;
733}
734
737 StringRef Name) {
738 CallInst *Call = Builder.CreateCall(Callee, Args, Name);
739 Call->setCallingConv(Config.getRuntimeCC());
740 return Call;
741}
742
743void OpenMPIRBuilder::initialize() { initializeTypes(M); }
744
747 BasicBlock &EntryBlock = Function->getEntryBlock();
748 BasicBlock::iterator MoveLocInst = EntryBlock.getFirstNonPHIIt();
749
750 // Loop over blocks looking for constant allocas, skipping the entry block
751 // as any allocas there are already in the desired location.
752 for (auto Block = std::next(Function->begin(), 1); Block != Function->end();
753 Block++) {
754 for (auto Inst = Block->getReverseIterator()->begin();
755 Inst != Block->getReverseIterator()->end();) {
757 Inst++;
759 continue;
760 AllocaInst->moveBeforePreserving(MoveLocInst);
761 } else {
762 Inst++;
763 }
764 }
765 }
766}
767
770
771 auto ShouldHoistAlloca = [](const llvm::AllocaInst &AllocaInst) {
772 // TODO: For now, we support simple static allocations, we might need to
773 // move non-static ones as well. However, this will need further analysis to
774 // move the lenght arguments as well.
776 };
777
778 for (llvm::Instruction &Inst : Block)
780 if (ShouldHoistAlloca(*AllocaInst))
781 AllocasToMove.push_back(AllocaInst);
782
783 auto InsertPoint =
784 Block.getParent()->getEntryBlock().getTerminator()->getIterator();
785
786 for (llvm::Instruction *AllocaInst : AllocasToMove)
788}
789
791 PostDominatorTree PostDomTree(*Func);
792 for (llvm::BasicBlock &BB : *Func)
793 if (PostDomTree.properlyDominates(&BB, &Func->getEntryBlock()))
795}
796
798 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
800 SmallVector<OutlineInfo, 16> DeferredOutlines;
801 for (OutlineInfo &OI : OutlineInfos) {
802 // Skip functions that have not finalized yet; may happen with nested
803 // function generation.
804 if (Fn && OI.getFunction() != Fn) {
805 DeferredOutlines.push_back(OI);
806 continue;
807 }
808
809 ParallelRegionBlockSet.clear();
810 Blocks.clear();
811 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
812
813 Function *OuterFn = OI.getFunction();
814 CodeExtractorAnalysisCache CEAC(*OuterFn);
815 // If we generate code for the target device, we need to allocate
816 // struct for aggregate params in the device default alloca address space.
817 // OpenMP runtime requires that the params of the extracted functions are
818 // passed as zero address space pointers. This flag ensures that
819 // CodeExtractor generates correct code for extracted functions
820 // which are used by OpenMP runtime.
821 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
822 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
823 /* AggregateArgs */ true,
824 /* BlockFrequencyInfo */ nullptr,
825 /* BranchProbabilityInfo */ nullptr,
826 /* AssumptionCache */ nullptr,
827 /* AllowVarArgs */ true,
828 /* AllowAlloca */ true,
829 /* AllocaBlock*/ OI.OuterAllocaBB,
830 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
831
832 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
833 LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName()
834 << " Exit: " << OI.ExitBB->getName() << "\n");
835 assert(Extractor.isEligible() &&
836 "Expected OpenMP outlining to be possible!");
837
838 for (auto *V : OI.ExcludeArgsFromAggregate)
839 Extractor.excludeArgFromAggregate(V);
840
841 Function *OutlinedFn =
842 Extractor.extractCodeRegion(CEAC, OI.Inputs, OI.Outputs);
843
844 // Forward target-cpu, target-features attributes to the outlined function.
845 auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
846 if (TargetCpuAttr.isStringAttribute())
847 OutlinedFn->addFnAttr(TargetCpuAttr);
848
849 auto TargetFeaturesAttr = OuterFn->getFnAttribute("target-features");
850 if (TargetFeaturesAttr.isStringAttribute())
851 OutlinedFn->addFnAttr(TargetFeaturesAttr);
852
853 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n");
854 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n");
855 assert(OutlinedFn->getReturnType()->isVoidTy() &&
856 "OpenMP outlined functions should not return a value!");
857
858 // For compability with the clang CG we move the outlined function after the
859 // one with the parallel region.
860 OutlinedFn->removeFromParent();
861 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
862
863 // Remove the artificial entry introduced by the extractor right away, we
864 // made our own entry block after all.
865 {
866 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
867 assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
868 assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
869 // Move instructions from the to-be-deleted ArtificialEntry to the entry
870 // basic block of the parallel region. CodeExtractor generates
871 // instructions to unwrap the aggregate argument and may sink
872 // allocas/bitcasts for values that are solely used in the outlined region
873 // and do not escape.
874 assert(!ArtificialEntry.empty() &&
875 "Expected instructions to add in the outlined region entry");
876 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
877 End = ArtificialEntry.rend();
878 It != End;) {
879 Instruction &I = *It;
880 It++;
881
882 if (I.isTerminator()) {
883 // Absorb any debug value that terminator may have
884 if (OI.EntryBB->getTerminator())
885 OI.EntryBB->getTerminator()->adoptDbgRecords(
886 &ArtificialEntry, I.getIterator(), false);
887 continue;
888 }
889
890 I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
891 }
892
893 OI.EntryBB->moveBefore(&ArtificialEntry);
894 ArtificialEntry.eraseFromParent();
895 }
896 assert(&OutlinedFn->getEntryBlock() == OI.EntryBB);
897 assert(OutlinedFn && OutlinedFn->hasNUses(1));
898
899 // Run a user callback, e.g. to add attributes.
900 if (OI.PostOutlineCB)
901 OI.PostOutlineCB(*OutlinedFn);
902
903 if (OI.FixUpNonEntryAllocas)
905 }
906
907 // Remove work items that have been completed.
908 OutlineInfos = std::move(DeferredOutlines);
909
910 // The createTarget functions embeds user written code into
911 // the target region which may inject allocas which need to
912 // be moved to the entry block of our target or risk malformed
913 // optimisations by later passes, this is only relevant for
914 // the device pass which appears to be a little more delicate
915 // when it comes to optimisations (however, we do not block on
916 // that here, it's up to the inserter to the list to do so).
917 // This notbaly has to occur after the OutlinedInfo candidates
918 // have been extracted so we have an end product that will not
919 // be implicitly adversely affected by any raises unless
920 // intentionally appended to the list.
921 // NOTE: This only does so for ConstantData, it could be extended
922 // to ConstantExpr's with further effort, however, they should
923 // largely be folded when they get here. Extending it to runtime
924 // defined/read+writeable allocation sizes would be non-trivial
925 // (need to factor in movement of any stores to variables the
926 // allocation size depends on, as well as the usual loads,
927 // otherwise it'll yield the wrong result after movement) and
928 // likely be more suitable as an LLVM optimisation pass.
931
932 EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
933 [](EmitMetadataErrorKind Kind,
934 const TargetRegionEntryInfo &EntryInfo) -> void {
935 errs() << "Error of kind: " << Kind
936 << " when emitting offload entries and metadata during "
937 "OMPIRBuilder finalization \n";
938 };
939
940 if (!OffloadInfoManager.empty())
942
943 if (Config.EmitLLVMUsedMetaInfo.value_or(false)) {
944 std::vector<WeakTrackingVH> LLVMCompilerUsed = {
945 M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
946 emitUsed("llvm.compiler.used", LLVMCompilerUsed);
947 }
948
949 IsFinalized = true;
950}
951
952bool OpenMPIRBuilder::isFinalized() { return IsFinalized; }
953
955 assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
956}
957
959 IntegerType *I32Ty = Type::getInt32Ty(M.getContext());
960 auto *GV =
961 new GlobalVariable(M, I32Ty,
962 /* isConstant = */ true, GlobalValue::WeakODRLinkage,
963 ConstantInt::get(I32Ty, Value), Name);
964 GV->setVisibility(GlobalValue::HiddenVisibility);
965
966 return GV;
967}
968
970 if (List.empty())
971 return;
972
973 // Convert List to what ConstantArray needs.
975 UsedArray.resize(List.size());
976 for (unsigned I = 0, E = List.size(); I != E; ++I)
978 cast<Constant>(&*List[I]), Builder.getPtrTy());
979
980 if (UsedArray.empty())
981 return;
982 ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
983
984 auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
985 ConstantArray::get(ATy, UsedArray), Name);
986
987 GV->setSection("llvm.metadata");
988}
989
992 OMPTgtExecModeFlags Mode) {
993 auto *Int8Ty = Builder.getInt8Ty();
994 auto *GVMode = new GlobalVariable(
995 M, Int8Ty, /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
996 ConstantInt::get(Int8Ty, Mode), Twine(KernelName, "_exec_mode"));
997 GVMode->setVisibility(GlobalVariable::ProtectedVisibility);
998 return GVMode;
999}
1000
1002 uint32_t SrcLocStrSize,
1003 IdentFlag LocFlags,
1004 unsigned Reserve2Flags) {
1005 // Enable "C-mode".
1006 LocFlags |= OMP_IDENT_FLAG_KMPC;
1007
1008 Constant *&Ident =
1009 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
1010 if (!Ident) {
1011 Constant *I32Null = ConstantInt::getNullValue(Int32);
1012 Constant *IdentData[] = {I32Null,
1013 ConstantInt::get(Int32, uint32_t(LocFlags)),
1014 ConstantInt::get(Int32, Reserve2Flags),
1015 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
1016
1017 size_t SrcLocStrArgIdx = 4;
1018 if (OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx)
1020 IdentData[SrcLocStrArgIdx]->getType()->getPointerAddressSpace())
1021 IdentData[SrcLocStrArgIdx] = ConstantExpr::getAddrSpaceCast(
1022 SrcLocStr, OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx));
1023 Constant *Initializer =
1024 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
1025
1026 // Look for existing encoding of the location + flags, not needed but
1027 // minimizes the difference to the existing solution while we transition.
1028 for (GlobalVariable &GV : M.globals())
1029 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
1030 if (GV.getInitializer() == Initializer)
1031 Ident = &GV;
1032
1033 if (!Ident) {
1034 auto *GV = new GlobalVariable(
1035 M, OpenMPIRBuilder::Ident,
1036 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
1038 M.getDataLayout().getDefaultGlobalsAddressSpace());
1039 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
1040 GV->setAlignment(Align(8));
1041 Ident = GV;
1042 }
1043 }
1044
1045 return ConstantExpr::getPointerBitCastOrAddrSpaceCast(Ident, IdentPtr);
1046}
1047
1049 uint32_t &SrcLocStrSize) {
1050 SrcLocStrSize = LocStr.size();
1051 Constant *&SrcLocStr = SrcLocStrMap[LocStr];
1052 if (!SrcLocStr) {
1053 Constant *Initializer =
1054 ConstantDataArray::getString(M.getContext(), LocStr);
1055
1056 // Look for existing encoding of the location, not needed but minimizes the
1057 // difference to the existing solution while we transition.
1058 for (GlobalVariable &GV : M.globals())
1059 if (GV.isConstant() && GV.hasInitializer() &&
1060 GV.getInitializer() == Initializer)
1061 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
1062
1063 SrcLocStr = Builder.CreateGlobalString(
1064 LocStr, /*Name=*/"", M.getDataLayout().getDefaultGlobalsAddressSpace(),
1065 &M);
1066 }
1067 return SrcLocStr;
1068}
1069
1071 StringRef FileName,
1072 unsigned Line, unsigned Column,
1073 uint32_t &SrcLocStrSize) {
1074 SmallString<128> Buffer;
1075 Buffer.push_back(';');
1076 Buffer.append(FileName);
1077 Buffer.push_back(';');
1078 Buffer.append(FunctionName);
1079 Buffer.push_back(';');
1080 Buffer.append(std::to_string(Line));
1081 Buffer.push_back(';');
1082 Buffer.append(std::to_string(Column));
1083 Buffer.push_back(';');
1084 Buffer.push_back(';');
1085 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
1086}
1087
1088Constant *
1090 StringRef UnknownLoc = ";unknown;unknown;0;0;;";
1091 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
1092}
1093
1095 uint32_t &SrcLocStrSize,
1096 Function *F) {
1097 DILocation *DIL = DL.get();
1098 if (!DIL)
1099 return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
1100 StringRef FileName = M.getName();
1101 if (DIFile *DIF = DIL->getFile())
1102 if (std::optional<StringRef> Source = DIF->getSource())
1103 FileName = *Source;
1104 StringRef Function = DIL->getScope()->getSubprogram()->getName();
1105 if (Function.empty() && F)
1106 Function = F->getName();
1107 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
1108 DIL->getColumn(), SrcLocStrSize);
1109}
1110
1112 uint32_t &SrcLocStrSize) {
1113 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
1114 Loc.IP.getBlock()->getParent());
1115}
1116
1119 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
1120 "omp_global_thread_num");
1121}
1122
1125 bool ForceSimpleCall, bool CheckCancelFlag) {
1126 if (!updateToLocation(Loc))
1127 return Loc.IP;
1128
1129 // Build call __kmpc_cancel_barrier(loc, thread_id) or
1130 // __kmpc_barrier(loc, thread_id);
1131
1132 IdentFlag BarrierLocFlags;
1133 switch (Kind) {
1134 case OMPD_for:
1135 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
1136 break;
1137 case OMPD_sections:
1138 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
1139 break;
1140 case OMPD_single:
1141 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
1142 break;
1143 case OMPD_barrier:
1144 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
1145 break;
1146 default:
1147 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
1148 break;
1149 }
1150
1151 uint32_t SrcLocStrSize;
1152 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1153 Value *Args[] = {
1154 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
1155 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
1156
1157 // If we are in a cancellable parallel region, barriers are cancellation
1158 // points.
1159 // TODO: Check why we would force simple calls or to ignore the cancel flag.
1160 bool UseCancelBarrier =
1161 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
1162
1164 getOrCreateRuntimeFunctionPtr(UseCancelBarrier
1165 ? OMPRTL___kmpc_cancel_barrier
1166 : OMPRTL___kmpc_barrier),
1167 Args);
1168
1169 if (UseCancelBarrier && CheckCancelFlag)
1170 if (Error Err = emitCancelationCheckImpl(Result, OMPD_parallel))
1171 return Err;
1172
1173 return Builder.saveIP();
1174}
1175
1178 Value *IfCondition,
1179 omp::Directive CanceledDirective) {
1180 if (!updateToLocation(Loc))
1181 return Loc.IP;
1182
1183 // LLVM utilities like blocks with terminators.
1184 auto *UI = Builder.CreateUnreachable();
1185
1186 Instruction *ThenTI = UI, *ElseTI = nullptr;
1187 if (IfCondition) {
1188 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
1189
1190 // Even if the if condition evaluates to false, this should count as a
1191 // cancellation point
1192 Builder.SetInsertPoint(ElseTI);
1193 auto ElseIP = Builder.saveIP();
1194
1196 LocationDescription{ElseIP, Loc.DL}, CanceledDirective);
1197 if (!IPOrErr)
1198 return IPOrErr;
1199 }
1200
1201 Builder.SetInsertPoint(ThenTI);
1202
1203 Value *CancelKind = nullptr;
1204 switch (CanceledDirective) {
1205#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1206 case DirectiveEnum: \
1207 CancelKind = Builder.getInt32(Value); \
1208 break;
1209#include "llvm/Frontend/OpenMP/OMPKinds.def"
1210 default:
1211 llvm_unreachable("Unknown cancel kind!");
1212 }
1213
1214 uint32_t SrcLocStrSize;
1215 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1216 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1217 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1219 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
1220
1221 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1222 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective))
1223 return Err;
1224
1225 // Update the insertion point and remove the terminator we introduced.
1226 Builder.SetInsertPoint(UI->getParent());
1227 UI->eraseFromParent();
1228
1229 return Builder.saveIP();
1230}
1231
1234 omp::Directive CanceledDirective) {
1235 if (!updateToLocation(Loc))
1236 return Loc.IP;
1237
1238 // LLVM utilities like blocks with terminators.
1239 auto *UI = Builder.CreateUnreachable();
1240 Builder.SetInsertPoint(UI);
1241
1242 Value *CancelKind = nullptr;
1243 switch (CanceledDirective) {
1244#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1245 case DirectiveEnum: \
1246 CancelKind = Builder.getInt32(Value); \
1247 break;
1248#include "llvm/Frontend/OpenMP/OMPKinds.def"
1249 default:
1250 llvm_unreachable("Unknown cancel kind!");
1251 }
1252
1253 uint32_t SrcLocStrSize;
1254 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1255 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1256 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1258 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancellationpoint), Args);
1259
1260 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1261 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective))
1262 return Err;
1263
1264 // Update the insertion point and remove the terminator we introduced.
1265 Builder.SetInsertPoint(UI->getParent());
1266 UI->eraseFromParent();
1267
1268 return Builder.saveIP();
1269}
1270
1272 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return,
1273 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads,
1274 Value *HostPtr, ArrayRef<Value *> KernelArgs) {
1275 if (!updateToLocation(Loc))
1276 return Loc.IP;
1277
1278 Builder.restoreIP(AllocaIP);
1279 auto *KernelArgsPtr =
1280 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
1282
1283 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
1284 llvm::Value *Arg =
1285 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
1286 Builder.CreateAlignedStore(
1287 KernelArgs[I], Arg,
1288 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType()));
1289 }
1290
1291 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams,
1292 NumThreads, HostPtr, KernelArgsPtr};
1293
1295 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel),
1296 OffloadingArgs);
1297
1298 return Builder.saveIP();
1299}
1300
1302 const LocationDescription &Loc, Value *OutlinedFnID,
1303 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
1304 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) {
1305
1306 if (!updateToLocation(Loc))
1307 return Loc.IP;
1308
1309 // On top of the arrays that were filled up, the target offloading call
1310 // takes as arguments the device id as well as the host pointer. The host
1311 // pointer is used by the runtime library to identify the current target
1312 // region, so it only has to be unique and not necessarily point to
1313 // anything. It could be the pointer to the outlined function that
1314 // implements the target region, but we aren't using that so that the
1315 // compiler doesn't need to keep that, and could therefore inline the host
1316 // function if proven worthwhile during optimization.
1317
1318 // From this point on, we need to have an ID of the target region defined.
1319 assert(OutlinedFnID && "Invalid outlined function ID!");
1320 (void)OutlinedFnID;
1321
1322 // Return value of the runtime offloading call.
1323 Value *Return = nullptr;
1324
1325 // Arguments for the target kernel.
1326 SmallVector<Value *> ArgsVector;
1327 getKernelArgsVector(Args, Builder, ArgsVector);
1328
1329 // The target region is an outlined function launched by the runtime
1330 // via calls to __tgt_target_kernel().
1331 //
1332 // Note that on the host and CPU targets, the runtime implementation of
1333 // these calls simply call the outlined function without forking threads.
1334 // The outlined functions themselves have runtime calls to
1335 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1336 // the compiler in emitTeamsCall() and emitParallelCall().
1337 //
1338 // In contrast, on the NVPTX target, the implementation of
1339 // __tgt_target_teams() launches a GPU kernel with the requested number
1340 // of teams and threads so no additional calls to the runtime are required.
1341 // Check the error code and execute the host version if required.
1342 Builder.restoreIP(emitTargetKernel(
1343 Builder, AllocaIP, Return, RTLoc, DeviceID, Args.NumTeams.front(),
1344 Args.NumThreads.front(), OutlinedFnID, ArgsVector));
1345
1346 BasicBlock *OffloadFailedBlock =
1347 BasicBlock::Create(Builder.getContext(), "omp_offload.failed");
1348 BasicBlock *OffloadContBlock =
1349 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
1350 Value *Failed = Builder.CreateIsNotNull(Return);
1351 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
1352
1353 auto CurFn = Builder.GetInsertBlock()->getParent();
1354 emitBlock(OffloadFailedBlock, CurFn);
1355 InsertPointOrErrorTy AfterIP = EmitTargetCallFallbackCB(Builder.saveIP());
1356 if (!AfterIP)
1357 return AfterIP.takeError();
1358 Builder.restoreIP(*AfterIP);
1359 emitBranch(OffloadContBlock);
1360 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
1361 return Builder.saveIP();
1362}
1363
1365 Value *CancelFlag, omp::Directive CanceledDirective) {
1366 assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
1367 "Unexpected cancellation!");
1368
1369 // For a cancel barrier we create two new blocks.
1370 BasicBlock *BB = Builder.GetInsertBlock();
1371 BasicBlock *NonCancellationBlock;
1372 if (Builder.GetInsertPoint() == BB->end()) {
1373 // TODO: This branch will not be needed once we moved to the
1374 // OpenMPIRBuilder codegen completely.
1375 NonCancellationBlock = BasicBlock::Create(
1376 BB->getContext(), BB->getName() + ".cont", BB->getParent());
1377 } else {
1378 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
1380 Builder.SetInsertPoint(BB);
1381 }
1382 BasicBlock *CancellationBlock = BasicBlock::Create(
1383 BB->getContext(), BB->getName() + ".cncl", BB->getParent());
1384
1385 // Jump to them based on the return value.
1386 Value *Cmp = Builder.CreateIsNull(CancelFlag);
1387 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
1388 /* TODO weight */ nullptr, nullptr);
1389
1390 // From the cancellation block we finalize all variables and go to the
1391 // post finalization block that is known to the FiniCB callback.
1392 auto &FI = FinalizationStack.back();
1393 Expected<BasicBlock *> FiniBBOrErr = FI.getFiniBB(Builder);
1394 if (!FiniBBOrErr)
1395 return FiniBBOrErr.takeError();
1396 Builder.SetInsertPoint(CancellationBlock);
1397 Builder.CreateBr(*FiniBBOrErr);
1398
1399 // The continuation block is where code generation continues.
1400 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
1401 return Error::success();
1402}
1403
1404// Callback used to create OpenMP runtime calls to support
1405// omp parallel clause for the device.
1406// We need to use this callback to replace call to the OutlinedFn in OuterFn
1407// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_60)
1409 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn,
1410 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
1411 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1412 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
1413 // Add some known attributes.
1414 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1415 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1416 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1417 OutlinedFn.addParamAttr(0, Attribute::NoUndef);
1418 OutlinedFn.addParamAttr(1, Attribute::NoUndef);
1419 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1420
1421 assert(OutlinedFn.arg_size() >= 2 &&
1422 "Expected at least tid and bounded tid as arguments");
1423 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1424
1425 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1426 assert(CI && "Expected call instruction to outlined function");
1427 CI->getParent()->setName("omp_parallel");
1428
1429 Builder.SetInsertPoint(CI);
1430 Type *PtrTy = OMPIRBuilder->VoidPtr;
1431 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1432
1433 // Add alloca for kernel args
1434 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
1435 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt());
1436 AllocaInst *ArgsAlloca =
1437 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
1438 Value *Args = ArgsAlloca;
1439 // Add address space cast if array for storing arguments is not allocated
1440 // in address space 0
1441 if (ArgsAlloca->getAddressSpace())
1442 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy);
1443 Builder.restoreIP(CurrentIP);
1444
1445 // Store captured vars which are used by kmpc_parallel_60
1446 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
1447 Value *V = *(CI->arg_begin() + 2 + Idx);
1448 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
1449 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
1450 Builder.CreateStore(V, StoreAddress);
1451 }
1452
1453 Value *Cond =
1454 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
1455 : Builder.getInt32(1);
1456
1457 // Build kmpc_parallel_60 call
1458 Value *Parallel60CallArgs[] = {
1459 /* identifier*/ Ident,
1460 /* global thread num*/ ThreadID,
1461 /* if expression */ Cond,
1462 /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1),
1463 /* Proc bind */ Builder.getInt32(-1),
1464 /* outlined function */ &OutlinedFn,
1465 /* wrapper function */ NullPtrValue,
1466 /* arguments of the outlined funciton*/ Args,
1467 /* number of arguments */ Builder.getInt64(NumCapturedVars),
1468 /* strict for number of threads */ Builder.getInt32(0)};
1469
1470 FunctionCallee RTLFn =
1471 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_60);
1472
1473 OMPIRBuilder->createRuntimeFunctionCall(RTLFn, Parallel60CallArgs);
1474
1475 LLVM_DEBUG(dbgs() << "With kmpc_parallel_60 placed: "
1476 << *Builder.GetInsertBlock()->getParent() << "\n");
1477
1478 // Initialize the local TID stack location with the argument value.
1479 Builder.SetInsertPoint(PrivTID);
1480 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1481 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1482 PrivTIDAddr);
1483
1484 // Remove redundant call to the outlined function.
1485 CI->eraseFromParent();
1486
1487 for (Instruction *I : ToBeDeleted) {
1488 I->eraseFromParent();
1489 }
1490}
1491
1492// Callback used to create OpenMP runtime calls to support
1493// omp parallel clause for the host.
1494// We need to use this callback to replace call to the OutlinedFn in OuterFn
1495// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1496static void
1498 Function *OuterFn, Value *Ident, Value *IfCondition,
1499 Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1500 const SmallVector<Instruction *, 4> &ToBeDeleted) {
1501 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1502 FunctionCallee RTLFn;
1503 if (IfCondition) {
1504 RTLFn =
1505 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
1506 } else {
1507 RTLFn =
1508 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
1509 }
1510 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) {
1511 if (!F->hasMetadata(LLVMContext::MD_callback)) {
1512 LLVMContext &Ctx = F->getContext();
1513 MDBuilder MDB(Ctx);
1514 // Annotate the callback behavior of the __kmpc_fork_call:
1515 // - The callback callee is argument number 2 (microtask).
1516 // - The first two arguments of the callback callee are unknown (-1).
1517 // - All variadic arguments to the __kmpc_fork_call are passed to the
1518 // callback callee.
1519 F->addMetadata(LLVMContext::MD_callback,
1521 2, {-1, -1},
1522 /* VarArgsArePassed */ true)}));
1523 }
1524 }
1525 // Add some known attributes.
1526 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1527 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1528 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1529
1530 assert(OutlinedFn.arg_size() >= 2 &&
1531 "Expected at least tid and bounded tid as arguments");
1532 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1533
1534 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1535 CI->getParent()->setName("omp_parallel");
1536 Builder.SetInsertPoint(CI);
1537
1538 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1539 Value *ForkCallArgs[] = {Ident, Builder.getInt32(NumCapturedVars),
1540 &OutlinedFn};
1541
1542 SmallVector<Value *, 16> RealArgs;
1543 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1544 if (IfCondition) {
1545 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32);
1546 RealArgs.push_back(Cond);
1547 }
1548 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1549
1550 // __kmpc_fork_call_if always expects a void ptr as the last argument
1551 // If there are no arguments, pass a null pointer.
1552 auto PtrTy = OMPIRBuilder->VoidPtr;
1553 if (IfCondition && NumCapturedVars == 0) {
1554 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1555 RealArgs.push_back(NullPtrValue);
1556 }
1557
1558 OMPIRBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
1559
1560 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1561 << *Builder.GetInsertBlock()->getParent() << "\n");
1562
1563 // Initialize the local TID stack location with the argument value.
1564 Builder.SetInsertPoint(PrivTID);
1565 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1566 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1567 PrivTIDAddr);
1568
1569 // Remove redundant call to the outlined function.
1570 CI->eraseFromParent();
1571
1572 for (Instruction *I : ToBeDeleted) {
1573 I->eraseFromParent();
1574 }
1575}
1576
1578 const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
1579 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
1580 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
1581 omp::ProcBindKind ProcBind, bool IsCancellable) {
1582 assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
1583
1584 if (!updateToLocation(Loc))
1585 return Loc.IP;
1586
1587 uint32_t SrcLocStrSize;
1588 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1589 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1590 const bool NeedThreadID = NumThreads || Config.isTargetDevice() ||
1591 (ProcBind != OMP_PROC_BIND_default);
1592 Value *ThreadID = NeedThreadID ? getOrCreateThreadID(Ident) : nullptr;
1593 // If we generate code for the target device, we need to allocate
1594 // struct for aggregate params in the device default alloca address space.
1595 // OpenMP runtime requires that the params of the extracted functions are
1596 // passed as zero address space pointers. This flag ensures that extracted
1597 // function arguments are declared in zero address space
1598 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
1599
1600 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1601 // only if we compile for host side.
1602 if (NumThreads && !Config.isTargetDevice()) {
1603 Value *Args[] = {
1604 Ident, ThreadID,
1605 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
1607 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
1608 }
1609
1610 if (ProcBind != OMP_PROC_BIND_default) {
1611 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1612 Value *Args[] = {
1613 Ident, ThreadID,
1614 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
1616 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
1617 }
1618
1619 BasicBlock *InsertBB = Builder.GetInsertBlock();
1620 Function *OuterFn = InsertBB->getParent();
1621
1622 // Save the outer alloca block because the insertion iterator may get
1623 // invalidated and we still need this later.
1624 BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
1625
1626 // Vector to remember instructions we used only during the modeling but which
1627 // we want to delete at the end.
1629
1630 // Change the location to the outer alloca insertion point to create and
1631 // initialize the allocas we pass into the parallel region.
1632 InsertPointTy NewOuter(OuterAllocaBlock, OuterAllocaBlock->begin());
1633 Builder.restoreIP(NewOuter);
1634 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
1635 AllocaInst *ZeroAddrAlloca =
1636 Builder.CreateAlloca(Int32, nullptr, "zero.addr");
1637 Instruction *TIDAddr = TIDAddrAlloca;
1638 Instruction *ZeroAddr = ZeroAddrAlloca;
1639 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
1640 // Add additional casts to enforce pointers in zero address space
1641 TIDAddr = new AddrSpaceCastInst(
1642 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
1643 TIDAddr->insertAfter(TIDAddrAlloca->getIterator());
1644 ToBeDeleted.push_back(TIDAddr);
1645 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
1646 PointerType ::get(M.getContext(), 0),
1647 "zero.addr.ascast");
1648 ZeroAddr->insertAfter(ZeroAddrAlloca->getIterator());
1649 ToBeDeleted.push_back(ZeroAddr);
1650 }
1651
1652 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1653 // associated arguments in the outlined function, so we delete them later.
1654 ToBeDeleted.push_back(TIDAddrAlloca);
1655 ToBeDeleted.push_back(ZeroAddrAlloca);
1656
1657 // Create an artificial insertion point that will also ensure the blocks we
1658 // are about to split are not degenerated.
1659 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
1660
1661 BasicBlock *EntryBB = UI->getParent();
1662 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry");
1663 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region");
1664 BasicBlock *PRegPreFiniBB =
1665 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize");
1666 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit");
1667
1668 auto FiniCBWrapper = [&](InsertPointTy IP) {
1669 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1670 // target to the region exit block.
1671 if (IP.getBlock()->end() == IP.getPoint()) {
1673 Builder.restoreIP(IP);
1674 Instruction *I = Builder.CreateBr(PRegExitBB);
1675 IP = InsertPointTy(I->getParent(), I->getIterator());
1676 }
1677 assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 &&
1678 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1679 "Unexpected insertion point for finalization call!");
1680 return FiniCB(IP);
1681 };
1682
1683 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
1684
1685 // Generate the privatization allocas in the block that will become the entry
1686 // of the outlined function.
1687 Builder.SetInsertPoint(PRegEntryBB->getTerminator());
1688 InsertPointTy InnerAllocaIP = Builder.saveIP();
1689
1690 AllocaInst *PrivTIDAddr =
1691 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
1692 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
1693
1694 // Add some fake uses for OpenMP provided arguments.
1695 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
1696 Instruction *ZeroAddrUse =
1697 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
1698 ToBeDeleted.push_back(ZeroAddrUse);
1699
1700 // EntryBB
1701 // |
1702 // V
1703 // PRegionEntryBB <- Privatization allocas are placed here.
1704 // |
1705 // V
1706 // PRegionBodyBB <- BodeGen is invoked here.
1707 // |
1708 // V
1709 // PRegPreFiniBB <- The block we will start finalization from.
1710 // |
1711 // V
1712 // PRegionExitBB <- A common exit to simplify block collection.
1713 //
1714
1715 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
1716
1717 // Let the caller create the body.
1718 assert(BodyGenCB && "Expected body generation callback!");
1719 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
1720 if (Error Err = BodyGenCB(InnerAllocaIP, CodeGenIP))
1721 return Err;
1722
1723 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
1724
1725 OutlineInfo OI;
1726 if (Config.isTargetDevice()) {
1727 // Generate OpenMP target specific runtime call
1728 OI.PostOutlineCB = [=, ToBeDeletedVec =
1729 std::move(ToBeDeleted)](Function &OutlinedFn) {
1730 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
1731 IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1732 ThreadID, ToBeDeletedVec);
1733 };
1734 OI.FixUpNonEntryAllocas = true;
1735 } else {
1736 // Generate OpenMP host runtime call
1737 OI.PostOutlineCB = [=, ToBeDeletedVec =
1738 std::move(ToBeDeleted)](Function &OutlinedFn) {
1739 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
1740 PrivTID, PrivTIDAddr, ToBeDeletedVec);
1741 };
1742 OI.FixUpNonEntryAllocas = true;
1743 }
1744
1745 OI.OuterAllocaBB = OuterAllocaBlock;
1746 OI.EntryBB = PRegEntryBB;
1747 OI.ExitBB = PRegExitBB;
1748
1749 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1751 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
1752
1753 CodeExtractorAnalysisCache CEAC(*OuterFn);
1754 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1755 /* AggregateArgs */ false,
1756 /* BlockFrequencyInfo */ nullptr,
1757 /* BranchProbabilityInfo */ nullptr,
1758 /* AssumptionCache */ nullptr,
1759 /* AllowVarArgs */ true,
1760 /* AllowAlloca */ true,
1761 /* AllocationBlock */ OuterAllocaBlock,
1762 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
1763
1764 // Find inputs to, outputs from the code region.
1765 BasicBlock *CommonExit = nullptr;
1766 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1767 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1768
1769 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands,
1770 /*CollectGlobalInputs=*/true);
1771
1772 Inputs.remove_if([&](Value *I) {
1774 return GV->getValueType() == OpenMPIRBuilder::Ident;
1775
1776 return false;
1777 });
1778
1779 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1780
1781 FunctionCallee TIDRTLFn =
1782 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1783
1784 auto PrivHelper = [&](Value &V) -> Error {
1785 if (&V == TIDAddr || &V == ZeroAddr) {
1787 return Error::success();
1788 }
1789
1791 for (Use &U : V.uses())
1792 if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1793 if (ParallelRegionBlockSet.count(UserI->getParent()))
1794 Uses.insert(&U);
1795
1796 // __kmpc_fork_call expects extra arguments as pointers. If the input
1797 // already has a pointer type, everything is fine. Otherwise, store the
1798 // value onto stack and load it back inside the to-be-outlined region. This
1799 // will ensure only the pointer will be passed to the function.
1800 // FIXME: if there are more than 15 trailing arguments, they must be
1801 // additionally packed in a struct.
1802 Value *Inner = &V;
1803 if (!V.getType()->isPointerTy()) {
1805 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
1806
1807 Builder.restoreIP(OuterAllocaIP);
1808 Value *Ptr =
1809 Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded");
1810
1811 // Store to stack at end of the block that currently branches to the entry
1812 // block of the to-be-outlined region.
1813 Builder.SetInsertPoint(InsertBB,
1814 InsertBB->getTerminator()->getIterator());
1815 Builder.CreateStore(&V, Ptr);
1816
1817 // Load back next to allocations in the to-be-outlined region.
1818 Builder.restoreIP(InnerAllocaIP);
1819 Inner = Builder.CreateLoad(V.getType(), Ptr);
1820 }
1821
1822 Value *ReplacementValue = nullptr;
1823 CallInst *CI = dyn_cast<CallInst>(&V);
1824 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
1825 ReplacementValue = PrivTID;
1826 } else {
1827 InsertPointOrErrorTy AfterIP =
1828 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue);
1829 if (!AfterIP)
1830 return AfterIP.takeError();
1831 Builder.restoreIP(*AfterIP);
1832 InnerAllocaIP = {
1833 InnerAllocaIP.getBlock(),
1834 InnerAllocaIP.getBlock()->getTerminator()->getIterator()};
1835
1836 assert(ReplacementValue &&
1837 "Expected copy/create callback to set replacement value!");
1838 if (ReplacementValue == &V)
1839 return Error::success();
1840 }
1841
1842 for (Use *UPtr : Uses)
1843 UPtr->set(ReplacementValue);
1844
1845 return Error::success();
1846 };
1847
1848 // Reset the inner alloca insertion as it will be used for loading the values
1849 // wrapped into pointers before passing them into the to-be-outlined region.
1850 // Configure it to insert immediately after the fake use of zero address so
1851 // that they are available in the generated body and so that the
1852 // OpenMP-related values (thread ID and zero address pointers) remain leading
1853 // in the argument list.
1854 InnerAllocaIP = IRBuilder<>::InsertPoint(
1855 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
1856
1857 // Reset the outer alloca insertion point to the entry of the relevant block
1858 // in case it was invalidated.
1859 OuterAllocaIP = IRBuilder<>::InsertPoint(
1860 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
1861
1862 for (Value *Input : Inputs) {
1863 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
1864 if (Error Err = PrivHelper(*Input))
1865 return Err;
1866 }
1867 LLVM_DEBUG({
1868 for (Value *Output : Outputs)
1869 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
1870 });
1871 assert(Outputs.empty() &&
1872 "OpenMP outlining should not produce live-out values!");
1873
1874 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
1875 LLVM_DEBUG({
1876 for (auto *BB : Blocks)
1877 dbgs() << " PBR: " << BB->getName() << "\n";
1878 });
1879
1880 // Adjust the finalization stack, verify the adjustment, and call the
1881 // finalize function a last time to finalize values between the pre-fini
1882 // block and the exit block if we left the parallel "the normal way".
1883 auto FiniInfo = FinalizationStack.pop_back_val();
1884 (void)FiniInfo;
1885 assert(FiniInfo.DK == OMPD_parallel &&
1886 "Unexpected finalization stack state!");
1887
1888 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
1889
1890 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
1891 Expected<BasicBlock *> FiniBBOrErr = FiniInfo.getFiniBB(Builder);
1892 if (!FiniBBOrErr)
1893 return FiniBBOrErr.takeError();
1894 {
1896 Builder.restoreIP(PreFiniIP);
1897 Builder.CreateBr(*FiniBBOrErr);
1898 // There's currently a branch to omp.par.exit. Delete it. We will get there
1899 // via the fini block
1900 if (Instruction *Term = Builder.GetInsertBlock()->getTerminator())
1901 Term->eraseFromParent();
1902 }
1903
1904 // Register the outlined info.
1905 addOutlineInfo(std::move(OI));
1906
1907 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
1908 UI->eraseFromParent();
1909
1910 return AfterIP;
1911}
1912
1914 // Build call void __kmpc_flush(ident_t *loc)
1915 uint32_t SrcLocStrSize;
1916 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1917 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
1918
1920 Args);
1921}
1922
1924 if (!updateToLocation(Loc))
1925 return;
1926 emitFlush(Loc);
1927}
1928
1930 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
1931 // global_tid);
1932 uint32_t SrcLocStrSize;
1933 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1934 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1935 Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
1936
1937 // Ignore return result until untied tasks are supported.
1939 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait), Args);
1940}
1941
1947
1949 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
1950 uint32_t SrcLocStrSize;
1951 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1952 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1953 Constant *I32Null = ConstantInt::getNullValue(Int32);
1954 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
1955
1957 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield), Args);
1958}
1959
1965
1966// Processes the dependencies in Dependencies and does the following
1967// - Allocates space on the stack of an array of DependInfo objects
1968// - Populates each DependInfo object with relevant information of
1969// the corresponding dependence.
1970// - All code is inserted in the entry block of the current function.
1972 OpenMPIRBuilder &OMPBuilder,
1974 // Early return if we have no dependencies to process
1975 if (Dependencies.empty())
1976 return nullptr;
1977
1978 // Given a vector of DependData objects, in this function we create an
1979 // array on the stack that holds kmp_dep_info objects corresponding
1980 // to each dependency. This is then passed to the OpenMP runtime.
1981 // For example, if there are 'n' dependencies then the following psedo
1982 // code is generated. Assume the first dependence is on a variable 'a'
1983 //
1984 // \code{c}
1985 // DepArray = alloc(n x sizeof(kmp_depend_info);
1986 // idx = 0;
1987 // DepArray[idx].base_addr = ptrtoint(&a);
1988 // DepArray[idx].len = 8;
1989 // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/
1990 // ++idx;
1991 // DepArray[idx].base_addr = ...;
1992 // \endcode
1993
1994 IRBuilderBase &Builder = OMPBuilder.Builder;
1995 Type *DependInfo = OMPBuilder.DependInfo;
1996 Module &M = OMPBuilder.M;
1997
1998 Value *DepArray = nullptr;
1999 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
2000 Builder.SetInsertPoint(
2002
2003 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
2004 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
2005
2006 Builder.restoreIP(OldIP);
2007
2008 for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) {
2009 Value *Base =
2010 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx);
2011 // Store the pointer to the variable
2012 Value *Addr = Builder.CreateStructGEP(
2013 DependInfo, Base,
2014 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
2015 Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty());
2016 Builder.CreateStore(DepValPtr, Addr);
2017 // Store the size of the variable
2018 Value *Size = Builder.CreateStructGEP(
2019 DependInfo, Base, static_cast<unsigned int>(RTLDependInfoFields::Len));
2020 Builder.CreateStore(
2021 Builder.getInt64(M.getDataLayout().getTypeStoreSize(Dep.DepValueType)),
2022 Size);
2023 // Store the dependency kind
2024 Value *Flags = Builder.CreateStructGEP(
2025 DependInfo, Base,
2026 static_cast<unsigned int>(RTLDependInfoFields::Flags));
2027 Builder.CreateStore(
2028 ConstantInt::get(Builder.getInt8Ty(),
2029 static_cast<unsigned int>(Dep.DepKind)),
2030 Flags);
2031 }
2032 return DepArray;
2033}
2034
2035/// Create the task duplication function passed to kmpc_taskloop.
2036Expected<Value *> OpenMPIRBuilder::createTaskDuplicationFunction(
2037 Type *PrivatesTy, int32_t PrivatesIndex, TaskDupCallbackTy DupCB) {
2038 unsigned ProgramAddressSpace = M.getDataLayout().getProgramAddressSpace();
2039 if (!DupCB)
2041 PointerType::get(Builder.getContext(), ProgramAddressSpace));
2042
2043 // From OpenMP Runtime p_task_dup_t:
2044 // Routine optionally generated by the compiler for setting the lastprivate
2045 // flag and calling needed constructors for private/firstprivate objects (used
2046 // to form taskloop tasks from pattern task) Parameters: dest task, src task,
2047 // lastprivate flag.
2048 // typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
2049
2050 auto *VoidPtrTy = PointerType::get(Builder.getContext(), ProgramAddressSpace);
2051
2052 FunctionType *DupFuncTy = FunctionType::get(
2053 Builder.getVoidTy(), {VoidPtrTy, VoidPtrTy, Builder.getInt32Ty()},
2054 /*isVarArg=*/false);
2055
2056 Function *DupFunction = Function::Create(DupFuncTy, Function::InternalLinkage,
2057 "omp_taskloop_dup", M);
2058 Value *DestTaskArg = DupFunction->getArg(0);
2059 Value *SrcTaskArg = DupFunction->getArg(1);
2060 Value *LastprivateFlagArg = DupFunction->getArg(2);
2061 DestTaskArg->setName("dest_task");
2062 SrcTaskArg->setName("src_task");
2063 LastprivateFlagArg->setName("lastprivate_flag");
2064
2065 IRBuilderBase::InsertPointGuard Guard(Builder);
2066 Builder.SetInsertPoint(
2067 BasicBlock::Create(Builder.getContext(), "entry", DupFunction));
2068
2069 auto GetTaskContextPtrFromArg = [&](Value *Arg) -> Value * {
2070 Type *TaskWithPrivatesTy =
2071 StructType::get(Builder.getContext(), {Task, PrivatesTy});
2072 Value *TaskPrivates = Builder.CreateGEP(
2073 TaskWithPrivatesTy, Arg, {Builder.getInt32(0), Builder.getInt32(1)});
2074 Value *ContextPtr = Builder.CreateGEP(
2075 PrivatesTy, TaskPrivates,
2076 {Builder.getInt32(0), Builder.getInt32(PrivatesIndex)});
2077 return ContextPtr;
2078 };
2079
2080 Value *DestTaskContextPtr = GetTaskContextPtrFromArg(DestTaskArg);
2081 Value *SrcTaskContextPtr = GetTaskContextPtrFromArg(SrcTaskArg);
2082
2083 DestTaskContextPtr->setName("destPtr");
2084 SrcTaskContextPtr->setName("srcPtr");
2085
2086 InsertPointTy AllocaIP(&DupFunction->getEntryBlock(),
2087 DupFunction->getEntryBlock().begin());
2088 InsertPointTy CodeGenIP = Builder.saveIP();
2089 Expected<IRBuilderBase::InsertPoint> AfterIPOrError =
2090 DupCB(AllocaIP, CodeGenIP, DestTaskContextPtr, SrcTaskContextPtr);
2091 if (!AfterIPOrError)
2092 return AfterIPOrError.takeError();
2093 Builder.restoreIP(*AfterIPOrError);
2094
2095 Builder.CreateRetVoid();
2096
2097 return DupFunction;
2098}
2099
2100OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
2101 const LocationDescription &Loc, InsertPointTy AllocaIP,
2102 BodyGenCallbackTy BodyGenCB,
2103 llvm::function_ref<llvm::Expected<llvm::CanonicalLoopInfo *>()> LoopInfo,
2104 Value *LBVal, Value *UBVal, Value *StepVal, bool Untied, Value *IfCond,
2105 Value *GrainSize, bool NoGroup, int Sched, Value *Final, bool Mergeable,
2106 Value *Priority, uint64_t NumOfCollapseLoops, TaskDupCallbackTy DupCB,
2107 Value *TaskContextStructPtrVal) {
2108
2109 if (!updateToLocation(Loc))
2110 return InsertPointTy();
2111
2112 uint32_t SrcLocStrSize;
2113 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2114 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2115
2116 BasicBlock *TaskloopExitBB =
2117 splitBB(Builder, /*CreateBranch=*/true, "taskloop.exit");
2118 BasicBlock *TaskloopBodyBB =
2119 splitBB(Builder, /*CreateBranch=*/true, "taskloop.body");
2120 BasicBlock *TaskloopAllocaBB =
2121 splitBB(Builder, /*CreateBranch=*/true, "taskloop.alloca");
2122
2123 InsertPointTy TaskloopAllocaIP =
2124 InsertPointTy(TaskloopAllocaBB, TaskloopAllocaBB->begin());
2125 InsertPointTy TaskloopBodyIP =
2126 InsertPointTy(TaskloopBodyBB, TaskloopBodyBB->begin());
2127
2128 if (Error Err = BodyGenCB(TaskloopAllocaIP, TaskloopBodyIP))
2129 return Err;
2130
2131 llvm::Expected<llvm::CanonicalLoopInfo *> result = LoopInfo();
2132 if (!result) {
2133 return result.takeError();
2134 }
2135
2136 llvm::CanonicalLoopInfo *CLI = result.get();
2137 OutlineInfo OI;
2138 OI.EntryBB = TaskloopAllocaBB;
2139 OI.OuterAllocaBB = AllocaIP.getBlock();
2140 OI.ExitBB = TaskloopExitBB;
2141
2142 // Add the thread ID argument.
2143 SmallVector<Instruction *> ToBeDeleted;
2144 // dummy instruction to be used as a fake argument
2145 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
2146 Builder, AllocaIP, ToBeDeleted, TaskloopAllocaIP, "global.tid", false));
2147 Value *FakeLB = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
2148 TaskloopAllocaIP, "lb", false, true);
2149 Value *FakeUB = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
2150 TaskloopAllocaIP, "ub", false, true);
2151 Value *FakeStep = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
2152 TaskloopAllocaIP, "step", false, true);
2153 // For Taskloop, we want to force the bounds being the first 3 inputs in the
2154 // aggregate struct
2155 OI.Inputs.insert(FakeLB);
2156 OI.Inputs.insert(FakeUB);
2157 OI.Inputs.insert(FakeStep);
2158 if (TaskContextStructPtrVal)
2159 OI.Inputs.insert(TaskContextStructPtrVal);
2160 assert(((TaskContextStructPtrVal && DupCB) ||
2161 (!TaskContextStructPtrVal && !DupCB)) &&
2162 "Task context struct ptr and duplication callback must be both set "
2163 "or both null");
2164
2165 // It isn't safe to run the duplication bodygen callback inside the post
2166 // outlining callback so this has to be run now before we know the real task
2167 // shareds structure type.
2168 unsigned ProgramAddressSpace = M.getDataLayout().getProgramAddressSpace();
2169 Type *PointerTy = PointerType::get(Builder.getContext(), ProgramAddressSpace);
2170 Type *FakeSharedsTy = StructType::get(
2171 Builder.getContext(),
2172 {FakeLB->getType(), FakeUB->getType(), FakeStep->getType(), PointerTy});
2173 Expected<Value *> TaskDupFnOrErr = createTaskDuplicationFunction(
2174 FakeSharedsTy,
2175 /*PrivatesIndex: the pointer after the three indices above*/ 3, DupCB);
2176 if (!TaskDupFnOrErr) {
2177 return TaskDupFnOrErr.takeError();
2178 }
2179 Value *TaskDupFn = *TaskDupFnOrErr;
2180
2181 OI.PostOutlineCB = [this, Ident, LBVal, UBVal, StepVal, Untied,
2182 TaskloopAllocaBB, CLI, Loc, TaskDupFn, ToBeDeleted,
2183 IfCond, GrainSize, NoGroup, Sched, FakeLB, FakeUB,
2184 FakeStep, FakeSharedsTy, Final, Mergeable, Priority,
2185 NumOfCollapseLoops](Function &OutlinedFn) mutable {
2186 // Replace the Stale CI by appropriate RTL function call.
2187 assert(OutlinedFn.hasOneUse() &&
2188 "there must be a single user for the outlined function");
2189 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
2190
2191 /* Create the casting for the Bounds Values that can be used when outlining
2192 * to replace the uses of the fakes with real values */
2193 BasicBlock *CodeReplBB = StaleCI->getParent();
2194 IRBuilderBase::InsertPoint CurrentIp = Builder.saveIP();
2195 Builder.SetInsertPoint(CodeReplBB->getFirstInsertionPt());
2196 Value *CastedLBVal =
2197 Builder.CreateIntCast(LBVal, Builder.getInt64Ty(), true, "lb64");
2198 Value *CastedUBVal =
2199 Builder.CreateIntCast(UBVal, Builder.getInt64Ty(), true, "ub64");
2200 Value *CastedStepVal =
2201 Builder.CreateIntCast(StepVal, Builder.getInt64Ty(), true, "step64");
2202 Builder.restoreIP(CurrentIp);
2203
2204 Builder.SetInsertPoint(StaleCI);
2205
2206 // Gather the arguments for emitting the runtime call for
2207 // @__kmpc_omp_task_alloc
2208 Function *TaskAllocFn =
2209 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
2210
2211 Value *ThreadID = getOrCreateThreadID(Ident);
2212
2213 if (!NoGroup) {
2214 // Emit runtime call for @__kmpc_taskgroup
2215 Function *TaskgroupFn =
2216 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2217 Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
2218 }
2219
2220 // `flags` Argument Configuration
2221 // Task is tied if (Flags & 1) == 1.
2222 // Task is untied if (Flags & 1) == 0.
2223 // Task is final if (Flags & 2) == 2.
2224 // Task is not final if (Flags & 2) == 0.
2225 // Task is mergeable if (Flags & 4) == 4.
2226 // Task is not mergeable if (Flags & 4) == 0.
2227 // Task is priority if (Flags & 32) == 32.
2228 // Task is not priority if (Flags & 32) == 0.
2229 Value *Flags = Builder.getInt32(Untied ? 0 : 1);
2230 if (Final)
2231 Flags = Builder.CreateOr(Builder.getInt32(2), Flags);
2232 if (Mergeable)
2233 Flags = Builder.CreateOr(Builder.getInt32(4), Flags);
2234 if (Priority)
2235 Flags = Builder.CreateOr(Builder.getInt32(32), Flags);
2236
2237 Value *TaskSize = Builder.getInt64(
2238 divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
2239
2240 AllocaInst *ArgStructAlloca =
2242 assert(ArgStructAlloca &&
2243 "Unable to find the alloca instruction corresponding to arguments "
2244 "for extracted function");
2245 std::optional<TypeSize> ArgAllocSize =
2246 ArgStructAlloca->getAllocationSize(M.getDataLayout());
2247 assert(ArgAllocSize &&
2248 "Unable to determine size of arguments for extracted function");
2249 Value *SharedsSize = Builder.getInt64(ArgAllocSize->getFixedValue());
2250
2251 // Emit the @__kmpc_omp_task_alloc runtime call
2252 // The runtime call returns a pointer to an area where the task captured
2253 // variables must be copied before the task is run (TaskData)
2254 CallInst *TaskData = Builder.CreateCall(
2255 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
2256 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
2257 /*task_func=*/&OutlinedFn});
2258
2259 Value *Shareds = StaleCI->getArgOperand(1);
2260 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
2261 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
2262 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
2263 SharedsSize);
2264 // Get the pointer to loop lb, ub, step from task ptr
2265 // and set up the lowerbound,upperbound and step values
2266 llvm::Value *Lb = Builder.CreateGEP(
2267 FakeSharedsTy, TaskShareds, {Builder.getInt32(0), Builder.getInt32(0)});
2268
2269 llvm::Value *Ub = Builder.CreateGEP(
2270 FakeSharedsTy, TaskShareds, {Builder.getInt32(0), Builder.getInt32(1)});
2271
2272 llvm::Value *Step = Builder.CreateGEP(
2273 FakeSharedsTy, TaskShareds, {Builder.getInt32(0), Builder.getInt32(2)});
2274 llvm::Value *Loadstep = Builder.CreateLoad(Builder.getInt64Ty(), Step);
2275
2276 // set up the arguments for emitting kmpc_taskloop runtime call
2277 // setting values for ifval, nogroup, sched, grainsize, task_dup
2278 Value *IfCondVal =
2279 IfCond ? Builder.CreateIntCast(IfCond, Builder.getInt32Ty(), true)
2280 : Builder.getInt32(1);
2281 // As __kmpc_taskgroup is called manually in OMPIRBuilder, NoGroupVal should
2282 // always be 1 when calling __kmpc_taskloop to ensure it is not called again
2283 Value *NoGroupVal = Builder.getInt32(1);
2284 Value *SchedVal = Builder.getInt32(Sched);
2285 Value *GrainSizeVal =
2286 GrainSize ? Builder.CreateIntCast(GrainSize, Builder.getInt64Ty(), true)
2287 : Builder.getInt64(0);
2288 Value *TaskDup = TaskDupFn;
2289
2290 Value *Args[] = {Ident, ThreadID, TaskData, IfCondVal, Lb, Ub,
2291 Loadstep, NoGroupVal, SchedVal, GrainSizeVal, TaskDup};
2292
2293 // taskloop runtime call
2294 Function *TaskloopFn =
2295 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskloop);
2296 Builder.CreateCall(TaskloopFn, Args);
2297
2298 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup if
2299 // nogroup is not defined
2300 if (!NoGroup) {
2301 Function *EndTaskgroupFn =
2302 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2303 Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID});
2304 }
2305
2306 StaleCI->eraseFromParent();
2307
2308 Builder.SetInsertPoint(TaskloopAllocaBB, TaskloopAllocaBB->begin());
2309
2310 LoadInst *SharedsOutlined =
2311 Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2312 OutlinedFn.getArg(1)->replaceUsesWithIf(
2313 SharedsOutlined,
2314 [SharedsOutlined](Use &U) { return U.getUser() != SharedsOutlined; });
2315
2316 Value *IV = CLI->getIndVar();
2317 Type *IVTy = IV->getType();
2318 Constant *One = ConstantInt::get(Builder.getInt64Ty(), 1);
2319
2320 // When outlining, CodeExtractor will create GEP's to the LowerBound and
2321 // UpperBound. These GEP's can be reused for loading the tasks respective
2322 // bounds.
2323 Value *TaskLB = nullptr;
2324 Value *TaskUB = nullptr;
2325 Value *LoadTaskLB = nullptr;
2326 Value *LoadTaskUB = nullptr;
2327 for (Instruction &I : *TaskloopAllocaBB) {
2328 if (I.getOpcode() == Instruction::GetElementPtr) {
2329 GetElementPtrInst &Gep = cast<GetElementPtrInst>(I);
2330 if (ConstantInt *CI = dyn_cast<ConstantInt>(Gep.getOperand(2))) {
2331 switch (CI->getZExtValue()) {
2332 case 0:
2333 TaskLB = &I;
2334 break;
2335 case 1:
2336 TaskUB = &I;
2337 break;
2338 }
2339 }
2340 } else if (I.getOpcode() == Instruction::Load) {
2341 LoadInst &Load = cast<LoadInst>(I);
2342 if (Load.getPointerOperand() == TaskLB) {
2343 assert(TaskLB != nullptr && "Expected value for TaskLB");
2344 LoadTaskLB = &I;
2345 } else if (Load.getPointerOperand() == TaskUB) {
2346 assert(TaskUB != nullptr && "Expected value for TaskUB");
2347 LoadTaskUB = &I;
2348 }
2349 }
2350 }
2351
2352 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
2353
2354 assert(LoadTaskLB != nullptr && "Expected value for LoadTaskLB");
2355 assert(LoadTaskUB != nullptr && "Expected value for LoadTaskUB");
2356 Value *TripCountMinusOne =
2357 Builder.CreateSDiv(Builder.CreateSub(LoadTaskUB, LoadTaskLB), FakeStep);
2358 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One, "trip_cnt");
2359 Value *CastedTripCount = Builder.CreateIntCast(TripCount, IVTy, true);
2360 Value *CastedTaskLB = Builder.CreateIntCast(LoadTaskLB, IVTy, true);
2361 // set the trip count in the CLI
2362 CLI->setTripCount(CastedTripCount);
2363
2364 Builder.SetInsertPoint(CLI->getBody(),
2365 CLI->getBody()->getFirstInsertionPt());
2366
2367 if (NumOfCollapseLoops > 1) {
2368 llvm::SmallVector<User *> UsersToReplace;
2369 // When using the collapse clause, the bounds of the loop have to be
2370 // adjusted to properly represent the iterator of the outer loop.
2371 Value *IVPlusTaskLB = Builder.CreateAdd(
2372 CLI->getIndVar(),
2373 Builder.CreateSub(CastedTaskLB, ConstantInt::get(IVTy, 1)));
2374 // To ensure every Use is correctly captured, we first want to record
2375 // which users to replace the value in, and then replace the value.
2376 for (auto IVUse = CLI->getIndVar()->uses().begin();
2377 IVUse != CLI->getIndVar()->uses().end(); IVUse++) {
2378 User *IVUser = IVUse->getUser();
2379 if (auto *Op = dyn_cast<BinaryOperator>(IVUser)) {
2380 if (Op->getOpcode() == Instruction::URem ||
2381 Op->getOpcode() == Instruction::UDiv) {
2382 UsersToReplace.push_back(IVUser);
2383 }
2384 }
2385 }
2386 for (User *User : UsersToReplace) {
2387 User->replaceUsesOfWith(CLI->getIndVar(), IVPlusTaskLB);
2388 }
2389 } else {
2390 // The canonical loop is generated with a fixed lower bound. We need to
2391 // update the index calculation code to use the task's lower bound. The
2392 // generated code looks like this:
2393 // %omp_loop.iv = phi ...
2394 // ...
2395 // %tmp = mul [type] %omp_loop.iv, step
2396 // %user_index = add [type] tmp, lb
2397 // OpenMPIRBuilder constructs canonical loops to have exactly three uses
2398 // of the normalised induction variable:
2399 // 1. This one: converting the normalised IV to the user IV
2400 // 2. The increment (add)
2401 // 3. The comparison against the trip count (icmp)
2402 // (1) is the only use that is a mul followed by an add so this cannot
2403 // match other IR.
2404 assert(CLI->getIndVar()->getNumUses() == 3 &&
2405 "Canonical loop should have exactly three uses of the ind var");
2406 for (User *IVUser : CLI->getIndVar()->users()) {
2407 if (auto *Mul = dyn_cast<BinaryOperator>(IVUser)) {
2408 if (Mul->getOpcode() == Instruction::Mul) {
2409 for (User *MulUser : Mul->users()) {
2410 if (auto *Add = dyn_cast<BinaryOperator>(MulUser)) {
2411 if (Add->getOpcode() == Instruction::Add) {
2412 Add->setOperand(1, CastedTaskLB);
2413 }
2414 }
2415 }
2416 }
2417 }
2418 }
2419 }
2420
2421 FakeLB->replaceAllUsesWith(CastedLBVal);
2422 FakeUB->replaceAllUsesWith(CastedUBVal);
2423 FakeStep->replaceAllUsesWith(CastedStepVal);
2424 for (Instruction *I : llvm::reverse(ToBeDeleted)) {
2425 I->eraseFromParent();
2426 }
2427 };
2428
2429 addOutlineInfo(std::move(OI));
2430 Builder.SetInsertPoint(TaskloopExitBB, TaskloopExitBB->begin());
2431 return Builder.saveIP();
2432}
2433
2435 const LocationDescription &Loc, InsertPointTy AllocaIP,
2436 BodyGenCallbackTy BodyGenCB, bool Tied, Value *Final, Value *IfCondition,
2437 SmallVector<DependData> Dependencies, bool Mergeable, Value *EventHandle,
2438 Value *Priority) {
2439
2440 if (!updateToLocation(Loc))
2441 return InsertPointTy();
2442
2443 uint32_t SrcLocStrSize;
2444 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2445 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2446 // The current basic block is split into four basic blocks. After outlining,
2447 // they will be mapped as follows:
2448 // ```
2449 // def current_fn() {
2450 // current_basic_block:
2451 // br label %task.exit
2452 // task.exit:
2453 // ; instructions after task
2454 // }
2455 // def outlined_fn() {
2456 // task.alloca:
2457 // br label %task.body
2458 // task.body:
2459 // ret void
2460 // }
2461 // ```
2462 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
2463 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
2464 BasicBlock *TaskAllocaBB =
2465 splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
2466
2467 InsertPointTy TaskAllocaIP =
2468 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
2469 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
2470 if (Error Err = BodyGenCB(TaskAllocaIP, TaskBodyIP))
2471 return Err;
2472
2473 OutlineInfo OI;
2474 OI.EntryBB = TaskAllocaBB;
2475 OI.OuterAllocaBB = AllocaIP.getBlock();
2476 OI.ExitBB = TaskExitBB;
2477
2478 // Add the thread ID argument.
2481 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
2482
2483 OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
2484 Mergeable, Priority, EventHandle, TaskAllocaBB,
2485 ToBeDeleted](Function &OutlinedFn) mutable {
2486 // Replace the Stale CI by appropriate RTL function call.
2487 assert(OutlinedFn.hasOneUse() &&
2488 "there must be a single user for the outlined function");
2489 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
2490
2491 // HasShareds is true if any variables are captured in the outlined region,
2492 // false otherwise.
2493 bool HasShareds = StaleCI->arg_size() > 1;
2494 Builder.SetInsertPoint(StaleCI);
2495
2496 // Gather the arguments for emitting the runtime call for
2497 // @__kmpc_omp_task_alloc
2498 Function *TaskAllocFn =
2499 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
2500
2501 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
2502 // call.
2503 Value *ThreadID = getOrCreateThreadID(Ident);
2504
2505 // Argument - `flags`
2506 // Task is tied iff (Flags & 1) == 1.
2507 // Task is untied iff (Flags & 1) == 0.
2508 // Task is final iff (Flags & 2) == 2.
2509 // Task is not final iff (Flags & 2) == 0.
2510 // Task is mergeable iff (Flags & 4) == 4.
2511 // Task is not mergeable iff (Flags & 4) == 0.
2512 // Task is priority iff (Flags & 32) == 32.
2513 // Task is not priority iff (Flags & 32) == 0.
2514 // TODO: Handle the other flags.
2515 Value *Flags = Builder.getInt32(Tied);
2516 if (Final) {
2517 Value *FinalFlag =
2518 Builder.CreateSelect(Final, Builder.getInt32(2), Builder.getInt32(0));
2519 Flags = Builder.CreateOr(FinalFlag, Flags);
2520 }
2521
2522 if (Mergeable)
2523 Flags = Builder.CreateOr(Builder.getInt32(4), Flags);
2524 if (Priority)
2525 Flags = Builder.CreateOr(Builder.getInt32(32), Flags);
2526
2527 // Argument - `sizeof_kmp_task_t` (TaskSize)
2528 // Tasksize refers to the size in bytes of kmp_task_t data structure
2529 // including private vars accessed in task.
2530 // TODO: add kmp_task_t_with_privates (privates)
2531 Value *TaskSize = Builder.getInt64(
2532 divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
2533
2534 // Argument - `sizeof_shareds` (SharedsSize)
2535 // SharedsSize refers to the shareds array size in the kmp_task_t data
2536 // structure.
2537 Value *SharedsSize = Builder.getInt64(0);
2538 if (HasShareds) {
2539 AllocaInst *ArgStructAlloca =
2541 assert(ArgStructAlloca &&
2542 "Unable to find the alloca instruction corresponding to arguments "
2543 "for extracted function");
2544 std::optional<TypeSize> ArgAllocSize =
2545 ArgStructAlloca->getAllocationSize(M.getDataLayout());
2546 assert(ArgAllocSize &&
2547 "Unable to determine size of arguments for extracted function");
2548 SharedsSize = Builder.getInt64(ArgAllocSize->getFixedValue());
2549 }
2550 // Emit the @__kmpc_omp_task_alloc runtime call
2551 // The runtime call returns a pointer to an area where the task captured
2552 // variables must be copied before the task is run (TaskData)
2554 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
2555 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
2556 /*task_func=*/&OutlinedFn});
2557
2558 // Emit detach clause initialization.
2559 // evt = (typeof(evt))__kmpc_task_allow_completion_event(loc, tid,
2560 // task_descriptor);
2561 if (EventHandle) {
2563 OMPRTL___kmpc_task_allow_completion_event);
2564 llvm::Value *EventVal =
2565 createRuntimeFunctionCall(TaskDetachFn, {Ident, ThreadID, TaskData});
2566 llvm::Value *EventHandleAddr =
2567 Builder.CreatePointerBitCastOrAddrSpaceCast(EventHandle,
2568 Builder.getPtrTy(0));
2569 EventVal = Builder.CreatePtrToInt(EventVal, Builder.getInt64Ty());
2570 Builder.CreateStore(EventVal, EventHandleAddr);
2571 }
2572 // Copy the arguments for outlined function
2573 if (HasShareds) {
2574 Value *Shareds = StaleCI->getArgOperand(1);
2575 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
2576 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
2577 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
2578 SharedsSize);
2579 }
2580
2581 if (Priority) {
2582 //
2583 // The return type of "__kmpc_omp_task_alloc" is "kmp_task_t *",
2584 // we populate the priority information into the "kmp_task_t" here
2585 //
2586 // The struct "kmp_task_t" definition is available in kmp.h
2587 // kmp_task_t = { shareds, routine, part_id, data1, data2 }
2588 // data2 is used for priority
2589 //
2590 Type *Int32Ty = Builder.getInt32Ty();
2591 Constant *Zero = ConstantInt::get(Int32Ty, 0);
2592 // kmp_task_t* => { ptr }
2593 Type *TaskPtr = StructType::get(VoidPtr);
2594 Value *TaskGEP =
2595 Builder.CreateInBoundsGEP(TaskPtr, TaskData, {Zero, Zero});
2596 // kmp_task_t => { ptr, ptr, i32, ptr, ptr }
2597 Type *TaskStructType = StructType::get(
2598 VoidPtr, VoidPtr, Builder.getInt32Ty(), VoidPtr, VoidPtr);
2599 Value *PriorityData = Builder.CreateInBoundsGEP(
2600 TaskStructType, TaskGEP, {Zero, ConstantInt::get(Int32Ty, 4)});
2601 // kmp_cmplrdata_t => { ptr, ptr }
2602 Type *CmplrStructType = StructType::get(VoidPtr, VoidPtr);
2603 Value *CmplrData = Builder.CreateInBoundsGEP(CmplrStructType,
2604 PriorityData, {Zero, Zero});
2605 Builder.CreateStore(Priority, CmplrData);
2606 }
2607
2608 Value *DepArray = emitTaskDependencies(*this, Dependencies);
2609
2610 // In the presence of the `if` clause, the following IR is generated:
2611 // ...
2612 // %data = call @__kmpc_omp_task_alloc(...)
2613 // br i1 %if_condition, label %then, label %else
2614 // then:
2615 // call @__kmpc_omp_task(...)
2616 // br label %exit
2617 // else:
2618 // ;; Wait for resolution of dependencies, if any, before
2619 // ;; beginning the task
2620 // call @__kmpc_omp_wait_deps(...)
2621 // call @__kmpc_omp_task_begin_if0(...)
2622 // call @outlined_fn(...)
2623 // call @__kmpc_omp_task_complete_if0(...)
2624 // br label %exit
2625 // exit:
2626 // ...
2627 if (IfCondition) {
2628 // `SplitBlockAndInsertIfThenElse` requires the block to have a
2629 // terminator.
2630 splitBB(Builder, /*CreateBranch=*/true, "if.end");
2631 Instruction *IfTerminator =
2632 Builder.GetInsertPoint()->getParent()->getTerminator();
2633 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr;
2634 Builder.SetInsertPoint(IfTerminator);
2635 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
2636 &ElseTI);
2637 Builder.SetInsertPoint(ElseTI);
2638
2639 if (Dependencies.size()) {
2640 Function *TaskWaitFn =
2641 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
2643 TaskWaitFn,
2644 {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
2645 ConstantInt::get(Builder.getInt32Ty(), 0),
2647 }
2648 Function *TaskBeginFn =
2649 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
2650 Function *TaskCompleteFn =
2651 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
2652 createRuntimeFunctionCall(TaskBeginFn, {Ident, ThreadID, TaskData});
2653 CallInst *CI = nullptr;
2654 if (HasShareds)
2655 CI = createRuntimeFunctionCall(&OutlinedFn, {ThreadID, TaskData});
2656 else
2657 CI = createRuntimeFunctionCall(&OutlinedFn, {ThreadID});
2658 CI->setDebugLoc(StaleCI->getDebugLoc());
2659 createRuntimeFunctionCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
2660 Builder.SetInsertPoint(ThenTI);
2661 }
2662
2663 if (Dependencies.size()) {
2664 Function *TaskFn =
2665 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
2667 TaskFn,
2668 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
2669 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
2671
2672 } else {
2673 // Emit the @__kmpc_omp_task runtime call to spawn the task
2674 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
2675 createRuntimeFunctionCall(TaskFn, {Ident, ThreadID, TaskData});
2676 }
2677
2678 StaleCI->eraseFromParent();
2679
2680 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin());
2681 if (HasShareds) {
2682 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2683 OutlinedFn.getArg(1)->replaceUsesWithIf(
2684 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
2685 }
2686
2687 for (Instruction *I : llvm::reverse(ToBeDeleted))
2688 I->eraseFromParent();
2689 };
2690
2691 addOutlineInfo(std::move(OI));
2692 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
2693
2694 return Builder.saveIP();
2695}
2696
2699 InsertPointTy AllocaIP,
2700 BodyGenCallbackTy BodyGenCB) {
2701 if (!updateToLocation(Loc))
2702 return InsertPointTy();
2703
2704 uint32_t SrcLocStrSize;
2705 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2706 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2707 Value *ThreadID = getOrCreateThreadID(Ident);
2708
2709 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
2710 Function *TaskgroupFn =
2711 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2712 createRuntimeFunctionCall(TaskgroupFn, {Ident, ThreadID});
2713
2714 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
2715 if (Error Err = BodyGenCB(AllocaIP, Builder.saveIP()))
2716 return Err;
2717
2718 Builder.SetInsertPoint(TaskgroupExitBB);
2719 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
2720 Function *EndTaskgroupFn =
2721 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2722 createRuntimeFunctionCall(EndTaskgroupFn, {Ident, ThreadID});
2723
2724 return Builder.saveIP();
2725}
2726
2728 const LocationDescription &Loc, InsertPointTy AllocaIP,
2730 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
2731 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
2732
2733 if (!updateToLocation(Loc))
2734 return Loc.IP;
2735
2736 FinalizationStack.push_back({FiniCB, OMPD_sections, IsCancellable});
2737
2738 // Each section is emitted as a switch case
2739 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
2740 // -> OMP.createSection() which generates the IR for each section
2741 // Iterate through all sections and emit a switch construct:
2742 // switch (IV) {
2743 // case 0:
2744 // <SectionStmt[0]>;
2745 // break;
2746 // ...
2747 // case <NumSection> - 1:
2748 // <SectionStmt[<NumSection> - 1]>;
2749 // break;
2750 // }
2751 // ...
2752 // section_loop.after:
2753 // <FiniCB>;
2754 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) -> Error {
2755 Builder.restoreIP(CodeGenIP);
2757 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
2758 Function *CurFn = Continue->getParent();
2759 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
2760
2761 unsigned CaseNumber = 0;
2762 for (auto SectionCB : SectionCBs) {
2764 M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
2765 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
2766 Builder.SetInsertPoint(CaseBB);
2767 BranchInst *CaseEndBr = Builder.CreateBr(Continue);
2768 if (Error Err = SectionCB(InsertPointTy(), {CaseEndBr->getParent(),
2769 CaseEndBr->getIterator()}))
2770 return Err;
2771 CaseNumber++;
2772 }
2773 // remove the existing terminator from body BB since there can be no
2774 // terminators after switch/case
2775 return Error::success();
2776 };
2777 // Loop body ends here
2778 // LowerBound, UpperBound, and STride for createCanonicalLoop
2779 Type *I32Ty = Type::getInt32Ty(M.getContext());
2780 Value *LB = ConstantInt::get(I32Ty, 0);
2781 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
2782 Value *ST = ConstantInt::get(I32Ty, 1);
2784 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
2785 if (!LoopInfo)
2786 return LoopInfo.takeError();
2787
2788 InsertPointOrErrorTy WsloopIP =
2789 applyStaticWorkshareLoop(Loc.DL, *LoopInfo, AllocaIP,
2790 WorksharingLoopType::ForStaticLoop, !IsNowait);
2791 if (!WsloopIP)
2792 return WsloopIP.takeError();
2793 InsertPointTy AfterIP = *WsloopIP;
2794
2795 BasicBlock *LoopFini = AfterIP.getBlock()->getSinglePredecessor();
2796 assert(LoopFini && "Bad structure of static workshare loop finalization");
2797
2798 // Apply the finalization callback in LoopAfterBB
2799 auto FiniInfo = FinalizationStack.pop_back_val();
2800 assert(FiniInfo.DK == OMPD_sections &&
2801 "Unexpected finalization stack state!");
2802 if (Error Err = FiniInfo.mergeFiniBB(Builder, LoopFini))
2803 return Err;
2804
2805 return AfterIP;
2806}
2807
2810 BodyGenCallbackTy BodyGenCB,
2811 FinalizeCallbackTy FiniCB) {
2812 if (!updateToLocation(Loc))
2813 return Loc.IP;
2814
2815 auto FiniCBWrapper = [&](InsertPointTy IP) {
2816 if (IP.getBlock()->end() != IP.getPoint())
2817 return FiniCB(IP);
2818 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2819 // will fail because that function requires the Finalization Basic Block to
2820 // have a terminator, which is already removed by EmitOMPRegionBody.
2821 // IP is currently at cancelation block.
2822 // We need to backtrack to the condition block to fetch
2823 // the exit block and create a branch from cancelation
2824 // to exit block.
2826 Builder.restoreIP(IP);
2827 auto *CaseBB = Loc.IP.getBlock();
2828 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2829 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2830 Instruction *I = Builder.CreateBr(ExitBB);
2831 IP = InsertPointTy(I->getParent(), I->getIterator());
2832 return FiniCB(IP);
2833 };
2834
2835 Directive OMPD = Directive::OMPD_sections;
2836 // Since we are using Finalization Callback here, HasFinalize
2837 // and IsCancellable have to be true
2838 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
2839 /*Conditional*/ false, /*hasFinalize*/ true,
2840 /*IsCancellable*/ true);
2841}
2842
2848
2849Value *OpenMPIRBuilder::getGPUThreadID() {
2852 OMPRTL___kmpc_get_hardware_thread_id_in_block),
2853 {});
2854}
2855
2856Value *OpenMPIRBuilder::getGPUWarpSize() {
2858 getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
2859}
2860
2861Value *OpenMPIRBuilder::getNVPTXWarpID() {
2862 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2863 return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
2864}
2865
2866Value *OpenMPIRBuilder::getNVPTXLaneID() {
2867 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2868 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
2869 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
2870 return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
2871 "nvptx_lane_id");
2872}
2873
2874Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
2875 Type *ToType) {
2876 Type *FromType = From->getType();
2877 uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
2878 uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
2879 assert(FromSize > 0 && "From size must be greater than zero");
2880 assert(ToSize > 0 && "To size must be greater than zero");
2881 if (FromType == ToType)
2882 return From;
2883 if (FromSize == ToSize)
2884 return Builder.CreateBitCast(From, ToType);
2885 if (ToType->isIntegerTy() && FromType->isIntegerTy())
2886 return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
2887 InsertPointTy SaveIP = Builder.saveIP();
2888 Builder.restoreIP(AllocaIP);
2889 Value *CastItem = Builder.CreateAlloca(ToType);
2890 Builder.restoreIP(SaveIP);
2891
2892 Value *ValCastItem = Builder.CreatePointerBitCastOrAddrSpaceCast(
2893 CastItem, Builder.getPtrTy(0));
2894 Builder.CreateStore(From, ValCastItem);
2895 return Builder.CreateLoad(ToType, CastItem);
2896}
2897
2898Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
2899 Value *Element,
2900 Type *ElementType,
2901 Value *Offset) {
2902 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
2903 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
2904
2905 // Cast all types to 32- or 64-bit values before calling shuffle routines.
2906 Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
2907 Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
2908 Value *WarpSize =
2909 Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
2911 Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
2912 : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
2913 Value *WarpSizeCast =
2914 Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
2915 Value *ShuffleCall =
2916 createRuntimeFunctionCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
2917 return castValueToType(AllocaIP, ShuffleCall, CastTy);
2918}
2919
2920void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
2921 Value *DstAddr, Type *ElemType,
2922 Value *Offset, Type *ReductionArrayTy,
2923 bool IsByRefElem) {
2924 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElemType);
2925 // Create the loop over the big sized data.
2926 // ptr = (void*)Elem;
2927 // ptrEnd = (void*) Elem + 1;
2928 // Step = 8;
2929 // while (ptr + Step < ptrEnd)
2930 // shuffle((int64_t)*ptr);
2931 // Step = 4;
2932 // while (ptr + Step < ptrEnd)
2933 // shuffle((int32_t)*ptr);
2934 // ...
2935 Type *IndexTy = Builder.getIndexTy(
2936 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
2937 Value *ElemPtr = DstAddr;
2938 Value *Ptr = SrcAddr;
2939 for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
2940 if (Size < IntSize)
2941 continue;
2942 Type *IntType = Builder.getIntNTy(IntSize * 8);
2943 Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2944 Ptr, Builder.getPtrTy(0), Ptr->getName() + ".ascast");
2945 Value *SrcAddrGEP =
2946 Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
2947 ElemPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2948 ElemPtr, Builder.getPtrTy(0), ElemPtr->getName() + ".ascast");
2949
2950 Function *CurFunc = Builder.GetInsertBlock()->getParent();
2951 if ((Size / IntSize) > 1) {
2952 Value *PtrEnd = Builder.CreatePointerBitCastOrAddrSpaceCast(
2953 SrcAddrGEP, Builder.getPtrTy());
2954 BasicBlock *PreCondBB =
2955 BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
2956 BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
2957 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
2958 BasicBlock *CurrentBB = Builder.GetInsertBlock();
2959 emitBlock(PreCondBB, CurFunc);
2960 PHINode *PhiSrc =
2961 Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
2962 PhiSrc->addIncoming(Ptr, CurrentBB);
2963 PHINode *PhiDest =
2964 Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
2965 PhiDest->addIncoming(ElemPtr, CurrentBB);
2966 Ptr = PhiSrc;
2967 ElemPtr = PhiDest;
2968 Value *PtrDiff = Builder.CreatePtrDiff(
2969 Builder.getInt8Ty(), PtrEnd,
2970 Builder.CreatePointerBitCastOrAddrSpaceCast(Ptr, Builder.getPtrTy()));
2971 Builder.CreateCondBr(
2972 Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
2973 ExitBB);
2974 emitBlock(ThenBB, CurFunc);
2975 Value *Res = createRuntimeShuffleFunction(
2976 AllocaIP,
2977 Builder.CreateAlignedLoad(
2978 IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
2979 IntType, Offset);
2980 Builder.CreateAlignedStore(Res, ElemPtr,
2981 M.getDataLayout().getPrefTypeAlign(ElemType));
2982 Value *LocalPtr =
2983 Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2984 Value *LocalElemPtr =
2985 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2986 PhiSrc->addIncoming(LocalPtr, ThenBB);
2987 PhiDest->addIncoming(LocalElemPtr, ThenBB);
2988 emitBranch(PreCondBB);
2989 emitBlock(ExitBB, CurFunc);
2990 } else {
2991 Value *Res = createRuntimeShuffleFunction(
2992 AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
2993 if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
2994 Res->getType()->getScalarSizeInBits())
2995 Res = Builder.CreateTrunc(Res, ElemType);
2996 Builder.CreateStore(Res, ElemPtr);
2997 Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2998 ElemPtr =
2999 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
3000 }
3001 Size = Size % IntSize;
3002 }
3003}
3004
3005Error OpenMPIRBuilder::emitReductionListCopy(
3006 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
3007 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
3008 ArrayRef<bool> IsByRef, CopyOptionsTy CopyOptions) {
3009 Type *IndexTy = Builder.getIndexTy(
3010 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3011 Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
3012
3013 // Iterates, element-by-element, through the source Reduce list and
3014 // make a copy.
3015 for (auto En : enumerate(ReductionInfos)) {
3016 const ReductionInfo &RI = En.value();
3017 Value *SrcElementAddr = nullptr;
3018 AllocaInst *DestAlloca = nullptr;
3019 Value *DestElementAddr = nullptr;
3020 Value *DestElementPtrAddr = nullptr;
3021 // Should we shuffle in an element from a remote lane?
3022 bool ShuffleInElement = false;
3023 // Set to true to update the pointer in the dest Reduce list to a
3024 // newly created element.
3025 bool UpdateDestListPtr = false;
3026
3027 // Step 1.1: Get the address for the src element in the Reduce list.
3028 Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
3029 ReductionArrayTy, SrcBase,
3030 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3031 SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
3032
3033 // Step 1.2: Create a temporary to store the element in the destination
3034 // Reduce list.
3035 DestElementPtrAddr = Builder.CreateInBoundsGEP(
3036 ReductionArrayTy, DestBase,
3037 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3038 bool IsByRefElem = (!IsByRef.empty() && IsByRef[En.index()]);
3039 switch (Action) {
3041 InsertPointTy CurIP = Builder.saveIP();
3042 Builder.restoreIP(AllocaIP);
3043
3044 Type *DestAllocaType =
3045 IsByRefElem ? RI.ByRefAllocatedType : RI.ElementType;
3046 DestAlloca = Builder.CreateAlloca(DestAllocaType, nullptr,
3047 ".omp.reduction.element");
3048 DestAlloca->setAlignment(
3049 M.getDataLayout().getPrefTypeAlign(DestAllocaType));
3050 DestElementAddr = DestAlloca;
3051 DestElementAddr =
3052 Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
3053 DestElementAddr->getName() + ".ascast");
3054 Builder.restoreIP(CurIP);
3055 ShuffleInElement = true;
3056 UpdateDestListPtr = true;
3057 break;
3058 }
3060 DestElementAddr =
3061 Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
3062 break;
3063 }
3064 }
3065
3066 // Now that all active lanes have read the element in the
3067 // Reduce list, shuffle over the value from the remote lane.
3068 if (ShuffleInElement) {
3069 Type *ShuffleType = RI.ElementType;
3070 Value *ShuffleSrcAddr = SrcElementAddr;
3071 Value *ShuffleDestAddr = DestElementAddr;
3072 AllocaInst *LocalStorage = nullptr;
3073
3074 if (IsByRefElem) {
3075 assert(RI.ByRefElementType && "Expected by-ref element type to be set");
3076 assert(RI.ByRefAllocatedType &&
3077 "Expected by-ref allocated type to be set");
3078 // For by-ref reductions, we need to copy from the remote lane the
3079 // actual value of the partial reduction computed by that remote lane;
3080 // rather than, for example, a pointer to that data or, even worse, a
3081 // pointer to the descriptor of the by-ref reduction element.
3082 ShuffleType = RI.ByRefElementType;
3083
3084 InsertPointOrErrorTy GenResult =
3085 RI.DataPtrPtrGen(Builder.saveIP(), ShuffleSrcAddr, ShuffleSrcAddr);
3086
3087 if (!GenResult)
3088 return GenResult.takeError();
3089
3090 ShuffleSrcAddr = Builder.CreateLoad(Builder.getPtrTy(), ShuffleSrcAddr);
3091
3092 {
3093 InsertPointTy OldIP = Builder.saveIP();
3094 Builder.restoreIP(AllocaIP);
3095
3096 LocalStorage = Builder.CreateAlloca(ShuffleType);
3097 Builder.restoreIP(OldIP);
3098 ShuffleDestAddr = LocalStorage;
3099 }
3100 }
3101
3102 shuffleAndStore(AllocaIP, ShuffleSrcAddr, ShuffleDestAddr, ShuffleType,
3103 RemoteLaneOffset, ReductionArrayTy, IsByRefElem);
3104
3105 if (IsByRefElem) {
3106 // Copy descriptor from source and update base_ptr to shuffled data
3107 Value *DestDescriptorAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3108 DestAlloca, Builder.getPtrTy(), ".ascast");
3109
3110 InsertPointOrErrorTy GenResult = generateReductionDescriptor(
3111 DestDescriptorAddr, LocalStorage, SrcElementAddr,
3112 RI.ByRefAllocatedType, RI.DataPtrPtrGen);
3113
3114 if (!GenResult)
3115 return GenResult.takeError();
3116 }
3117 } else {
3118 switch (RI.EvaluationKind) {
3119 case EvalKind::Scalar: {
3120 Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
3121 // Store the source element value to the dest element address.
3122 Builder.CreateStore(Elem, DestElementAddr);
3123 break;
3124 }
3125 case EvalKind::Complex: {
3126 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3127 RI.ElementType, SrcElementAddr, 0, 0, ".realp");
3128 Value *SrcReal = Builder.CreateLoad(
3129 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3130 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3131 RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
3132 Value *SrcImg = Builder.CreateLoad(
3133 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3134
3135 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3136 RI.ElementType, DestElementAddr, 0, 0, ".realp");
3137 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3138 RI.ElementType, DestElementAddr, 0, 1, ".imagp");
3139 Builder.CreateStore(SrcReal, DestRealPtr);
3140 Builder.CreateStore(SrcImg, DestImgPtr);
3141 break;
3142 }
3143 case EvalKind::Aggregate: {
3144 Value *SizeVal = Builder.getInt64(
3145 M.getDataLayout().getTypeStoreSize(RI.ElementType));
3146 Builder.CreateMemCpy(
3147 DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3148 SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3149 SizeVal, false);
3150 break;
3151 }
3152 };
3153 }
3154
3155 // Step 3.1: Modify reference in dest Reduce list as needed.
3156 // Modifying the reference in Reduce list to point to the newly
3157 // created element. The element is live in the current function
3158 // scope and that of functions it invokes (i.e., reduce_function).
3159 // RemoteReduceData[i] = (void*)&RemoteElem
3160 if (UpdateDestListPtr) {
3161 Value *CastDestAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3162 DestElementAddr, Builder.getPtrTy(),
3163 DestElementAddr->getName() + ".ascast");
3164 Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
3165 }
3166 }
3167
3168 return Error::success();
3169}
3170
3171Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
3172 const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
3173 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3174 InsertPointTy SavedIP = Builder.saveIP();
3175 LLVMContext &Ctx = M.getContext();
3176 FunctionType *FuncTy = FunctionType::get(
3177 Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
3178 /* IsVarArg */ false);
3179 Function *WcFunc =
3181 "_omp_reduction_inter_warp_copy_func", &M);
3182 WcFunc->setAttributes(FuncAttrs);
3183 WcFunc->addParamAttr(0, Attribute::NoUndef);
3184 WcFunc->addParamAttr(1, Attribute::NoUndef);
3185 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
3186 Builder.SetInsertPoint(EntryBB);
3187
3188 // ReduceList: thread local Reduce list.
3189 // At the stage of the computation when this function is called, partially
3190 // aggregated values reside in the first lane of every active warp.
3191 Argument *ReduceListArg = WcFunc->getArg(0);
3192 // NumWarps: number of warps active in the parallel region. This could
3193 // be smaller than 32 (max warps in a CTA) for partial block reduction.
3194 Argument *NumWarpsArg = WcFunc->getArg(1);
3195
3196 // This array is used as a medium to transfer, one reduce element at a time,
3197 // the data from the first lane of every warp to lanes in the first warp
3198 // in order to perform the final step of a reduction in a parallel region
3199 // (reduction across warps). The array is placed in NVPTX __shared__ memory
3200 // for reduced latency, as well as to have a distinct copy for concurrently
3201 // executing target regions. The array is declared with common linkage so
3202 // as to be shared across compilation units.
3203 StringRef TransferMediumName =
3204 "__openmp_nvptx_data_transfer_temporary_storage";
3205 GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
3206 unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
3207 ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
3208 if (!TransferMedium) {
3209 TransferMedium = new GlobalVariable(
3210 M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
3211 UndefValue::get(ArrayTy), TransferMediumName,
3212 /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
3213 /*AddressSpace=*/3);
3214 }
3215
3216 // Get the CUDA thread id of the current OpenMP thread on the GPU.
3217 Value *GPUThreadID = getGPUThreadID();
3218 // nvptx_lane_id = nvptx_id % warpsize
3219 Value *LaneID = getNVPTXLaneID();
3220 // nvptx_warp_id = nvptx_id / warpsize
3221 Value *WarpID = getNVPTXWarpID();
3222
3223 InsertPointTy AllocaIP =
3224 InsertPointTy(Builder.GetInsertBlock(),
3225 Builder.GetInsertBlock()->getFirstInsertionPt());
3226 Type *Arg0Type = ReduceListArg->getType();
3227 Type *Arg1Type = NumWarpsArg->getType();
3228 Builder.restoreIP(AllocaIP);
3229 AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
3230 Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
3231 AllocaInst *NumWarpsAlloca =
3232 Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
3233 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3234 ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
3235 Value *NumWarpsAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3236 NumWarpsAlloca, Builder.getPtrTy(0),
3237 NumWarpsAlloca->getName() + ".ascast");
3238 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
3239 Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
3240 AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
3241 InsertPointTy CodeGenIP =
3242 getInsertPointAfterInstr(&Builder.GetInsertBlock()->back());
3243 Builder.restoreIP(CodeGenIP);
3244
3245 Value *ReduceList =
3246 Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
3247
3248 for (auto En : enumerate(ReductionInfos)) {
3249 //
3250 // Warp master copies reduce element to transfer medium in __shared__
3251 // memory.
3252 //
3253 const ReductionInfo &RI = En.value();
3254 bool IsByRefElem = !IsByRef.empty() && IsByRef[En.index()];
3255 unsigned RealTySize = M.getDataLayout().getTypeAllocSize(
3256 IsByRefElem ? RI.ByRefElementType : RI.ElementType);
3257 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
3258 Type *CType = Builder.getIntNTy(TySize * 8);
3259
3260 unsigned NumIters = RealTySize / TySize;
3261 if (NumIters == 0)
3262 continue;
3263 Value *Cnt = nullptr;
3264 Value *CntAddr = nullptr;
3265 BasicBlock *PrecondBB = nullptr;
3266 BasicBlock *ExitBB = nullptr;
3267 if (NumIters > 1) {
3268 CodeGenIP = Builder.saveIP();
3269 Builder.restoreIP(AllocaIP);
3270 CntAddr =
3271 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
3272
3273 CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
3274 CntAddr->getName() + ".ascast");
3275 Builder.restoreIP(CodeGenIP);
3276 Builder.CreateStore(Constant::getNullValue(Builder.getInt32Ty()),
3277 CntAddr,
3278 /*Volatile=*/false);
3279 PrecondBB = BasicBlock::Create(Ctx, "precond");
3280 ExitBB = BasicBlock::Create(Ctx, "exit");
3281 BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
3282 emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
3283 Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
3284 /*Volatile=*/false);
3285 Value *Cmp = Builder.CreateICmpULT(
3286 Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
3287 Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
3288 emitBlock(BodyBB, Builder.GetInsertBlock()->getParent());
3289 }
3290
3291 // kmpc_barrier.
3292 InsertPointOrErrorTy BarrierIP1 =
3293 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
3294 omp::Directive::OMPD_unknown,
3295 /* ForceSimpleCall */ false,
3296 /* CheckCancelFlag */ true);
3297 if (!BarrierIP1)
3298 return BarrierIP1.takeError();
3299 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
3300 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
3301 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
3302
3303 // if (lane_id == 0)
3304 Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
3305 Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
3306 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
3307
3308 // Reduce element = LocalReduceList[i]
3309 auto *RedListArrayTy =
3310 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3311 Type *IndexTy = Builder.getIndexTy(
3312 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3313 Value *ElemPtrPtr =
3314 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
3315 {ConstantInt::get(IndexTy, 0),
3316 ConstantInt::get(IndexTy, En.index())});
3317 // elemptr = ((CopyType*)(elemptrptr)) + I
3318 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3319
3320 if (IsByRefElem) {
3321 InsertPointOrErrorTy GenRes =
3322 RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr);
3323
3324 if (!GenRes)
3325 return GenRes.takeError();
3326
3327 ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
3328 }
3329
3330 if (NumIters > 1)
3331 ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
3332
3333 // Get pointer to location in transfer medium.
3334 // MediumPtr = &medium[warp_id]
3335 Value *MediumPtr = Builder.CreateInBoundsGEP(
3336 ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
3337 // elem = *elemptr
3338 //*MediumPtr = elem
3339 Value *Elem = Builder.CreateLoad(CType, ElemPtr);
3340 // Store the source element value to the dest element address.
3341 Builder.CreateStore(Elem, MediumPtr,
3342 /*IsVolatile*/ true);
3343 Builder.CreateBr(MergeBB);
3344
3345 // else
3346 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
3347 Builder.CreateBr(MergeBB);
3348
3349 // endif
3350 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
3351 InsertPointOrErrorTy BarrierIP2 =
3352 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
3353 omp::Directive::OMPD_unknown,
3354 /* ForceSimpleCall */ false,
3355 /* CheckCancelFlag */ true);
3356 if (!BarrierIP2)
3357 return BarrierIP2.takeError();
3358
3359 // Warp 0 copies reduce element from transfer medium
3360 BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
3361 BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
3362 BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
3363
3364 Value *NumWarpsVal =
3365 Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
3366 // Up to 32 threads in warp 0 are active.
3367 Value *IsActiveThread =
3368 Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
3369 Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
3370
3371 emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
3372
3373 // SecMediumPtr = &medium[tid]
3374 // SrcMediumVal = *SrcMediumPtr
3375 Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
3376 ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
3377 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
3378 Value *TargetElemPtrPtr =
3379 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
3380 {ConstantInt::get(IndexTy, 0),
3381 ConstantInt::get(IndexTy, En.index())});
3382 Value *TargetElemPtrVal =
3383 Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
3384 Value *TargetElemPtr = TargetElemPtrVal;
3385
3386 if (IsByRefElem) {
3387 InsertPointOrErrorTy GenRes =
3388 RI.DataPtrPtrGen(Builder.saveIP(), TargetElemPtr, TargetElemPtr);
3389
3390 if (!GenRes)
3391 return GenRes.takeError();
3392
3393 TargetElemPtr = Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtr);
3394 }
3395
3396 if (NumIters > 1)
3397 TargetElemPtr =
3398 Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
3399
3400 // *TargetElemPtr = SrcMediumVal;
3401 Value *SrcMediumValue =
3402 Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
3403 Builder.CreateStore(SrcMediumValue, TargetElemPtr);
3404 Builder.CreateBr(W0MergeBB);
3405
3406 emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
3407 Builder.CreateBr(W0MergeBB);
3408
3409 emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
3410
3411 if (NumIters > 1) {
3412 Cnt = Builder.CreateNSWAdd(
3413 Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
3414 Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
3415
3416 auto *CurFn = Builder.GetInsertBlock()->getParent();
3417 emitBranch(PrecondBB);
3418 emitBlock(ExitBB, CurFn);
3419 }
3420 RealTySize %= TySize;
3421 }
3422 }
3423
3424 Builder.CreateRetVoid();
3425 Builder.restoreIP(SavedIP);
3426
3427 return WcFunc;
3428}
3429
3430Expected<Function *> OpenMPIRBuilder::emitShuffleAndReduceFunction(
3431 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3432 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3433 LLVMContext &Ctx = M.getContext();
3434 FunctionType *FuncTy =
3435 FunctionType::get(Builder.getVoidTy(),
3436 {Builder.getPtrTy(), Builder.getInt16Ty(),
3437 Builder.getInt16Ty(), Builder.getInt16Ty()},
3438 /* IsVarArg */ false);
3439 Function *SarFunc =
3441 "_omp_reduction_shuffle_and_reduce_func", &M);
3442 SarFunc->setAttributes(FuncAttrs);
3443 SarFunc->addParamAttr(0, Attribute::NoUndef);
3444 SarFunc->addParamAttr(1, Attribute::NoUndef);
3445 SarFunc->addParamAttr(2, Attribute::NoUndef);
3446 SarFunc->addParamAttr(3, Attribute::NoUndef);
3447 SarFunc->addParamAttr(1, Attribute::SExt);
3448 SarFunc->addParamAttr(2, Attribute::SExt);
3449 SarFunc->addParamAttr(3, Attribute::SExt);
3450 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
3451 Builder.SetInsertPoint(EntryBB);
3452
3453 // Thread local Reduce list used to host the values of data to be reduced.
3454 Argument *ReduceListArg = SarFunc->getArg(0);
3455 // Current lane id; could be logical.
3456 Argument *LaneIDArg = SarFunc->getArg(1);
3457 // Offset of the remote source lane relative to the current lane.
3458 Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
3459 // Algorithm version. This is expected to be known at compile time.
3460 Argument *AlgoVerArg = SarFunc->getArg(3);
3461
3462 Type *ReduceListArgType = ReduceListArg->getType();
3463 Type *LaneIDArgType = LaneIDArg->getType();
3464 Type *LaneIDArgPtrType = Builder.getPtrTy(0);
3465 Value *ReduceListAlloca = Builder.CreateAlloca(
3466 ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
3467 Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
3468 LaneIDArg->getName() + ".addr");
3469 Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
3470 LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
3471 Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
3472 AlgoVerArg->getName() + ".addr");
3473 ArrayType *RedListArrayTy =
3474 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3475
3476 // Create a local thread-private variable to host the Reduce list
3477 // from a remote lane.
3478 Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
3479 RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
3480
3481 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3482 ReduceListAlloca, ReduceListArgType,
3483 ReduceListAlloca->getName() + ".ascast");
3484 Value *LaneIdAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3485 LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
3486 Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3487 RemoteLaneOffsetAlloca, LaneIDArgPtrType,
3488 RemoteLaneOffsetAlloca->getName() + ".ascast");
3489 Value *AlgoVerAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3490 AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
3491 Value *RemoteListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3492 RemoteReductionListAlloca, Builder.getPtrTy(),
3493 RemoteReductionListAlloca->getName() + ".ascast");
3494
3495 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
3496 Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
3497 Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
3498 Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
3499
3500 Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
3501 Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
3502 Value *RemoteLaneOffset =
3503 Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
3504 Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
3505
3506 InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
3507
3508 // This loop iterates through the list of reduce elements and copies,
3509 // element by element, from a remote lane in the warp to RemoteReduceList,
3510 // hosted on the thread's stack.
3511 Error EmitRedLsCpRes = emitReductionListCopy(
3512 AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
3513 ReduceList, RemoteListAddrCast, IsByRef,
3514 {RemoteLaneOffset, nullptr, nullptr});
3515
3516 if (EmitRedLsCpRes)
3517 return EmitRedLsCpRes;
3518
3519 // The actions to be performed on the Remote Reduce list is dependent
3520 // on the algorithm version.
3521 //
3522 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
3523 // LaneId % 2 == 0 && Offset > 0):
3524 // do the reduction value aggregation
3525 //
3526 // The thread local variable Reduce list is mutated in place to host the
3527 // reduced data, which is the aggregated value produced from local and
3528 // remote lanes.
3529 //
3530 // Note that AlgoVer is expected to be a constant integer known at compile
3531 // time.
3532 // When AlgoVer==0, the first conjunction evaluates to true, making
3533 // the entire predicate true during compile time.
3534 // When AlgoVer==1, the second conjunction has only the second part to be
3535 // evaluated during runtime. Other conjunctions evaluates to false
3536 // during compile time.
3537 // When AlgoVer==2, the third conjunction has only the second part to be
3538 // evaluated during runtime. Other conjunctions evaluates to false
3539 // during compile time.
3540 Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
3541 Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
3542 Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
3543 Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
3544 Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
3545 Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
3546 Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
3547 Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
3548 Value *RemoteOffsetComp =
3549 Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
3550 Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
3551 Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
3552 Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
3553
3554 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
3555 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
3556 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
3557
3558 Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
3559 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
3560 Value *LocalReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3561 ReduceList, Builder.getPtrTy());
3562 Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3563 RemoteListAddrCast, Builder.getPtrTy());
3564 createRuntimeFunctionCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
3565 ->addFnAttr(Attribute::NoUnwind);
3566 Builder.CreateBr(MergeBB);
3567
3568 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
3569 Builder.CreateBr(MergeBB);
3570
3571 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
3572
3573 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3574 // Reduce list.
3575 Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
3576 Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
3577 Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
3578
3579 BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
3580 BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
3581 BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
3582 Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3583
3584 emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
3585
3586 EmitRedLsCpRes = emitReductionListCopy(
3587 AllocaIP, CopyAction::ThreadCopy, RedListArrayTy, ReductionInfos,
3588 RemoteListAddrCast, ReduceList, IsByRef);
3589
3590 if (EmitRedLsCpRes)
3591 return EmitRedLsCpRes;
3592
3593 Builder.CreateBr(CpyMergeBB);
3594
3595 emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
3596 Builder.CreateBr(CpyMergeBB);
3597
3598 emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
3599
3600 Builder.CreateRetVoid();
3601
3602 return SarFunc;
3603}
3604
3606OpenMPIRBuilder::generateReductionDescriptor(
3607 Value *DescriptorAddr, Value *DataPtr, Value *SrcDescriptorAddr,
3608 Type *DescriptorType,
3609 function_ref<InsertPointOrErrorTy(InsertPointTy, Value *, Value *&)>
3610 DataPtrPtrGen) {
3611
3612 // Copy the source descriptor to preserve all metadata (rank, extents,
3613 // strides, etc.)
3614 Value *DescriptorSize =
3615 Builder.getInt64(M.getDataLayout().getTypeStoreSize(DescriptorType));
3616 Builder.CreateMemCpy(
3617 DescriptorAddr, M.getDataLayout().getPrefTypeAlign(DescriptorType),
3618 SrcDescriptorAddr, M.getDataLayout().getPrefTypeAlign(DescriptorType),
3619 DescriptorSize);
3620
3621 // Update the base pointer field to point to the local shuffled data
3622 Value *DataPtrField;
3623 InsertPointOrErrorTy GenResult =
3624 DataPtrPtrGen(Builder.saveIP(), DescriptorAddr, DataPtrField);
3625
3626 if (!GenResult)
3627 return GenResult.takeError();
3628
3629 Builder.CreateStore(Builder.CreatePointerBitCastOrAddrSpaceCast(
3630 DataPtr, Builder.getPtrTy(), ".ascast"),
3631 DataPtrField);
3632
3633 return Builder.saveIP();
3634}
3635
3636Expected<Function *> OpenMPIRBuilder::emitListToGlobalCopyFunction(
3637 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3638 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3639 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3640 LLVMContext &Ctx = M.getContext();
3641 FunctionType *FuncTy = FunctionType::get(
3642 Builder.getVoidTy(),
3643 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3644 /* IsVarArg */ false);
3645 Function *LtGCFunc =
3647 "_omp_reduction_list_to_global_copy_func", &M);
3648 LtGCFunc->setAttributes(FuncAttrs);
3649 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3650 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3651 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3652
3653 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3654 Builder.SetInsertPoint(EntryBlock);
3655
3656 // Buffer: global reduction buffer.
3657 Argument *BufferArg = LtGCFunc->getArg(0);
3658 // Idx: index of the buffer.
3659 Argument *IdxArg = LtGCFunc->getArg(1);
3660 // ReduceList: thread local Reduce list.
3661 Argument *ReduceListArg = LtGCFunc->getArg(2);
3662
3663 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3664 BufferArg->getName() + ".addr");
3665 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3666 IdxArg->getName() + ".addr");
3667 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3668 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3669 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3670 BufferArgAlloca, Builder.getPtrTy(),
3671 BufferArgAlloca->getName() + ".ascast");
3672 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3673 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3674 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3675 ReduceListArgAlloca, Builder.getPtrTy(),
3676 ReduceListArgAlloca->getName() + ".ascast");
3677
3678 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3679 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3680 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3681
3682 Value *LocalReduceList =
3683 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3684 Value *BufferArgVal =
3685 Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3686 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3687 Type *IndexTy = Builder.getIndexTy(
3688 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3689 for (auto En : enumerate(ReductionInfos)) {
3690 const ReductionInfo &RI = En.value();
3691 auto *RedListArrayTy =
3692 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3693 // Reduce element = LocalReduceList[i]
3694 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3695 RedListArrayTy, LocalReduceList,
3696 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3697 // elemptr = ((CopyType*)(elemptrptr)) + I
3698 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3699
3700 // Global = Buffer.VD[Idx];
3701 Value *BufferVD =
3702 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
3703 Value *GlobVal = Builder.CreateConstInBoundsGEP2_32(
3704 ReductionsBufferTy, BufferVD, 0, En.index());
3705
3706 switch (RI.EvaluationKind) {
3707 case EvalKind::Scalar: {
3708 Value *TargetElement;
3709
3710 if (IsByRef.empty() || !IsByRef[En.index()]) {
3711 TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
3712 } else {
3713 InsertPointOrErrorTy GenResult =
3714 RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr);
3715
3716 if (!GenResult)
3717 return GenResult.takeError();
3718
3719 ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
3720 TargetElement = Builder.CreateLoad(RI.ByRefElementType, ElemPtr);
3721 }
3722
3723 Builder.CreateStore(TargetElement, GlobVal);
3724 break;
3725 }
3726 case EvalKind::Complex: {
3727 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3728 RI.ElementType, ElemPtr, 0, 0, ".realp");
3729 Value *SrcReal = Builder.CreateLoad(
3730 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3731 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3732 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3733 Value *SrcImg = Builder.CreateLoad(
3734 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3735
3736 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3737 RI.ElementType, GlobVal, 0, 0, ".realp");
3738 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3739 RI.ElementType, GlobVal, 0, 1, ".imagp");
3740 Builder.CreateStore(SrcReal, DestRealPtr);
3741 Builder.CreateStore(SrcImg, DestImgPtr);
3742 break;
3743 }
3744 case EvalKind::Aggregate: {
3745 Value *SizeVal =
3746 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3747 Builder.CreateMemCpy(
3748 GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
3749 M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
3750 break;
3751 }
3752 }
3753 }
3754
3755 Builder.CreateRetVoid();
3756 Builder.restoreIP(OldIP);
3757 return LtGCFunc;
3758}
3759
3760Expected<Function *> OpenMPIRBuilder::emitListToGlobalReduceFunction(
3761 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3762 Type *ReductionsBufferTy, AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3763 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3764 LLVMContext &Ctx = M.getContext();
3765 FunctionType *FuncTy = FunctionType::get(
3766 Builder.getVoidTy(),
3767 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3768 /* IsVarArg */ false);
3769 Function *LtGRFunc =
3771 "_omp_reduction_list_to_global_reduce_func", &M);
3772 LtGRFunc->setAttributes(FuncAttrs);
3773 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3774 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3775 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3776
3777 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3778 Builder.SetInsertPoint(EntryBlock);
3779
3780 // Buffer: global reduction buffer.
3781 Argument *BufferArg = LtGRFunc->getArg(0);
3782 // Idx: index of the buffer.
3783 Argument *IdxArg = LtGRFunc->getArg(1);
3784 // ReduceList: thread local Reduce list.
3785 Argument *ReduceListArg = LtGRFunc->getArg(2);
3786
3787 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3788 BufferArg->getName() + ".addr");
3789 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3790 IdxArg->getName() + ".addr");
3791 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3792 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3793 auto *RedListArrayTy =
3794 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3795
3796 // 1. Build a list of reduction variables.
3797 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3798 Value *LocalReduceList =
3799 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3800
3801 InsertPointTy AllocaIP{EntryBlock, EntryBlock->begin()};
3802
3803 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3804 BufferArgAlloca, Builder.getPtrTy(),
3805 BufferArgAlloca->getName() + ".ascast");
3806 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3807 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3808 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3809 ReduceListArgAlloca, Builder.getPtrTy(),
3810 ReduceListArgAlloca->getName() + ".ascast");
3811 Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3812 LocalReduceList, Builder.getPtrTy(),
3813 LocalReduceList->getName() + ".ascast");
3814
3815 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3816 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3817 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3818
3819 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3820 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3821 Type *IndexTy = Builder.getIndexTy(
3822 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3823 for (auto En : enumerate(ReductionInfos)) {
3824 const ReductionInfo &RI = En.value();
3825 Value *ByRefAlloc;
3826
3827 if (!IsByRef.empty() && IsByRef[En.index()]) {
3828 InsertPointTy OldIP = Builder.saveIP();
3829 Builder.restoreIP(AllocaIP);
3830
3831 ByRefAlloc = Builder.CreateAlloca(RI.ByRefAllocatedType);
3832 ByRefAlloc = Builder.CreatePointerBitCastOrAddrSpaceCast(
3833 ByRefAlloc, Builder.getPtrTy(), ByRefAlloc->getName() + ".ascast");
3834
3835 Builder.restoreIP(OldIP);
3836 }
3837
3838 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3839 RedListArrayTy, LocalReduceListAddrCast,
3840 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3841 Value *BufferVD =
3842 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3843 // Global = Buffer.VD[Idx];
3844 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3845 ReductionsBufferTy, BufferVD, 0, En.index());
3846
3847 if (!IsByRef.empty() && IsByRef[En.index()]) {
3848 // Get source descriptor from the reduce list argument
3849 Value *ReduceList =
3850 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3851 Value *SrcElementPtrPtr =
3852 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
3853 {ConstantInt::get(IndexTy, 0),
3854 ConstantInt::get(IndexTy, En.index())});
3855 Value *SrcDescriptorAddr =
3856 Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrPtr);
3857
3858 // Copy descriptor from source and update base_ptr to global buffer data
3859 InsertPointOrErrorTy GenResult =
3860 generateReductionDescriptor(ByRefAlloc, GlobValPtr, SrcDescriptorAddr,
3861 RI.ByRefAllocatedType, RI.DataPtrPtrGen);
3862
3863 if (!GenResult)
3864 return GenResult.takeError();
3865
3866 Builder.CreateStore(ByRefAlloc, TargetElementPtrPtr);
3867 } else {
3868 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3869 }
3870 }
3871
3872 // Call reduce_function(GlobalReduceList, ReduceList)
3873 Value *ReduceList =
3874 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3875 createRuntimeFunctionCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
3876 ->addFnAttr(Attribute::NoUnwind);
3877 Builder.CreateRetVoid();
3878 Builder.restoreIP(OldIP);
3879 return LtGRFunc;
3880}
3881
3882Expected<Function *> OpenMPIRBuilder::emitGlobalToListCopyFunction(
3883 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3884 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3885 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3886 LLVMContext &Ctx = M.getContext();
3887 FunctionType *FuncTy = FunctionType::get(
3888 Builder.getVoidTy(),
3889 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3890 /* IsVarArg */ false);
3891 Function *GtLCFunc =
3893 "_omp_reduction_global_to_list_copy_func", &M);
3894 GtLCFunc->setAttributes(FuncAttrs);
3895 GtLCFunc->addParamAttr(0, Attribute::NoUndef);
3896 GtLCFunc->addParamAttr(1, Attribute::NoUndef);
3897 GtLCFunc->addParamAttr(2, Attribute::NoUndef);
3898
3899 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", GtLCFunc);
3900 Builder.SetInsertPoint(EntryBlock);
3901
3902 // Buffer: global reduction buffer.
3903 Argument *BufferArg = GtLCFunc->getArg(0);
3904 // Idx: index of the buffer.
3905 Argument *IdxArg = GtLCFunc->getArg(1);
3906 // ReduceList: thread local Reduce list.
3907 Argument *ReduceListArg = GtLCFunc->getArg(2);
3908
3909 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3910 BufferArg->getName() + ".addr");
3911 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3912 IdxArg->getName() + ".addr");
3913 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3914 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3915 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3916 BufferArgAlloca, Builder.getPtrTy(),
3917 BufferArgAlloca->getName() + ".ascast");
3918 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3919 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3920 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3921 ReduceListArgAlloca, Builder.getPtrTy(),
3922 ReduceListArgAlloca->getName() + ".ascast");
3923 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3924 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3925 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3926
3927 Value *LocalReduceList =
3928 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3929 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3930 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3931 Type *IndexTy = Builder.getIndexTy(
3932 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3933 for (auto En : enumerate(ReductionInfos)) {
3934 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3935 auto *RedListArrayTy =
3936 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3937 // Reduce element = LocalReduceList[i]
3938 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3939 RedListArrayTy, LocalReduceList,
3940 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3941 // elemptr = ((CopyType*)(elemptrptr)) + I
3942 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3943 // Global = Buffer.VD[Idx];
3944 Value *BufferVD =
3945 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3946 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3947 ReductionsBufferTy, BufferVD, 0, En.index());
3948
3949 switch (RI.EvaluationKind) {
3950 case EvalKind::Scalar: {
3951 Type *ElemType = RI.ElementType;
3952
3953 if (!IsByRef.empty() && IsByRef[En.index()]) {
3954 ElemType = RI.ByRefElementType;
3955 InsertPointOrErrorTy GenResult =
3956 RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr);
3957
3958 if (!GenResult)
3959 return GenResult.takeError();
3960
3961 ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
3962 }
3963
3964 Value *TargetElement = Builder.CreateLoad(ElemType, GlobValPtr);
3965 Builder.CreateStore(TargetElement, ElemPtr);
3966 break;
3967 }
3968 case EvalKind::Complex: {
3969 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3970 RI.ElementType, GlobValPtr, 0, 0, ".realp");
3971 Value *SrcReal = Builder.CreateLoad(
3972 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3973 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3974 RI.ElementType, GlobValPtr, 0, 1, ".imagp");
3975 Value *SrcImg = Builder.CreateLoad(
3976 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3977
3978 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3979 RI.ElementType, ElemPtr, 0, 0, ".realp");
3980 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3981 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3982 Builder.CreateStore(SrcReal, DestRealPtr);
3983 Builder.CreateStore(SrcImg, DestImgPtr);
3984 break;
3985 }
3986 case EvalKind::Aggregate: {
3987 Value *SizeVal =
3988 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3989 Builder.CreateMemCpy(
3990 ElemPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3991 GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3992 SizeVal, false);
3993 break;
3994 }
3995 }
3996 }
3997
3998 Builder.CreateRetVoid();
3999 Builder.restoreIP(OldIP);
4000 return GtLCFunc;
4001}
4002
4003Expected<Function *> OpenMPIRBuilder::emitGlobalToListReduceFunction(
4004 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
4005 Type *ReductionsBufferTy, AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
4006 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
4007 LLVMContext &Ctx = M.getContext();
4008 auto *FuncTy = FunctionType::get(
4009 Builder.getVoidTy(),
4010 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
4011 /* IsVarArg */ false);
4012 Function *GtLRFunc =
4014 "_omp_reduction_global_to_list_reduce_func", &M);
4015 GtLRFunc->setAttributes(FuncAttrs);
4016 GtLRFunc->addParamAttr(0, Attribute::NoUndef);
4017 GtLRFunc->addParamAttr(1, Attribute::NoUndef);
4018 GtLRFunc->addParamAttr(2, Attribute::NoUndef);
4019
4020 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", GtLRFunc);
4021 Builder.SetInsertPoint(EntryBlock);
4022
4023 // Buffer: global reduction buffer.
4024 Argument *BufferArg = GtLRFunc->getArg(0);
4025 // Idx: index of the buffer.
4026 Argument *IdxArg = GtLRFunc->getArg(1);
4027 // ReduceList: thread local Reduce list.
4028 Argument *ReduceListArg = GtLRFunc->getArg(2);
4029
4030 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
4031 BufferArg->getName() + ".addr");
4032 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
4033 IdxArg->getName() + ".addr");
4034 Value *ReduceListArgAlloca = Builder.CreateAlloca(
4035 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
4036 ArrayType *RedListArrayTy =
4037 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
4038
4039 // 1. Build a list of reduction variables.
4040 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
4041 Value *LocalReduceList =
4042 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
4043
4044 InsertPointTy AllocaIP{EntryBlock, EntryBlock->begin()};
4045
4046 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4047 BufferArgAlloca, Builder.getPtrTy(),
4048 BufferArgAlloca->getName() + ".ascast");
4049 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4050 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
4051 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4052 ReduceListArgAlloca, Builder.getPtrTy(),
4053 ReduceListArgAlloca->getName() + ".ascast");
4054 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
4055 LocalReduceList, Builder.getPtrTy(),
4056 LocalReduceList->getName() + ".ascast");
4057
4058 Builder.CreateStore(BufferArg, BufferArgAddrCast);
4059 Builder.CreateStore(IdxArg, IdxArgAddrCast);
4060 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
4061
4062 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
4063 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
4064 Type *IndexTy = Builder.getIndexTy(
4065 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4066 for (auto En : enumerate(ReductionInfos)) {
4067 const ReductionInfo &RI = En.value();
4068 Value *ByRefAlloc;
4069
4070 if (!IsByRef.empty() && IsByRef[En.index()]) {
4071 InsertPointTy OldIP = Builder.saveIP();
4072 Builder.restoreIP(AllocaIP);
4073
4074 ByRefAlloc = Builder.CreateAlloca(RI.ByRefAllocatedType);
4075 ByRefAlloc = Builder.CreatePointerBitCastOrAddrSpaceCast(
4076 ByRefAlloc, Builder.getPtrTy(), ByRefAlloc->getName() + ".ascast");
4077
4078 Builder.restoreIP(OldIP);
4079 }
4080
4081 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
4082 RedListArrayTy, ReductionList,
4083 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4084 // Global = Buffer.VD[Idx];
4085 Value *BufferVD =
4086 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
4087 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
4088 ReductionsBufferTy, BufferVD, 0, En.index());
4089
4090 if (!IsByRef.empty() && IsByRef[En.index()]) {
4091 // Get source descriptor from the reduce list
4092 Value *ReduceListVal =
4093 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
4094 Value *SrcElementPtrPtr =
4095 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceListVal,
4096 {ConstantInt::get(IndexTy, 0),
4097 ConstantInt::get(IndexTy, En.index())});
4098 Value *SrcDescriptorAddr =
4099 Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrPtr);
4100
4101 // Copy descriptor from source and update base_ptr to global buffer data
4102 InsertPointOrErrorTy GenResult =
4103 generateReductionDescriptor(ByRefAlloc, GlobValPtr, SrcDescriptorAddr,
4104 RI.ByRefAllocatedType, RI.DataPtrPtrGen);
4105 if (!GenResult)
4106 return GenResult.takeError();
4107
4108 Builder.CreateStore(ByRefAlloc, TargetElementPtrPtr);
4109 } else {
4110 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
4111 }
4112 }
4113
4114 // Call reduce_function(ReduceList, GlobalReduceList)
4115 Value *ReduceList =
4116 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
4117 createRuntimeFunctionCall(ReduceFn, {ReduceList, ReductionList})
4118 ->addFnAttr(Attribute::NoUnwind);
4119 Builder.CreateRetVoid();
4120 Builder.restoreIP(OldIP);
4121 return GtLRFunc;
4122}
4123
4124std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
4125 std::string Suffix =
4126 createPlatformSpecificName({"omp", "reduction", "reduction_func"});
4127 return (Name + Suffix).str();
4128}
4129
4130Expected<Function *> OpenMPIRBuilder::createReductionFunction(
4131 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
4133 AttributeList FuncAttrs) {
4134 auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
4135 {Builder.getPtrTy(), Builder.getPtrTy()},
4136 /* IsVarArg */ false);
4137 std::string Name = getReductionFuncName(ReducerName);
4138 Function *ReductionFunc =
4140 ReductionFunc->setAttributes(FuncAttrs);
4141 ReductionFunc->addParamAttr(0, Attribute::NoUndef);
4142 ReductionFunc->addParamAttr(1, Attribute::NoUndef);
4143 BasicBlock *EntryBB =
4144 BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
4145 Builder.SetInsertPoint(EntryBB);
4146
4147 // Need to alloca memory here and deal with the pointers before getting
4148 // LHS/RHS pointers out
4149 Value *LHSArrayPtr = nullptr;
4150 Value *RHSArrayPtr = nullptr;
4151 Argument *Arg0 = ReductionFunc->getArg(0);
4152 Argument *Arg1 = ReductionFunc->getArg(1);
4153 Type *Arg0Type = Arg0->getType();
4154 Type *Arg1Type = Arg1->getType();
4155
4156 Value *LHSAlloca =
4157 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
4158 Value *RHSAlloca =
4159 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
4160 Value *LHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4161 LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
4162 Value *RHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4163 RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
4164 Builder.CreateStore(Arg0, LHSAddrCast);
4165 Builder.CreateStore(Arg1, RHSAddrCast);
4166 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
4167 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
4168
4169 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
4170 Type *IndexTy = Builder.getIndexTy(
4171 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4172 SmallVector<Value *> LHSPtrs, RHSPtrs;
4173 for (auto En : enumerate(ReductionInfos)) {
4174 const ReductionInfo &RI = En.value();
4175 Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
4176 RedArrayTy, RHSArrayPtr,
4177 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4178 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
4179 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4180 RHSI8Ptr, RI.PrivateVariable->getType(),
4181 RHSI8Ptr->getName() + ".ascast");
4182
4183 Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
4184 RedArrayTy, LHSArrayPtr,
4185 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4186 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
4187 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4188 LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
4189
4191 LHSPtrs.emplace_back(LHSPtr);
4192 RHSPtrs.emplace_back(RHSPtr);
4193 } else {
4194 Value *LHS = LHSPtr;
4195 Value *RHS = RHSPtr;
4196
4197 if (!IsByRef.empty() && !IsByRef[En.index()]) {
4198 LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
4199 RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
4200 }
4201
4202 Value *Reduced;
4203 InsertPointOrErrorTy AfterIP =
4204 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
4205 if (!AfterIP)
4206 return AfterIP.takeError();
4207 if (!Builder.GetInsertBlock())
4208 return ReductionFunc;
4209
4210 Builder.restoreIP(*AfterIP);
4211
4212 if (!IsByRef.empty() && !IsByRef[En.index()])
4213 Builder.CreateStore(Reduced, LHSPtr);
4214 }
4215 }
4216
4218 for (auto En : enumerate(ReductionInfos)) {
4219 unsigned Index = En.index();
4220 const ReductionInfo &RI = En.value();
4221 Value *LHSFixupPtr, *RHSFixupPtr;
4222 Builder.restoreIP(RI.ReductionGenClang(
4223 Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
4224
4225 // Fix the CallBack code genereated to use the correct Values for the LHS
4226 // and RHS
4227 LHSFixupPtr->replaceUsesWithIf(
4228 LHSPtrs[Index], [ReductionFunc](const Use &U) {
4229 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4230 ReductionFunc;
4231 });
4232 RHSFixupPtr->replaceUsesWithIf(
4233 RHSPtrs[Index], [ReductionFunc](const Use &U) {
4234 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4235 ReductionFunc;
4236 });
4237 }
4238
4239 Builder.CreateRetVoid();
4240 // Compiling with `-O0`, `alloca`s emitted in non-entry blocks are not hoisted
4241 // to the entry block (this is dones for higher opt levels by later passes in
4242 // the pipeline). This has caused issues because non-entry `alloca`s force the
4243 // function to use dynamic stack allocations and we might run out of scratch
4244 // memory.
4245 hoistNonEntryAllocasToEntryBlock(ReductionFunc);
4246
4247 return ReductionFunc;
4248}
4249
4250static void
4252 bool IsGPU) {
4253 for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
4254 (void)RI;
4255 assert(RI.Variable && "expected non-null variable");
4256 assert(RI.PrivateVariable && "expected non-null private variable");
4257 assert((RI.ReductionGen || RI.ReductionGenClang) &&
4258 "expected non-null reduction generator callback");
4259 if (!IsGPU) {
4260 assert(
4261 RI.Variable->getType() == RI.PrivateVariable->getType() &&
4262 "expected variables and their private equivalents to have the same "
4263 "type");
4264 }
4265 assert(RI.Variable->getType()->isPointerTy() &&
4266 "expected variables to be pointers");
4267 }
4268}
4269
4271 const LocationDescription &Loc, InsertPointTy AllocaIP,
4272 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
4273 ArrayRef<bool> IsByRef, bool IsNoWait, bool IsTeamsReduction,
4274 ReductionGenCBKind ReductionGenCBKind, std::optional<omp::GV> GridValue,
4275 unsigned ReductionBufNum, Value *SrcLocInfo) {
4276 if (!updateToLocation(Loc))
4277 return InsertPointTy();
4278 Builder.restoreIP(CodeGenIP);
4279 checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
4280 LLVMContext &Ctx = M.getContext();
4281
4282 // Source location for the ident struct
4283 if (!SrcLocInfo) {
4284 uint32_t SrcLocStrSize;
4285 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4286 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4287 }
4288
4289 if (ReductionInfos.size() == 0)
4290 return Builder.saveIP();
4291
4292 BasicBlock *ContinuationBlock = nullptr;
4294 // Copied code from createReductions
4295 BasicBlock *InsertBlock = Loc.IP.getBlock();
4296 ContinuationBlock =
4297 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
4298 InsertBlock->getTerminator()->eraseFromParent();
4299 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
4300 }
4301
4302 Function *CurFunc = Builder.GetInsertBlock()->getParent();
4303 AttributeList FuncAttrs;
4304 AttrBuilder AttrBldr(Ctx);
4305 for (auto Attr : CurFunc->getAttributes().getFnAttrs())
4306 AttrBldr.addAttribute(Attr);
4307 AttrBldr.removeAttribute(Attribute::OptimizeNone);
4308 FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
4309
4310 CodeGenIP = Builder.saveIP();
4311 Expected<Function *> ReductionResult = createReductionFunction(
4312 Builder.GetInsertBlock()->getParent()->getName(), ReductionInfos, IsByRef,
4313 ReductionGenCBKind, FuncAttrs);
4314 if (!ReductionResult)
4315 return ReductionResult.takeError();
4316 Function *ReductionFunc = *ReductionResult;
4317 Builder.restoreIP(CodeGenIP);
4318
4319 // Set the grid value in the config needed for lowering later on
4320 if (GridValue.has_value())
4321 Config.setGridValue(GridValue.value());
4322 else
4323 Config.setGridValue(getGridValue(T, ReductionFunc));
4324
4325 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
4326 // RedList, shuffle_reduce_func, interwarp_copy_func);
4327 // or
4328 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
4329 Value *Res;
4330
4331 // 1. Build a list of reduction variables.
4332 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
4333 auto Size = ReductionInfos.size();
4334 Type *PtrTy = PointerType::get(Ctx, Config.getDefaultTargetAS());
4335 Type *FuncPtrTy =
4336 Builder.getPtrTy(M.getDataLayout().getProgramAddressSpace());
4337 Type *RedArrayTy = ArrayType::get(PtrTy, Size);
4338 CodeGenIP = Builder.saveIP();
4339 Builder.restoreIP(AllocaIP);
4340 Value *ReductionListAlloca =
4341 Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
4342 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
4343 ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
4344 Builder.restoreIP(CodeGenIP);
4345 Type *IndexTy = Builder.getIndexTy(
4346 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4347 for (auto En : enumerate(ReductionInfos)) {
4348 const ReductionInfo &RI = En.value();
4349 Value *ElemPtr = Builder.CreateInBoundsGEP(
4350 RedArrayTy, ReductionList,
4351 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4352
4353 Value *PrivateVar = RI.PrivateVariable;
4354 bool IsByRefElem = !IsByRef.empty() && IsByRef[En.index()];
4355 if (IsByRefElem)
4356 PrivateVar = Builder.CreateLoad(RI.ElementType, PrivateVar);
4357
4358 Value *CastElem =
4359 Builder.CreatePointerBitCastOrAddrSpaceCast(PrivateVar, PtrTy);
4360 Builder.CreateStore(CastElem, ElemPtr);
4361 }
4362 CodeGenIP = Builder.saveIP();
4363 Expected<Function *> SarFunc = emitShuffleAndReduceFunction(
4364 ReductionInfos, ReductionFunc, FuncAttrs, IsByRef);
4365
4366 if (!SarFunc)
4367 return SarFunc.takeError();
4368
4369 Expected<Function *> CopyResult =
4370 emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs, IsByRef);
4371 if (!CopyResult)
4372 return CopyResult.takeError();
4373 Function *WcFunc = *CopyResult;
4374 Builder.restoreIP(CodeGenIP);
4375
4376 Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
4377
4378 unsigned MaxDataSize = 0;
4379 SmallVector<Type *> ReductionTypeArgs;
4380 for (auto En : enumerate(ReductionInfos)) {
4381 auto Size = M.getDataLayout().getTypeStoreSize(En.value().ElementType);
4382 if (Size > MaxDataSize)
4383 MaxDataSize = Size;
4384 Type *RedTypeArg = (!IsByRef.empty() && IsByRef[En.index()])
4385 ? En.value().ByRefElementType
4386 : En.value().ElementType;
4387 ReductionTypeArgs.emplace_back(RedTypeArg);
4388 }
4389 Value *ReductionDataSize =
4390 Builder.getInt64(MaxDataSize * ReductionInfos.size());
4391 if (!IsTeamsReduction) {
4392 Value *SarFuncCast =
4393 Builder.CreatePointerBitCastOrAddrSpaceCast(*SarFunc, FuncPtrTy);
4394 Value *WcFuncCast =
4395 Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, FuncPtrTy);
4396 Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast,
4397 WcFuncCast};
4399 RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
4400 Res = createRuntimeFunctionCall(Pv2Ptr, Args);
4401 } else {
4402 CodeGenIP = Builder.saveIP();
4403 StructType *ReductionsBufferTy = StructType::create(
4404 Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
4405 Function *RedFixedBufferFn = getOrCreateRuntimeFunctionPtr(
4406 RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
4407
4408 Expected<Function *> LtGCFunc = emitListToGlobalCopyFunction(
4409 ReductionInfos, ReductionsBufferTy, FuncAttrs, IsByRef);
4410 if (!LtGCFunc)
4411 return LtGCFunc.takeError();
4412
4413 Expected<Function *> LtGRFunc = emitListToGlobalReduceFunction(
4414 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs, IsByRef);
4415 if (!LtGRFunc)
4416 return LtGRFunc.takeError();
4417
4418 Expected<Function *> GtLCFunc = emitGlobalToListCopyFunction(
4419 ReductionInfos, ReductionsBufferTy, FuncAttrs, IsByRef);
4420 if (!GtLCFunc)
4421 return GtLCFunc.takeError();
4422
4423 Expected<Function *> GtLRFunc = emitGlobalToListReduceFunction(
4424 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs, IsByRef);
4425 if (!GtLRFunc)
4426 return GtLRFunc.takeError();
4427
4428 Builder.restoreIP(CodeGenIP);
4429
4430 Value *KernelTeamsReductionPtr = createRuntimeFunctionCall(
4431 RedFixedBufferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
4432
4433 Value *Args3[] = {SrcLocInfo,
4434 KernelTeamsReductionPtr,
4435 Builder.getInt32(ReductionBufNum),
4436 ReductionDataSize,
4437 RL,
4438 *SarFunc,
4439 WcFunc,
4440 *LtGCFunc,
4441 *LtGRFunc,
4442 *GtLCFunc,
4443 *GtLRFunc};
4444
4445 Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
4446 RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
4447 Res = createRuntimeFunctionCall(TeamsReduceFn, Args3);
4448 }
4449
4450 // 5. Build if (res == 1)
4451 BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
4452 BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
4453 Value *Cond = Builder.CreateICmpEQ(Res, Builder.getInt32(1));
4454 Builder.CreateCondBr(Cond, ThenBB, ExitBB);
4455
4456 // 6. Build then branch: where we have reduced values in the master
4457 // thread in each team.
4458 // __kmpc_end_reduce{_nowait}(<gtid>);
4459 // break;
4460 emitBlock(ThenBB, CurFunc);
4461
4462 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
4463 for (auto En : enumerate(ReductionInfos)) {
4464 const ReductionInfo &RI = En.value();
4466 Value *RedValue = RI.Variable;
4467 Value *RHS =
4468 Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
4469
4471 Value *LHSPtr, *RHSPtr;
4472 Builder.restoreIP(RI.ReductionGenClang(Builder.saveIP(), En.index(),
4473 &LHSPtr, &RHSPtr, CurFunc));
4474
4475 // Fix the CallBack code genereated to use the correct Values for the LHS
4476 // and RHS
4477 LHSPtr->replaceUsesWithIf(RedValue, [ReductionFunc](const Use &U) {
4478 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4479 ReductionFunc;
4480 });
4481 RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
4482 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4483 ReductionFunc;
4484 });
4485 } else {
4486 if (IsByRef.empty() || !IsByRef[En.index()]) {
4487 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
4488 "red.value." + Twine(En.index()));
4489 }
4490 Value *PrivateRedValue = Builder.CreateLoad(
4491 ValueType, RHS, "red.private.value" + Twine(En.index()));
4492 Value *Reduced;
4493 InsertPointOrErrorTy AfterIP =
4494 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
4495 if (!AfterIP)
4496 return AfterIP.takeError();
4497 Builder.restoreIP(*AfterIP);
4498
4499 if (!IsByRef.empty() && !IsByRef[En.index()])
4500 Builder.CreateStore(Reduced, RI.Variable);
4501 }
4502 }
4503 emitBlock(ExitBB, CurFunc);
4504 if (ContinuationBlock) {
4505 Builder.CreateBr(ContinuationBlock);
4506 Builder.SetInsertPoint(ContinuationBlock);
4507 }
4508 Config.setEmitLLVMUsed();
4509
4510 return Builder.saveIP();
4511}
4512
4514 Type *VoidTy = Type::getVoidTy(M.getContext());
4515 Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
4516 auto *FuncTy =
4517 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
4519 ".omp.reduction.func", &M);
4520}
4521
4523 Function *ReductionFunc,
4525 IRBuilder<> &Builder, ArrayRef<bool> IsByRef, bool IsGPU) {
4526 Module *Module = ReductionFunc->getParent();
4527 BasicBlock *ReductionFuncBlock =
4528 BasicBlock::Create(Module->getContext(), "", ReductionFunc);
4529 Builder.SetInsertPoint(ReductionFuncBlock);
4530 Value *LHSArrayPtr = nullptr;
4531 Value *RHSArrayPtr = nullptr;
4532 if (IsGPU) {
4533 // Need to alloca memory here and deal with the pointers before getting
4534 // LHS/RHS pointers out
4535 //
4536 Argument *Arg0 = ReductionFunc->getArg(0);
4537 Argument *Arg1 = ReductionFunc->getArg(1);
4538 Type *Arg0Type = Arg0->getType();
4539 Type *Arg1Type = Arg1->getType();
4540
4541 Value *LHSAlloca =
4542 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
4543 Value *RHSAlloca =
4544 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
4545 Value *LHSAddrCast =
4546 Builder.CreatePointerBitCastOrAddrSpaceCast(LHSAlloca, Arg0Type);
4547 Value *RHSAddrCast =
4548 Builder.CreatePointerBitCastOrAddrSpaceCast(RHSAlloca, Arg1Type);
4549 Builder.CreateStore(Arg0, LHSAddrCast);
4550 Builder.CreateStore(Arg1, RHSAddrCast);
4551 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
4552 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
4553 } else {
4554 LHSArrayPtr = ReductionFunc->getArg(0);
4555 RHSArrayPtr = ReductionFunc->getArg(1);
4556 }
4557
4558 unsigned NumReductions = ReductionInfos.size();
4559 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
4560
4561 for (auto En : enumerate(ReductionInfos)) {
4562 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
4563 Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
4564 RedArrayTy, LHSArrayPtr, 0, En.index());
4565 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
4566 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4567 LHSI8Ptr, RI.Variable->getType());
4568 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
4569 Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
4570 RedArrayTy, RHSArrayPtr, 0, En.index());
4571 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
4572 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4573 RHSI8Ptr, RI.PrivateVariable->getType());
4574 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
4575 Value *Reduced;
4577 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
4578 if (!AfterIP)
4579 return AfterIP.takeError();
4580
4581 Builder.restoreIP(*AfterIP);
4582 // TODO: Consider flagging an error.
4583 if (!Builder.GetInsertBlock())
4584 return Error::success();
4585
4586 // store is inside of the reduction region when using by-ref
4587 if (!IsByRef[En.index()])
4588 Builder.CreateStore(Reduced, LHSPtr);
4589 }
4590 Builder.CreateRetVoid();
4591 return Error::success();
4592}
4593
4595 const LocationDescription &Loc, InsertPointTy AllocaIP,
4596 ArrayRef<ReductionInfo> ReductionInfos, ArrayRef<bool> IsByRef,
4597 bool IsNoWait, bool IsTeamsReduction) {
4598 assert(ReductionInfos.size() == IsByRef.size());
4599 if (Config.isGPU())
4600 return createReductionsGPU(Loc, AllocaIP, Builder.saveIP(), ReductionInfos,
4601 IsByRef, IsNoWait, IsTeamsReduction);
4602
4603 checkReductionInfos(ReductionInfos, /*IsGPU*/ false);
4604
4605 if (!updateToLocation(Loc))
4606 return InsertPointTy();
4607
4608 if (ReductionInfos.size() == 0)
4609 return Builder.saveIP();
4610
4611 BasicBlock *InsertBlock = Loc.IP.getBlock();
4612 BasicBlock *ContinuationBlock =
4613 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
4614 InsertBlock->getTerminator()->eraseFromParent();
4615
4616 // Create and populate array of type-erased pointers to private reduction
4617 // values.
4618 unsigned NumReductions = ReductionInfos.size();
4619 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
4620 Builder.SetInsertPoint(AllocaIP.getBlock()->getTerminator());
4621 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
4622
4623 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
4624
4625 for (auto En : enumerate(ReductionInfos)) {
4626 unsigned Index = En.index();
4627 const ReductionInfo &RI = En.value();
4628 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
4629 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
4630 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr);
4631 }
4632
4633 // Emit a call to the runtime function that orchestrates the reduction.
4634 // Declare the reduction function in the process.
4635 Type *IndexTy = Builder.getIndexTy(
4636 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4637 Function *Func = Builder.GetInsertBlock()->getParent();
4638 Module *Module = Func->getParent();
4639 uint32_t SrcLocStrSize;
4640 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4641 bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
4642 return RI.AtomicReductionGen;
4643 });
4644 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
4645 CanGenerateAtomic
4646 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
4647 : IdentFlag(0));
4648 Value *ThreadId = getOrCreateThreadID(Ident);
4649 Constant *NumVariables = Builder.getInt32(NumReductions);
4650 const DataLayout &DL = Module->getDataLayout();
4651 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
4652 Constant *RedArraySize = ConstantInt::get(IndexTy, RedArrayByteSize);
4653 Function *ReductionFunc = getFreshReductionFunc(*Module);
4654 Value *Lock = getOMPCriticalRegionLock(".reduction");
4656 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
4657 : RuntimeFunction::OMPRTL___kmpc_reduce);
4658 CallInst *ReduceCall =
4659 createRuntimeFunctionCall(ReduceFunc,
4660 {Ident, ThreadId, NumVariables, RedArraySize,
4661 RedArray, ReductionFunc, Lock},
4662 "reduce");
4663
4664 // Create final reduction entry blocks for the atomic and non-atomic case.
4665 // Emit IR that dispatches control flow to one of the blocks based on the
4666 // reduction supporting the atomic mode.
4667 BasicBlock *NonAtomicRedBlock =
4668 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
4669 BasicBlock *AtomicRedBlock =
4670 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
4671 SwitchInst *Switch =
4672 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
4673 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
4674 Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
4675
4676 // Populate the non-atomic reduction using the elementwise reduction function.
4677 // This loads the elements from the global and private variables and reduces
4678 // them before storing back the result to the global variable.
4679 Builder.SetInsertPoint(NonAtomicRedBlock);
4680 for (auto En : enumerate(ReductionInfos)) {
4681 const ReductionInfo &RI = En.value();
4683 // We have one less load for by-ref case because that load is now inside of
4684 // the reduction region
4685 Value *RedValue = RI.Variable;
4686 if (!IsByRef[En.index()]) {
4687 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
4688 "red.value." + Twine(En.index()));
4689 }
4690 Value *PrivateRedValue =
4691 Builder.CreateLoad(ValueType, RI.PrivateVariable,
4692 "red.private.value." + Twine(En.index()));
4693 Value *Reduced;
4694 InsertPointOrErrorTy AfterIP =
4695 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
4696 if (!AfterIP)
4697 return AfterIP.takeError();
4698 Builder.restoreIP(*AfterIP);
4699
4700 if (!Builder.GetInsertBlock())
4701 return InsertPointTy();
4702 // for by-ref case, the load is inside of the reduction region
4703 if (!IsByRef[En.index()])
4704 Builder.CreateStore(Reduced, RI.Variable);
4705 }
4706 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
4707 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
4708 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
4709 createRuntimeFunctionCall(EndReduceFunc, {Ident, ThreadId, Lock});
4710 Builder.CreateBr(ContinuationBlock);
4711
4712 // Populate the atomic reduction using the atomic elementwise reduction
4713 // function. There are no loads/stores here because they will be happening
4714 // inside the atomic elementwise reduction.
4715 Builder.SetInsertPoint(AtomicRedBlock);
4716 if (CanGenerateAtomic && llvm::none_of(IsByRef, [](bool P) { return P; })) {
4717 for (const ReductionInfo &RI : ReductionInfos) {
4719 Builder.saveIP(), RI.ElementType, RI.Variable, RI.PrivateVariable);
4720 if (!AfterIP)
4721 return AfterIP.takeError();
4722 Builder.restoreIP(*AfterIP);
4723 if (!Builder.GetInsertBlock())
4724 return InsertPointTy();
4725 }
4726 Builder.CreateBr(ContinuationBlock);
4727 } else {
4728 Builder.CreateUnreachable();
4729 }
4730
4731 // Populate the outlined reduction function using the elementwise reduction
4732 // function. Partial values are extracted from the type-erased array of
4733 // pointers to private variables.
4734 Error Err = populateReductionFunction(ReductionFunc, ReductionInfos, Builder,
4735 IsByRef, /*isGPU=*/false);
4736 if (Err)
4737 return Err;
4738
4739 if (!Builder.GetInsertBlock())
4740 return InsertPointTy();
4741
4742 Builder.SetInsertPoint(ContinuationBlock);
4743 return Builder.saveIP();
4744}
4745
4748 BodyGenCallbackTy BodyGenCB,
4749 FinalizeCallbackTy FiniCB) {
4750 if (!updateToLocation(Loc))
4751 return Loc.IP;
4752
4753 Directive OMPD = Directive::OMPD_master;
4754 uint32_t SrcLocStrSize;
4755 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4756 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4757 Value *ThreadId = getOrCreateThreadID(Ident);
4758 Value *Args[] = {Ident, ThreadId};
4759
4760 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
4761 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
4762
4763 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
4764 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
4765
4766 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4767 /*Conditional*/ true, /*hasFinalize*/ true);
4768}
4769
4772 BodyGenCallbackTy BodyGenCB,
4773 FinalizeCallbackTy FiniCB, Value *Filter) {
4774 if (!updateToLocation(Loc))
4775 return Loc.IP;
4776
4777 Directive OMPD = Directive::OMPD_masked;
4778 uint32_t SrcLocStrSize;
4779 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4780 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4781 Value *ThreadId = getOrCreateThreadID(Ident);
4782 Value *Args[] = {Ident, ThreadId, Filter};
4783 Value *ArgsEnd[] = {Ident, ThreadId};
4784
4785 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
4786 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
4787
4788 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
4789 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, ArgsEnd);
4790
4791 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4792 /*Conditional*/ true, /*hasFinalize*/ true);
4793}
4794
4796 llvm::FunctionCallee Callee,
4798 const llvm::Twine &Name) {
4799 llvm::CallInst *Call = Builder.CreateCall(
4800 Callee, Args, SmallVector<llvm::OperandBundleDef, 1>(), Name);
4801 Call->setDoesNotThrow();
4802 return Call;
4803}
4804
4805// Expects input basic block is dominated by BeforeScanBB.
4806// Once Scan directive is encountered, the code after scan directive should be
4807// dominated by AfterScanBB. Scan directive splits the code sequence to
4808// scan and input phase. Based on whether inclusive or exclusive
4809// clause is used in the scan directive and whether input loop or scan loop
4810// is lowered, it adds jumps to input and scan phase. First Scan loop is the
4811// input loop and second is the scan loop. The code generated handles only
4812// inclusive scans now.
4814 const LocationDescription &Loc, InsertPointTy AllocaIP,
4815 ArrayRef<llvm::Value *> ScanVars, ArrayRef<llvm::Type *> ScanVarsType,
4816 bool IsInclusive, ScanInfo *ScanRedInfo) {
4817 if (ScanRedInfo->OMPFirstScanLoop) {
4818 llvm::Error Err = emitScanBasedDirectiveDeclsIR(AllocaIP, ScanVars,
4819 ScanVarsType, ScanRedInfo);
4820 if (Err)
4821 return Err;
4822 }
4823 if (!updateToLocation(Loc))
4824 return Loc.IP;
4825
4826 llvm::Value *IV = ScanRedInfo->IV;
4827
4828 if (ScanRedInfo->OMPFirstScanLoop) {
4829 // Emit buffer[i] = red; at the end of the input phase.
4830 for (size_t i = 0; i < ScanVars.size(); i++) {
4831 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
4832 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4833 Type *DestTy = ScanVarsType[i];
4834 Value *Val = Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4835 Value *Src = Builder.CreateLoad(DestTy, ScanVars[i]);
4836
4837 Builder.CreateStore(Src, Val);
4838 }
4839 }
4840 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
4841 emitBlock(ScanRedInfo->OMPScanDispatch,
4842 Builder.GetInsertBlock()->getParent());
4843
4844 if (!ScanRedInfo->OMPFirstScanLoop) {
4845 IV = ScanRedInfo->IV;
4846 // Emit red = buffer[i]; at the entrance to the scan phase.
4847 // TODO: if exclusive scan, the red = buffer[i-1] needs to be updated.
4848 for (size_t i = 0; i < ScanVars.size(); i++) {
4849 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
4850 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4851 Type *DestTy = ScanVarsType[i];
4852 Value *SrcPtr =
4853 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4854 Value *Src = Builder.CreateLoad(DestTy, SrcPtr);
4855 Builder.CreateStore(Src, ScanVars[i]);
4856 }
4857 }
4858
4859 // TODO: Update it to CreateBr and remove dead blocks
4860 llvm::Value *CmpI = Builder.getInt1(true);
4861 if (ScanRedInfo->OMPFirstScanLoop == IsInclusive) {
4862 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPBeforeScanBlock,
4863 ScanRedInfo->OMPAfterScanBlock);
4864 } else {
4865 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPAfterScanBlock,
4866 ScanRedInfo->OMPBeforeScanBlock);
4867 }
4868 emitBlock(ScanRedInfo->OMPAfterScanBlock,
4869 Builder.GetInsertBlock()->getParent());
4870 Builder.SetInsertPoint(ScanRedInfo->OMPAfterScanBlock);
4871 return Builder.saveIP();
4872}
4873
4874Error OpenMPIRBuilder::emitScanBasedDirectiveDeclsIR(
4875 InsertPointTy AllocaIP, ArrayRef<Value *> ScanVars,
4876 ArrayRef<Type *> ScanVarsType, ScanInfo *ScanRedInfo) {
4877
4878 Builder.restoreIP(AllocaIP);
4879 // Create the shared pointer at alloca IP.
4880 for (size_t i = 0; i < ScanVars.size(); i++) {
4881 llvm::Value *BuffPtr =
4882 Builder.CreateAlloca(Builder.getPtrTy(), nullptr, "vla");
4883 (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]] = BuffPtr;
4884 }
4885
4886 // Allocate temporary buffer by master thread
4887 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4888 InsertPointTy CodeGenIP) -> Error {
4889 Builder.restoreIP(CodeGenIP);
4890 Value *AllocSpan =
4891 Builder.CreateAdd(ScanRedInfo->Span, Builder.getInt32(1));
4892 for (size_t i = 0; i < ScanVars.size(); i++) {
4893 Type *IntPtrTy = Builder.getInt32Ty();
4894 Constant *Allocsize = ConstantExpr::getSizeOf(ScanVarsType[i]);
4895 Allocsize = ConstantExpr::getTruncOrBitCast(Allocsize, IntPtrTy);
4896 Value *Buff = Builder.CreateMalloc(IntPtrTy, ScanVarsType[i], Allocsize,
4897 AllocSpan, nullptr, "arr");
4898 Builder.CreateStore(Buff, (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]]);
4899 }
4900 return Error::success();
4901 };
4902 // TODO: Perform finalization actions for variables. This has to be
4903 // called for variables which have destructors/finalizers.
4904 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4905
4906 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit->getTerminator());
4907 llvm::Value *FilterVal = Builder.getInt32(0);
4909 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4910
4911 if (!AfterIP)
4912 return AfterIP.takeError();
4913 Builder.restoreIP(*AfterIP);
4914 BasicBlock *InputBB = Builder.GetInsertBlock();
4915 if (InputBB->getTerminator())
4916 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
4917 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4918 if (!AfterIP)
4919 return AfterIP.takeError();
4920 Builder.restoreIP(*AfterIP);
4921
4922 return Error::success();
4923}
4924
4925Error OpenMPIRBuilder::emitScanBasedDirectiveFinalsIR(
4926 ArrayRef<ReductionInfo> ReductionInfos, ScanInfo *ScanRedInfo) {
4927 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4928 InsertPointTy CodeGenIP) -> Error {
4929 Builder.restoreIP(CodeGenIP);
4930 for (ReductionInfo RedInfo : ReductionInfos) {
4931 Value *PrivateVar = RedInfo.PrivateVariable;
4932 Value *OrigVar = RedInfo.Variable;
4933 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[PrivateVar];
4934 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4935
4936 Type *SrcTy = RedInfo.ElementType;
4937 Value *Val = Builder.CreateInBoundsGEP(SrcTy, Buff, ScanRedInfo->Span,
4938 "arrayOffset");
4939 Value *Src = Builder.CreateLoad(SrcTy, Val);
4940
4941 Builder.CreateStore(Src, OrigVar);
4942 Builder.CreateFree(Buff);
4943 }
4944 return Error::success();
4945 };
4946 // TODO: Perform finalization actions for variables. This has to be
4947 // called for variables which have destructors/finalizers.
4948 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4949
4950 if (ScanRedInfo->OMPScanFinish->getTerminator())
4951 Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish->getTerminator());
4952 else
4953 Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish);
4954
4955 llvm::Value *FilterVal = Builder.getInt32(0);
4957 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4958
4959 if (!AfterIP)
4960 return AfterIP.takeError();
4961 Builder.restoreIP(*AfterIP);
4962 BasicBlock *InputBB = Builder.GetInsertBlock();
4963 if (InputBB->getTerminator())
4964 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
4965 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4966 if (!AfterIP)
4967 return AfterIP.takeError();
4968 Builder.restoreIP(*AfterIP);
4969 return Error::success();
4970}
4971
4973 const LocationDescription &Loc,
4975 ScanInfo *ScanRedInfo) {
4976
4977 if (!updateToLocation(Loc))
4978 return Loc.IP;
4979 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4980 InsertPointTy CodeGenIP) -> Error {
4981 Builder.restoreIP(CodeGenIP);
4982 Function *CurFn = Builder.GetInsertBlock()->getParent();
4983 // for (int k = 0; k <= ceil(log2(n)); ++k)
4984 llvm::BasicBlock *LoopBB =
4985 BasicBlock::Create(CurFn->getContext(), "omp.outer.log.scan.body");
4986 llvm::BasicBlock *ExitBB =
4987 splitBB(Builder, false, "omp.outer.log.scan.exit");
4989 Builder.GetInsertBlock()->getModule(),
4990 (llvm::Intrinsic::ID)llvm::Intrinsic::log2, Builder.getDoubleTy());
4991 llvm::BasicBlock *InputBB = Builder.GetInsertBlock();
4992 llvm::Value *Arg =
4993 Builder.CreateUIToFP(ScanRedInfo->Span, Builder.getDoubleTy());
4994 llvm::Value *LogVal = emitNoUnwindRuntimeCall(Builder, F, Arg, "");
4996 Builder.GetInsertBlock()->getModule(),
4997 (llvm::Intrinsic::ID)llvm::Intrinsic::ceil, Builder.getDoubleTy());
4998 LogVal = emitNoUnwindRuntimeCall(Builder, F, LogVal, "");
4999 LogVal = Builder.CreateFPToUI(LogVal, Builder.getInt32Ty());
5000 llvm::Value *NMin1 = Builder.CreateNUWSub(
5001 ScanRedInfo->Span,
5002 llvm::ConstantInt::get(ScanRedInfo->Span->getType(), 1));
5003 Builder.SetInsertPoint(InputBB);
5004 Builder.CreateBr(LoopBB);
5005 emitBlock(LoopBB, CurFn);
5006 Builder.SetInsertPoint(LoopBB);
5007
5008 PHINode *Counter = Builder.CreatePHI(Builder.getInt32Ty(), 2);
5009 // size pow2k = 1;
5010 PHINode *Pow2K = Builder.CreatePHI(Builder.getInt32Ty(), 2);
5011 Counter->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 0),
5012 InputBB);
5013 Pow2K->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 1),
5014 InputBB);
5015 // for (size i = n - 1; i >= 2 ^ k; --i)
5016 // tmp[i] op= tmp[i-pow2k];
5017 llvm::BasicBlock *InnerLoopBB =
5018 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.body");
5019 llvm::BasicBlock *InnerExitBB =
5020 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.exit");
5021 llvm::Value *CmpI = Builder.CreateICmpUGE(NMin1, Pow2K);
5022 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
5023 emitBlock(InnerLoopBB, CurFn);
5024 Builder.SetInsertPoint(InnerLoopBB);
5025 PHINode *IVal = Builder.CreatePHI(Builder.getInt32Ty(), 2);
5026 IVal->addIncoming(NMin1, LoopBB);
5027 for (ReductionInfo RedInfo : ReductionInfos) {
5028 Value *ReductionVal = RedInfo.PrivateVariable;
5029 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ReductionVal];
5030 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
5031 Type *DestTy = RedInfo.ElementType;
5032 Value *IV = Builder.CreateAdd(IVal, Builder.getInt32(1));
5033 Value *LHSPtr =
5034 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
5035 Value *OffsetIval = Builder.CreateNUWSub(IV, Pow2K);
5036 Value *RHSPtr =
5037 Builder.CreateInBoundsGEP(DestTy, Buff, OffsetIval, "arrayOffset");
5038 Value *LHS = Builder.CreateLoad(DestTy, LHSPtr);
5039 Value *RHS = Builder.CreateLoad(DestTy, RHSPtr);
5040 llvm::Value *Result;
5041 InsertPointOrErrorTy AfterIP =
5042 RedInfo.ReductionGen(Builder.saveIP(), LHS, RHS, Result);
5043 if (!AfterIP)
5044 return AfterIP.takeError();
5045 Builder.CreateStore(Result, LHSPtr);
5046 }
5047 llvm::Value *NextIVal = Builder.CreateNUWSub(
5048 IVal, llvm::ConstantInt::get(Builder.getInt32Ty(), 1));
5049 IVal->addIncoming(NextIVal, Builder.GetInsertBlock());
5050 CmpI = Builder.CreateICmpUGE(NextIVal, Pow2K);
5051 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
5052 emitBlock(InnerExitBB, CurFn);
5053 llvm::Value *Next = Builder.CreateNUWAdd(
5054 Counter, llvm::ConstantInt::get(Counter->getType(), 1));
5055 Counter->addIncoming(Next, Builder.GetInsertBlock());
5056 // pow2k <<= 1;
5057 llvm::Value *NextPow2K = Builder.CreateShl(Pow2K, 1, "", /*HasNUW=*/true);
5058 Pow2K->addIncoming(NextPow2K, Builder.GetInsertBlock());
5059 llvm::Value *Cmp = Builder.CreateICmpNE(Next, LogVal);
5060 Builder.CreateCondBr(Cmp, LoopBB, ExitBB);
5061 Builder.SetInsertPoint(ExitBB->getFirstInsertionPt());
5062 return Error::success();
5063 };
5064
5065 // TODO: Perform finalization actions for variables. This has to be
5066 // called for variables which have destructors/finalizers.
5067 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
5068
5069 llvm::Value *FilterVal = Builder.getInt32(0);
5071 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
5072
5073 if (!AfterIP)
5074 return AfterIP.takeError();
5075 Builder.restoreIP(*AfterIP);
5076 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
5077
5078 if (!AfterIP)
5079 return AfterIP.takeError();
5080 Builder.restoreIP(*AfterIP);
5081 Error Err = emitScanBasedDirectiveFinalsIR(ReductionInfos, ScanRedInfo);
5082 if (Err)
5083 return Err;
5084
5085 return AfterIP;
5086}
5087
5088Error OpenMPIRBuilder::emitScanBasedDirectiveIR(
5089 llvm::function_ref<Error()> InputLoopGen,
5090 llvm::function_ref<Error(LocationDescription Loc)> ScanLoopGen,
5091 ScanInfo *ScanRedInfo) {
5092
5093 {
5094 // Emit loop with input phase:
5095 // for (i: 0..<num_iters>) {
5096 // <input phase>;
5097 // buffer[i] = red;
5098 // }
5099 ScanRedInfo->OMPFirstScanLoop = true;
5100 Error Err = InputLoopGen();
5101 if (Err)
5102 return Err;
5103 }
5104 {
5105 // Emit loop with scan phase:
5106 // for (i: 0..<num_iters>) {
5107 // red = buffer[i];
5108 // <scan phase>;
5109 // }
5110 ScanRedInfo->OMPFirstScanLoop = false;
5111 Error Err = ScanLoopGen(Builder.saveIP());
5112 if (Err)
5113 return Err;
5114 }
5115 return Error::success();
5116}
5117
5118void OpenMPIRBuilder::createScanBBs(ScanInfo *ScanRedInfo) {
5119 Function *Fun = Builder.GetInsertBlock()->getParent();
5120 ScanRedInfo->OMPScanDispatch =
5121 BasicBlock::Create(Fun->getContext(), "omp.inscan.dispatch");
5122 ScanRedInfo->OMPAfterScanBlock =
5123 BasicBlock::Create(Fun->getContext(), "omp.after.scan.bb");
5124 ScanRedInfo->OMPBeforeScanBlock =
5125 BasicBlock::Create(Fun->getContext(), "omp.before.scan.bb");
5126 ScanRedInfo->OMPScanLoopExit =
5127 BasicBlock::Create(Fun->getContext(), "omp.scan.loop.exit");
5128}
5130 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
5131 BasicBlock *PostInsertBefore, const Twine &Name) {
5132 Module *M = F->getParent();
5133 LLVMContext &Ctx = M->getContext();
5134 Type *IndVarTy = TripCount->getType();
5135
5136 // Create the basic block structure.
5137 BasicBlock *Preheader =
5138 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
5139 BasicBlock *Header =
5140 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
5141 BasicBlock *Cond =
5142 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
5143 BasicBlock *Body =
5144 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
5145 BasicBlock *Latch =
5146 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
5147 BasicBlock *Exit =
5148 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
5149 BasicBlock *After =
5150 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
5151
5152 // Use specified DebugLoc for new instructions.
5153 Builder.SetCurrentDebugLocation(DL);
5154
5155 Builder.SetInsertPoint(Preheader);
5156 Builder.CreateBr(Header);
5157
5158 Builder.SetInsertPoint(Header);
5159 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
5160 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
5161 Builder.CreateBr(Cond);
5162
5163 Builder.SetInsertPoint(Cond);
5164 Value *Cmp =
5165 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
5166 Builder.CreateCondBr(Cmp, Body, Exit);
5167
5168 Builder.SetInsertPoint(Body);
5169 Builder.CreateBr(Latch);
5170
5171 Builder.SetInsertPoint(Latch);
5172 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
5173 "omp_" + Name + ".next", /*HasNUW=*/true);
5174 Builder.CreateBr(Header);
5175 IndVarPHI->addIncoming(Next, Latch);
5176
5177 Builder.SetInsertPoint(Exit);
5178 Builder.CreateBr(After);
5179
5180 // Remember and return the canonical control flow.
5181 LoopInfos.emplace_front();
5182 CanonicalLoopInfo *CL = &LoopInfos.front();
5183
5184 CL->Header = Header;
5185 CL->Cond = Cond;
5186 CL->Latch = Latch;
5187 CL->Exit = Exit;
5188
5189#ifndef NDEBUG
5190 CL->assertOK();
5191#endif
5192 return CL;
5193}
5194
5197 LoopBodyGenCallbackTy BodyGenCB,
5198 Value *TripCount, const Twine &Name) {
5199 BasicBlock *BB = Loc.IP.getBlock();
5200 BasicBlock *NextBB = BB->getNextNode();
5201
5202 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
5203 NextBB, NextBB, Name);
5204 BasicBlock *After = CL->getAfter();
5205
5206 // If location is not set, don't connect the loop.
5207 if (updateToLocation(Loc)) {
5208 // Split the loop at the insertion point: Branch to the preheader and move
5209 // every following instruction to after the loop (the After BB). Also, the
5210 // new successor is the loop's after block.
5211 spliceBB(Builder, After, /*CreateBranch=*/false);
5212 Builder.CreateBr(CL->getPreheader());
5213 }
5214
5215 // Emit the body content. We do it after connecting the loop to the CFG to
5216 // avoid that the callback encounters degenerate BBs.
5217 if (Error Err = BodyGenCB(CL->getBodyIP(), CL->getIndVar()))
5218 return Err;
5219
5220#ifndef NDEBUG
5221 CL->assertOK();
5222#endif
5223 return CL;
5224}
5225
5227 ScanInfos.emplace_front();
5228 ScanInfo *Result = &ScanInfos.front();
5229 return Result;
5230}
5231
5235 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
5236 InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo) {
5237 LocationDescription ComputeLoc =
5238 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
5239 updateToLocation(ComputeLoc);
5240
5242
5244 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
5245 ScanRedInfo->Span = TripCount;
5246 ScanRedInfo->OMPScanInit = splitBB(Builder, true, "scan.init");
5247 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit);
5248
5249 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
5250 Builder.restoreIP(CodeGenIP);
5251 ScanRedInfo->IV = IV;
5252 createScanBBs(ScanRedInfo);
5253 BasicBlock *InputBlock = Builder.GetInsertBlock();
5254 Instruction *Terminator = InputBlock->getTerminator();
5255 assert(Terminator->getNumSuccessors() == 1);
5256 BasicBlock *ContinueBlock = Terminator->getSuccessor(0);
5257 Terminator->setSuccessor(0, ScanRedInfo->OMPScanDispatch);
5258 emitBlock(ScanRedInfo->OMPBeforeScanBlock,
5259 Builder.GetInsertBlock()->getParent());
5260 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
5261 emitBlock(ScanRedInfo->OMPScanLoopExit,
5262 Builder.GetInsertBlock()->getParent());
5263 Builder.CreateBr(ContinueBlock);
5264 Builder.SetInsertPoint(
5265 ScanRedInfo->OMPBeforeScanBlock->getFirstInsertionPt());
5266 return BodyGenCB(Builder.saveIP(), IV);
5267 };
5268
5269 const auto &&InputLoopGen = [&]() -> Error {
5271 Builder.saveIP(), BodyGen, Start, Stop, Step, IsSigned, InclusiveStop,
5272 ComputeIP, Name, true, ScanRedInfo);
5273 if (!LoopInfo)
5274 return LoopInfo.takeError();
5275 Result.push_back(*LoopInfo);
5276 Builder.restoreIP((*LoopInfo)->getAfterIP());
5277 return Error::success();
5278 };
5279 const auto &&ScanLoopGen = [&](LocationDescription Loc) -> Error {
5281 createCanonicalLoop(Loc, BodyGen, Start, Stop, Step, IsSigned,
5282 InclusiveStop, ComputeIP, Name, true, ScanRedInfo);
5283 if (!LoopInfo)
5284 return LoopInfo.takeError();
5285 Result.push_back(*LoopInfo);
5286 Builder.restoreIP((*LoopInfo)->getAfterIP());
5287 ScanRedInfo->OMPScanFinish = Builder.GetInsertBlock();
5288 return Error::success();
5289 };
5290 Error Err = emitScanBasedDirectiveIR(InputLoopGen, ScanLoopGen, ScanRedInfo);
5291 if (Err)
5292 return Err;
5293 return Result;
5294}
5295
5297 const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step,
5298 bool IsSigned, bool InclusiveStop, const Twine &Name) {
5299
5300 // Consider the following difficulties (assuming 8-bit signed integers):
5301 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
5302 // DO I = 1, 100, 50
5303 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
5304 // DO I = 100, 0, -128
5305
5306 // Start, Stop and Step must be of the same integer type.
5307 auto *IndVarTy = cast<IntegerType>(Start->getType());
5308 assert(IndVarTy == Stop->getType() && "Stop type mismatch");
5309 assert(IndVarTy == Step->getType() && "Step type mismatch");
5310
5312
5313 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
5314 ConstantInt *One = ConstantInt::get(IndVarTy, 1);
5315
5316 // Like Step, but always positive.
5317 Value *Incr = Step;
5318
5319 // Distance between Start and Stop; always positive.
5320 Value *Span;
5321
5322 // Condition whether there are no iterations are executed at all, e.g. because
5323 // UB < LB.
5324 Value *ZeroCmp;
5325
5326 if (IsSigned) {
5327 // Ensure that increment is positive. If not, negate and invert LB and UB.
5328 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
5329 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
5330 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
5331 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
5332 Span = Builder.CreateSub(UB, LB, "", false, true);
5333 ZeroCmp = Builder.CreateICmp(
5334 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
5335 } else {
5336 Span = Builder.CreateSub(Stop, Start, "", true);
5337 ZeroCmp = Builder.CreateICmp(
5338 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
5339 }
5340
5341 Value *CountIfLooping;
5342 if (InclusiveStop) {
5343 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
5344 } else {
5345 // Avoid incrementing past stop since it could overflow.
5346 Value *CountIfTwo = Builder.CreateAdd(
5347 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
5348 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
5349 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
5350 }
5351
5352 return Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
5353 "omp_" + Name + ".tripcount");
5354}
5355
5358 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
5359 InsertPointTy ComputeIP, const Twine &Name, bool InScan,
5360 ScanInfo *ScanRedInfo) {
5361 LocationDescription ComputeLoc =
5362 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
5363
5365 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
5366
5367 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
5368 Builder.restoreIP(CodeGenIP);
5369 Value *Span = Builder.CreateMul(IV, Step);
5370 Value *IndVar = Builder.CreateAdd(Span, Start);
5371 if (InScan)
5372 ScanRedInfo->IV = IndVar;
5373 return BodyGenCB(Builder.saveIP(), IndVar);
5374 };
5375 LocationDescription LoopLoc =
5376 ComputeIP.isSet()
5377 ? Loc
5378 : LocationDescription(Builder.saveIP(),
5379 Builder.getCurrentDebugLocation());
5380 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
5381}
5382
5383// Returns an LLVM function to call for initializing loop bounds using OpenMP
5384// static scheduling for composite `distribute parallel for` depending on
5385// `type`. Only i32 and i64 are supported by the runtime. Always interpret
5386// integers as unsigned similarly to CanonicalLoopInfo.
5387static FunctionCallee
5389 OpenMPIRBuilder &OMPBuilder) {
5390 unsigned Bitwidth = Ty->getIntegerBitWidth();
5391 if (Bitwidth == 32)
5392 return OMPBuilder.getOrCreateRuntimeFunction(
5393 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_4u);
5394 if (Bitwidth == 64)
5395 return OMPBuilder.getOrCreateRuntimeFunction(
5396 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_8u);
5397 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5398}
5399
5400// Returns an LLVM function to call for initializing loop bounds using OpenMP
5401// static scheduling depending on `type`. Only i32 and i64 are supported by the
5402// runtime. Always interpret integers as unsigned similarly to
5403// CanonicalLoopInfo.
5405 OpenMPIRBuilder &OMPBuilder) {
5406 unsigned Bitwidth = Ty->getIntegerBitWidth();
5407 if (Bitwidth == 32)
5408 return OMPBuilder.getOrCreateRuntimeFunction(
5409 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
5410 if (Bitwidth == 64)
5411 return OMPBuilder.getOrCreateRuntimeFunction(
5412 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
5413 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5414}
5415
5416OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyStaticWorkshareLoop(
5417 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5418 WorksharingLoopType LoopType, bool NeedsBarrier, bool HasDistSchedule,
5419 OMPScheduleType DistScheduleSchedType) {
5420 assert(CLI->isValid() && "Requires a valid canonical loop");
5421 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
5422 "Require dedicated allocate IP");
5423
5424 // Set up the source location value for OpenMP runtime.
5425 Builder.restoreIP(CLI->getPreheaderIP());
5426 Builder.SetCurrentDebugLocation(DL);
5427
5428 uint32_t SrcLocStrSize;
5429 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5430 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5431
5432 // Declare useful OpenMP runtime functions.
5433 Value *IV = CLI->getIndVar();
5434 Type *IVTy = IV->getType();
5435 FunctionCallee StaticInit =
5436 LoopType == WorksharingLoopType::DistributeForStaticLoop
5437 ? getKmpcDistForStaticInitForType(IVTy, M, *this)
5438 : getKmpcForStaticInitForType(IVTy, M, *this);
5439 FunctionCallee StaticFini =
5440 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
5441
5442 // Allocate space for computed loop bounds as expected by the "init" function.
5443 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
5444
5445 Type *I32Type = Type::getInt32Ty(M.getContext());
5446 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
5447 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
5448 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
5449 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
5450 CLI->setLastIter(PLastIter);
5451
5452 // At the end of the preheader, prepare for calling the "init" function by
5453 // storing the current loop bounds into the allocated space. A canonical loop
5454 // always iterates from 0 to trip-count with step 1. Note that "init" expects
5455 // and produces an inclusive upper bound.
5456 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
5457 Constant *Zero = ConstantInt::get(IVTy, 0);
5458 Constant *One = ConstantInt::get(IVTy, 1);
5459 Builder.CreateStore(Zero, PLowerBound);
5460 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
5461 Builder.CreateStore(UpperBound, PUpperBound);
5462 Builder.CreateStore(One, PStride);
5463
5464 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
5465
5466 OMPScheduleType SchedType =
5467 (LoopType == WorksharingLoopType::DistributeStaticLoop)
5468 ? OMPScheduleType::OrderedDistribute
5470 Constant *SchedulingType =
5471 ConstantInt::get(I32Type, static_cast<int>(SchedType));
5472
5473 // Call the "init" function and update the trip count of the loop with the
5474 // value it produced.
5475 auto BuildInitCall = [LoopType, SrcLoc, ThreadNum, PLastIter, PLowerBound,
5476 PUpperBound, IVTy, PStride, One, Zero, StaticInit,
5477 this](Value *SchedulingType, auto &Builder) {
5478 SmallVector<Value *, 10> Args({SrcLoc, ThreadNum, SchedulingType, PLastIter,
5479 PLowerBound, PUpperBound});
5480 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
5481 Value *PDistUpperBound =
5482 Builder.CreateAlloca(IVTy, nullptr, "p.distupperbound");
5483 Args.push_back(PDistUpperBound);
5484 }
5485 Args.append({PStride, One, Zero});
5486 createRuntimeFunctionCall(StaticInit, Args);
5487 };
5488 BuildInitCall(SchedulingType, Builder);
5489 if (HasDistSchedule &&
5490 LoopType != WorksharingLoopType::DistributeStaticLoop) {
5491 Constant *DistScheduleSchedType = ConstantInt::get(
5492 I32Type, static_cast<int>(omp::OMPScheduleType::OrderedDistribute));
5493 // We want to emit a second init function call for the dist_schedule clause
5494 // to the Distribute construct. This should only be done however if a
5495 // Workshare Loop is nested within a Distribute Construct
5496 BuildInitCall(DistScheduleSchedType, Builder);
5497 }
5498 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
5499 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
5500 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
5501 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
5502 CLI->setTripCount(TripCount);
5503
5504 // Update all uses of the induction variable except the one in the condition
5505 // block that compares it with the actual upper bound, and the increment in
5506 // the latch block.
5507
5508 CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
5509 Builder.SetInsertPoint(CLI->getBody(),
5510 CLI->getBody()->getFirstInsertionPt());
5511 Builder.SetCurrentDebugLocation(DL);
5512 return Builder.CreateAdd(OldIV, LowerBound);
5513 });
5514
5515 // In the "exit" block, call the "fini" function.
5516 Builder.SetInsertPoint(CLI->getExit(),
5517 CLI->getExit()->getTerminator()->getIterator());
5518 createRuntimeFunctionCall(StaticFini, {SrcLoc, ThreadNum});
5519
5520 // Add the barrier if requested.
5521 if (NeedsBarrier) {
5522 InsertPointOrErrorTy BarrierIP =
5524 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
5525 /* CheckCancelFlag */ false);
5526 if (!BarrierIP)
5527 return BarrierIP.takeError();
5528 }
5529
5530 InsertPointTy AfterIP = CLI->getAfterIP();
5531 CLI->invalidate();
5532
5533 return AfterIP;
5534}
5535
5536static void addAccessGroupMetadata(BasicBlock *Block, MDNode *AccessGroup,
5537 LoopInfo &LI);
5538static void addLoopMetadata(CanonicalLoopInfo *Loop,
5539 ArrayRef<Metadata *> Properties);
5540
5542 LLVMContext &Ctx, Loop *Loop,
5544 SmallVector<Metadata *> &LoopMDList) {
5545 SmallSet<BasicBlock *, 8> Reachable;
5546
5547 // Get the basic blocks from the loop in which memref instructions
5548 // can be found.
5549 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
5550 // preferably without running any passes.
5551 for (BasicBlock *Block : Loop->getBlocks()) {
5552 if (Block == CLI->getCond() || Block == CLI->getHeader())
5553 continue;
5554 Reachable.insert(Block);
5555 }
5556
5557 // Add access group metadata to memory-access instructions.
5558 MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
5559 for (BasicBlock *BB : Reachable)
5560 addAccessGroupMetadata(BB, AccessGroup, LoopInfo);
5561 // TODO: If the loop has existing parallel access metadata, have
5562 // to combine two lists.
5563 LoopMDList.push_back(MDNode::get(
5564 Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
5565}
5566
5568OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(
5569 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5570 bool NeedsBarrier, Value *ChunkSize, OMPScheduleType SchedType,
5571 Value *DistScheduleChunkSize, OMPScheduleType DistScheduleSchedType) {
5572 assert(CLI->isValid() && "Requires a valid canonical loop");
5573 assert((ChunkSize || DistScheduleChunkSize) && "Chunk size is required");
5574
5575 LLVMContext &Ctx = CLI->getFunction()->getContext();
5576 Value *IV = CLI->getIndVar();
5577 Value *OrigTripCount = CLI->getTripCount();
5578 Type *IVTy = IV->getType();
5579 assert(IVTy->getIntegerBitWidth() <= 64 &&
5580 "Max supported tripcount bitwidth is 64 bits");
5581 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
5582 : Type::getInt64Ty(Ctx);
5583 Type *I32Type = Type::getInt32Ty(M.getContext());
5584 Constant *Zero = ConstantInt::get(InternalIVTy, 0);
5585 Constant *One = ConstantInt::get(InternalIVTy, 1);
5586
5587 Function *F = CLI->getFunction();
5589 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5590 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5591 LoopAnalysis LIA;
5592 LoopInfo &&LI = LIA.run(*F, FAM);
5593 Loop *L = LI.getLoopFor(CLI->getHeader());
5594 SmallVector<Metadata *> LoopMDList;
5595 if (ChunkSize || DistScheduleChunkSize)
5596 applyParallelAccessesMetadata(CLI, Ctx, L, LI, LoopMDList);
5597 addLoopMetadata(CLI, LoopMDList);
5598
5599 // Declare useful OpenMP runtime functions.
5600 FunctionCallee StaticInit =
5601 getKmpcForStaticInitForType(InternalIVTy, M, *this);
5602 FunctionCallee StaticFini =
5603 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
5604
5605 // Allocate space for computed loop bounds as expected by the "init" function.
5606 Builder.restoreIP(AllocaIP);
5607 Builder.SetCurrentDebugLocation(DL);
5608 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
5609 Value *PLowerBound =
5610 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
5611 Value *PUpperBound =
5612 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
5613 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
5614 CLI->setLastIter(PLastIter);
5615
5616 // Set up the source location value for the OpenMP runtime.
5617 Builder.restoreIP(CLI->getPreheaderIP());
5618 Builder.SetCurrentDebugLocation(DL);
5619
5620 // TODO: Detect overflow in ubsan or max-out with current tripcount.
5621 Value *CastedChunkSize = Builder.CreateZExtOrTrunc(
5622 ChunkSize ? ChunkSize : Zero, InternalIVTy, "chunksize");
5623 Value *CastedDistScheduleChunkSize = Builder.CreateZExtOrTrunc(
5624 DistScheduleChunkSize ? DistScheduleChunkSize : Zero, InternalIVTy,
5625 "distschedulechunksize");
5626 Value *CastedTripCount =
5627 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
5628
5629 Constant *SchedulingType =
5630 ConstantInt::get(I32Type, static_cast<int>(SchedType));
5631 Constant *DistSchedulingType =
5632 ConstantInt::get(I32Type, static_cast<int>(DistScheduleSchedType));
5633 Builder.CreateStore(Zero, PLowerBound);
5634 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
5635 Value *IsTripCountZero = Builder.CreateICmpEQ(CastedTripCount, Zero);
5636 Value *UpperBound =
5637 Builder.CreateSelect(IsTripCountZero, Zero, OrigUpperBound);
5638 Builder.CreateStore(UpperBound, PUpperBound);
5639 Builder.CreateStore(One, PStride);
5640
5641 // Call the "init" function and update the trip count of the loop with the
5642 // value it produced.
5643 uint32_t SrcLocStrSize;
5644 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5645 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5646 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
5647 auto BuildInitCall = [StaticInit, SrcLoc, ThreadNum, PLastIter, PLowerBound,
5648 PUpperBound, PStride, One,
5649 this](Value *SchedulingType, Value *ChunkSize,
5650 auto &Builder) {
5652 StaticInit, {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
5653 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
5654 /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
5655 /*pstride=*/PStride, /*incr=*/One,
5656 /*chunk=*/ChunkSize});
5657 };
5658 BuildInitCall(SchedulingType, CastedChunkSize, Builder);
5659 if (DistScheduleSchedType != OMPScheduleType::None &&
5660 SchedType != OMPScheduleType::OrderedDistributeChunked &&
5661 SchedType != OMPScheduleType::OrderedDistribute) {
5662 // We want to emit a second init function call for the dist_schedule clause
5663 // to the Distribute construct. This should only be done however if a
5664 // Workshare Loop is nested within a Distribute Construct
5665 BuildInitCall(DistSchedulingType, CastedDistScheduleChunkSize, Builder);
5666 }
5667
5668 // Load values written by the "init" function.
5669 Value *FirstChunkStart =
5670 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
5671 Value *FirstChunkStop =
5672 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
5673 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
5674 Value *ChunkRange =
5675 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
5676 Value *NextChunkStride =
5677 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
5678
5679 // Create outer "dispatch" loop for enumerating the chunks.
5680 BasicBlock *DispatchEnter = splitBB(Builder, true);
5681 Value *DispatchCounter;
5682
5683 // It is safe to assume this didn't return an error because the callback
5684 // passed into createCanonicalLoop is the only possible error source, and it
5685 // always returns success.
5686 CanonicalLoopInfo *DispatchCLI = cantFail(createCanonicalLoop(
5687 {Builder.saveIP(), DL},
5688 [&](InsertPointTy BodyIP, Value *Counter) {
5689 DispatchCounter = Counter;
5690 return Error::success();
5691 },
5692 FirstChunkStart, CastedTripCount, NextChunkStride,
5693 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
5694 "dispatch"));
5695
5696 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
5697 // not have to preserve the canonical invariant.
5698 BasicBlock *DispatchBody = DispatchCLI->getBody();
5699 BasicBlock *DispatchLatch = DispatchCLI->getLatch();
5700 BasicBlock *DispatchExit = DispatchCLI->getExit();
5701 BasicBlock *DispatchAfter = DispatchCLI->getAfter();
5702 DispatchCLI->invalidate();
5703
5704 // Rewire the original loop to become the chunk loop inside the dispatch loop.
5705 redirectTo(DispatchAfter, CLI->getAfter(), DL);
5706 redirectTo(CLI->getExit(), DispatchLatch, DL);
5707 redirectTo(DispatchBody, DispatchEnter, DL);
5708
5709 // Prepare the prolog of the chunk loop.
5710 Builder.restoreIP(CLI->getPreheaderIP());
5711 Builder.SetCurrentDebugLocation(DL);
5712
5713 // Compute the number of iterations of the chunk loop.
5714 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
5715 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
5716 Value *IsLastChunk =
5717 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
5718 Value *CountUntilOrigTripCount =
5719 Builder.CreateSub(CastedTripCount, DispatchCounter);
5720 Value *ChunkTripCount = Builder.CreateSelect(
5721 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
5722 Value *BackcastedChunkTC =
5723 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
5724 CLI->setTripCount(BackcastedChunkTC);
5725
5726 // Update all uses of the induction variable except the one in the condition
5727 // block that compares it with the actual upper bound, and the increment in
5728 // the latch block.
5729 Value *BackcastedDispatchCounter =
5730 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
5731 CLI->mapIndVar([&](Instruction *) -> Value * {
5732 Builder.restoreIP(CLI->getBodyIP());
5733 return Builder.CreateAdd(IV, BackcastedDispatchCounter);
5734 });
5735
5736 // In the "exit" block, call the "fini" function.
5737 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
5738 createRuntimeFunctionCall(StaticFini, {SrcLoc, ThreadNum});
5739
5740 // Add the barrier if requested.
5741 if (NeedsBarrier) {
5742 InsertPointOrErrorTy AfterIP =
5743 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
5744 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
5745 if (!AfterIP)
5746 return AfterIP.takeError();
5747 }
5748
5749#ifndef NDEBUG
5750 // Even though we currently do not support applying additional methods to it,
5751 // the chunk loop should remain a canonical loop.
5752 CLI->assertOK();
5753#endif
5754
5755 return InsertPointTy(DispatchAfter, DispatchAfter->getFirstInsertionPt());
5756}
5757
5758// Returns an LLVM function to call for executing an OpenMP static worksharing
5759// for loop depending on `type`. Only i32 and i64 are supported by the runtime.
5760// Always interpret integers as unsigned similarly to CanonicalLoopInfo.
5761static FunctionCallee
5763 WorksharingLoopType LoopType) {
5764 unsigned Bitwidth = Ty->getIntegerBitWidth();
5765 Module &M = OMPBuilder->M;
5766 switch (LoopType) {
5767 case WorksharingLoopType::ForStaticLoop:
5768 if (Bitwidth == 32)
5769 return OMPBuilder->getOrCreateRuntimeFunction(
5770 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u);
5771 if (Bitwidth == 64)
5772 return OMPBuilder->getOrCreateRuntimeFunction(
5773 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u);
5774 break;
5775 case WorksharingLoopType::DistributeStaticLoop:
5776 if (Bitwidth == 32)
5777 return OMPBuilder->getOrCreateRuntimeFunction(
5778 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u);
5779 if (Bitwidth == 64)
5780 return OMPBuilder->getOrCreateRuntimeFunction(
5781 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u);
5782 break;
5783 case WorksharingLoopType::DistributeForStaticLoop:
5784 if (Bitwidth == 32)
5785 return OMPBuilder->getOrCreateRuntimeFunction(
5786 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u);
5787 if (Bitwidth == 64)
5788 return OMPBuilder->getOrCreateRuntimeFunction(
5789 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u);
5790 break;
5791 }
5792 if (Bitwidth != 32 && Bitwidth != 64) {
5793 llvm_unreachable("Unknown OpenMP loop iterator bitwidth");
5794 }
5795 llvm_unreachable("Unknown type of OpenMP worksharing loop");
5796}
5797
5798// Inserts a call to proper OpenMP Device RTL function which handles
5799// loop worksharing.
5801 WorksharingLoopType LoopType,
5802 BasicBlock *InsertBlock, Value *Ident,
5803 Value *LoopBodyArg, Value *TripCount,
5804 Function &LoopBodyFn, bool NoLoop) {
5805 Type *TripCountTy = TripCount->getType();
5806 Module &M = OMPBuilder->M;
5807 IRBuilder<> &Builder = OMPBuilder->Builder;
5808 FunctionCallee RTLFn =
5809 getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
5810 SmallVector<Value *, 8> RealArgs;
5811 RealArgs.push_back(Ident);
5812 RealArgs.push_back(&LoopBodyFn);
5813 RealArgs.push_back(LoopBodyArg);
5814 RealArgs.push_back(TripCount);
5815 if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
5816 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5817 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
5818 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
5819 OMPBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
5820 return;
5821 }
5822 FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction(
5823 M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads);
5824 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
5825 Value *NumThreads = OMPBuilder->createRuntimeFunctionCall(RTLNumThreads, {});
5826
5827 RealArgs.push_back(
5828 Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
5829 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5830 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
5831 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5832 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), NoLoop));
5833 } else {
5834 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
5835 }
5836
5837 OMPBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
5838}
5839
5841 OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident,
5842 Function &OutlinedFn, const SmallVector<Instruction *, 4> &ToBeDeleted,
5843 WorksharingLoopType LoopType, bool NoLoop) {
5844 IRBuilder<> &Builder = OMPIRBuilder->Builder;
5845 BasicBlock *Preheader = CLI->getPreheader();
5846 Value *TripCount = CLI->getTripCount();
5847
5848 // After loop body outling, the loop body contains only set up
5849 // of loop body argument structure and the call to the outlined
5850 // loop body function. Firstly, we need to move setup of loop body args
5851 // into loop preheader.
5852 Preheader->splice(std::prev(Preheader->end()), CLI->getBody(),
5853 CLI->getBody()->begin(), std::prev(CLI->getBody()->end()));
5854
5855 // The next step is to remove the whole loop. We do not it need anymore.
5856 // That's why make an unconditional branch from loop preheader to loop
5857 // exit block
5858 Builder.restoreIP({Preheader, Preheader->end()});
5859 Builder.SetCurrentDebugLocation(Preheader->getTerminator()->getDebugLoc());
5860 Preheader->getTerminator()->eraseFromParent();
5861 Builder.CreateBr(CLI->getExit());
5862
5863 // Delete dead loop blocks
5864 OpenMPIRBuilder::OutlineInfo CleanUpInfo;
5865 SmallPtrSet<BasicBlock *, 32> RegionBlockSet;
5866 SmallVector<BasicBlock *, 32> BlocksToBeRemoved;
5867 CleanUpInfo.EntryBB = CLI->getHeader();
5868 CleanUpInfo.ExitBB = CLI->getExit();
5869 CleanUpInfo.collectBlocks(RegionBlockSet, BlocksToBeRemoved);
5870 DeleteDeadBlocks(BlocksToBeRemoved);
5871
5872 // Find the instruction which corresponds to loop body argument structure
5873 // and remove the call to loop body function instruction.
5874 Value *LoopBodyArg;
5875 User *OutlinedFnUser = OutlinedFn.getUniqueUndroppableUser();
5876 assert(OutlinedFnUser &&
5877 "Expected unique undroppable user of outlined function");
5878 CallInst *OutlinedFnCallInstruction = dyn_cast<CallInst>(OutlinedFnUser);
5879 assert(OutlinedFnCallInstruction && "Expected outlined function call");
5880 assert((OutlinedFnCallInstruction->getParent() == Preheader) &&
5881 "Expected outlined function call to be located in loop preheader");
5882 // Check in case no argument structure has been passed.
5883 if (OutlinedFnCallInstruction->arg_size() > 1)
5884 LoopBodyArg = OutlinedFnCallInstruction->getArgOperand(1);
5885 else
5886 LoopBodyArg = Constant::getNullValue(Builder.getPtrTy());
5887 OutlinedFnCallInstruction->eraseFromParent();
5888
5889 createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
5890 LoopBodyArg, TripCount, OutlinedFn, NoLoop);
5891
5892 for (auto &ToBeDeletedItem : ToBeDeleted)
5893 ToBeDeletedItem->eraseFromParent();
5894 CLI->invalidate();
5895}
5896
5897OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoopTarget(
5898 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5899 WorksharingLoopType LoopType, bool NoLoop) {
5900 uint32_t SrcLocStrSize;
5901 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5902 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5903
5904 OutlineInfo OI;
5905 OI.OuterAllocaBB = CLI->getPreheader();
5906 Function *OuterFn = CLI->getPreheader()->getParent();
5907
5908 // Instructions which need to be deleted at the end of code generation
5909 SmallVector<Instruction *, 4> ToBeDeleted;
5910
5911 OI.OuterAllocaBB = AllocaIP.getBlock();
5912
5913 // Mark the body loop as region which needs to be extracted
5914 OI.EntryBB = CLI->getBody();
5915 OI.ExitBB = CLI->getLatch()->splitBasicBlockBefore(CLI->getLatch()->begin(),
5916 "omp.prelatch");
5917
5918 // Prepare loop body for extraction
5919 Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()});
5920
5921 // Insert new loop counter variable which will be used only in loop
5922 // body.
5923 AllocaInst *NewLoopCnt = Builder.CreateAlloca(CLI->getIndVarType(), 0, "");
5924 Instruction *NewLoopCntLoad =
5925 Builder.CreateLoad(CLI->getIndVarType(), NewLoopCnt);
5926 // New loop counter instructions are redundant in the loop preheader when
5927 // code generation for workshare loop is finshed. That's why mark them as
5928 // ready for deletion.
5929 ToBeDeleted.push_back(NewLoopCntLoad);
5930 ToBeDeleted.push_back(NewLoopCnt);
5931
5932 // Analyse loop body region. Find all input variables which are used inside
5933 // loop body region.
5934 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
5936 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
5937
5938 CodeExtractorAnalysisCache CEAC(*OuterFn);
5939 CodeExtractor Extractor(Blocks,
5940 /* DominatorTree */ nullptr,
5941 /* AggregateArgs */ true,
5942 /* BlockFrequencyInfo */ nullptr,
5943 /* BranchProbabilityInfo */ nullptr,
5944 /* AssumptionCache */ nullptr,
5945 /* AllowVarArgs */ true,
5946 /* AllowAlloca */ true,
5947 /* AllocationBlock */ CLI->getPreheader(),
5948 /* Suffix */ ".omp_wsloop",
5949 /* AggrArgsIn0AddrSpace */ true);
5950
5951 BasicBlock *CommonExit = nullptr;
5952 SetVector<Value *> SinkingCands, HoistingCands;
5953
5954 // Find allocas outside the loop body region which are used inside loop
5955 // body
5956 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
5957
5958 // We need to model loop body region as the function f(cnt, loop_arg).
5959 // That's why we replace loop induction variable by the new counter
5960 // which will be one of loop body function argument
5962 CLI->getIndVar()->user_end());
5963 for (auto Use : Users) {
5964 if (Instruction *Inst = dyn_cast<Instruction>(Use)) {
5965 if (ParallelRegionBlockSet.count(Inst->getParent())) {
5966 Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad);
5967 }
5968 }
5969 }
5970 // Make sure that loop counter variable is not merged into loop body
5971 // function argument structure and it is passed as separate variable
5972 OI.ExcludeArgsFromAggregate.push_back(NewLoopCntLoad);
5973
5974 // PostOutline CB is invoked when loop body function is outlined and
5975 // loop body is replaced by call to outlined function. We need to add
5976 // call to OpenMP device rtl inside loop preheader. OpenMP device rtl
5977 // function will handle loop control logic.
5978 //
5979 OI.PostOutlineCB = [=, ToBeDeletedVec =
5980 std::move(ToBeDeleted)](Function &OutlinedFn) {
5981 workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ToBeDeletedVec,
5982 LoopType, NoLoop);
5983 };
5984 addOutlineInfo(std::move(OI));
5985 return CLI->getAfterIP();
5986}
5987
5990 bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize,
5991 bool HasSimdModifier, bool HasMonotonicModifier,
5992 bool HasNonmonotonicModifier, bool HasOrderedClause,
5993 WorksharingLoopType LoopType, bool NoLoop, bool HasDistSchedule,
5994 Value *DistScheduleChunkSize) {
5995 if (Config.isTargetDevice())
5996 return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType, NoLoop);
5997 OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
5998 SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
5999 HasNonmonotonicModifier, HasOrderedClause, DistScheduleChunkSize);
6000
6001 bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
6002 OMPScheduleType::ModifierOrdered;
6003 OMPScheduleType DistScheduleSchedType = OMPScheduleType::None;
6004 if (HasDistSchedule) {
6005 DistScheduleSchedType = DistScheduleChunkSize
6006 ? OMPScheduleType::OrderedDistributeChunked
6007 : OMPScheduleType::OrderedDistribute;
6008 }
6009 switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
6010 case OMPScheduleType::BaseStatic:
6011 case OMPScheduleType::BaseDistribute:
6012 assert((!ChunkSize || !DistScheduleChunkSize) &&
6013 "No chunk size with static-chunked schedule");
6014 if (IsOrdered && !HasDistSchedule)
6015 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
6016 NeedsBarrier, ChunkSize);
6017 // FIXME: Monotonicity ignored?
6018 if (DistScheduleChunkSize)
6019 return applyStaticChunkedWorkshareLoop(
6020 DL, CLI, AllocaIP, NeedsBarrier, ChunkSize, EffectiveScheduleType,
6021 DistScheduleChunkSize, DistScheduleSchedType);
6022 return applyStaticWorkshareLoop(DL, CLI, AllocaIP, LoopType, NeedsBarrier,
6023 HasDistSchedule);
6024
6025 case OMPScheduleType::BaseStaticChunked:
6026 case OMPScheduleType::BaseDistributeChunked:
6027 if (IsOrdered && !HasDistSchedule)
6028 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
6029 NeedsBarrier, ChunkSize);
6030 // FIXME: Monotonicity ignored?
6031 return applyStaticChunkedWorkshareLoop(
6032 DL, CLI, AllocaIP, NeedsBarrier, ChunkSize, EffectiveScheduleType,
6033 DistScheduleChunkSize, DistScheduleSchedType);
6034
6035 case OMPScheduleType::BaseRuntime:
6036 case OMPScheduleType::BaseAuto:
6037 case OMPScheduleType::BaseGreedy:
6038 case OMPScheduleType::BaseBalanced:
6039 case OMPScheduleType::BaseSteal:
6040 case OMPScheduleType::BaseRuntimeSimd:
6041 assert(!ChunkSize &&
6042 "schedule type does not support user-defined chunk sizes");
6043 [[fallthrough]];
6044 case OMPScheduleType::BaseGuidedSimd:
6045 case OMPScheduleType::BaseDynamicChunked:
6046 case OMPScheduleType::BaseGuidedChunked:
6047 case OMPScheduleType::BaseGuidedIterativeChunked:
6048 case OMPScheduleType::BaseGuidedAnalyticalChunked:
6049 case OMPScheduleType::BaseStaticBalancedChunked:
6050 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
6051 NeedsBarrier, ChunkSize);
6052
6053 default:
6054 llvm_unreachable("Unknown/unimplemented schedule kind");
6055 }
6056}
6057
6058/// Returns an LLVM function to call for initializing loop bounds using OpenMP
6059/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
6060/// the runtime. Always interpret integers as unsigned similarly to
6061/// CanonicalLoopInfo.
6062static FunctionCallee
6064 unsigned Bitwidth = Ty->getIntegerBitWidth();
6065 if (Bitwidth == 32)
6066 return OMPBuilder.getOrCreateRuntimeFunction(
6067 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
6068 if (Bitwidth == 64)
6069 return OMPBuilder.getOrCreateRuntimeFunction(
6070 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
6071 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
6072}
6073
6074/// Returns an LLVM function to call for updating the next loop using OpenMP
6075/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
6076/// the runtime. Always interpret integers as unsigned similarly to
6077/// CanonicalLoopInfo.
6078static FunctionCallee
6080 unsigned Bitwidth = Ty->getIntegerBitWidth();
6081 if (Bitwidth == 32)
6082 return OMPBuilder.getOrCreateRuntimeFunction(
6083 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
6084 if (Bitwidth == 64)
6085 return OMPBuilder.getOrCreateRuntimeFunction(
6086 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
6087 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
6088}
6089
6090/// Returns an LLVM function to call for finalizing the dynamic loop using
6091/// depending on `type`. Only i32 and i64 are supported by the runtime. Always
6092/// interpret integers as unsigned similarly to CanonicalLoopInfo.
6093static FunctionCallee
6095 unsigned Bitwidth = Ty->getIntegerBitWidth();
6096 if (Bitwidth == 32)
6097 return OMPBuilder.getOrCreateRuntimeFunction(
6098 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u);
6099 if (Bitwidth == 64)
6100 return OMPBuilder.getOrCreateRuntimeFunction(
6101 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u);
6102 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
6103}
6104
6106OpenMPIRBuilder::applyDynamicWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
6107 InsertPointTy AllocaIP,
6108 OMPScheduleType SchedType,
6109 bool NeedsBarrier, Value *Chunk) {
6110 assert(CLI->isValid() && "Requires a valid canonical loop");
6111 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
6112 "Require dedicated allocate IP");
6114 "Require valid schedule type");
6115
6116 bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) ==
6117 OMPScheduleType::ModifierOrdered;
6118
6119 // Set up the source location value for OpenMP runtime.
6120 Builder.SetCurrentDebugLocation(DL);
6121
6122 uint32_t SrcLocStrSize;
6123 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
6124 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6125
6126 // Declare useful OpenMP runtime functions.
6127 Value *IV = CLI->getIndVar();
6128 Type *IVTy = IV->getType();
6129 FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
6130 FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
6131
6132 // Allocate space for computed loop bounds as expected by the "init" function.
6133 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
6134 Type *I32Type = Type::getInt32Ty(M.getContext());
6135 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
6136 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
6137 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
6138 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
6139 CLI->setLastIter(PLastIter);
6140
6141 // At the end of the preheader, prepare for calling the "init" function by
6142 // storing the current loop bounds into the allocated space. A canonical loop
6143 // always iterates from 0 to trip-count with step 1. Note that "init" expects
6144 // and produces an inclusive upper bound.
6145 BasicBlock *PreHeader = CLI->getPreheader();
6146 Builder.SetInsertPoint(PreHeader->getTerminator());
6147 Constant *One = ConstantInt::get(IVTy, 1);
6148 Builder.CreateStore(One, PLowerBound);
6149 Value *UpperBound = CLI->getTripCount();
6150 Builder.CreateStore(UpperBound, PUpperBound);
6151 Builder.CreateStore(One, PStride);
6152
6153 BasicBlock *Header = CLI->getHeader();
6154 BasicBlock *Exit = CLI->getExit();
6155 BasicBlock *Cond = CLI->getCond();
6156 BasicBlock *Latch = CLI->getLatch();
6157 InsertPointTy AfterIP = CLI->getAfterIP();
6158
6159 // The CLI will be "broken" in the code below, as the loop is no longer
6160 // a valid canonical loop.
6161
6162 if (!Chunk)
6163 Chunk = One;
6164
6165 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
6166
6167 Constant *SchedulingType =
6168 ConstantInt::get(I32Type, static_cast<int>(SchedType));
6169
6170 // Call the "init" function.
6171 createRuntimeFunctionCall(DynamicInit, {SrcLoc, ThreadNum, SchedulingType,
6172 /* LowerBound */ One, UpperBound,
6173 /* step */ One, Chunk});
6174
6175 // An outer loop around the existing one.
6176 BasicBlock *OuterCond = BasicBlock::Create(
6177 PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
6178 PreHeader->getParent());
6179 // This needs to be 32-bit always, so can't use the IVTy Zero above.
6180 Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
6182 DynamicNext,
6183 {SrcLoc, ThreadNum, PLastIter, PLowerBound, PUpperBound, PStride});
6184 Constant *Zero32 = ConstantInt::get(I32Type, 0);
6185 Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
6186 Value *LowerBound =
6187 Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
6188 Builder.CreateCondBr(MoreWork, Header, Exit);
6189
6190 // Change PHI-node in loop header to use outer cond rather than preheader,
6191 // and set IV to the LowerBound.
6192 Instruction *Phi = &Header->front();
6193 auto *PI = cast<PHINode>(Phi);
6194 PI->setIncomingBlock(0, OuterCond);
6195 PI->setIncomingValue(0, LowerBound);
6196
6197 // Then set the pre-header to jump to the OuterCond
6198 Instruction *Term = PreHeader->getTerminator();
6199 auto *Br = cast<BranchInst>(Term);
6200 Br->setSuccessor(0, OuterCond);
6201
6202 // Modify the inner condition:
6203 // * Use the UpperBound returned from the DynamicNext call.
6204 // * jump to the loop outer loop when done with one of the inner loops.
6205 Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
6206 UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
6207 Instruction *Comp = &*Builder.GetInsertPoint();
6208 auto *CI = cast<CmpInst>(Comp);
6209 CI->setOperand(1, UpperBound);
6210 // Redirect the inner exit to branch to outer condition.
6211 Instruction *Branch = &Cond->back();
6212 auto *BI = cast<BranchInst>(Branch);
6213 assert(BI->getSuccessor(1) == Exit);
6214 BI->setSuccessor(1, OuterCond);
6215
6216 // Call the "fini" function if "ordered" is present in wsloop directive.
6217 if (Ordered) {
6218 Builder.SetInsertPoint(&Latch->back());
6219 FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this);
6220 createRuntimeFunctionCall(DynamicFini, {SrcLoc, ThreadNum});
6221 }
6222
6223 // Add the barrier if requested.
6224 if (NeedsBarrier) {
6225 Builder.SetInsertPoint(&Exit->back());
6226 InsertPointOrErrorTy BarrierIP =
6228 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
6229 /* CheckCancelFlag */ false);
6230 if (!BarrierIP)
6231 return BarrierIP.takeError();
6232 }
6233
6234 CLI->invalidate();
6235 return AfterIP;
6236}
6237
6238/// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
6239/// after this \p OldTarget will be orphaned.
6241 BasicBlock *NewTarget, DebugLoc DL) {
6242 for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget)))
6243 redirectTo(Pred, NewTarget, DL);
6244}
6245
6246/// Determine which blocks in \p BBs are reachable from outside and remove the
6247/// ones that are not reachable from the function.
6250 auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) {
6251 for (Use &U : BB->uses()) {
6252 auto *UseInst = dyn_cast<Instruction>(U.getUser());
6253 if (!UseInst)
6254 continue;
6255 if (BBsToErase.count(UseInst->getParent()))
6256 continue;
6257 return true;
6258 }
6259 return false;
6260 };
6261
6262 while (BBsToErase.remove_if(HasRemainingUses)) {
6263 // Try again if anything was removed.
6264 }
6265
6266 SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end());
6267 DeleteDeadBlocks(BBVec);
6268}
6269
6270CanonicalLoopInfo *
6272 InsertPointTy ComputeIP) {
6273 assert(Loops.size() >= 1 && "At least one loop required");
6274 size_t NumLoops = Loops.size();
6275
6276 // Nothing to do if there is already just one loop.
6277 if (NumLoops == 1)
6278 return Loops.front();
6279
6280 CanonicalLoopInfo *Outermost = Loops.front();
6281 CanonicalLoopInfo *Innermost = Loops.back();
6282 BasicBlock *OrigPreheader = Outermost->getPreheader();
6283 BasicBlock *OrigAfter = Outermost->getAfter();
6284 Function *F = OrigPreheader->getParent();
6285
6286 // Loop control blocks that may become orphaned later.
6287 SmallVector<BasicBlock *, 12> OldControlBBs;
6288 OldControlBBs.reserve(6 * Loops.size());
6290 Loop->collectControlBlocks(OldControlBBs);
6291
6292 // Setup the IRBuilder for inserting the trip count computation.
6293 Builder.SetCurrentDebugLocation(DL);
6294 if (ComputeIP.isSet())
6295 Builder.restoreIP(ComputeIP);
6296 else
6297 Builder.restoreIP(Outermost->getPreheaderIP());
6298
6299 // Derive the collapsed' loop trip count.
6300 // TODO: Find common/largest indvar type.
6301 Value *CollapsedTripCount = nullptr;
6302 for (CanonicalLoopInfo *L : Loops) {
6303 assert(L->isValid() &&
6304 "All loops to collapse must be valid canonical loops");
6305 Value *OrigTripCount = L->getTripCount();
6306 if (!CollapsedTripCount) {
6307 CollapsedTripCount = OrigTripCount;
6308 continue;
6309 }
6310
6311 // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
6312 CollapsedTripCount =
6313 Builder.CreateNUWMul(CollapsedTripCount, OrigTripCount);
6314 }
6315
6316 // Create the collapsed loop control flow.
6317 CanonicalLoopInfo *Result =
6318 createLoopSkeleton(DL, CollapsedTripCount, F,
6319 OrigPreheader->getNextNode(), OrigAfter, "collapsed");
6320
6321 // Build the collapsed loop body code.
6322 // Start with deriving the input loop induction variables from the collapsed
6323 // one, using a divmod scheme. To preserve the original loops' order, the
6324 // innermost loop use the least significant bits.
6325 Builder.restoreIP(Result->getBodyIP());
6326
6327 Value *Leftover = Result->getIndVar();
6328 SmallVector<Value *> NewIndVars;
6329 NewIndVars.resize(NumLoops);
6330 for (int i = NumLoops - 1; i >= 1; --i) {
6331 Value *OrigTripCount = Loops[i]->getTripCount();
6332
6333 Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount);
6334 NewIndVars[i] = NewIndVar;
6335
6336 Leftover = Builder.CreateUDiv(Leftover, OrigTripCount);
6337 }
6338 // Outermost loop gets all the remaining bits.
6339 NewIndVars[0] = Leftover;
6340
6341 // Construct the loop body control flow.
6342 // We progressively construct the branch structure following in direction of
6343 // the control flow, from the leading in-between code, the loop nest body, the
6344 // trailing in-between code, and rejoining the collapsed loop's latch.
6345 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If
6346 // the ContinueBlock is set, continue with that block. If ContinuePred, use
6347 // its predecessors as sources.
6348 BasicBlock *ContinueBlock = Result->getBody();
6349 BasicBlock *ContinuePred = nullptr;
6350 auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest,
6351 BasicBlock *NextSrc) {
6352 if (ContinueBlock)
6353 redirectTo(ContinueBlock, Dest, DL);
6354 else
6355 redirectAllPredecessorsTo(ContinuePred, Dest, DL);
6356
6357 ContinueBlock = nullptr;
6358 ContinuePred = NextSrc;
6359 };
6360
6361 // The code before the nested loop of each level.
6362 // Because we are sinking it into the nest, it will be executed more often
6363 // that the original loop. More sophisticated schemes could keep track of what
6364 // the in-between code is and instantiate it only once per thread.
6365 for (size_t i = 0; i < NumLoops - 1; ++i)
6366 ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader());
6367
6368 // Connect the loop nest body.
6369 ContinueWith(Innermost->getBody(), Innermost->getLatch());
6370
6371 // The code after the nested loop at each level.
6372 for (size_t i = NumLoops - 1; i > 0; --i)
6373 ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch());
6374
6375 // Connect the finished loop to the collapsed loop latch.
6376 ContinueWith(Result->getLatch(), nullptr);
6377
6378 // Replace the input loops with the new collapsed loop.
6379 redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL);
6380 redirectTo(Result->getAfter(), Outermost->getAfter(), DL);
6381
6382 // Replace the input loop indvars with the derived ones.
6383 for (size_t i = 0; i < NumLoops; ++i)
6384 Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]);
6385
6386 // Remove unused parts of the input loops.
6387 removeUnusedBlocksFromParent(OldControlBBs);
6388
6389 for (CanonicalLoopInfo *L : Loops)
6390 L->invalidate();
6391
6392#ifndef NDEBUG
6393 Result->assertOK();
6394#endif
6395 return Result;
6396}
6397
6398std::vector<CanonicalLoopInfo *>
6400 ArrayRef<Value *> TileSizes) {
6401 assert(TileSizes.size() == Loops.size() &&
6402 "Must pass as many tile sizes as there are loops");
6403 int NumLoops = Loops.size();
6404 assert(NumLoops >= 1 && "At least one loop to tile required");
6405
6406 CanonicalLoopInfo *OutermostLoop = Loops.front();
6407 CanonicalLoopInfo *InnermostLoop = Loops.back();
6408 Function *F = OutermostLoop->getBody()->getParent();
6409 BasicBlock *InnerEnter = InnermostLoop->getBody();
6410 BasicBlock *InnerLatch = InnermostLoop->getLatch();
6411
6412 // Loop control blocks that may become orphaned later.
6413 SmallVector<BasicBlock *, 12> OldControlBBs;
6414 OldControlBBs.reserve(6 * Loops.size());
6416 Loop->collectControlBlocks(OldControlBBs);
6417
6418 // Collect original trip counts and induction variable to be accessible by
6419 // index. Also, the structure of the original loops is not preserved during
6420 // the construction of the tiled loops, so do it before we scavenge the BBs of
6421 // any original CanonicalLoopInfo.
6422 SmallVector<Value *, 4> OrigTripCounts, OrigIndVars;
6423 for (CanonicalLoopInfo *L : Loops) {
6424 assert(L->isValid() && "All input loops must be valid canonical loops");
6425 OrigTripCounts.push_back(L->getTripCount());
6426 OrigIndVars.push_back(L->getIndVar());
6427 }
6428
6429 // Collect the code between loop headers. These may contain SSA definitions
6430 // that are used in the loop nest body. To be usable with in the innermost
6431 // body, these BasicBlocks will be sunk into the loop nest body. That is,
6432 // these instructions may be executed more often than before the tiling.
6433 // TODO: It would be sufficient to only sink them into body of the
6434 // corresponding tile loop.
6436 for (int i = 0; i < NumLoops - 1; ++i) {
6437 CanonicalLoopInfo *Surrounding = Loops[i];
6438 CanonicalLoopInfo *Nested = Loops[i + 1];
6439
6440 BasicBlock *EnterBB = Surrounding->getBody();
6441 BasicBlock *ExitBB = Nested->getHeader();
6442 InbetweenCode.emplace_back(EnterBB, ExitBB);
6443 }
6444
6445 // Compute the trip counts of the floor loops.
6446 Builder.SetCurrentDebugLocation(DL);
6447 Builder.restoreIP(OutermostLoop->getPreheaderIP());
6448 SmallVector<Value *, 4> FloorCompleteCount, FloorCount, FloorRems;
6449 for (int i = 0; i < NumLoops; ++i) {
6450 Value *TileSize = TileSizes[i];
6451 Value *OrigTripCount = OrigTripCounts[i];
6452 Type *IVType = OrigTripCount->getType();
6453
6454 Value *FloorCompleteTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
6455 Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize);
6456
6457 // 0 if tripcount divides the tilesize, 1 otherwise.
6458 // 1 means we need an additional iteration for a partial tile.
6459 //
6460 // Unfortunately we cannot just use the roundup-formula
6461 // (tripcount + tilesize - 1)/tilesize
6462 // because the summation might overflow. We do not want introduce undefined
6463 // behavior when the untiled loop nest did not.
6464 Value *FloorTripOverflow =
6465 Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0));
6466
6467 FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType);
6468 Value *FloorTripCount =
6469 Builder.CreateAdd(FloorCompleteTripCount, FloorTripOverflow,
6470 "omp_floor" + Twine(i) + ".tripcount", true);
6471
6472 // Remember some values for later use.
6473 FloorCompleteCount.push_back(FloorCompleteTripCount);
6474 FloorCount.push_back(FloorTripCount);
6475 FloorRems.push_back(FloorTripRem);
6476 }
6477
6478 // Generate the new loop nest, from the outermost to the innermost.
6479 std::vector<CanonicalLoopInfo *> Result;
6480 Result.reserve(NumLoops * 2);
6481
6482 // The basic block of the surrounding loop that enters the nest generated
6483 // loop.
6484 BasicBlock *Enter = OutermostLoop->getPreheader();
6485
6486 // The basic block of the surrounding loop where the inner code should
6487 // continue.
6488 BasicBlock *Continue = OutermostLoop->getAfter();
6489
6490 // Where the next loop basic block should be inserted.
6491 BasicBlock *OutroInsertBefore = InnermostLoop->getExit();
6492
6493 auto EmbeddNewLoop =
6494 [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore](
6495 Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * {
6496 CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton(
6497 DL, TripCount, F, InnerEnter, OutroInsertBefore, Name);
6498 redirectTo(Enter, EmbeddedLoop->getPreheader(), DL);
6499 redirectTo(EmbeddedLoop->getAfter(), Continue, DL);
6500
6501 // Setup the position where the next embedded loop connects to this loop.
6502 Enter = EmbeddedLoop->getBody();
6503 Continue = EmbeddedLoop->getLatch();
6504 OutroInsertBefore = EmbeddedLoop->getLatch();
6505 return EmbeddedLoop;
6506 };
6507
6508 auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts,
6509 const Twine &NameBase) {
6510 for (auto P : enumerate(TripCounts)) {
6511 CanonicalLoopInfo *EmbeddedLoop =
6512 EmbeddNewLoop(P.value(), NameBase + Twine(P.index()));
6513 Result.push_back(EmbeddedLoop);
6514 }
6515 };
6516
6517 EmbeddNewLoops(FloorCount, "floor");
6518
6519 // Within the innermost floor loop, emit the code that computes the tile
6520 // sizes.
6521 Builder.SetInsertPoint(Enter->getTerminator());
6522 SmallVector<Value *, 4> TileCounts;
6523 for (int i = 0; i < NumLoops; ++i) {
6524 CanonicalLoopInfo *FloorLoop = Result[i];
6525 Value *TileSize = TileSizes[i];
6526
6527 Value *FloorIsEpilogue =
6528 Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCompleteCount[i]);
6529 Value *TileTripCount =
6530 Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize);
6531
6532 TileCounts.push_back(TileTripCount);
6533 }
6534
6535 // Create the tile loops.
6536 EmbeddNewLoops(TileCounts, "tile");
6537
6538 // Insert the inbetween code into the body.
6539 BasicBlock *BodyEnter = Enter;
6540 BasicBlock *BodyEntered = nullptr;
6541 for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) {
6542 BasicBlock *EnterBB = P.first;
6543 BasicBlock *ExitBB = P.second;
6544
6545 if (BodyEnter)
6546 redirectTo(BodyEnter, EnterBB, DL);
6547 else
6548 redirectAllPredecessorsTo(BodyEntered, EnterBB, DL);
6549
6550 BodyEnter = nullptr;
6551 BodyEntered = ExitBB;
6552 }
6553
6554 // Append the original loop nest body into the generated loop nest body.
6555 if (BodyEnter)
6556 redirectTo(BodyEnter, InnerEnter, DL);
6557 else
6558 redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL);
6560
6561 // Replace the original induction variable with an induction variable computed
6562 // from the tile and floor induction variables.
6563 Builder.restoreIP(Result.back()->getBodyIP());
6564 for (int i = 0; i < NumLoops; ++i) {
6565 CanonicalLoopInfo *FloorLoop = Result[i];
6566 CanonicalLoopInfo *TileLoop = Result[NumLoops + i];
6567 Value *OrigIndVar = OrigIndVars[i];
6568 Value *Size = TileSizes[i];
6569
6570 Value *Scale =
6571 Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true);
6572 Value *Shift =
6573 Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true);
6574 OrigIndVar->replaceAllUsesWith(Shift);
6575 }
6576
6577 // Remove unused parts of the original loops.
6578 removeUnusedBlocksFromParent(OldControlBBs);
6579
6580 for (CanonicalLoopInfo *L : Loops)
6581 L->invalidate();
6582
6583#ifndef NDEBUG
6584 for (CanonicalLoopInfo *GenL : Result)
6585 GenL->assertOK();
6586#endif
6587 return Result;
6588}
6589
6590/// Attach metadata \p Properties to the basic block described by \p BB. If the
6591/// basic block already has metadata, the basic block properties are appended.
6593 ArrayRef<Metadata *> Properties) {
6594 // Nothing to do if no property to attach.
6595 if (Properties.empty())
6596 return;
6597
6598 LLVMContext &Ctx = BB->getContext();
6599 SmallVector<Metadata *> NewProperties;
6600 NewProperties.push_back(nullptr);
6601
6602 // If the basic block already has metadata, prepend it to the new metadata.
6603 MDNode *Existing = BB->getTerminator()->getMetadata(LLVMContext::MD_loop);
6604 if (Existing)
6605 append_range(NewProperties, drop_begin(Existing->operands(), 1));
6606
6607 append_range(NewProperties, Properties);
6608 MDNode *BasicBlockID = MDNode::getDistinct(Ctx, NewProperties);
6609 BasicBlockID->replaceOperandWith(0, BasicBlockID);
6610
6611 BB->getTerminator()->setMetadata(LLVMContext::MD_loop, BasicBlockID);
6612}
6613
6614/// Attach loop metadata \p Properties to the loop described by \p Loop. If the
6615/// loop already has metadata, the loop properties are appended.
6617 ArrayRef<Metadata *> Properties) {
6618 assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo");
6619
6620 // Attach metadata to the loop's latch
6621 BasicBlock *Latch = Loop->getLatch();
6622 assert(Latch && "A valid CanonicalLoopInfo must have a unique latch");
6623 addBasicBlockMetadata(Latch, Properties);
6624}
6625
6626/// Attach llvm.access.group metadata to the memref instructions of \p Block
6628 LoopInfo &LI) {
6629 for (Instruction &I : *Block) {
6630 if (I.mayReadOrWriteMemory()) {
6631 // TODO: This instruction may already have access group from
6632 // other pragmas e.g. #pragma clang loop vectorize. Append
6633 // so that the existing metadata is not overwritten.
6634 I.setMetadata(LLVMContext::MD_access_group, AccessGroup);
6635 }
6636 }
6637}
6638
6639CanonicalLoopInfo *
6641 CanonicalLoopInfo *firstLoop = Loops.front();
6642 CanonicalLoopInfo *lastLoop = Loops.back();
6643 Function *F = firstLoop->getPreheader()->getParent();
6644
6645 // Loop control blocks that will become orphaned later
6646 SmallVector<BasicBlock *> oldControlBBs;
6648 Loop->collectControlBlocks(oldControlBBs);
6649
6650 // Collect original trip counts
6651 SmallVector<Value *> origTripCounts;
6652 for (CanonicalLoopInfo *L : Loops) {
6653 assert(L->isValid() && "All input loops must be valid canonical loops");
6654 origTripCounts.push_back(L->getTripCount());
6655 }
6656
6657 Builder.SetCurrentDebugLocation(DL);
6658
6659 // Compute max trip count.
6660 // The fused loop will be from 0 to max(origTripCounts)
6661 BasicBlock *TCBlock = BasicBlock::Create(F->getContext(), "omp.fuse.comp.tc",
6662 F, firstLoop->getHeader());
6663 Builder.SetInsertPoint(TCBlock);
6664 Value *fusedTripCount = nullptr;
6665 for (CanonicalLoopInfo *L : Loops) {
6666 assert(L->isValid() && "All loops to fuse must be valid canonical loops");
6667 Value *origTripCount = L->getTripCount();
6668 if (!fusedTripCount) {
6669 fusedTripCount = origTripCount;
6670 continue;
6671 }
6672 Value *condTP = Builder.CreateICmpSGT(fusedTripCount, origTripCount);
6673 fusedTripCount = Builder.CreateSelect(condTP, fusedTripCount, origTripCount,
6674 ".omp.fuse.tc");
6675 }
6676
6677 // Generate new loop
6678 CanonicalLoopInfo *fused =
6679 createLoopSkeleton(DL, fusedTripCount, F, firstLoop->getBody(),
6680 lastLoop->getLatch(), "fused");
6681
6682 // Replace original loops with the fused loop
6683 // Preheader and After are not considered inside the CLI.
6684 // These are used to compute the individual TCs of the loops
6685 // so they have to be put before the resulting fused loop.
6686 // Moving them up for readability.
6687 for (size_t i = 0; i < Loops.size() - 1; ++i) {
6688 Loops[i]->getPreheader()->moveBefore(TCBlock);
6689 Loops[i]->getAfter()->moveBefore(TCBlock);
6690 }
6691 lastLoop->getPreheader()->moveBefore(TCBlock);
6692
6693 for (size_t i = 0; i < Loops.size() - 1; ++i) {
6694 redirectTo(Loops[i]->getPreheader(), Loops[i]->getAfter(), DL);
6695 redirectTo(Loops[i]->getAfter(), Loops[i + 1]->getPreheader(), DL);
6696 }
6697 redirectTo(lastLoop->getPreheader(), TCBlock, DL);
6698 redirectTo(TCBlock, fused->getPreheader(), DL);
6699 redirectTo(fused->getAfter(), lastLoop->getAfter(), DL);
6700
6701 // Build the fused body
6702 // Create new Blocks with conditions that jump to the original loop bodies
6704 SmallVector<Value *> condValues;
6705 for (size_t i = 0; i < Loops.size(); ++i) {
6706 BasicBlock *condBlock = BasicBlock::Create(
6707 F->getContext(), "omp.fused.inner.cond", F, Loops[i]->getBody());
6708 Builder.SetInsertPoint(condBlock);
6709 Value *condValue =
6710 Builder.CreateICmpSLT(fused->getIndVar(), origTripCounts[i]);
6711 condBBs.push_back(condBlock);
6712 condValues.push_back(condValue);
6713 }
6714 // Join the condition blocks with the bodies of the original loops
6715 redirectTo(fused->getBody(), condBBs[0], DL);
6716 for (size_t i = 0; i < Loops.size() - 1; ++i) {
6717 Builder.SetInsertPoint(condBBs[i]);
6718 Builder.CreateCondBr(condValues[i], Loops[i]->getBody(), condBBs[i + 1]);
6719 redirectAllPredecessorsTo(Loops[i]->getLatch(), condBBs[i + 1], DL);
6720 // Replace the IV with the fused IV
6721 Loops[i]->getIndVar()->replaceAllUsesWith(fused->getIndVar());
6722 }
6723 // Last body jumps to the created end body block
6724 Builder.SetInsertPoint(condBBs.back());
6725 Builder.CreateCondBr(condValues.back(), lastLoop->getBody(),
6726 fused->getLatch());
6727 redirectAllPredecessorsTo(lastLoop->getLatch(), fused->getLatch(), DL);
6728 // Replace the IV with the fused IV
6729 lastLoop->getIndVar()->replaceAllUsesWith(fused->getIndVar());
6730
6731 // The loop latch must have only one predecessor. Currently it is branched to
6732 // from both the last condition block and the last loop body
6733 fused->getLatch()->splitBasicBlockBefore(fused->getLatch()->begin(),
6734 "omp.fused.pre_latch");
6735
6736 // Remove unused parts
6737 removeUnusedBlocksFromParent(oldControlBBs);
6738
6739 // Invalidate old CLIs
6740 for (CanonicalLoopInfo *L : Loops)
6741 L->invalidate();
6742
6743#ifndef NDEBUG
6744 fused->assertOK();
6745#endif
6746 return fused;
6747}
6748
6750 LLVMContext &Ctx = Builder.getContext();
6752 Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
6753 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))});
6754}
6755
6757 LLVMContext &Ctx = Builder.getContext();
6759 Loop, {
6760 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
6761 });
6762}
6763
6764void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
6765 Value *IfCond, ValueToValueMapTy &VMap,
6766 LoopAnalysis &LIA, LoopInfo &LI, Loop *L,
6767 const Twine &NamePrefix) {
6768 Function *F = CanonicalLoop->getFunction();
6769
6770 // We can't do
6771 // if (cond) {
6772 // simd_loop;
6773 // } else {
6774 // non_simd_loop;
6775 // }
6776 // because then the CanonicalLoopInfo would only point to one of the loops:
6777 // leading to other constructs operating on the same loop to malfunction.
6778 // Instead generate
6779 // while (...) {
6780 // if (cond) {
6781 // simd_body;
6782 // } else {
6783 // not_simd_body;
6784 // }
6785 // }
6786 // At least for simple loops, LLVM seems able to hoist the if out of the loop
6787 // body at -O3
6788
6789 // Define where if branch should be inserted
6790 auto SplitBeforeIt = CanonicalLoop->getBody()->getFirstNonPHIIt();
6791
6792 // Create additional blocks for the if statement
6793 BasicBlock *Cond = SplitBeforeIt->getParent();
6794 llvm::LLVMContext &C = Cond->getContext();
6796 C, NamePrefix + ".if.then", Cond->getParent(), Cond->getNextNode());
6798 C, NamePrefix + ".if.else", Cond->getParent(), CanonicalLoop->getExit());
6799
6800 // Create if condition branch.
6801 Builder.SetInsertPoint(SplitBeforeIt);
6802 Instruction *BrInstr =
6803 Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock);
6804 InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()};
6805 // Then block contains branch to omp loop body which needs to be vectorized
6806 spliceBB(IP, ThenBlock, false, Builder.getCurrentDebugLocation());
6807 ThenBlock->replaceSuccessorsPhiUsesWith(Cond, ThenBlock);
6808
6809 Builder.SetInsertPoint(ElseBlock);
6810
6811 // Clone loop for the else branch
6813
6814 SmallVector<BasicBlock *, 8> ExistingBlocks;
6815 ExistingBlocks.reserve(L->getNumBlocks() + 1);
6816 ExistingBlocks.push_back(ThenBlock);
6817 ExistingBlocks.append(L->block_begin(), L->block_end());
6818 // Cond is the block that has the if clause condition
6819 // LoopCond is omp_loop.cond
6820 // LoopHeader is omp_loop.header
6821 BasicBlock *LoopCond = Cond->getUniquePredecessor();
6822 BasicBlock *LoopHeader = LoopCond->getUniquePredecessor();
6823 assert(LoopCond && LoopHeader && "Invalid loop structure");
6824 for (BasicBlock *Block : ExistingBlocks) {
6825 if (Block == L->getLoopPreheader() || Block == L->getLoopLatch() ||
6826 Block == LoopHeader || Block == LoopCond || Block == Cond) {
6827 continue;
6828 }
6829 BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F);
6830
6831 // fix name not to be omp.if.then
6832 if (Block == ThenBlock)
6833 NewBB->setName(NamePrefix + ".if.else");
6834
6835 NewBB->moveBefore(CanonicalLoop->getExit());
6836 VMap[Block] = NewBB;
6837 NewBlocks.push_back(NewBB);
6838 }
6839 remapInstructionsInBlocks(NewBlocks, VMap);
6840 Builder.CreateBr(NewBlocks.front());
6841
6842 // The loop latch must have only one predecessor. Currently it is branched to
6843 // from both the 'then' and 'else' branches.
6844 L->getLoopLatch()->splitBasicBlockBefore(L->getLoopLatch()->begin(),
6845 NamePrefix + ".pre_latch");
6846
6847 // Ensure that the then block is added to the loop so we add the attributes in
6848 // the next step
6849 L->addBasicBlockToLoop(ThenBlock, LI);
6850}
6851
6852unsigned
6854 const StringMap<bool> &Features) {
6855 if (TargetTriple.isX86()) {
6856 if (Features.lookup("avx512f"))
6857 return 512;
6858 else if (Features.lookup("avx"))
6859 return 256;
6860 return 128;
6861 }
6862 if (TargetTriple.isPPC())
6863 return 128;
6864 if (TargetTriple.isWasm())
6865 return 128;
6866 return 0;
6867}
6868
6870 MapVector<Value *, Value *> AlignedVars,
6871 Value *IfCond, OrderKind Order,
6872 ConstantInt *Simdlen, ConstantInt *Safelen) {
6873 LLVMContext &Ctx = Builder.getContext();
6874
6875 Function *F = CanonicalLoop->getFunction();
6876
6877 // TODO: We should not rely on pass manager. Currently we use pass manager
6878 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
6879 // object. We should have a method which returns all blocks between
6880 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
6882 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
6883 FAM.registerPass([]() { return LoopAnalysis(); });
6884 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
6885
6886 LoopAnalysis LIA;
6887 LoopInfo &&LI = LIA.run(*F, FAM);
6888
6889 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
6890 if (AlignedVars.size()) {
6891 InsertPointTy IP = Builder.saveIP();
6892 for (auto &AlignedItem : AlignedVars) {
6893 Value *AlignedPtr = AlignedItem.first;
6894 Value *Alignment = AlignedItem.second;
6895 Instruction *loadInst = dyn_cast<Instruction>(AlignedPtr);
6896 Builder.SetInsertPoint(loadInst->getNextNode());
6897 Builder.CreateAlignmentAssumption(F->getDataLayout(), AlignedPtr,
6898 Alignment);
6899 }
6900 Builder.restoreIP(IP);
6901 }
6902
6903 if (IfCond) {
6904 ValueToValueMapTy VMap;
6905 createIfVersion(CanonicalLoop, IfCond, VMap, LIA, LI, L, "simd");
6906 }
6907
6909
6910 // Get the basic blocks from the loop in which memref instructions
6911 // can be found.
6912 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
6913 // preferably without running any passes.
6914 for (BasicBlock *Block : L->getBlocks()) {
6915 if (Block == CanonicalLoop->getCond() ||
6916 Block == CanonicalLoop->getHeader())
6917 continue;
6918 Reachable.insert(Block);
6919 }
6920
6921 SmallVector<Metadata *> LoopMDList;
6922
6923 // In presence of finite 'safelen', it may be unsafe to mark all
6924 // the memory instructions parallel, because loop-carried
6925 // dependences of 'safelen' iterations are possible.
6926 // If clause order(concurrent) is specified then the memory instructions
6927 // are marked parallel even if 'safelen' is finite.
6928 if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent))
6929 applyParallelAccessesMetadata(CanonicalLoop, Ctx, L, LI, LoopMDList);
6930
6931 // FIXME: the IF clause shares a loop backedge for the SIMD and non-SIMD
6932 // versions so we can't add the loop attributes in that case.
6933 if (IfCond) {
6934 // we can still add llvm.loop.parallel_access
6935 addLoopMetadata(CanonicalLoop, LoopMDList);
6936 return;
6937 }
6938
6939 // Use the above access group metadata to create loop level
6940 // metadata, which should be distinct for each loop.
6941 ConstantAsMetadata *BoolConst =
6943 LoopMDList.push_back(MDNode::get(
6944 Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), BoolConst}));
6945
6946 if (Simdlen || Safelen) {
6947 // If both simdlen and safelen clauses are specified, the value of the
6948 // simdlen parameter must be less than or equal to the value of the safelen
6949 // parameter. Therefore, use safelen only in the absence of simdlen.
6950 ConstantInt *VectorizeWidth = Simdlen == nullptr ? Safelen : Simdlen;
6951 LoopMDList.push_back(
6952 MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.width"),
6953 ConstantAsMetadata::get(VectorizeWidth)}));
6954 }
6955
6956 addLoopMetadata(CanonicalLoop, LoopMDList);
6957}
6958
6959/// Create the TargetMachine object to query the backend for optimization
6960/// preferences.
6961///
6962/// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
6963/// e.g. Clang does not pass it to its CodeGen layer and creates it only when
6964/// needed for the LLVM pass pipline. We use some default options to avoid
6965/// having to pass too many settings from the frontend that probably do not
6966/// matter.
6967///
6968/// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
6969/// method. If we are going to use TargetMachine for more purposes, especially
6970/// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
6971/// might become be worth requiring front-ends to pass on their TargetMachine,
6972/// or at least cache it between methods. Note that while fontends such as Clang
6973/// have just a single main TargetMachine per translation unit, "target-cpu" and
6974/// "target-features" that determine the TargetMachine are per-function and can
6975/// be overrided using __attribute__((target("OPTIONS"))).
6976static std::unique_ptr<TargetMachine>
6978 Module *M = F->getParent();
6979
6980 StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString();
6981 StringRef Features = F->getFnAttribute("target-features").getValueAsString();
6982 const llvm::Triple &Triple = M->getTargetTriple();
6983
6984 std::string Error;
6986 if (!TheTarget)
6987 return {};
6988
6990 return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
6991 Triple, CPU, Features, Options, /*RelocModel=*/std::nullopt,
6992 /*CodeModel=*/std::nullopt, OptLevel));
6993}
6994
6995/// Heuristically determine the best-performant unroll factor for \p CLI. This
6996/// depends on the target processor. We are re-using the same heuristics as the
6997/// LoopUnrollPass.
6999 Function *F = CLI->getFunction();
7000
7001 // Assume the user requests the most aggressive unrolling, even if the rest of
7002 // the code is optimized using a lower setting.
7004 std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);
7005
7007 FAM.registerPass([]() { return TargetLibraryAnalysis(); });
7008 FAM.registerPass([]() { return AssumptionAnalysis(); });
7009 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
7010 FAM.registerPass([]() { return LoopAnalysis(); });
7011 FAM.registerPass([]() { return ScalarEvolutionAnalysis(); });
7012 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
7013 TargetIRAnalysis TIRA;
7014 if (TM)
7015 TIRA = TargetIRAnalysis(
7016 [&](const Function &F) { return TM->getTargetTransformInfo(F); });
7017 FAM.registerPass([&]() { return TIRA; });
7018
7019 TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM);
7021 ScalarEvolution &&SE = SEA.run(*F, FAM);
7023 DominatorTree &&DT = DTA.run(*F, FAM);
7024 LoopAnalysis LIA;
7025 LoopInfo &&LI = LIA.run(*F, FAM);
7027 AssumptionCache &&AC = ACT.run(*F, FAM);
7029
7030 Loop *L = LI.getLoopFor(CLI->getHeader());
7031 assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");
7032
7034 L, SE, TTI,
7035 /*BlockFrequencyInfo=*/nullptr,
7036 /*ProfileSummaryInfo=*/nullptr, ORE, static_cast<int>(OptLevel),
7037 /*UserThreshold=*/std::nullopt,
7038 /*UserCount=*/std::nullopt,
7039 /*UserAllowPartial=*/true,
7040 /*UserAllowRuntime=*/true,
7041 /*UserUpperBound=*/std::nullopt,
7042 /*UserFullUnrollMaxCount=*/std::nullopt);
7043
7044 UP.Force = true;
7045
7046 // Account for additional optimizations taking place before the LoopUnrollPass
7047 // would unroll the loop.
7050
7051 // Use normal unroll factors even if the rest of the code is optimized for
7052 // size.
7055
7056 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
7057 << " Threshold=" << UP.Threshold << "\n"
7058 << " PartialThreshold=" << UP.PartialThreshold << "\n"
7059 << " OptSizeThreshold=" << UP.OptSizeThreshold << "\n"
7060 << " PartialOptSizeThreshold="
7061 << UP.PartialOptSizeThreshold << "\n");
7062
7063 // Disable peeling.
7066 /*UserAllowPeeling=*/false,
7067 /*UserAllowProfileBasedPeeling=*/false,
7068 /*UnrollingSpecficValues=*/false);
7069
7071 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
7072
7073 // Assume that reads and writes to stack variables can be eliminated by
7074 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
7075 // size.
7076 for (BasicBlock *BB : L->blocks()) {
7077 for (Instruction &I : *BB) {
7078 Value *Ptr;
7079 if (auto *Load = dyn_cast<LoadInst>(&I)) {
7080 Ptr = Load->getPointerOperand();
7081 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
7082 Ptr = Store->getPointerOperand();
7083 } else
7084 continue;
7085
7086 Ptr = Ptr->stripPointerCasts();
7087
7088 if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) {
7089 if (Alloca->getParent() == &F->getEntryBlock())
7090 EphValues.insert(&I);
7091 }
7092 }
7093 }
7094
7095 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
7096
7097 // Loop is not unrollable if the loop contains certain instructions.
7098 if (!UCE.canUnroll()) {
7099 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
7100 return 1;
7101 }
7102
7103 LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE.getRolledLoopSize()
7104 << "\n");
7105
7106 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
7107 // be able to use it.
7108 int TripCount = 0;
7109 int MaxTripCount = 0;
7110 bool MaxOrZero = false;
7111 unsigned TripMultiple = 0;
7112
7113 computeUnrollCount(L, TTI, DT, &LI, &AC, SE, EphValues, &ORE, TripCount,
7114 MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP);
7115 unsigned Factor = UP.Count;
7116 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n");
7117
7118 // This function returns 1 to signal to not unroll a loop.
7119 if (Factor == 0)
7120 return 1;
7121 return Factor;
7122}
7123
7125 int32_t Factor,
7126 CanonicalLoopInfo **UnrolledCLI) {
7127 assert(Factor >= 0 && "Unroll factor must not be negative");
7128
7129 Function *F = Loop->getFunction();
7130 LLVMContext &Ctx = F->getContext();
7131
7132 // If the unrolled loop is not used for another loop-associated directive, it
7133 // is sufficient to add metadata for the LoopUnrollPass.
7134 if (!UnrolledCLI) {
7135 SmallVector<Metadata *, 2> LoopMetadata;
7136 LoopMetadata.push_back(
7137 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")));
7138
7139 if (Factor >= 1) {
7141 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
7142 LoopMetadata.push_back(MDNode::get(
7143 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst}));
7144 }
7145
7146 addLoopMetadata(Loop, LoopMetadata);
7147 return;
7148 }
7149
7150 // Heuristically determine the unroll factor.
7151 if (Factor == 0)
7153
7154 // No change required with unroll factor 1.
7155 if (Factor == 1) {
7156 *UnrolledCLI = Loop;
7157 return;
7158 }
7159
7160 assert(Factor >= 2 &&
7161 "unrolling only makes sense with a factor of 2 or larger");
7162
7163 Type *IndVarTy = Loop->getIndVarType();
7164
7165 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
7166 // unroll the inner loop.
7167 Value *FactorVal =
7168 ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor,
7169 /*isSigned=*/false));
7170 std::vector<CanonicalLoopInfo *> LoopNest =
7171 tileLoops(DL, {Loop}, {FactorVal});
7172 assert(LoopNest.size() == 2 && "Expect 2 loops after tiling");
7173 *UnrolledCLI = LoopNest[0];
7174 CanonicalLoopInfo *InnerLoop = LoopNest[1];
7175
7176 // LoopUnrollPass can only fully unroll loops with constant trip count.
7177 // Unroll by the unroll factor with a fallback epilog for the remainder
7178 // iterations if necessary.
7180 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
7182 InnerLoop,
7183 {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
7185 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})});
7186
7187#ifndef NDEBUG
7188 (*UnrolledCLI)->assertOK();
7189#endif
7190}
7191
7194 llvm::Value *BufSize, llvm::Value *CpyBuf,
7195 llvm::Value *CpyFn, llvm::Value *DidIt) {
7196 if (!updateToLocation(Loc))
7197 return Loc.IP;
7198
7199 uint32_t SrcLocStrSize;
7200 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7201 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7202 Value *ThreadId = getOrCreateThreadID(Ident);
7203
7204 llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt);
7205
7206 Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD};
7207
7208 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate);
7209 createRuntimeFunctionCall(Fn, Args);
7210
7211 return Builder.saveIP();
7212}
7213
7215 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
7216 FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef<llvm::Value *> CPVars,
7218
7219 if (!updateToLocation(Loc))
7220 return Loc.IP;
7221
7222 // If needed allocate and initialize `DidIt` with 0.
7223 // DidIt: flag variable: 1=single thread; 0=not single thread.
7224 llvm::Value *DidIt = nullptr;
7225 if (!CPVars.empty()) {
7226 DidIt = Builder.CreateAlloca(llvm::Type::getInt32Ty(Builder.getContext()));
7227 Builder.CreateStore(Builder.getInt32(0), DidIt);
7228 }
7229
7230 Directive OMPD = Directive::OMPD_single;
7231 uint32_t SrcLocStrSize;
7232 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7233 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7234 Value *ThreadId = getOrCreateThreadID(Ident);
7235 Value *Args[] = {Ident, ThreadId};
7236
7237 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single);
7238 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
7239
7240 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
7241 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
7242
7243 auto FiniCBWrapper = [&](InsertPointTy IP) -> Error {
7244 if (Error Err = FiniCB(IP))
7245 return Err;
7246
7247 // The thread that executes the single region must set `DidIt` to 1.
7248 // This is used by __kmpc_copyprivate, to know if the caller is the
7249 // single thread or not.
7250 if (DidIt)
7251 Builder.CreateStore(Builder.getInt32(1), DidIt);
7252
7253 return Error::success();
7254 };
7255
7256 // generates the following:
7257 // if (__kmpc_single()) {
7258 // .... single region ...
7259 // __kmpc_end_single
7260 // }
7261 // __kmpc_copyprivate
7262 // __kmpc_barrier
7263
7264 InsertPointOrErrorTy AfterIP =
7265 EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCBWrapper,
7266 /*Conditional*/ true,
7267 /*hasFinalize*/ true);
7268 if (!AfterIP)
7269 return AfterIP.takeError();
7270
7271 if (DidIt) {
7272 for (size_t I = 0, E = CPVars.size(); I < E; ++I)
7273 // NOTE BufSize is currently unused, so just pass 0.
7275 /*BufSize=*/ConstantInt::get(Int64, 0), CPVars[I],
7276 CPFuncs[I], DidIt);
7277 // NOTE __kmpc_copyprivate already inserts a barrier
7278 } else if (!IsNowait) {
7279 InsertPointOrErrorTy AfterIP =
7281 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
7282 /* CheckCancelFlag */ false);
7283 if (!AfterIP)
7284 return AfterIP.takeError();
7285 }
7286 return Builder.saveIP();
7287}
7288
7290 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
7291 FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) {
7292
7293 if (!updateToLocation(Loc))
7294 return Loc.IP;
7295
7296 Directive OMPD = Directive::OMPD_critical;
7297 uint32_t SrcLocStrSize;
7298 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7299 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7300 Value *ThreadId = getOrCreateThreadID(Ident);
7301 Value *LockVar = getOMPCriticalRegionLock(CriticalName);
7302 Value *Args[] = {Ident, ThreadId, LockVar};
7303
7304 SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args));
7305 Function *RTFn = nullptr;
7306 if (HintInst) {
7307 // Add Hint to entry Args and create call
7308 EnterArgs.push_back(HintInst);
7309 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint);
7310 } else {
7311 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical);
7312 }
7313 Instruction *EntryCall = createRuntimeFunctionCall(RTFn, EnterArgs);
7314
7315 Function *ExitRTLFn =
7316 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical);
7317 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
7318
7319 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
7320 /*Conditional*/ false, /*hasFinalize*/ true);
7321}
7322
7325 InsertPointTy AllocaIP, unsigned NumLoops,
7326 ArrayRef<llvm::Value *> StoreValues,
7327 const Twine &Name, bool IsDependSource) {
7328 assert(
7329 llvm::all_of(StoreValues,
7330 [](Value *SV) { return SV->getType()->isIntegerTy(64); }) &&
7331 "OpenMP runtime requires depend vec with i64 type");
7332
7333 if (!updateToLocation(Loc))
7334 return Loc.IP;
7335
7336 // Allocate space for vector and generate alloc instruction.
7337 auto *ArrI64Ty = ArrayType::get(Int64, NumLoops);
7338 Builder.restoreIP(AllocaIP);
7339 AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
7340 ArgsBase->setAlignment(Align(8));
7342
7343 // Store the index value with offset in depend vector.
7344 for (unsigned I = 0; I < NumLoops; ++I) {
7345 Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
7346 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
7347 StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
7348 STInst->setAlignment(Align(8));
7349 }
7350
7351 Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
7352 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
7353
7354 uint32_t SrcLocStrSize;
7355 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7356 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7357 Value *ThreadId = getOrCreateThreadID(Ident);
7358 Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
7359
7360 Function *RTLFn = nullptr;
7361 if (IsDependSource)
7362 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post);
7363 else
7364 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait);
7365 createRuntimeFunctionCall(RTLFn, Args);
7366
7367 return Builder.saveIP();
7368}
7369
7371 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
7372 FinalizeCallbackTy FiniCB, bool IsThreads) {
7373 if (!updateToLocation(Loc))
7374 return Loc.IP;
7375
7376 Directive OMPD = Directive::OMPD_ordered;
7377 Instruction *EntryCall = nullptr;
7378 Instruction *ExitCall = nullptr;
7379
7380 if (IsThreads) {
7381 uint32_t SrcLocStrSize;
7382 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7383 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7384 Value *ThreadId = getOrCreateThreadID(Ident);
7385 Value *Args[] = {Ident, ThreadId};
7386
7387 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered);
7388 EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
7389
7390 Function *ExitRTLFn =
7391 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered);
7392 ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
7393 }
7394
7395 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
7396 /*Conditional*/ false, /*hasFinalize*/ true);
7397}
7398
7399OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::EmitOMPInlinedRegion(
7400 Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
7401 BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,
7402 bool HasFinalize, bool IsCancellable) {
7403
7404 if (HasFinalize)
7405 FinalizationStack.push_back({FiniCB, OMPD, IsCancellable});
7406
7407 // Create inlined region's entry and body blocks, in preparation
7408 // for conditional creation
7409 BasicBlock *EntryBB = Builder.GetInsertBlock();
7410 Instruction *SplitPos = EntryBB->getTerminator();
7411 if (!isa_and_nonnull<BranchInst>(SplitPos))
7412 SplitPos = new UnreachableInst(Builder.getContext(), EntryBB);
7413 BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end");
7414 BasicBlock *FiniBB =
7415 EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize");
7416
7417 Builder.SetInsertPoint(EntryBB->getTerminator());
7418 emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
7419
7420 // generate body
7421 if (Error Err = BodyGenCB(/* AllocaIP */ InsertPointTy(),
7422 /* CodeGenIP */ Builder.saveIP()))
7423 return Err;
7424
7425 // emit exit call and do any needed finalization.
7426 auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
7427 assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
7428 FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
7429 "Unexpected control flow graph state!!");
7430 InsertPointOrErrorTy AfterIP =
7431 emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
7432 if (!AfterIP)
7433 return AfterIP.takeError();
7434
7435 // If we are skipping the region of a non conditional, remove the exit
7436 // block, and clear the builder's insertion point.
7437 assert(SplitPos->getParent() == ExitBB &&
7438 "Unexpected Insertion point location!");
7439 auto merged = MergeBlockIntoPredecessor(ExitBB);
7440 BasicBlock *ExitPredBB = SplitPos->getParent();
7441 auto InsertBB = merged ? ExitPredBB : ExitBB;
7442 if (!isa_and_nonnull<BranchInst>(SplitPos))
7443 SplitPos->eraseFromParent();
7444 Builder.SetInsertPoint(InsertBB);
7445
7446 return Builder.saveIP();
7447}
7448
7449OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry(
7450 Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) {
7451 // if nothing to do, Return current insertion point.
7452 if (!Conditional || !EntryCall)
7453 return Builder.saveIP();
7454
7455 BasicBlock *EntryBB = Builder.GetInsertBlock();
7456 Value *CallBool = Builder.CreateIsNotNull(EntryCall);
7457 auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body");
7458 auto *UI = new UnreachableInst(Builder.getContext(), ThenBB);
7459
7460 // Emit thenBB and set the Builder's insertion point there for
7461 // body generation next. Place the block after the current block.
7462 Function *CurFn = EntryBB->getParent();
7463 CurFn->insert(std::next(EntryBB->getIterator()), ThenBB);
7464
7465 // Move Entry branch to end of ThenBB, and replace with conditional
7466 // branch (If-stmt)
7467 Instruction *EntryBBTI = EntryBB->getTerminator();
7468 Builder.CreateCondBr(CallBool, ThenBB, ExitBB);
7469 EntryBBTI->removeFromParent();
7470 Builder.SetInsertPoint(UI);
7471 Builder.Insert(EntryBBTI);
7472 UI->eraseFromParent();
7473 Builder.SetInsertPoint(ThenBB->getTerminator());
7474
7475 // return an insertion point to ExitBB.
7476 return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt());
7477}
7478
7479OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitCommonDirectiveExit(
7480 omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall,
7481 bool HasFinalize) {
7482
7483 Builder.restoreIP(FinIP);
7484
7485 // If there is finalization to do, emit it before the exit call
7486 if (HasFinalize) {
7487 assert(!FinalizationStack.empty() &&
7488 "Unexpected finalization stack state!");
7489
7490 FinalizationInfo Fi = FinalizationStack.pop_back_val();
7491 assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
7492
7493 if (Error Err = Fi.mergeFiniBB(Builder, FinIP.getBlock()))
7494 return std::move(Err);
7495
7496 // Exit condition: insertion point is before the terminator of the new Fini
7497 // block
7498 Builder.SetInsertPoint(FinIP.getBlock()->getTerminator());
7499 }
7500
7501 if (!ExitCall)
7502 return Builder.saveIP();
7503
7504 // place the Exitcall as last instruction before Finalization block terminator
7505 ExitCall->removeFromParent();
7506 Builder.Insert(ExitCall);
7507
7508 return IRBuilder<>::InsertPoint(ExitCall->getParent(),
7509 ExitCall->getIterator());
7510}
7511
7513 InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr,
7514 llvm::IntegerType *IntPtrTy, bool BranchtoEnd) {
7515 if (!IP.isSet())
7516 return IP;
7517
7519
7520 // creates the following CFG structure
7521 // OMP_Entry : (MasterAddr != PrivateAddr)?
7522 // F T
7523 // | \
7524 // | copin.not.master
7525 // | /
7526 // v /
7527 // copyin.not.master.end
7528 // |
7529 // v
7530 // OMP.Entry.Next
7531
7532 BasicBlock *OMP_Entry = IP.getBlock();
7533 Function *CurFn = OMP_Entry->getParent();
7534 BasicBlock *CopyBegin =
7535 BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn);
7536 BasicBlock *CopyEnd = nullptr;
7537
7538 // If entry block is terminated, split to preserve the branch to following
7539 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
7540 if (isa_and_nonnull<BranchInst>(OMP_Entry->getTerminator())) {
7541 CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(),
7542 "copyin.not.master.end");
7543 OMP_Entry->getTerminator()->eraseFromParent();
7544 } else {
7545 CopyEnd =
7546 BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn);
7547 }
7548
7549 Builder.SetInsertPoint(OMP_Entry);
7550 Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy);
7551 Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy);
7552 Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr);
7553 Builder.CreateCondBr(cmp, CopyBegin, CopyEnd);
7554
7555 Builder.SetInsertPoint(CopyBegin);
7556 if (BranchtoEnd)
7557 Builder.SetInsertPoint(Builder.CreateBr(CopyEnd));
7558
7559 return Builder.saveIP();
7560}
7561
7563 Value *Size, Value *Allocator,
7564 std::string Name) {
7567
7568 uint32_t SrcLocStrSize;
7569 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7570 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7571 Value *ThreadId = getOrCreateThreadID(Ident);
7572 Value *Args[] = {ThreadId, Size, Allocator};
7573
7574 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc);
7575
7576 return createRuntimeFunctionCall(Fn, Args, Name);
7577}
7578
7580 Value *Addr, Value *Allocator,
7581 std::string Name) {
7584
7585 uint32_t SrcLocStrSize;
7586 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7587 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7588 Value *ThreadId = getOrCreateThreadID(Ident);
7589 Value *Args[] = {ThreadId, Addr, Allocator};
7590 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
7591 return createRuntimeFunctionCall(Fn, Args, Name);
7592}
7593
7595 const LocationDescription &Loc, Value *InteropVar,
7596 omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,
7597 Value *DependenceAddress, bool HaveNowaitClause) {
7600
7601 uint32_t SrcLocStrSize;
7602 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7603 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7604 Value *ThreadId = getOrCreateThreadID(Ident);
7605 if (Device == nullptr)
7606 Device = Constant::getAllOnesValue(Int32);
7607 Constant *InteropTypeVal = ConstantInt::get(Int32, (int)InteropType);
7608 if (NumDependences == nullptr) {
7609 NumDependences = ConstantInt::get(Int32, 0);
7610 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
7611 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
7612 }
7613 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
7614 Value *Args[] = {
7615 Ident, ThreadId, InteropVar, InteropTypeVal,
7616 Device, NumDependences, DependenceAddress, HaveNowaitClauseVal};
7617
7618 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init);
7619
7620 return createRuntimeFunctionCall(Fn, Args);
7621}
7622
7624 const LocationDescription &Loc, Value *InteropVar, Value *Device,
7625 Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) {
7628
7629 uint32_t SrcLocStrSize;
7630 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7631 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7632 Value *ThreadId = getOrCreateThreadID(Ident);
7633 if (Device == nullptr)
7634 Device = Constant::getAllOnesValue(Int32);
7635 if (NumDependences == nullptr) {
7636 NumDependences = ConstantInt::get(Int32, 0);
7637 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
7638 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
7639 }
7640 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
7641 Value *Args[] = {
7642 Ident, ThreadId, InteropVar, Device,
7643 NumDependences, DependenceAddress, HaveNowaitClauseVal};
7644
7645 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy);
7646
7647 return createRuntimeFunctionCall(Fn, Args);
7648}
7649
7651 Value *InteropVar, Value *Device,
7652 Value *NumDependences,
7653 Value *DependenceAddress,
7654 bool HaveNowaitClause) {
7657 uint32_t SrcLocStrSize;
7658 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7659 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7660 Value *ThreadId = getOrCreateThreadID(Ident);
7661 if (Device == nullptr)
7662 Device = Constant::getAllOnesValue(Int32);
7663 if (NumDependences == nullptr) {
7664 NumDependences = ConstantInt::get(Int32, 0);
7665 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
7666 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
7667 }
7668 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
7669 Value *Args[] = {
7670 Ident, ThreadId, InteropVar, Device,
7671 NumDependences, DependenceAddress, HaveNowaitClauseVal};
7672
7673 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use);
7674
7675 return createRuntimeFunctionCall(Fn, Args);
7676}
7677
7680 llvm::ConstantInt *Size, const llvm::Twine &Name) {
7683
7684 uint32_t SrcLocStrSize;
7685 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7686 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7687 Value *ThreadId = getOrCreateThreadID(Ident);
7688 Constant *ThreadPrivateCache =
7689 getOrCreateInternalVariable(Int8PtrPtr, Name.str());
7690 llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
7691
7692 Function *Fn =
7693 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
7694
7695 return createRuntimeFunctionCall(Fn, Args);
7696}
7697
7699 const LocationDescription &Loc,
7701 assert(!Attrs.MaxThreads.empty() && !Attrs.MaxTeams.empty() &&
7702 "expected num_threads and num_teams to be specified");
7703
7704 if (!updateToLocation(Loc))
7705 return Loc.IP;
7706
7707 uint32_t SrcLocStrSize;
7708 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7709 Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7710 Constant *IsSPMDVal = ConstantInt::getSigned(Int8, Attrs.ExecFlags);
7711 Constant *UseGenericStateMachineVal = ConstantInt::getSigned(
7712 Int8, Attrs.ExecFlags != omp::OMP_TGT_EXEC_MODE_SPMD);
7713 Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
7714 Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
7715
7716 Function *DebugKernelWrapper = Builder.GetInsertBlock()->getParent();
7717 Function *Kernel = DebugKernelWrapper;
7718
7719 // We need to strip the debug prefix to get the correct kernel name.
7720 StringRef KernelName = Kernel->getName();
7721 const std::string DebugPrefix = "_debug__";
7722 if (KernelName.ends_with(DebugPrefix)) {
7723 KernelName = KernelName.drop_back(DebugPrefix.length());
7724 Kernel = M.getFunction(KernelName);
7725 assert(Kernel && "Expected the real kernel to exist");
7726 }
7727
7728 // Manifest the launch configuration in the metadata matching the kernel
7729 // environment.
7730 if (Attrs.MinTeams > 1 || Attrs.MaxTeams.front() > 0)
7731 writeTeamsForKernel(T, *Kernel, Attrs.MinTeams, Attrs.MaxTeams.front());
7732
7733 // If MaxThreads not set, select the maximum between the default workgroup
7734 // size and the MinThreads value.
7735 int32_t MaxThreadsVal = Attrs.MaxThreads.front();
7736 if (MaxThreadsVal < 0)
7737 MaxThreadsVal = std::max(
7738 int32_t(getGridValue(T, Kernel).GV_Default_WG_Size), Attrs.MinThreads);
7739
7740 if (MaxThreadsVal > 0)
7741 writeThreadBoundsForKernel(T, *Kernel, Attrs.MinThreads, MaxThreadsVal);
7742
7743 Constant *MinThreads = ConstantInt::getSigned(Int32, Attrs.MinThreads);
7744 Constant *MaxThreads = ConstantInt::getSigned(Int32, MaxThreadsVal);
7745 Constant *MinTeams = ConstantInt::getSigned(Int32, Attrs.MinTeams);
7746 Constant *MaxTeams = ConstantInt::getSigned(Int32, Attrs.MaxTeams.front());
7747 Constant *ReductionDataSize =
7748 ConstantInt::getSigned(Int32, Attrs.ReductionDataSize);
7749 Constant *ReductionBufferLength =
7750 ConstantInt::getSigned(Int32, Attrs.ReductionBufferLength);
7751
7753 omp::RuntimeFunction::OMPRTL___kmpc_target_init);
7754 const DataLayout &DL = Fn->getDataLayout();
7755
7756 Twine DynamicEnvironmentName = KernelName + "_dynamic_environment";
7757 Constant *DynamicEnvironmentInitializer =
7758 ConstantStruct::get(DynamicEnvironment, {DebugIndentionLevelVal});
7759 GlobalVariable *DynamicEnvironmentGV = new GlobalVariable(
7760 M, DynamicEnvironment, /*IsConstant=*/false, GlobalValue::WeakODRLinkage,
7761 DynamicEnvironmentInitializer, DynamicEnvironmentName,
7762 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
7763 DL.getDefaultGlobalsAddressSpace());
7764 DynamicEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
7765
7766 Constant *DynamicEnvironment =
7767 DynamicEnvironmentGV->getType() == DynamicEnvironmentPtr
7768 ? DynamicEnvironmentGV
7769 : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV,
7770 DynamicEnvironmentPtr);
7771
7772 Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get(
7773 ConfigurationEnvironment, {
7774 UseGenericStateMachineVal,
7775 MayUseNestedParallelismVal,
7776 IsSPMDVal,
7777 MinThreads,
7778 MaxThreads,
7779 MinTeams,
7780 MaxTeams,
7781 ReductionDataSize,
7782 ReductionBufferLength,
7783 });
7784 Constant *KernelEnvironmentInitializer = ConstantStruct::get(
7785 KernelEnvironment, {
7786 ConfigurationEnvironmentInitializer,
7787 Ident,
7788 DynamicEnvironment,
7789 });
7790 std::string KernelEnvironmentName =
7791 (KernelName + "_kernel_environment").str();
7792 GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
7793 M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
7794 KernelEnvironmentInitializer, KernelEnvironmentName,
7795 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
7796 DL.getDefaultGlobalsAddressSpace());
7797 KernelEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
7798
7799 Constant *KernelEnvironment =
7800 KernelEnvironmentGV->getType() == KernelEnvironmentPtr
7801 ? KernelEnvironmentGV
7802 : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV,
7803 KernelEnvironmentPtr);
7804 Value *KernelLaunchEnvironment = DebugKernelWrapper->getArg(0);
7805 Type *KernelLaunchEnvParamTy = Fn->getFunctionType()->getParamType(1);
7806 KernelLaunchEnvironment =
7807 KernelLaunchEnvironment->getType() == KernelLaunchEnvParamTy
7808 ? KernelLaunchEnvironment
7809 : Builder.CreateAddrSpaceCast(KernelLaunchEnvironment,
7810 KernelLaunchEnvParamTy);
7811 CallInst *ThreadKind = createRuntimeFunctionCall(
7812 Fn, {KernelEnvironment, KernelLaunchEnvironment});
7813
7814 Value *ExecUserCode = Builder.CreateICmpEQ(
7815 ThreadKind, Constant::getAllOnesValue(ThreadKind->getType()),
7816 "exec_user_code");
7817
7818 // ThreadKind = __kmpc_target_init(...)
7819 // if (ThreadKind == -1)
7820 // user_code
7821 // else
7822 // return;
7823
7824 auto *UI = Builder.CreateUnreachable();
7825 BasicBlock *CheckBB = UI->getParent();
7826 BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry");
7827
7828 BasicBlock *WorkerExitBB = BasicBlock::Create(
7829 CheckBB->getContext(), "worker.exit", CheckBB->getParent());
7830 Builder.SetInsertPoint(WorkerExitBB);
7831 Builder.CreateRetVoid();
7832
7833 auto *CheckBBTI = CheckBB->getTerminator();
7834 Builder.SetInsertPoint(CheckBBTI);
7835 Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB);
7836
7837 CheckBBTI->eraseFromParent();
7838 UI->eraseFromParent();
7839
7840 // Continue in the "user_code" block, see diagram above and in
7841 // openmp/libomptarget/deviceRTLs/common/include/target.h .
7842 return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
7843}
7844
7846 int32_t TeamsReductionDataSize,
7847 int32_t TeamsReductionBufferLength) {
7848 if (!updateToLocation(Loc))
7849 return;
7850
7852 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
7853
7855
7856 if (!TeamsReductionBufferLength || !TeamsReductionDataSize)
7857 return;
7858
7859 Function *Kernel = Builder.GetInsertBlock()->getParent();
7860 // We need to strip the debug prefix to get the correct kernel name.
7861 StringRef KernelName = Kernel->getName();
7862 const std::string DebugPrefix = "_debug__";
7863 if (KernelName.ends_with(DebugPrefix))
7864 KernelName = KernelName.drop_back(DebugPrefix.length());
7865 auto *KernelEnvironmentGV =
7866 M.getNamedGlobal((KernelName + "_kernel_environment").str());
7867 assert(KernelEnvironmentGV && "Expected kernel environment global\n");
7868 auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer();
7869 auto *NewInitializer = ConstantFoldInsertValueInstruction(
7870 KernelEnvironmentInitializer,
7871 ConstantInt::get(Int32, TeamsReductionDataSize), {0, 7});
7872 NewInitializer = ConstantFoldInsertValueInstruction(
7873 NewInitializer, ConstantInt::get(Int32, TeamsReductionBufferLength),
7874 {0, 8});
7875 KernelEnvironmentGV->setInitializer(NewInitializer);
7876}
7877
7878static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value,
7879 bool Min) {
7880 if (Kernel.hasFnAttribute(Name)) {
7881 int32_t OldLimit = Kernel.getFnAttributeAsParsedInteger(Name);
7882 Value = Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value);
7883 }
7884 Kernel.addFnAttr(Name, llvm::utostr(Value));
7885}
7886
7887std::pair<int32_t, int32_t>
7889 int32_t ThreadLimit =
7890 Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit");
7891
7892 if (T.isAMDGPU()) {
7893 const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size");
7894 if (!Attr.isValid() || !Attr.isStringAttribute())
7895 return {0, ThreadLimit};
7896 auto [LBStr, UBStr] = Attr.getValueAsString().split(',');
7897 int32_t LB, UB;
7898 if (!llvm::to_integer(UBStr, UB, 10))
7899 return {0, ThreadLimit};
7900 UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB;
7901 if (!llvm::to_integer(LBStr, LB, 10))
7902 return {0, UB};
7903 return {LB, UB};
7904 }
7905
7906 if (Kernel.hasFnAttribute("nvvm.maxntid")) {
7907 int32_t UB = Kernel.getFnAttributeAsParsedInteger("nvvm.maxntid");
7908 return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
7909 }
7910 return {0, ThreadLimit};
7911}
7912
7914 Function &Kernel, int32_t LB,
7915 int32_t UB) {
7916 Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB));
7917
7918 if (T.isAMDGPU()) {
7919 Kernel.addFnAttr("amdgpu-flat-work-group-size",
7920 llvm::utostr(LB) + "," + llvm::utostr(UB));
7921 return;
7922 }
7923
7924 updateNVPTXAttr(Kernel, "nvvm.maxntid", UB, true);
7925}
7926
7927std::pair<int32_t, int32_t>
7929 // TODO: Read from backend annotations if available.
7930 return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")};
7931}
7932
7934 int32_t LB, int32_t UB) {
7935 if (T.isNVPTX())
7936 if (UB > 0)
7937 Kernel.addFnAttr("nvvm.maxclusterrank", llvm::utostr(UB));
7938 if (T.isAMDGPU())
7939 Kernel.addFnAttr("amdgpu-max-num-workgroups", llvm::utostr(LB) + ",1,1");
7940
7941 Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB));
7942}
7943
7944void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
7945 Function *OutlinedFn) {
7946 if (Config.isTargetDevice()) {
7948 // TODO: Determine if DSO local can be set to true.
7949 OutlinedFn->setDSOLocal(false);
7951 if (T.isAMDGCN())
7953 else if (T.isNVPTX())
7955 else if (T.isSPIRV())
7957 }
7958}
7959
7960Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn,
7961 StringRef EntryFnIDName) {
7962 if (Config.isTargetDevice()) {
7963 assert(OutlinedFn && "The outlined function must exist if embedded");
7964 return OutlinedFn;
7965 }
7966
7967 return new GlobalVariable(
7968 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
7969 Constant::getNullValue(Builder.getInt8Ty()), EntryFnIDName);
7970}
7971
7972Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
7973 StringRef EntryFnName) {
7974 if (OutlinedFn)
7975 return OutlinedFn;
7976
7977 assert(!M.getGlobalVariable(EntryFnName, true) &&
7978 "Named kernel already exists?");
7979 return new GlobalVariable(
7980 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage,
7981 Constant::getNullValue(Builder.getInt8Ty()), EntryFnName);
7982}
7983
7985 TargetRegionEntryInfo &EntryInfo,
7986 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
7987 Function *&OutlinedFn, Constant *&OutlinedFnID) {
7988
7989 SmallString<64> EntryFnName;
7990 OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
7991
7992 if (Config.isTargetDevice() || !Config.openMPOffloadMandatory()) {
7993 Expected<Function *> CBResult = GenerateFunctionCallback(EntryFnName);
7994 if (!CBResult)
7995 return CBResult.takeError();
7996 OutlinedFn = *CBResult;
7997 } else {
7998 OutlinedFn = nullptr;
7999 }
8000
8001 // If this target outline function is not an offload entry, we don't need to
8002 // register it. This may be in the case of a false if clause, or if there are
8003 // no OpenMP targets.
8004 if (!IsOffloadEntry)
8005 return Error::success();
8006
8007 std::string EntryFnIDName =
8008 Config.isTargetDevice()
8009 ? std::string(EntryFnName)
8010 : createPlatformSpecificName({EntryFnName, "region_id"});
8011
8012 OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn,
8013 EntryFnName, EntryFnIDName);
8014 return Error::success();
8015}
8016
8018 TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
8019 StringRef EntryFnName, StringRef EntryFnIDName) {
8020 if (OutlinedFn)
8021 setOutlinedTargetRegionFunctionAttributes(OutlinedFn);
8022 auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
8023 auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
8024 OffloadInfoManager.registerTargetRegionEntryInfo(
8025 EntryInfo, EntryAddr, OutlinedFnID,
8027 return OutlinedFnID;
8028}
8029
8031 const LocationDescription &Loc, InsertPointTy AllocaIP,
8032 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
8033 TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
8034 CustomMapperCallbackTy CustomMapperCB, omp::RuntimeFunction *MapperFunc,
8036 BodyGenTy BodyGenType)>
8037 BodyGenCB,
8038 function_ref<void(unsigned int, Value *)> DeviceAddrCB, Value *SrcLocInfo) {
8039 if (!updateToLocation(Loc))
8040 return InsertPointTy();
8041
8042 Builder.restoreIP(CodeGenIP);
8043
8044 bool IsStandAlone = !BodyGenCB;
8045 MapInfosTy *MapInfo;
8046 // Generate the code for the opening of the data environment. Capture all the
8047 // arguments of the runtime call by reference because they are used in the
8048 // closing of the region.
8049 auto BeginThenGen = [&](InsertPointTy AllocaIP,
8050 InsertPointTy CodeGenIP) -> Error {
8051 MapInfo = &GenMapInfoCB(Builder.saveIP());
8052 if (Error Err = emitOffloadingArrays(
8053 AllocaIP, Builder.saveIP(), *MapInfo, Info, CustomMapperCB,
8054 /*IsNonContiguous=*/true, DeviceAddrCB))
8055 return Err;
8056
8057 TargetDataRTArgs RTArgs;
8059
8060 // Emit the number of elements in the offloading arrays.
8061 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
8062
8063 // Source location for the ident struct
8064 if (!SrcLocInfo) {
8065 uint32_t SrcLocStrSize;
8066 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8067 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8068 }
8069
8070 SmallVector<llvm::Value *, 13> OffloadingArgs = {
8071 SrcLocInfo, DeviceID,
8072 PointerNum, RTArgs.BasePointersArray,
8073 RTArgs.PointersArray, RTArgs.SizesArray,
8074 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
8075 RTArgs.MappersArray};
8076
8077 if (IsStandAlone) {
8078 assert(MapperFunc && "MapperFunc missing for standalone target data");
8079
8080 auto TaskBodyCB = [&](Value *, Value *,
8082 if (Info.HasNoWait) {
8083 OffloadingArgs.append({llvm::Constant::getNullValue(Int32),
8087 }
8088
8090 OffloadingArgs);
8091
8092 if (Info.HasNoWait) {
8093 BasicBlock *OffloadContBlock =
8094 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
8095 Function *CurFn = Builder.GetInsertBlock()->getParent();
8096 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
8097 Builder.restoreIP(Builder.saveIP());
8098 }
8099 return Error::success();
8100 };
8101
8102 bool RequiresOuterTargetTask = Info.HasNoWait;
8103 if (!RequiresOuterTargetTask)
8104 cantFail(TaskBodyCB(/*DeviceID=*/nullptr, /*RTLoc=*/nullptr,
8105 /*TargetTaskAllocaIP=*/{}));
8106 else
8107 cantFail(emitTargetTask(TaskBodyCB, DeviceID, SrcLocInfo, AllocaIP,
8108 /*Dependencies=*/{}, RTArgs, Info.HasNoWait));
8109 } else {
8110 Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
8111 omp::OMPRTL___tgt_target_data_begin_mapper);
8112
8113 createRuntimeFunctionCall(BeginMapperFunc, OffloadingArgs);
8114
8115 for (auto DeviceMap : Info.DevicePtrInfoMap) {
8116 if (isa<AllocaInst>(DeviceMap.second.second)) {
8117 auto *LI =
8118 Builder.CreateLoad(Builder.getPtrTy(), DeviceMap.second.first);
8119 Builder.CreateStore(LI, DeviceMap.second.second);
8120 }
8121 }
8122
8123 // If device pointer privatization is required, emit the body of the
8124 // region here. It will have to be duplicated: with and without
8125 // privatization.
8126 InsertPointOrErrorTy AfterIP =
8127 BodyGenCB(Builder.saveIP(), BodyGenTy::Priv);
8128 if (!AfterIP)
8129 return AfterIP.takeError();
8130 Builder.restoreIP(*AfterIP);
8131 }
8132 return Error::success();
8133 };
8134
8135 // If we need device pointer privatization, we need to emit the body of the
8136 // region with no privatization in the 'else' branch of the conditional.
8137 // Otherwise, we don't have to do anything.
8138 auto BeginElseGen = [&](InsertPointTy AllocaIP,
8139 InsertPointTy CodeGenIP) -> Error {
8140 InsertPointOrErrorTy AfterIP =
8141 BodyGenCB(Builder.saveIP(), BodyGenTy::DupNoPriv);
8142 if (!AfterIP)
8143 return AfterIP.takeError();
8144 Builder.restoreIP(*AfterIP);
8145 return Error::success();
8146 };
8147
8148 // Generate code for the closing of the data region.
8149 auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
8150 TargetDataRTArgs RTArgs;
8151 Info.EmitDebug = !MapInfo->Names.empty();
8152 emitOffloadingArraysArgument(Builder, RTArgs, Info, /*ForEndCall=*/true);
8153
8154 // Emit the number of elements in the offloading arrays.
8155 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
8156
8157 // Source location for the ident struct
8158 if (!SrcLocInfo) {
8159 uint32_t SrcLocStrSize;
8160 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8161 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8162 }
8163
8164 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
8165 PointerNum, RTArgs.BasePointersArray,
8166 RTArgs.PointersArray, RTArgs.SizesArray,
8167 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
8168 RTArgs.MappersArray};
8169 Function *EndMapperFunc =
8170 getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper);
8171
8172 createRuntimeFunctionCall(EndMapperFunc, OffloadingArgs);
8173 return Error::success();
8174 };
8175
8176 // We don't have to do anything to close the region if the if clause evaluates
8177 // to false.
8178 auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
8179 return Error::success();
8180 };
8181
8182 Error Err = [&]() -> Error {
8183 if (BodyGenCB) {
8184 Error Err = [&]() {
8185 if (IfCond)
8186 return emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP);
8187 return BeginThenGen(AllocaIP, Builder.saveIP());
8188 }();
8189
8190 if (Err)
8191 return Err;
8192
8193 // If we don't require privatization of device pointers, we emit the body
8194 // in between the runtime calls. This avoids duplicating the body code.
8195 InsertPointOrErrorTy AfterIP =
8196 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
8197 if (!AfterIP)
8198 return AfterIP.takeError();
8199 restoreIPandDebugLoc(Builder, *AfterIP);
8200
8201 if (IfCond)
8202 return emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP);
8203 return EndThenGen(AllocaIP, Builder.saveIP());
8204 }
8205 if (IfCond)
8206 return emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP);
8207 return BeginThenGen(AllocaIP, Builder.saveIP());
8208 }();
8209
8210 if (Err)
8211 return Err;
8212
8213 return Builder.saveIP();
8214}
8215
8218 bool IsGPUDistribute) {
8219 assert((IVSize == 32 || IVSize == 64) &&
8220 "IV size is not compatible with the omp runtime");
8221 RuntimeFunction Name;
8222 if (IsGPUDistribute)
8223 Name = IVSize == 32
8224 ? (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_4
8225 : omp::OMPRTL___kmpc_distribute_static_init_4u)
8226 : (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_8
8227 : omp::OMPRTL___kmpc_distribute_static_init_8u);
8228 else
8229 Name = IVSize == 32 ? (IVSigned ? omp::OMPRTL___kmpc_for_static_init_4
8230 : omp::OMPRTL___kmpc_for_static_init_4u)
8231 : (IVSigned ? omp::OMPRTL___kmpc_for_static_init_8
8232 : omp::OMPRTL___kmpc_for_static_init_8u);
8233
8234 return getOrCreateRuntimeFunction(M, Name);
8235}
8236
8238 bool IVSigned) {
8239 assert((IVSize == 32 || IVSize == 64) &&
8240 "IV size is not compatible with the omp runtime");
8241 RuntimeFunction Name = IVSize == 32
8242 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_4
8243 : omp::OMPRTL___kmpc_dispatch_init_4u)
8244 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_8
8245 : omp::OMPRTL___kmpc_dispatch_init_8u);
8246
8247 return getOrCreateRuntimeFunction(M, Name);
8248}
8249
8251 bool IVSigned) {
8252 assert((IVSize == 32 || IVSize == 64) &&
8253 "IV size is not compatible with the omp runtime");
8254 RuntimeFunction Name = IVSize == 32
8255 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_4
8256 : omp::OMPRTL___kmpc_dispatch_next_4u)
8257 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_8
8258 : omp::OMPRTL___kmpc_dispatch_next_8u);
8259
8260 return getOrCreateRuntimeFunction(M, Name);
8261}
8262
8264 bool IVSigned) {
8265 assert((IVSize == 32 || IVSize == 64) &&
8266 "IV size is not compatible with the omp runtime");
8267 RuntimeFunction Name = IVSize == 32
8268 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_4
8269 : omp::OMPRTL___kmpc_dispatch_fini_4u)
8270 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_8
8271 : omp::OMPRTL___kmpc_dispatch_fini_8u);
8272
8273 return getOrCreateRuntimeFunction(M, Name);
8274}
8275
8277 return getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_dispatch_deinit);
8278}
8279
8281 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func,
8282 DenseMap<Value *, std::tuple<Value *, unsigned>> &ValueReplacementMap) {
8283
8284 DISubprogram *NewSP = Func->getSubprogram();
8285 if (!NewSP)
8286 return;
8287
8289
8290 auto GetUpdatedDIVariable = [&](DILocalVariable *OldVar, unsigned arg) {
8291 DILocalVariable *&NewVar = RemappedVariables[OldVar];
8292 // Only use cached variable if the arg number matches. This is important
8293 // so that DIVariable created for privatized variables are not discarded.
8294 if (NewVar && (arg == NewVar->getArg()))
8295 return NewVar;
8296
8298 Builder.getContext(), OldVar->getScope(), OldVar->getName(),
8299 OldVar->getFile(), OldVar->getLine(), OldVar->getType(), arg,
8300 OldVar->getFlags(), OldVar->getAlignInBits(), OldVar->getAnnotations());
8301 return NewVar;
8302 };
8303
8304 auto UpdateDebugRecord = [&](auto *DR) {
8305 DILocalVariable *OldVar = DR->getVariable();
8306 unsigned ArgNo = 0;
8307 for (auto Loc : DR->location_ops()) {
8308 auto Iter = ValueReplacementMap.find(Loc);
8309 if (Iter != ValueReplacementMap.end()) {
8310 DR->replaceVariableLocationOp(Loc, std::get<0>(Iter->second));
8311 ArgNo = std::get<1>(Iter->second) + 1;
8312 }
8313 }
8314 if (ArgNo != 0)
8315 DR->setVariable(GetUpdatedDIVariable(OldVar, ArgNo));
8316 };
8317
8318 // The location and scope of variable intrinsics and records still point to
8319 // the parent function of the target region. Update them.
8320 for (Instruction &I : instructions(Func)) {
8322 "Unexpected debug intrinsic");
8323 for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
8324 UpdateDebugRecord(&DVR);
8325 }
8326 // An extra argument is passed to the device. Create the debug data for it.
8327 if (OMPBuilder.Config.isTargetDevice()) {
8328 DICompileUnit *CU = NewSP->getUnit();
8329 Module *M = Func->getParent();
8330 DIBuilder DB(*M, true, CU);
8331 DIType *VoidPtrTy =
8332 DB.createQualifiedType(dwarf::DW_TAG_pointer_type, nullptr);
8333 DILocalVariable *Var = DB.createParameterVariable(
8334 NewSP, "dyn_ptr", /*ArgNo*/ 1, NewSP->getFile(), /*LineNo=*/0,
8335 VoidPtrTy, /*AlwaysPreserve=*/false, DINode::DIFlags::FlagArtificial);
8336 auto Loc = DILocation::get(Func->getContext(), 0, 0, NewSP, 0);
8337 DB.insertDeclare(&(*Func->arg_begin()), Var, DB.createExpression(), Loc,
8338 &(*Func->begin()));
8339 }
8340}
8341
8343 if (Operator::getOpcode(V) == Instruction::AddrSpaceCast)
8344 return cast<Operator>(V)->getOperand(0);
8345 return V;
8346}
8347
8349 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
8351 StringRef FuncName, SmallVectorImpl<Value *> &Inputs,
8354 SmallVector<Type *> ParameterTypes;
8355 if (OMPBuilder.Config.isTargetDevice()) {
8356 // Add the "implicit" runtime argument we use to provide launch specific
8357 // information for target devices.
8358 auto *Int8PtrTy = PointerType::getUnqual(Builder.getContext());
8359 ParameterTypes.push_back(Int8PtrTy);
8360
8361 // All parameters to target devices are passed as pointers
8362 // or i64. This assumes 64-bit address spaces/pointers.
8363 for (auto &Arg : Inputs)
8364 ParameterTypes.push_back(Arg->getType()->isPointerTy()
8365 ? Arg->getType()
8366 : Type::getInt64Ty(Builder.getContext()));
8367 } else {
8368 for (auto &Arg : Inputs)
8369 ParameterTypes.push_back(Arg->getType());
8370 }
8371
8372 auto BB = Builder.GetInsertBlock();
8373 auto M = BB->getModule();
8374 auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes,
8375 /*isVarArg*/ false);
8376 auto Func =
8377 Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName, M);
8378
8379 // Forward target-cpu and target-features function attributes from the
8380 // original function to the new outlined function.
8381 Function *ParentFn = Builder.GetInsertBlock()->getParent();
8382
8383 auto TargetCpuAttr = ParentFn->getFnAttribute("target-cpu");
8384 if (TargetCpuAttr.isStringAttribute())
8385 Func->addFnAttr(TargetCpuAttr);
8386
8387 auto TargetFeaturesAttr = ParentFn->getFnAttribute("target-features");
8388 if (TargetFeaturesAttr.isStringAttribute())
8389 Func->addFnAttr(TargetFeaturesAttr);
8390
8391 if (OMPBuilder.Config.isTargetDevice()) {
8392 Value *ExecMode =
8393 OMPBuilder.emitKernelExecutionMode(FuncName, DefaultAttrs.ExecFlags);
8394 OMPBuilder.emitUsed("llvm.compiler.used", {ExecMode});
8395 }
8396
8397 // Save insert point.
8398 IRBuilder<>::InsertPointGuard IPG(Builder);
8399 // We will generate the entries in the outlined function but the debug
8400 // location may still be pointing to the parent function. Reset it now.
8401 Builder.SetCurrentDebugLocation(llvm::DebugLoc());
8402
8403 // Generate the region into the function.
8404 BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func);
8405 Builder.SetInsertPoint(EntryBB);
8406
8407 // Insert target init call in the device compilation pass.
8408 if (OMPBuilder.Config.isTargetDevice())
8409 Builder.restoreIP(OMPBuilder.createTargetInit(Builder, DefaultAttrs));
8410
8411 BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock();
8412
8413 // As we embed the user code in the middle of our target region after we
8414 // generate entry code, we must move what allocas we can into the entry
8415 // block to avoid possible breaking optimisations for device
8416 if (OMPBuilder.Config.isTargetDevice())
8418
8419 // Insert target deinit call in the device compilation pass.
8420 BasicBlock *OutlinedBodyBB =
8421 splitBB(Builder, /*CreateBranch=*/true, "outlined.body");
8423 Builder.saveIP(),
8424 OpenMPIRBuilder::InsertPointTy(OutlinedBodyBB, OutlinedBodyBB->begin()));
8425 if (!AfterIP)
8426 return AfterIP.takeError();
8427 Builder.restoreIP(*AfterIP);
8428 if (OMPBuilder.Config.isTargetDevice())
8429 OMPBuilder.createTargetDeinit(Builder);
8430
8431 // Insert return instruction.
8432 Builder.CreateRetVoid();
8433
8434 // New Alloca IP at entry point of created device function.
8435 Builder.SetInsertPoint(EntryBB->getFirstNonPHIIt());
8436 auto AllocaIP = Builder.saveIP();
8437
8438 Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg());
8439
8440 // Skip the artificial dyn_ptr on the device.
8441 const auto &ArgRange =
8442 OMPBuilder.Config.isTargetDevice()
8443 ? make_range(Func->arg_begin() + 1, Func->arg_end())
8444 : Func->args();
8445
8447
8448 auto ReplaceValue = [](Value *Input, Value *InputCopy, Function *Func) {
8449 // Things like GEP's can come in the form of Constants. Constants and
8450 // ConstantExpr's do not have access to the knowledge of what they're
8451 // contained in, so we must dig a little to find an instruction so we
8452 // can tell if they're used inside of the function we're outlining. We
8453 // also replace the original constant expression with a new instruction
8454 // equivalent; an instruction as it allows easy modification in the
8455 // following loop, as we can now know the constant (instruction) is
8456 // owned by our target function and replaceUsesOfWith can now be invoked
8457 // on it (cannot do this with constants it seems). A brand new one also
8458 // allows us to be cautious as it is perhaps possible the old expression
8459 // was used inside of the function but exists and is used externally
8460 // (unlikely by the nature of a Constant, but still).
8461 // NOTE: We cannot remove dead constants that have been rewritten to
8462 // instructions at this stage, we run the risk of breaking later lowering
8463 // by doing so as we could still be in the process of lowering the module
8464 // from MLIR to LLVM-IR and the MLIR lowering may still require the original
8465 // constants we have created rewritten versions of.
8466 if (auto *Const = dyn_cast<Constant>(Input))
8467 convertUsersOfConstantsToInstructions(Const, Func, false);
8468
8469 // Collect users before iterating over them to avoid invalidating the
8470 // iteration in case a user uses Input more than once (e.g. a call
8471 // instruction).
8472 SetVector<User *> Users(Input->users().begin(), Input->users().end());
8473 // Collect all the instructions
8475 if (auto *Instr = dyn_cast<Instruction>(User))
8476 if (Instr->getFunction() == Func)
8477 Instr->replaceUsesOfWith(Input, InputCopy);
8478 };
8479
8480 SmallVector<std::pair<Value *, Value *>> DeferredReplacement;
8481
8482 // Rewrite uses of input valus to parameters.
8483 for (auto InArg : zip(Inputs, ArgRange)) {
8484 Value *Input = std::get<0>(InArg);
8485 Argument &Arg = std::get<1>(InArg);
8486 Value *InputCopy = nullptr;
8487
8489 ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP());
8490 if (!AfterIP)
8491 return AfterIP.takeError();
8492 Builder.restoreIP(*AfterIP);
8493 ValueReplacementMap[Input] = std::make_tuple(InputCopy, Arg.getArgNo());
8494
8495 // In certain cases a Global may be set up for replacement, however, this
8496 // Global may be used in multiple arguments to the kernel, just segmented
8497 // apart, for example, if we have a global array, that is sectioned into
8498 // multiple mappings (technically not legal in OpenMP, but there is a case
8499 // in Fortran for Common Blocks where this is neccesary), we will end up
8500 // with GEP's into this array inside the kernel, that refer to the Global
8501 // but are technically separate arguments to the kernel for all intents and
8502 // purposes. If we have mapped a segment that requires a GEP into the 0-th
8503 // index, it will fold into an referal to the Global, if we then encounter
8504 // this folded GEP during replacement all of the references to the
8505 // Global in the kernel will be replaced with the argument we have generated
8506 // that corresponds to it, including any other GEP's that refer to the
8507 // Global that may be other arguments. This will invalidate all of the other
8508 // preceding mapped arguments that refer to the same global that may be
8509 // separate segments. To prevent this, we defer global processing until all
8510 // other processing has been performed.
8513 DeferredReplacement.push_back(std::make_pair(Input, InputCopy));
8514 continue;
8515 }
8516
8518 continue;
8519
8520 ReplaceValue(Input, InputCopy, Func);
8521 }
8522
8523 // Replace all of our deferred Input values, currently just Globals.
8524 for (auto Deferred : DeferredReplacement)
8525 ReplaceValue(std::get<0>(Deferred), std::get<1>(Deferred), Func);
8526
8527 FixupDebugInfoForOutlinedFunction(OMPBuilder, Builder, Func,
8528 ValueReplacementMap);
8529 return Func;
8530}
8531/// Given a task descriptor, TaskWithPrivates, return the pointer to the block
8532/// of pointers containing shared data between the parent task and the created
8533/// task.
8535 IRBuilderBase &Builder,
8536 Value *TaskWithPrivates,
8537 Type *TaskWithPrivatesTy) {
8538
8539 Type *TaskTy = OMPIRBuilder.Task;
8540 LLVMContext &Ctx = Builder.getContext();
8541 Value *TaskT =
8542 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 0);
8543 Value *Shareds = TaskT;
8544 // TaskWithPrivatesTy can be one of the following
8545 // 1. %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
8546 // %struct.privates }
8547 // 2. %struct.kmp_task_ompbuilder_t ;; This is simply TaskTy
8548 //
8549 // In the former case, that is when TaskWithPrivatesTy != TaskTy,
8550 // its first member has to be the task descriptor. TaskTy is the type of the
8551 // task descriptor. TaskT is the pointer to the task descriptor. Loading the
8552 // first member of TaskT, gives us the pointer to shared data.
8553 if (TaskWithPrivatesTy != TaskTy)
8554 Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
8555 return Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
8556}
8557/// Create an entry point for a target task with the following.
8558/// It'll have the following signature
8559/// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
8560/// This function is called from emitTargetTask once the
8561/// code to launch the target kernel has been outlined already.
8562/// NumOffloadingArrays is the number of offloading arrays that we need to copy
8563/// into the task structure so that the deferred target task can access this
8564/// data even after the stack frame of the generating task has been rolled
8565/// back. Offloading arrays contain base pointers, pointers, sizes etc
8566/// of the data that the target kernel will access. These in effect are the
8567/// non-empty arrays of pointers held by OpenMPIRBuilder::TargetDataRTArgs.
8569 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI,
8570 StructType *PrivatesTy, StructType *TaskWithPrivatesTy,
8571 const size_t NumOffloadingArrays, const int SharedArgsOperandNo) {
8572
8573 // If NumOffloadingArrays is non-zero, PrivatesTy better not be nullptr.
8574 // This is because PrivatesTy is the type of the structure in which
8575 // we pass the offloading arrays to the deferred target task.
8576 assert((!NumOffloadingArrays || PrivatesTy) &&
8577 "PrivatesTy cannot be nullptr when there are offloadingArrays"
8578 "to privatize");
8579
8580 Module &M = OMPBuilder.M;
8581 // KernelLaunchFunction is the target launch function, i.e.
8582 // the function that sets up kernel arguments and calls
8583 // __tgt_target_kernel to launch the kernel on the device.
8584 //
8585 Function *KernelLaunchFunction = StaleCI->getCalledFunction();
8586
8587 // StaleCI is the CallInst which is the call to the outlined
8588 // target kernel launch function. If there are local live-in values
8589 // that the outlined function uses then these are aggregated into a structure
8590 // which is passed as the second argument. If there are no local live-in
8591 // values or if all values used by the outlined kernel are global variables,
8592 // then there's only one argument, the threadID. So, StaleCI can be
8593 //
8594 // %structArg = alloca { ptr, ptr }, align 8
8595 // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
8596 // store ptr %20, ptr %gep_, align 8
8597 // %gep_8 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1
8598 // store ptr %21, ptr %gep_8, align 8
8599 // call void @_QQmain..omp_par.1(i32 %global.tid.val6, ptr %structArg)
8600 //
8601 // OR
8602 //
8603 // call void @_QQmain..omp_par.1(i32 %global.tid.val6)
8605 StaleCI->getIterator());
8606
8607 LLVMContext &Ctx = StaleCI->getParent()->getContext();
8608
8609 Type *ThreadIDTy = Type::getInt32Ty(Ctx);
8610 Type *TaskPtrTy = OMPBuilder.TaskPtr;
8611 [[maybe_unused]] Type *TaskTy = OMPBuilder.Task;
8612
8613 auto ProxyFnTy =
8614 FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
8615 /* isVarArg */ false);
8616 auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
8617 ".omp_target_task_proxy_func",
8618 Builder.GetInsertBlock()->getModule());
8619 Value *ThreadId = ProxyFn->getArg(0);
8620 Value *TaskWithPrivates = ProxyFn->getArg(1);
8621 ThreadId->setName("thread.id");
8622 TaskWithPrivates->setName("task");
8623
8624 bool HasShareds = SharedArgsOperandNo > 0;
8625 bool HasOffloadingArrays = NumOffloadingArrays > 0;
8626 BasicBlock *EntryBB =
8627 BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
8628 Builder.SetInsertPoint(EntryBB);
8629
8630 SmallVector<Value *> KernelLaunchArgs;
8631 KernelLaunchArgs.reserve(StaleCI->arg_size());
8632 KernelLaunchArgs.push_back(ThreadId);
8633
8634 if (HasOffloadingArrays) {
8635 assert(TaskTy != TaskWithPrivatesTy &&
8636 "If there are offloading arrays to pass to the target"
8637 "TaskTy cannot be the same as TaskWithPrivatesTy");
8638 (void)TaskTy;
8639 Value *Privates =
8640 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 1);
8641 for (unsigned int i = 0; i < NumOffloadingArrays; ++i)
8642 KernelLaunchArgs.push_back(
8643 Builder.CreateStructGEP(PrivatesTy, Privates, i));
8644 }
8645
8646 if (HasShareds) {
8647 auto *ArgStructAlloca =
8648 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgsOperandNo));
8649 assert(ArgStructAlloca &&
8650 "Unable to find the alloca instruction corresponding to arguments "
8651 "for extracted function");
8652 auto *ArgStructType = cast<StructType>(ArgStructAlloca->getAllocatedType());
8653 std::optional<TypeSize> ArgAllocSize =
8654 ArgStructAlloca->getAllocationSize(M.getDataLayout());
8655 assert(ArgStructType && ArgAllocSize &&
8656 "Unable to determine size of arguments for extracted function");
8657 uint64_t StructSize = ArgAllocSize->getFixedValue();
8658
8659 AllocaInst *NewArgStructAlloca =
8660 Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
8661
8662 Value *SharedsSize = Builder.getInt64(StructSize);
8663
8665 OMPBuilder, Builder, TaskWithPrivates, TaskWithPrivatesTy);
8666
8667 Builder.CreateMemCpy(
8668 NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
8669 LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
8670 KernelLaunchArgs.push_back(NewArgStructAlloca);
8671 }
8672 OMPBuilder.createRuntimeFunctionCall(KernelLaunchFunction, KernelLaunchArgs);
8673 Builder.CreateRetVoid();
8674 return ProxyFn;
8675}
8677
8678 if (auto *GEP = dyn_cast<GetElementPtrInst>(V))
8679 return GEP->getSourceElementType();
8680 if (auto *Alloca = dyn_cast<AllocaInst>(V))
8681 return Alloca->getAllocatedType();
8682
8683 llvm_unreachable("Unhandled Instruction type");
8684 return nullptr;
8685}
8686// This function returns a struct that has at most two members.
8687// The first member is always %struct.kmp_task_ompbuilder_t, that is the task
8688// descriptor. The second member, if needed, is a struct containing arrays
8689// that need to be passed to the offloaded target kernel. For example,
8690// if .offload_baseptrs, .offload_ptrs and .offload_sizes have to be passed to
8691// the target kernel and their types are [3 x ptr], [3 x ptr] and [3 x i64]
8692// respectively, then the types created by this function are
8693//
8694// %struct.privates = type { [3 x ptr], [3 x ptr], [3 x i64] }
8695// %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
8696// %struct.privates }
8697// %struct.task_with_privates is returned by this function.
8698// If there aren't any offloading arrays to pass to the target kernel,
8699// %struct.kmp_task_ompbuilder_t is returned.
8700static StructType *
8702 ArrayRef<Value *> OffloadingArraysToPrivatize) {
8703
8704 if (OffloadingArraysToPrivatize.empty())
8705 return OMPIRBuilder.Task;
8706
8707 SmallVector<Type *, 4> StructFieldTypes;
8708 for (Value *V : OffloadingArraysToPrivatize) {
8709 assert(V->getType()->isPointerTy() &&
8710 "Expected pointer to array to privatize. Got a non-pointer value "
8711 "instead");
8712 Type *ArrayTy = getOffloadingArrayType(V);
8713 assert(ArrayTy && "ArrayType cannot be nullptr");
8714 StructFieldTypes.push_back(ArrayTy);
8715 }
8716 StructType *PrivatesStructTy =
8717 StructType::create(StructFieldTypes, "struct.privates");
8718 return StructType::create({OMPIRBuilder.Task, PrivatesStructTy},
8719 "struct.task_with_privates");
8720}
8722 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry,
8723 TargetRegionEntryInfo &EntryInfo,
8725 Function *&OutlinedFn, Constant *&OutlinedFnID,
8729
8730 OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction =
8731 [&](StringRef EntryFnName) {
8732 return createOutlinedFunction(OMPBuilder, Builder, DefaultAttrs,
8733 EntryFnName, Inputs, CBFunc,
8734 ArgAccessorFuncCB);
8735 };
8736
8737 return OMPBuilder.emitTargetRegionFunction(
8738 EntryInfo, GenerateOutlinedFunction, IsOffloadEntry, OutlinedFn,
8739 OutlinedFnID);
8740}
8741
8743 TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
8746 const TargetDataRTArgs &RTArgs, bool HasNoWait) {
8747
8748 // The following explains the code-gen scenario for the `target` directive. A
8749 // similar scneario is followed for other device-related directives (e.g.
8750 // `target enter data`) but in similar fashion since we only need to emit task
8751 // that encapsulates the proper runtime call.
8752 //
8753 // When we arrive at this function, the target region itself has been
8754 // outlined into the function OutlinedFn.
8755 // So at ths point, for
8756 // --------------------------------------------------------------
8757 // void user_code_that_offloads(...) {
8758 // omp target depend(..) map(from:a) map(to:b) private(i)
8759 // do i = 1, 10
8760 // a(i) = b(i) + n
8761 // }
8762 //
8763 // --------------------------------------------------------------
8764 //
8765 // we have
8766 //
8767 // --------------------------------------------------------------
8768 //
8769 // void user_code_that_offloads(...) {
8770 // %.offload_baseptrs = alloca [2 x ptr], align 8
8771 // %.offload_ptrs = alloca [2 x ptr], align 8
8772 // %.offload_mappers = alloca [2 x ptr], align 8
8773 // ;; target region has been outlined and now we need to
8774 // ;; offload to it via a target task.
8775 // }
8776 // void outlined_device_function(ptr a, ptr b, ptr n) {
8777 // n = *n_ptr;
8778 // do i = 1, 10
8779 // a(i) = b(i) + n
8780 // }
8781 //
8782 // We have to now do the following
8783 // (i) Make an offloading call to outlined_device_function using the OpenMP
8784 // RTL. See 'kernel_launch_function' in the pseudo code below. This is
8785 // emitted by emitKernelLaunch
8786 // (ii) Create a task entry point function that calls kernel_launch_function
8787 // and is the entry point for the target task. See
8788 // '@.omp_target_task_proxy_func in the pseudocode below.
8789 // (iii) Create a task with the task entry point created in (ii)
8790 //
8791 // That is we create the following
8792 // struct task_with_privates {
8793 // struct kmp_task_ompbuilder_t task_struct;
8794 // struct privates {
8795 // [2 x ptr] ; baseptrs
8796 // [2 x ptr] ; ptrs
8797 // [2 x i64] ; sizes
8798 // }
8799 // }
8800 // void user_code_that_offloads(...) {
8801 // %.offload_baseptrs = alloca [2 x ptr], align 8
8802 // %.offload_ptrs = alloca [2 x ptr], align 8
8803 // %.offload_sizes = alloca [2 x i64], align 8
8804 //
8805 // %structArg = alloca { ptr, ptr, ptr }, align 8
8806 // %strucArg[0] = a
8807 // %strucArg[1] = b
8808 // %strucArg[2] = &n
8809 //
8810 // target_task_with_privates = @__kmpc_omp_target_task_alloc(...,
8811 // sizeof(kmp_task_ompbuilder_t),
8812 // sizeof(structArg),
8813 // @.omp_target_task_proxy_func,
8814 // ...)
8815 // memcpy(target_task_with_privates->task_struct->shareds, %structArg,
8816 // sizeof(structArg))
8817 // memcpy(target_task_with_privates->privates->baseptrs,
8818 // offload_baseptrs, sizeof(offload_baseptrs)
8819 // memcpy(target_task_with_privates->privates->ptrs,
8820 // offload_ptrs, sizeof(offload_ptrs)
8821 // memcpy(target_task_with_privates->privates->sizes,
8822 // offload_sizes, sizeof(offload_sizes)
8823 // dependencies_array = ...
8824 // ;; if nowait not present
8825 // call @__kmpc_omp_wait_deps(..., dependencies_array)
8826 // call @__kmpc_omp_task_begin_if0(...)
8827 // call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
8828 // %target_task_with_privates)
8829 // call @__kmpc_omp_task_complete_if0(...)
8830 // }
8831 //
8832 // define internal void @.omp_target_task_proxy_func(i32 %thread.id,
8833 // ptr %task) {
8834 // %structArg = alloca {ptr, ptr, ptr}
8835 // %task_ptr = getelementptr(%task, 0, 0)
8836 // %shared_data = load (getelementptr %task_ptr, 0, 0)
8837 // mempcy(%structArg, %shared_data, sizeof(%structArg))
8838 //
8839 // %offloading_arrays = getelementptr(%task, 0, 1)
8840 // %offload_baseptrs = getelementptr(%offloading_arrays, 0, 0)
8841 // %offload_ptrs = getelementptr(%offloading_arrays, 0, 1)
8842 // %offload_sizes = getelementptr(%offloading_arrays, 0, 2)
8843 // kernel_launch_function(%thread.id, %offload_baseptrs, %offload_ptrs,
8844 // %offload_sizes, %structArg)
8845 // }
8846 //
8847 // We need the proxy function because the signature of the task entry point
8848 // expected by kmpc_omp_task is always the same and will be different from
8849 // that of the kernel_launch function.
8850 //
8851 // kernel_launch_function is generated by emitKernelLaunch and has the
8852 // always_inline attribute. For this example, it'll look like so:
8853 // void kernel_launch_function(%thread_id, %offload_baseptrs, %offload_ptrs,
8854 // %offload_sizes, %structArg) alwaysinline {
8855 // %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
8856 // ; load aggregated data from %structArg
8857 // ; setup kernel_args using offload_baseptrs, offload_ptrs and
8858 // ; offload_sizes
8859 // call i32 @__tgt_target_kernel(...,
8860 // outlined_device_function,
8861 // ptr %kernel_args)
8862 // }
8863 // void outlined_device_function(ptr a, ptr b, ptr n) {
8864 // n = *n_ptr;
8865 // do i = 1, 10
8866 // a(i) = b(i) + n
8867 // }
8868 //
8869 BasicBlock *TargetTaskBodyBB =
8870 splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
8871 BasicBlock *TargetTaskAllocaBB =
8872 splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
8873
8874 InsertPointTy TargetTaskAllocaIP(TargetTaskAllocaBB,
8875 TargetTaskAllocaBB->begin());
8876 InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin());
8877
8878 OutlineInfo OI;
8879 OI.EntryBB = TargetTaskAllocaBB;
8880 OI.OuterAllocaBB = AllocaIP.getBlock();
8881
8882 // Add the thread ID argument.
8885 Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
8886
8887 // Generate the task body which will subsequently be outlined.
8888 Builder.restoreIP(TargetTaskBodyIP);
8889 if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP))
8890 return Err;
8891
8892 // The outliner (CodeExtractor) extract a sequence or vector of blocks that
8893 // it is given. These blocks are enumerated by
8894 // OpenMPIRBuilder::OutlineInfo::collectBlocks which expects the OI.ExitBlock
8895 // to be outside the region. In other words, OI.ExitBlock is expected to be
8896 // the start of the region after the outlining. We used to set OI.ExitBlock
8897 // to the InsertBlock after TaskBodyCB is done. This is fine in most cases
8898 // except when the task body is a single basic block. In that case,
8899 // OI.ExitBlock is set to the single task body block and will get left out of
8900 // the outlining process. So, simply create a new empty block to which we
8901 // uncoditionally branch from where TaskBodyCB left off
8902 OI.ExitBB = BasicBlock::Create(Builder.getContext(), "target.task.cont");
8903 emitBlock(OI.ExitBB, Builder.GetInsertBlock()->getParent(),
8904 /*IsFinished=*/true);
8905
8906 SmallVector<Value *, 2> OffloadingArraysToPrivatize;
8907 bool NeedsTargetTask = HasNoWait && DeviceID;
8908 if (NeedsTargetTask) {
8909 for (auto *V :
8910 {RTArgs.BasePointersArray, RTArgs.PointersArray, RTArgs.MappersArray,
8911 RTArgs.MapNamesArray, RTArgs.MapTypesArray, RTArgs.MapTypesArrayEnd,
8912 RTArgs.SizesArray}) {
8914 OffloadingArraysToPrivatize.push_back(V);
8916 }
8917 }
8918 }
8919 OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, NeedsTargetTask,
8920 DeviceID, OffloadingArraysToPrivatize](
8921 Function &OutlinedFn) mutable {
8922 assert(OutlinedFn.hasOneUse() &&
8923 "there must be a single user for the outlined function");
8924
8925 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
8926
8927 // The first argument of StaleCI is always the thread id.
8928 // The next few arguments are the pointers to offloading arrays
8929 // if any. (see OffloadingArraysToPrivatize)
8930 // Finally, all other local values that are live-in into the outlined region
8931 // end up in a structure whose pointer is passed as the last argument. This
8932 // piece of data is passed in the "shared" field of the task structure. So,
8933 // we know we have to pass shareds to the task if the number of arguments is
8934 // greater than OffloadingArraysToPrivatize.size() + 1 The 1 is for the
8935 // thread id. Further, for safety, we assert that the number of arguments of
8936 // StaleCI is exactly OffloadingArraysToPrivatize.size() + 2
8937 const unsigned int NumStaleCIArgs = StaleCI->arg_size();
8938 bool HasShareds = NumStaleCIArgs > OffloadingArraysToPrivatize.size() + 1;
8939 assert((!HasShareds ||
8940 NumStaleCIArgs == (OffloadingArraysToPrivatize.size() + 2)) &&
8941 "Wrong number of arguments for StaleCI when shareds are present");
8942 int SharedArgOperandNo =
8943 HasShareds ? OffloadingArraysToPrivatize.size() + 1 : 0;
8944
8945 StructType *TaskWithPrivatesTy =
8946 createTaskWithPrivatesTy(*this, OffloadingArraysToPrivatize);
8947 StructType *PrivatesTy = nullptr;
8948
8949 if (!OffloadingArraysToPrivatize.empty())
8950 PrivatesTy =
8951 static_cast<StructType *>(TaskWithPrivatesTy->getElementType(1));
8952
8954 *this, Builder, StaleCI, PrivatesTy, TaskWithPrivatesTy,
8955 OffloadingArraysToPrivatize.size(), SharedArgOperandNo);
8956
8957 LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
8958 << "\n");
8959
8960 Builder.SetInsertPoint(StaleCI);
8961
8962 // Gather the arguments for emitting the runtime call.
8963 uint32_t SrcLocStrSize;
8964 Constant *SrcLocStr =
8966 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8967
8968 // @__kmpc_omp_task_alloc or @__kmpc_omp_target_task_alloc
8969 //
8970 // If `HasNoWait == true`, we call @__kmpc_omp_target_task_alloc to provide
8971 // the DeviceID to the deferred task and also since
8972 // @__kmpc_omp_target_task_alloc creates an untied/async task.
8973 Function *TaskAllocFn =
8974 !NeedsTargetTask
8975 ? getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc)
8977 OMPRTL___kmpc_omp_target_task_alloc);
8978
8979 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
8980 // call.
8981 Value *ThreadID = getOrCreateThreadID(Ident);
8982
8983 // Argument - `sizeof_kmp_task_t` (TaskSize)
8984 // Tasksize refers to the size in bytes of kmp_task_t data structure
8985 // plus any other data to be passed to the target task, if any, which
8986 // is packed into a struct. kmp_task_t and the struct so created are
8987 // packed into a wrapper struct whose type is TaskWithPrivatesTy.
8988 Value *TaskSize = Builder.getInt64(
8989 M.getDataLayout().getTypeStoreSize(TaskWithPrivatesTy));
8990
8991 // Argument - `sizeof_shareds` (SharedsSize)
8992 // SharedsSize refers to the shareds array size in the kmp_task_t data
8993 // structure.
8994 Value *SharedsSize = Builder.getInt64(0);
8995 if (HasShareds) {
8996 auto *ArgStructAlloca =
8997 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgOperandNo));
8998 assert(ArgStructAlloca &&
8999 "Unable to find the alloca instruction corresponding to arguments "
9000 "for extracted function");
9001 std::optional<TypeSize> ArgAllocSize =
9002 ArgStructAlloca->getAllocationSize(M.getDataLayout());
9003 assert(ArgAllocSize &&
9004 "Unable to determine size of arguments for extracted function");
9005 SharedsSize = Builder.getInt64(ArgAllocSize->getFixedValue());
9006 }
9007
9008 // Argument - `flags`
9009 // Task is tied iff (Flags & 1) == 1.
9010 // Task is untied iff (Flags & 1) == 0.
9011 // Task is final iff (Flags & 2) == 2.
9012 // Task is not final iff (Flags & 2) == 0.
9013 // A target task is not final and is untied.
9014 Value *Flags = Builder.getInt32(0);
9015
9016 // Emit the @__kmpc_omp_task_alloc runtime call
9017 // The runtime call returns a pointer to an area where the task captured
9018 // variables must be copied before the task is run (TaskData)
9019 CallInst *TaskData = nullptr;
9020
9021 SmallVector<llvm::Value *> TaskAllocArgs = {
9022 /*loc_ref=*/Ident, /*gtid=*/ThreadID,
9023 /*flags=*/Flags,
9024 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
9025 /*task_func=*/ProxyFn};
9026
9027 if (NeedsTargetTask) {
9028 assert(DeviceID && "Expected non-empty device ID.");
9029 TaskAllocArgs.push_back(DeviceID);
9030 }
9031
9032 TaskData = createRuntimeFunctionCall(TaskAllocFn, TaskAllocArgs);
9033
9034 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
9035 if (HasShareds) {
9036 Value *Shareds = StaleCI->getArgOperand(SharedArgOperandNo);
9038 *this, Builder, TaskData, TaskWithPrivatesTy);
9039 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
9040 SharedsSize);
9041 }
9042 if (!OffloadingArraysToPrivatize.empty()) {
9043 Value *Privates =
9044 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskData, 1);
9045 for (unsigned int i = 0; i < OffloadingArraysToPrivatize.size(); ++i) {
9046 Value *PtrToPrivatize = OffloadingArraysToPrivatize[i];
9047 [[maybe_unused]] Type *ArrayType =
9048 getOffloadingArrayType(PtrToPrivatize);
9049 assert(ArrayType && "ArrayType cannot be nullptr");
9050
9051 Type *ElementType = PrivatesTy->getElementType(i);
9052 assert(ElementType == ArrayType &&
9053 "ElementType should match ArrayType");
9054 (void)ArrayType;
9055
9056 Value *Dst = Builder.CreateStructGEP(PrivatesTy, Privates, i);
9057 Builder.CreateMemCpy(
9058 Dst, Alignment, PtrToPrivatize, Alignment,
9059 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ElementType)));
9060 }
9061 }
9062
9063 Value *DepArray = emitTaskDependencies(*this, Dependencies);
9064
9065 // ---------------------------------------------------------------
9066 // V5.2 13.8 target construct
9067 // If the nowait clause is present, execution of the target task
9068 // may be deferred. If the nowait clause is not present, the target task is
9069 // an included task.
9070 // ---------------------------------------------------------------
9071 // The above means that the lack of a nowait on the target construct
9072 // translates to '#pragma omp task if(0)'
9073 if (!NeedsTargetTask) {
9074 if (DepArray) {
9075 Function *TaskWaitFn =
9076 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
9078 TaskWaitFn,
9079 {/*loc_ref=*/Ident, /*gtid=*/ThreadID,
9080 /*ndeps=*/Builder.getInt32(Dependencies.size()),
9081 /*dep_list=*/DepArray,
9082 /*ndeps_noalias=*/ConstantInt::get(Builder.getInt32Ty(), 0),
9083 /*noalias_dep_list=*/
9085 }
9086 // Included task.
9087 Function *TaskBeginFn =
9088 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
9089 Function *TaskCompleteFn =
9090 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
9091 createRuntimeFunctionCall(TaskBeginFn, {Ident, ThreadID, TaskData});
9092 CallInst *CI = createRuntimeFunctionCall(ProxyFn, {ThreadID, TaskData});
9093 CI->setDebugLoc(StaleCI->getDebugLoc());
9094 createRuntimeFunctionCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
9095 } else if (DepArray) {
9096 // HasNoWait - meaning the task may be deferred. Call
9097 // __kmpc_omp_task_with_deps if there are dependencies,
9098 // else call __kmpc_omp_task
9099 Function *TaskFn =
9100 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
9102 TaskFn,
9103 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
9104 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
9106 } else {
9107 // Emit the @__kmpc_omp_task runtime call to spawn the task
9108 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
9109 createRuntimeFunctionCall(TaskFn, {Ident, ThreadID, TaskData});
9110 }
9111
9112 StaleCI->eraseFromParent();
9113 for (Instruction *I : llvm::reverse(ToBeDeleted))
9114 I->eraseFromParent();
9115 };
9116 addOutlineInfo(std::move(OI));
9117
9118 LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
9119 << *(Builder.GetInsertBlock()) << "\n");
9120 LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
9121 << *(Builder.GetInsertBlock()->getParent()->getParent())
9122 << "\n");
9123 return Builder.saveIP();
9124}
9125
9127 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
9128 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo,
9129 CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous,
9130 bool ForEndCall, function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
9131 if (Error Err =
9132 emitOffloadingArrays(AllocaIP, CodeGenIP, CombinedInfo, Info,
9133 CustomMapperCB, IsNonContiguous, DeviceAddrCB))
9134 return Err;
9135 emitOffloadingArraysArgument(Builder, RTArgs, Info, ForEndCall);
9136 return Error::success();
9137}
9138
9139static void emitTargetCall(
9140 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
9145 Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID,
9150 bool HasNoWait, Value *DynCGroupMem,
9151 OMPDynGroupprivateFallbackType DynCGroupMemFallback) {
9152 // Generate a function call to the host fallback implementation of the target
9153 // region. This is called by the host when no offload entry was generated for
9154 // the target region and when the offloading call fails at runtime.
9155 auto &&EmitTargetCallFallbackCB = [&](OpenMPIRBuilder::InsertPointTy IP)
9157 Builder.restoreIP(IP);
9158 OMPBuilder.createRuntimeFunctionCall(OutlinedFn, Args);
9159 return Builder.saveIP();
9160 };
9161
9162 bool HasDependencies = Dependencies.size() > 0;
9163 bool RequiresOuterTargetTask = HasNoWait || HasDependencies;
9164
9166
9167 auto TaskBodyCB =
9168 [&](Value *DeviceID, Value *RTLoc,
9169 IRBuilderBase::InsertPoint TargetTaskAllocaIP) -> Error {
9170 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
9171 // produce any.
9173 // emitKernelLaunch makes the necessary runtime call to offload the
9174 // kernel. We then outline all that code into a separate function
9175 // ('kernel_launch_function' in the pseudo code above). This function is
9176 // then called by the target task proxy function (see
9177 // '@.omp_target_task_proxy_func' in the pseudo code above)
9178 // "@.omp_target_task_proxy_func' is generated by
9179 // emitTargetTaskProxyFunction.
9180 if (OutlinedFnID && DeviceID)
9181 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
9182 EmitTargetCallFallbackCB, KArgs,
9183 DeviceID, RTLoc, TargetTaskAllocaIP);
9184
9185 // We only need to do the outlining if `DeviceID` is set to avoid calling
9186 // `emitKernelLaunch` if we want to code-gen for the host; e.g. if we are
9187 // generating the `else` branch of an `if` clause.
9188 //
9189 // When OutlinedFnID is set to nullptr, then it's not an offloading call.
9190 // In this case, we execute the host implementation directly.
9191 return EmitTargetCallFallbackCB(OMPBuilder.Builder.saveIP());
9192 }());
9193
9194 OMPBuilder.Builder.restoreIP(AfterIP);
9195 return Error::success();
9196 };
9197
9198 auto &&EmitTargetCallElse =
9199 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
9201 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
9202 // produce any.
9204 if (RequiresOuterTargetTask) {
9205 // Arguments that are intended to be directly forwarded to an
9206 // emitKernelLaunch call are pased as nullptr, since
9207 // OutlinedFnID=nullptr results in that call not being done.
9209 return OMPBuilder.emitTargetTask(TaskBodyCB, /*DeviceID=*/nullptr,
9210 /*RTLoc=*/nullptr, AllocaIP,
9211 Dependencies, EmptyRTArgs, HasNoWait);
9212 }
9213 return EmitTargetCallFallbackCB(Builder.saveIP());
9214 }());
9215
9216 Builder.restoreIP(AfterIP);
9217 return Error::success();
9218 };
9219
9220 auto &&EmitTargetCallThen =
9221 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
9223 Info.HasNoWait = HasNoWait;
9224 OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
9226 if (Error Err = OMPBuilder.emitOffloadingArraysAndArgs(
9227 AllocaIP, Builder.saveIP(), Info, RTArgs, MapInfo, CustomMapperCB,
9228 /*IsNonContiguous=*/true,
9229 /*ForEndCall=*/false))
9230 return Err;
9231
9232 SmallVector<Value *, 3> NumTeamsC;
9233 for (auto [DefaultVal, RuntimeVal] :
9234 zip_equal(DefaultAttrs.MaxTeams, RuntimeAttrs.MaxTeams))
9235 NumTeamsC.push_back(RuntimeVal ? RuntimeVal
9236 : Builder.getInt32(DefaultVal));
9237
9238 // Calculate number of threads: 0 if no clauses specified, otherwise it is
9239 // the minimum between optional THREAD_LIMIT and NUM_THREADS clauses.
9240 auto InitMaxThreadsClause = [&Builder](Value *Clause) {
9241 if (Clause)
9242 Clause = Builder.CreateIntCast(Clause, Builder.getInt32Ty(),
9243 /*isSigned=*/false);
9244 return Clause;
9245 };
9246 auto CombineMaxThreadsClauses = [&Builder](Value *Clause, Value *&Result) {
9247 if (Clause)
9248 Result =
9249 Result ? Builder.CreateSelect(Builder.CreateICmpULT(Result, Clause),
9250 Result, Clause)
9251 : Clause;
9252 };
9253
9254 // If a multi-dimensional THREAD_LIMIT is set, it is the OMPX_BARE case, so
9255 // the NUM_THREADS clause is overriden by THREAD_LIMIT.
9256 SmallVector<Value *, 3> NumThreadsC;
9257 Value *MaxThreadsClause =
9258 RuntimeAttrs.TeamsThreadLimit.size() == 1
9259 ? InitMaxThreadsClause(RuntimeAttrs.MaxThreads)
9260 : nullptr;
9261
9262 for (auto [TeamsVal, TargetVal] : zip_equal(
9263 RuntimeAttrs.TeamsThreadLimit, RuntimeAttrs.TargetThreadLimit)) {
9264 Value *TeamsThreadLimitClause = InitMaxThreadsClause(TeamsVal);
9265 Value *NumThreads = InitMaxThreadsClause(TargetVal);
9266
9267 CombineMaxThreadsClauses(TeamsThreadLimitClause, NumThreads);
9268 CombineMaxThreadsClauses(MaxThreadsClause, NumThreads);
9269
9270 NumThreadsC.push_back(NumThreads ? NumThreads : Builder.getInt32(0));
9271 }
9272
9273 unsigned NumTargetItems = Info.NumberOfPtrs;
9274 uint32_t SrcLocStrSize;
9275 Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
9276 Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize,
9277 llvm::omp::IdentFlag(0), 0);
9278
9279 Value *TripCount = RuntimeAttrs.LoopTripCount
9280 ? Builder.CreateIntCast(RuntimeAttrs.LoopTripCount,
9281 Builder.getInt64Ty(),
9282 /*isSigned=*/false)
9283 : Builder.getInt64(0);
9284
9285 // Request zero groupprivate bytes by default.
9286 if (!DynCGroupMem)
9287 DynCGroupMem = Builder.getInt32(0);
9288
9290 NumTargetItems, RTArgs, TripCount, NumTeamsC, NumThreadsC, DynCGroupMem,
9291 HasNoWait, DynCGroupMemFallback);
9292
9293 // Assume no error was returned because TaskBodyCB and
9294 // EmitTargetCallFallbackCB don't produce any.
9296 // The presence of certain clauses on the target directive require the
9297 // explicit generation of the target task.
9298 if (RequiresOuterTargetTask)
9299 return OMPBuilder.emitTargetTask(TaskBodyCB, RuntimeAttrs.DeviceID,
9300 RTLoc, AllocaIP, Dependencies,
9301 KArgs.RTArgs, Info.HasNoWait);
9302
9303 return OMPBuilder.emitKernelLaunch(
9304 Builder, OutlinedFnID, EmitTargetCallFallbackCB, KArgs,
9305 RuntimeAttrs.DeviceID, RTLoc, AllocaIP);
9306 }());
9307
9308 Builder.restoreIP(AfterIP);
9309 return Error::success();
9310 };
9311
9312 // If we don't have an ID for the target region, it means an offload entry
9313 // wasn't created. In this case we just run the host fallback directly and
9314 // ignore any potential 'if' clauses.
9315 if (!OutlinedFnID) {
9316 cantFail(EmitTargetCallElse(AllocaIP, Builder.saveIP()));
9317 return;
9318 }
9319
9320 // If there's no 'if' clause, only generate the kernel launch code path.
9321 if (!IfCond) {
9322 cantFail(EmitTargetCallThen(AllocaIP, Builder.saveIP()));
9323 return;
9324 }
9325
9326 cantFail(OMPBuilder.emitIfClause(IfCond, EmitTargetCallThen,
9327 EmitTargetCallElse, AllocaIP));
9328}
9329
9331 const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocaIP,
9332 InsertPointTy CodeGenIP, TargetDataInfo &Info,
9333 TargetRegionEntryInfo &EntryInfo,
9334 const TargetKernelDefaultAttrs &DefaultAttrs,
9335 const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond,
9336 SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB,
9339 CustomMapperCallbackTy CustomMapperCB,
9340 const SmallVector<DependData> &Dependencies, bool HasNowait,
9341 Value *DynCGroupMem, OMPDynGroupprivateFallbackType DynCGroupMemFallback) {
9342
9343 if (!updateToLocation(Loc))
9344 return InsertPointTy();
9345
9346 Builder.restoreIP(CodeGenIP);
9347
9348 Function *OutlinedFn;
9349 Constant *OutlinedFnID = nullptr;
9350 // The target region is outlined into its own function. The LLVM IR for
9351 // the target region itself is generated using the callbacks CBFunc
9352 // and ArgAccessorFuncCB
9354 *this, Builder, IsOffloadEntry, EntryInfo, DefaultAttrs, OutlinedFn,
9355 OutlinedFnID, Inputs, CBFunc, ArgAccessorFuncCB))
9356 return Err;
9357
9358 // If we are not on the target device, then we need to generate code
9359 // to make a remote call (offload) to the previously outlined function
9360 // that represents the target region. Do that now.
9361 if (!Config.isTargetDevice())
9362 emitTargetCall(*this, Builder, AllocaIP, Info, DefaultAttrs, RuntimeAttrs,
9363 IfCond, OutlinedFn, OutlinedFnID, Inputs, GenMapInfoCB,
9364 CustomMapperCB, Dependencies, HasNowait, DynCGroupMem,
9365 DynCGroupMemFallback);
9366 return Builder.saveIP();
9367}
9368
9369std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
9370 StringRef FirstSeparator,
9371 StringRef Separator) {
9372 SmallString<128> Buffer;
9373 llvm::raw_svector_ostream OS(Buffer);
9374 StringRef Sep = FirstSeparator;
9375 for (StringRef Part : Parts) {
9376 OS << Sep << Part;
9377 Sep = Separator;
9378 }
9379 return OS.str().str();
9380}
9381
9382std::string
9384 return OpenMPIRBuilder::getNameWithSeparators(Parts, Config.firstSeparator(),
9385 Config.separator());
9386}
9387
9389 Type *Ty, const StringRef &Name, std::optional<unsigned> AddressSpace) {
9390 auto &Elem = *InternalVars.try_emplace(Name, nullptr).first;
9391 if (Elem.second) {
9392 assert(Elem.second->getValueType() == Ty &&
9393 "OMP internal variable has different type than requested");
9394 } else {
9395 // TODO: investigate the appropriate linkage type used for the global
9396 // variable for possibly changing that to internal or private, or maybe
9397 // create different versions of the function for different OMP internal
9398 // variables.
9399 const DataLayout &DL = M.getDataLayout();
9400 // TODO: Investigate why AMDGPU expects AS 0 for globals even though the
9401 // default global AS is 1.
9402 // See double-target-call-with-declare-target.f90 and
9403 // declare-target-vars-in-target-region.f90 libomptarget
9404 // tests.
9405 unsigned AddressSpaceVal = AddressSpace ? *AddressSpace
9406 : M.getTargetTriple().isAMDGPU()
9407 ? 0
9408 : DL.getDefaultGlobalsAddressSpace();
9409 auto Linkage = this->M.getTargetTriple().getArch() == Triple::wasm32
9412 auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage,
9413 Constant::getNullValue(Ty), Elem.first(),
9414 /*InsertBefore=*/nullptr,
9415 GlobalValue::NotThreadLocal, AddressSpaceVal);
9416 const llvm::Align TypeAlign = DL.getABITypeAlign(Ty);
9417 const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpaceVal);
9418 GV->setAlignment(std::max(TypeAlign, PtrAlign));
9419 Elem.second = GV;
9420 }
9421
9422 return Elem.second;
9423}
9424
9425Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) {
9426 std::string Prefix = Twine("gomp_critical_user_", CriticalName).str();
9427 std::string Name = getNameWithSeparators({Prefix, "var"}, ".", ".");
9428 return getOrCreateInternalVariable(KmpCriticalNameTy, Name);
9429}
9430
9432 LLVMContext &Ctx = Builder.getContext();
9433 Value *Null =
9434 Constant::getNullValue(PointerType::getUnqual(BasePtr->getContext()));
9435 Value *SizeGep =
9436 Builder.CreateGEP(BasePtr->getType(), Null, Builder.getInt32(1));
9437 Value *SizePtrToInt = Builder.CreatePtrToInt(SizeGep, Type::getInt64Ty(Ctx));
9438 return SizePtrToInt;
9439}
9440
9443 std::string VarName) {
9444 llvm::Constant *MaptypesArrayInit =
9445 llvm::ConstantDataArray::get(M.getContext(), Mappings);
9446 auto *MaptypesArrayGlobal = new llvm::GlobalVariable(
9447 M, MaptypesArrayInit->getType(),
9448 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit,
9449 VarName);
9450 MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
9451 return MaptypesArrayGlobal;
9452}
9453
9455 InsertPointTy AllocaIP,
9456 unsigned NumOperands,
9457 struct MapperAllocas &MapperAllocas) {
9458 if (!updateToLocation(Loc))
9459 return;
9460
9461 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
9462 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
9463 Builder.restoreIP(AllocaIP);
9464 AllocaInst *ArgsBase = Builder.CreateAlloca(
9465 ArrI8PtrTy, /* ArraySize = */ nullptr, ".offload_baseptrs");
9466 AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy, /* ArraySize = */ nullptr,
9467 ".offload_ptrs");
9468 AllocaInst *ArgSizes = Builder.CreateAlloca(
9469 ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes");
9471 MapperAllocas.ArgsBase = ArgsBase;
9472 MapperAllocas.Args = Args;
9473 MapperAllocas.ArgSizes = ArgSizes;
9474}
9475
9477 Function *MapperFunc, Value *SrcLocInfo,
9478 Value *MaptypesArg, Value *MapnamesArg,
9480 int64_t DeviceID, unsigned NumOperands) {
9481 if (!updateToLocation(Loc))
9482 return;
9483
9484 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
9485 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
9486 Value *ArgsBaseGEP =
9487 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.ArgsBase,
9488 {Builder.getInt32(0), Builder.getInt32(0)});
9489 Value *ArgsGEP =
9490 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.Args,
9491 {Builder.getInt32(0), Builder.getInt32(0)});
9492 Value *ArgSizesGEP =
9493 Builder.CreateInBoundsGEP(ArrI64Ty, MapperAllocas.ArgSizes,
9494 {Builder.getInt32(0), Builder.getInt32(0)});
9495 Value *NullPtr =
9496 Constant::getNullValue(PointerType::getUnqual(Int8Ptr->getContext()));
9497 createRuntimeFunctionCall(MapperFunc, {SrcLocInfo, Builder.getInt64(DeviceID),
9498 Builder.getInt32(NumOperands),
9499 ArgsBaseGEP, ArgsGEP, ArgSizesGEP,
9500 MaptypesArg, MapnamesArg, NullPtr});
9501}
9502
9504 TargetDataRTArgs &RTArgs,
9505 TargetDataInfo &Info,
9506 bool ForEndCall) {
9507 assert((!ForEndCall || Info.separateBeginEndCalls()) &&
9508 "expected region end call to runtime only when end call is separate");
9509 auto UnqualPtrTy = PointerType::getUnqual(M.getContext());
9510 auto VoidPtrTy = UnqualPtrTy;
9511 auto VoidPtrPtrTy = UnqualPtrTy;
9512 auto Int64Ty = Type::getInt64Ty(M.getContext());
9513 auto Int64PtrTy = UnqualPtrTy;
9514
9515 if (!Info.NumberOfPtrs) {
9516 RTArgs.BasePointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
9517 RTArgs.PointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
9518 RTArgs.SizesArray = ConstantPointerNull::get(Int64PtrTy);
9519 RTArgs.MapTypesArray = ConstantPointerNull::get(Int64PtrTy);
9520 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
9521 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
9522 return;
9523 }
9524
9525 RTArgs.BasePointersArray = Builder.CreateConstInBoundsGEP2_32(
9526 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs),
9527 Info.RTArgs.BasePointersArray,
9528 /*Idx0=*/0, /*Idx1=*/0);
9529 RTArgs.PointersArray = Builder.CreateConstInBoundsGEP2_32(
9530 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray,
9531 /*Idx0=*/0,
9532 /*Idx1=*/0);
9533 RTArgs.SizesArray = Builder.CreateConstInBoundsGEP2_32(
9534 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
9535 /*Idx0=*/0, /*Idx1=*/0);
9536 RTArgs.MapTypesArray = Builder.CreateConstInBoundsGEP2_32(
9537 ArrayType::get(Int64Ty, Info.NumberOfPtrs),
9538 ForEndCall && Info.RTArgs.MapTypesArrayEnd ? Info.RTArgs.MapTypesArrayEnd
9539 : Info.RTArgs.MapTypesArray,
9540 /*Idx0=*/0,
9541 /*Idx1=*/0);
9542
9543 // Only emit the mapper information arrays if debug information is
9544 // requested.
9545 if (!Info.EmitDebug)
9546 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
9547 else
9548 RTArgs.MapNamesArray = Builder.CreateConstInBoundsGEP2_32(
9549 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.MapNamesArray,
9550 /*Idx0=*/0,
9551 /*Idx1=*/0);
9552 // If there is no user-defined mapper, set the mapper array to nullptr to
9553 // avoid an unnecessary data privatization
9554 if (!Info.HasMapper)
9555 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
9556 else
9557 RTArgs.MappersArray =
9558 Builder.CreatePointerCast(Info.RTArgs.MappersArray, VoidPtrPtrTy);
9559}
9560
9562 InsertPointTy CodeGenIP,
9563 MapInfosTy &CombinedInfo,
9564 TargetDataInfo &Info) {
9566 CombinedInfo.NonContigInfo;
9567
9568 // Build an array of struct descriptor_dim and then assign it to
9569 // offload_args.
9570 //
9571 // struct descriptor_dim {
9572 // uint64_t offset;
9573 // uint64_t count;
9574 // uint64_t stride
9575 // };
9576 Type *Int64Ty = Builder.getInt64Ty();
9578 M.getContext(), ArrayRef<Type *>({Int64Ty, Int64Ty, Int64Ty}),
9579 "struct.descriptor_dim");
9580
9581 enum { OffsetFD = 0, CountFD, StrideFD };
9582 // We need two index variable here since the size of "Dims" is the same as
9583 // the size of Components, however, the size of offset, count, and stride is
9584 // equal to the size of base declaration that is non-contiguous.
9585 for (unsigned I = 0, L = 0, E = NonContigInfo.Dims.size(); I < E; ++I) {
9586 // Skip emitting ir if dimension size is 1 since it cannot be
9587 // non-contiguous.
9588 if (NonContigInfo.Dims[I] == 1)
9589 continue;
9590 Builder.restoreIP(AllocaIP);
9591 ArrayType *ArrayTy = ArrayType::get(DimTy, NonContigInfo.Dims[I]);
9592 AllocaInst *DimsAddr =
9593 Builder.CreateAlloca(ArrayTy, /* ArraySize = */ nullptr, "dims");
9594 Builder.restoreIP(CodeGenIP);
9595 for (unsigned II = 0, EE = NonContigInfo.Dims[I]; II < EE; ++II) {
9596 unsigned RevIdx = EE - II - 1;
9597 Value *DimsLVal = Builder.CreateInBoundsGEP(
9598 ArrayTy, DimsAddr, {Builder.getInt64(0), Builder.getInt64(II)});
9599 // Offset
9600 Value *OffsetLVal = Builder.CreateStructGEP(DimTy, DimsLVal, OffsetFD);
9601 Builder.CreateAlignedStore(
9602 NonContigInfo.Offsets[L][RevIdx], OffsetLVal,
9603 M.getDataLayout().getPrefTypeAlign(OffsetLVal->getType()));
9604 // Count
9605 Value *CountLVal = Builder.CreateStructGEP(DimTy, DimsLVal, CountFD);
9606 Builder.CreateAlignedStore(
9607 NonContigInfo.Counts[L][RevIdx], CountLVal,
9608 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
9609 // Stride
9610 Value *StrideLVal = Builder.CreateStructGEP(DimTy, DimsLVal, StrideFD);
9611 Builder.CreateAlignedStore(
9612 NonContigInfo.Strides[L][RevIdx], StrideLVal,
9613 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
9614 }
9615 // args[I] = &dims
9616 Builder.restoreIP(CodeGenIP);
9617 Value *DAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
9618 DimsAddr, Builder.getPtrTy());
9619 Value *P = Builder.CreateConstInBoundsGEP2_32(
9620 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs),
9621 Info.RTArgs.PointersArray, 0, I);
9622 Builder.CreateAlignedStore(
9623 DAddr, P, M.getDataLayout().getPrefTypeAlign(Builder.getPtrTy()));
9624 ++L;
9625 }
9626}
9627
9628void OpenMPIRBuilder::emitUDMapperArrayInitOrDel(
9629 Function *MapperFn, Value *MapperHandle, Value *Base, Value *Begin,
9630 Value *Size, Value *MapType, Value *MapName, TypeSize ElementSize,
9631 BasicBlock *ExitBB, bool IsInit) {
9632 StringRef Prefix = IsInit ? ".init" : ".del";
9633
9634 // Evaluate if this is an array section.
9636 M.getContext(), createPlatformSpecificName({"omp.array", Prefix}));
9637 Value *IsArray =
9638 Builder.CreateICmpSGT(Size, Builder.getInt64(1), "omp.arrayinit.isarray");
9639 Value *DeleteBit = Builder.CreateAnd(
9640 MapType,
9641 Builder.getInt64(
9642 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9643 OpenMPOffloadMappingFlags::OMP_MAP_DELETE)));
9644 Value *DeleteCond;
9645 Value *Cond;
9646 if (IsInit) {
9647 // base != begin?
9648 Value *BaseIsBegin = Builder.CreateICmpNE(Base, Begin);
9649 Cond = Builder.CreateOr(IsArray, BaseIsBegin);
9650 DeleteCond = Builder.CreateIsNull(
9651 DeleteBit,
9652 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
9653 } else {
9654 Cond = IsArray;
9655 DeleteCond = Builder.CreateIsNotNull(
9656 DeleteBit,
9657 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
9658 }
9659 Cond = Builder.CreateAnd(Cond, DeleteCond);
9660 Builder.CreateCondBr(Cond, BodyBB, ExitBB);
9661
9662 emitBlock(BodyBB, MapperFn);
9663 // Get the array size by multiplying element size and element number (i.e., \p
9664 // Size).
9665 Value *ArraySize = Builder.CreateNUWMul(Size, Builder.getInt64(ElementSize));
9666 // Remove OMP_MAP_TO and OMP_MAP_FROM from the map type, so that it achieves
9667 // memory allocation/deletion purpose only.
9668 Value *MapTypeArg = Builder.CreateAnd(
9669 MapType,
9670 Builder.getInt64(
9671 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9672 OpenMPOffloadMappingFlags::OMP_MAP_TO |
9673 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9674 MapTypeArg = Builder.CreateOr(
9675 MapTypeArg,
9676 Builder.getInt64(
9677 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9678 OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT)));
9679
9680 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
9681 // data structure.
9682 Value *OffloadingArgs[] = {MapperHandle, Base, Begin,
9683 ArraySize, MapTypeArg, MapName};
9685 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
9686 OffloadingArgs);
9687}
9688
9691 llvm::Value *BeginArg)>
9692 GenMapInfoCB,
9693 Type *ElemTy, StringRef FuncName, CustomMapperCallbackTy CustomMapperCB) {
9694 SmallVector<Type *> Params;
9695 Params.emplace_back(Builder.getPtrTy());
9696 Params.emplace_back(Builder.getPtrTy());
9697 Params.emplace_back(Builder.getPtrTy());
9698 Params.emplace_back(Builder.getInt64Ty());
9699 Params.emplace_back(Builder.getInt64Ty());
9700 Params.emplace_back(Builder.getPtrTy());
9701
9702 auto *FnTy =
9703 FunctionType::get(Builder.getVoidTy(), Params, /* IsVarArg */ false);
9704
9705 SmallString<64> TyStr;
9706 raw_svector_ostream Out(TyStr);
9707 Function *MapperFn =
9709 MapperFn->addFnAttr(Attribute::NoInline);
9710 MapperFn->addFnAttr(Attribute::NoUnwind);
9711 MapperFn->addParamAttr(0, Attribute::NoUndef);
9712 MapperFn->addParamAttr(1, Attribute::NoUndef);
9713 MapperFn->addParamAttr(2, Attribute::NoUndef);
9714 MapperFn->addParamAttr(3, Attribute::NoUndef);
9715 MapperFn->addParamAttr(4, Attribute::NoUndef);
9716 MapperFn->addParamAttr(5, Attribute::NoUndef);
9717
9718 // Start the mapper function code generation.
9719 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", MapperFn);
9720 auto SavedIP = Builder.saveIP();
9721 Builder.SetInsertPoint(EntryBB);
9722
9723 Value *MapperHandle = MapperFn->getArg(0);
9724 Value *BaseIn = MapperFn->getArg(1);
9725 Value *BeginIn = MapperFn->getArg(2);
9726 Value *Size = MapperFn->getArg(3);
9727 Value *MapType = MapperFn->getArg(4);
9728 Value *MapName = MapperFn->getArg(5);
9729
9730 // Compute the starting and end addresses of array elements.
9731 // Prepare common arguments for array initiation and deletion.
9732 // Convert the size in bytes into the number of array elements.
9733 TypeSize ElementSize = M.getDataLayout().getTypeStoreSize(ElemTy);
9734 Size = Builder.CreateExactUDiv(Size, Builder.getInt64(ElementSize));
9735 Value *PtrBegin = BeginIn;
9736 Value *PtrEnd = Builder.CreateGEP(ElemTy, PtrBegin, Size);
9737
9738 // Emit array initiation if this is an array section and \p MapType indicates
9739 // that memory allocation is required.
9740 BasicBlock *HeadBB = BasicBlock::Create(M.getContext(), "omp.arraymap.head");
9741 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
9742 MapType, MapName, ElementSize, HeadBB,
9743 /*IsInit=*/true);
9744
9745 // Emit a for loop to iterate through SizeArg of elements and map all of them.
9746
9747 // Emit the loop header block.
9748 emitBlock(HeadBB, MapperFn);
9749 BasicBlock *BodyBB = BasicBlock::Create(M.getContext(), "omp.arraymap.body");
9750 BasicBlock *DoneBB = BasicBlock::Create(M.getContext(), "omp.done");
9751 // Evaluate whether the initial condition is satisfied.
9752 Value *IsEmpty =
9753 Builder.CreateICmpEQ(PtrBegin, PtrEnd, "omp.arraymap.isempty");
9754 Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB);
9755
9756 // Emit the loop body block.
9757 emitBlock(BodyBB, MapperFn);
9758 BasicBlock *LastBB = BodyBB;
9759 PHINode *PtrPHI =
9760 Builder.CreatePHI(PtrBegin->getType(), 2, "omp.arraymap.ptrcurrent");
9761 PtrPHI->addIncoming(PtrBegin, HeadBB);
9762
9763 // Get map clause information. Fill up the arrays with all mapped variables.
9764 MapInfosOrErrorTy Info = GenMapInfoCB(Builder.saveIP(), PtrPHI, BeginIn);
9765 if (!Info)
9766 return Info.takeError();
9767
9768 // Call the runtime API __tgt_mapper_num_components to get the number of
9769 // pre-existing components.
9770 Value *OffloadingArgs[] = {MapperHandle};
9771 Value *PreviousSize = createRuntimeFunctionCall(
9772 getOrCreateRuntimeFunction(M, OMPRTL___tgt_mapper_num_components),
9773 OffloadingArgs);
9774 Value *ShiftedPreviousSize =
9775 Builder.CreateShl(PreviousSize, Builder.getInt64(getFlagMemberOffset()));
9776
9777 // Fill up the runtime mapper handle for all components.
9778 for (unsigned I = 0; I < Info->BasePointers.size(); ++I) {
9779 Value *CurBaseArg = Info->BasePointers[I];
9780 Value *CurBeginArg = Info->Pointers[I];
9781 Value *CurSizeArg = Info->Sizes[I];
9782 Value *CurNameArg = Info->Names.size()
9783 ? Info->Names[I]
9784 : Constant::getNullValue(Builder.getPtrTy());
9785
9786 // Extract the MEMBER_OF field from the map type.
9787 Value *OriMapType = Builder.getInt64(
9788 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9789 Info->Types[I]));
9790 Value *MemberMapType =
9791 Builder.CreateNUWAdd(OriMapType, ShiftedPreviousSize);
9792
9793 // Combine the map type inherited from user-defined mapper with that
9794 // specified in the program. According to the OMP_MAP_TO and OMP_MAP_FROM
9795 // bits of the \a MapType, which is the input argument of the mapper
9796 // function, the following code will set the OMP_MAP_TO and OMP_MAP_FROM
9797 // bits of MemberMapType.
9798 // [OpenMP 5.0], 1.2.6. map-type decay.
9799 // | alloc | to | from | tofrom | release | delete
9800 // ----------------------------------------------------------
9801 // alloc | alloc | alloc | alloc | alloc | release | delete
9802 // to | alloc | to | alloc | to | release | delete
9803 // from | alloc | alloc | from | from | release | delete
9804 // tofrom | alloc | to | from | tofrom | release | delete
9805 Value *LeftToFrom = Builder.CreateAnd(
9806 MapType,
9807 Builder.getInt64(
9808 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9809 OpenMPOffloadMappingFlags::OMP_MAP_TO |
9810 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9811 BasicBlock *AllocBB = BasicBlock::Create(M.getContext(), "omp.type.alloc");
9812 BasicBlock *AllocElseBB =
9813 BasicBlock::Create(M.getContext(), "omp.type.alloc.else");
9814 BasicBlock *ToBB = BasicBlock::Create(M.getContext(), "omp.type.to");
9815 BasicBlock *ToElseBB =
9816 BasicBlock::Create(M.getContext(), "omp.type.to.else");
9817 BasicBlock *FromBB = BasicBlock::Create(M.getContext(), "omp.type.from");
9818 BasicBlock *EndBB = BasicBlock::Create(M.getContext(), "omp.type.end");
9819 Value *IsAlloc = Builder.CreateIsNull(LeftToFrom);
9820 Builder.CreateCondBr(IsAlloc, AllocBB, AllocElseBB);
9821 // In case of alloc, clear OMP_MAP_TO and OMP_MAP_FROM.
9822 emitBlock(AllocBB, MapperFn);
9823 Value *AllocMapType = Builder.CreateAnd(
9824 MemberMapType,
9825 Builder.getInt64(
9826 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9827 OpenMPOffloadMappingFlags::OMP_MAP_TO |
9828 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9829 Builder.CreateBr(EndBB);
9830 emitBlock(AllocElseBB, MapperFn);
9831 Value *IsTo = Builder.CreateICmpEQ(
9832 LeftToFrom,
9833 Builder.getInt64(
9834 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9835 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
9836 Builder.CreateCondBr(IsTo, ToBB, ToElseBB);
9837 // In case of to, clear OMP_MAP_FROM.
9838 emitBlock(ToBB, MapperFn);
9839 Value *ToMapType = Builder.CreateAnd(
9840 MemberMapType,
9841 Builder.getInt64(
9842 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9843 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9844 Builder.CreateBr(EndBB);
9845 emitBlock(ToElseBB, MapperFn);
9846 Value *IsFrom = Builder.CreateICmpEQ(
9847 LeftToFrom,
9848 Builder.getInt64(
9849 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9850 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9851 Builder.CreateCondBr(IsFrom, FromBB, EndBB);
9852 // In case of from, clear OMP_MAP_TO.
9853 emitBlock(FromBB, MapperFn);
9854 Value *FromMapType = Builder.CreateAnd(
9855 MemberMapType,
9856 Builder.getInt64(
9857 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9858 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
9859 // In case of tofrom, do nothing.
9860 emitBlock(EndBB, MapperFn);
9861 LastBB = EndBB;
9862 PHINode *CurMapType =
9863 Builder.CreatePHI(Builder.getInt64Ty(), 4, "omp.maptype");
9864 CurMapType->addIncoming(AllocMapType, AllocBB);
9865 CurMapType->addIncoming(ToMapType, ToBB);
9866 CurMapType->addIncoming(FromMapType, FromBB);
9867 CurMapType->addIncoming(MemberMapType, ToElseBB);
9868
9869 Value *OffloadingArgs[] = {MapperHandle, CurBaseArg, CurBeginArg,
9870 CurSizeArg, CurMapType, CurNameArg};
9871
9872 auto ChildMapperFn = CustomMapperCB(I);
9873 if (!ChildMapperFn)
9874 return ChildMapperFn.takeError();
9875 if (*ChildMapperFn) {
9876 // Call the corresponding mapper function.
9877 createRuntimeFunctionCall(*ChildMapperFn, OffloadingArgs)
9878 ->setDoesNotThrow();
9879 } else {
9880 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
9881 // data structure.
9883 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
9884 OffloadingArgs);
9885 }
9886 }
9887
9888 // Update the pointer to point to the next element that needs to be mapped,
9889 // and check whether we have mapped all elements.
9890 Value *PtrNext = Builder.CreateConstGEP1_32(ElemTy, PtrPHI, /*Idx0=*/1,
9891 "omp.arraymap.next");
9892 PtrPHI->addIncoming(PtrNext, LastBB);
9893 Value *IsDone = Builder.CreateICmpEQ(PtrNext, PtrEnd, "omp.arraymap.isdone");
9894 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), "omp.arraymap.exit");
9895 Builder.CreateCondBr(IsDone, ExitBB, BodyBB);
9896
9897 emitBlock(ExitBB, MapperFn);
9898 // Emit array deletion if this is an array section and \p MapType indicates
9899 // that deletion is required.
9900 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
9901 MapType, MapName, ElementSize, DoneBB,
9902 /*IsInit=*/false);
9903
9904 // Emit the function exit block.
9905 emitBlock(DoneBB, MapperFn, /*IsFinished=*/true);
9906
9907 Builder.CreateRetVoid();
9908 Builder.restoreIP(SavedIP);
9909 return MapperFn;
9910}
9911
9913 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
9914 TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB,
9915 bool IsNonContiguous,
9916 function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
9917
9918 // Reset the array information.
9919 Info.clearArrayInfo();
9920 Info.NumberOfPtrs = CombinedInfo.BasePointers.size();
9921
9922 if (Info.NumberOfPtrs == 0)
9923 return Error::success();
9924
9925 Builder.restoreIP(AllocaIP);
9926 // Detect if we have any capture size requiring runtime evaluation of the
9927 // size so that a constant array could be eventually used.
9928 ArrayType *PointerArrayType =
9929 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs);
9930
9931 Info.RTArgs.BasePointersArray = Builder.CreateAlloca(
9932 PointerArrayType, /* ArraySize = */ nullptr, ".offload_baseptrs");
9933
9934 Info.RTArgs.PointersArray = Builder.CreateAlloca(
9935 PointerArrayType, /* ArraySize = */ nullptr, ".offload_ptrs");
9936 AllocaInst *MappersArray = Builder.CreateAlloca(
9937 PointerArrayType, /* ArraySize = */ nullptr, ".offload_mappers");
9938 Info.RTArgs.MappersArray = MappersArray;
9939
9940 // If we don't have any VLA types or other types that require runtime
9941 // evaluation, we can use a constant array for the map sizes, otherwise we
9942 // need to fill up the arrays as we do for the pointers.
9943 Type *Int64Ty = Builder.getInt64Ty();
9944 SmallVector<Constant *> ConstSizes(CombinedInfo.Sizes.size(),
9945 ConstantInt::get(Int64Ty, 0));
9946 SmallBitVector RuntimeSizes(CombinedInfo.Sizes.size());
9947 for (unsigned I = 0, E = CombinedInfo.Sizes.size(); I < E; ++I) {
9948 if (auto *CI = dyn_cast<Constant>(CombinedInfo.Sizes[I])) {
9949 if (!isa<ConstantExpr>(CI) && !isa<GlobalValue>(CI)) {
9950 if (IsNonContiguous &&
9951 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9952 CombinedInfo.Types[I] &
9953 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG))
9954 ConstSizes[I] =
9955 ConstantInt::get(Int64Ty, CombinedInfo.NonContigInfo.Dims[I]);
9956 else
9957 ConstSizes[I] = CI;
9958 continue;
9959 }
9960 }
9961 RuntimeSizes.set(I);
9962 }
9963
9964 if (RuntimeSizes.all()) {
9965 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
9966 Info.RTArgs.SizesArray = Builder.CreateAlloca(
9967 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
9968 restoreIPandDebugLoc(Builder, CodeGenIP);
9969 } else {
9970 auto *SizesArrayInit = ConstantArray::get(
9971 ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes);
9972 std::string Name = createPlatformSpecificName({"offload_sizes"});
9973 auto *SizesArrayGbl =
9974 new GlobalVariable(M, SizesArrayInit->getType(), /*isConstant=*/true,
9975 GlobalValue::PrivateLinkage, SizesArrayInit, Name);
9976 SizesArrayGbl->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
9977
9978 if (!RuntimeSizes.any()) {
9979 Info.RTArgs.SizesArray = SizesArrayGbl;
9980 } else {
9981 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
9982 Align OffloadSizeAlign = M.getDataLayout().getABIIntegerTypeAlignment(64);
9983 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
9984 AllocaInst *Buffer = Builder.CreateAlloca(
9985 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
9986 Buffer->setAlignment(OffloadSizeAlign);
9987 restoreIPandDebugLoc(Builder, CodeGenIP);
9988 Builder.CreateMemCpy(
9989 Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()),
9990 SizesArrayGbl, OffloadSizeAlign,
9991 Builder.getIntN(
9992 IndexSize,
9993 Buffer->getAllocationSize(M.getDataLayout())->getFixedValue()));
9994
9995 Info.RTArgs.SizesArray = Buffer;
9996 }
9997 restoreIPandDebugLoc(Builder, CodeGenIP);
9998 }
9999
10000 // The map types are always constant so we don't need to generate code to
10001 // fill arrays. Instead, we create an array constant.
10003 for (auto mapFlag : CombinedInfo.Types)
10004 Mapping.push_back(
10005 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10006 mapFlag));
10007 std::string MaptypesName = createPlatformSpecificName({"offload_maptypes"});
10008 auto *MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
10009 Info.RTArgs.MapTypesArray = MapTypesArrayGbl;
10010
10011 // The information types are only built if provided.
10012 if (!CombinedInfo.Names.empty()) {
10013 auto *MapNamesArrayGbl = createOffloadMapnames(
10014 CombinedInfo.Names, createPlatformSpecificName({"offload_mapnames"}));
10015 Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
10016 Info.EmitDebug = true;
10017 } else {
10018 Info.RTArgs.MapNamesArray =
10020 Info.EmitDebug = false;
10021 }
10022
10023 // If there's a present map type modifier, it must not be applied to the end
10024 // of a region, so generate a separate map type array in that case.
10025 if (Info.separateBeginEndCalls()) {
10026 bool EndMapTypesDiffer = false;
10027 for (uint64_t &Type : Mapping) {
10028 if (Type & static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10029 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) {
10030 Type &= ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10031 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT);
10032 EndMapTypesDiffer = true;
10033 }
10034 }
10035 if (EndMapTypesDiffer) {
10036 MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
10037 Info.RTArgs.MapTypesArrayEnd = MapTypesArrayGbl;
10038 }
10039 }
10040
10041 PointerType *PtrTy = Builder.getPtrTy();
10042 for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) {
10043 Value *BPVal = CombinedInfo.BasePointers[I];
10044 Value *BP = Builder.CreateConstInBoundsGEP2_32(
10045 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray,
10046 0, I);
10047 Builder.CreateAlignedStore(BPVal, BP,
10048 M.getDataLayout().getPrefTypeAlign(PtrTy));
10049
10050 if (Info.requiresDevicePointerInfo()) {
10051 if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) {
10052 CodeGenIP = Builder.saveIP();
10053 Builder.restoreIP(AllocaIP);
10054 Info.DevicePtrInfoMap[BPVal] = {BP, Builder.CreateAlloca(PtrTy)};
10055 Builder.restoreIP(CodeGenIP);
10056 if (DeviceAddrCB)
10057 DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second);
10058 } else if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Address) {
10059 Info.DevicePtrInfoMap[BPVal] = {BP, BP};
10060 if (DeviceAddrCB)
10061 DeviceAddrCB(I, BP);
10062 }
10063 }
10064
10065 Value *PVal = CombinedInfo.Pointers[I];
10066 Value *P = Builder.CreateConstInBoundsGEP2_32(
10067 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0,
10068 I);
10069 // TODO: Check alignment correct.
10070 Builder.CreateAlignedStore(PVal, P,
10071 M.getDataLayout().getPrefTypeAlign(PtrTy));
10072
10073 if (RuntimeSizes.test(I)) {
10074 Value *S = Builder.CreateConstInBoundsGEP2_32(
10075 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
10076 /*Idx0=*/0,
10077 /*Idx1=*/I);
10078 Builder.CreateAlignedStore(Builder.CreateIntCast(CombinedInfo.Sizes[I],
10079 Int64Ty,
10080 /*isSigned=*/true),
10081 S, M.getDataLayout().getPrefTypeAlign(PtrTy));
10082 }
10083 // Fill up the mapper array.
10084 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
10085 Value *MFunc = ConstantPointerNull::get(PtrTy);
10086
10087 auto CustomMFunc = CustomMapperCB(I);
10088 if (!CustomMFunc)
10089 return CustomMFunc.takeError();
10090 if (*CustomMFunc)
10091 MFunc = Builder.CreatePointerCast(*CustomMFunc, PtrTy);
10092
10093 Value *MAddr = Builder.CreateInBoundsGEP(
10094 PointerArrayType, MappersArray,
10095 {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)});
10096 Builder.CreateAlignedStore(
10097 MFunc, MAddr, M.getDataLayout().getPrefTypeAlign(MAddr->getType()));
10098 }
10099
10100 if (!IsNonContiguous || CombinedInfo.NonContigInfo.Offsets.empty() ||
10101 Info.NumberOfPtrs == 0)
10102 return Error::success();
10103 emitNonContiguousDescriptor(AllocaIP, CodeGenIP, CombinedInfo, Info);
10104 return Error::success();
10105}
10106
10108 BasicBlock *CurBB = Builder.GetInsertBlock();
10109
10110 if (!CurBB || CurBB->getTerminator()) {
10111 // If there is no insert point or the previous block is already
10112 // terminated, don't touch it.
10113 } else {
10114 // Otherwise, create a fall-through branch.
10115 Builder.CreateBr(Target);
10116 }
10117
10118 Builder.ClearInsertionPoint();
10119}
10120
10122 bool IsFinished) {
10123 BasicBlock *CurBB = Builder.GetInsertBlock();
10124
10125 // Fall out of the current block (if necessary).
10126 emitBranch(BB);
10127
10128 if (IsFinished && BB->use_empty()) {
10129 BB->eraseFromParent();
10130 return;
10131 }
10132
10133 // Place the block after the current block, if possible, or else at
10134 // the end of the function.
10135 if (CurBB && CurBB->getParent())
10136 CurFn->insert(std::next(CurBB->getIterator()), BB);
10137 else
10138 CurFn->insert(CurFn->end(), BB);
10139 Builder.SetInsertPoint(BB);
10140}
10141
10143 BodyGenCallbackTy ElseGen,
10144 InsertPointTy AllocaIP) {
10145 // If the condition constant folds and can be elided, try to avoid emitting
10146 // the condition and the dead arm of the if/else.
10147 if (auto *CI = dyn_cast<ConstantInt>(Cond)) {
10148 auto CondConstant = CI->getSExtValue();
10149 if (CondConstant)
10150 return ThenGen(AllocaIP, Builder.saveIP());
10151
10152 return ElseGen(AllocaIP, Builder.saveIP());
10153 }
10154
10155 Function *CurFn = Builder.GetInsertBlock()->getParent();
10156
10157 // Otherwise, the condition did not fold, or we couldn't elide it. Just
10158 // emit the conditional branch.
10159 BasicBlock *ThenBlock = BasicBlock::Create(M.getContext(), "omp_if.then");
10160 BasicBlock *ElseBlock = BasicBlock::Create(M.getContext(), "omp_if.else");
10161 BasicBlock *ContBlock = BasicBlock::Create(M.getContext(), "omp_if.end");
10162 Builder.CreateCondBr(Cond, ThenBlock, ElseBlock);
10163 // Emit the 'then' code.
10164 emitBlock(ThenBlock, CurFn);
10165 if (Error Err = ThenGen(AllocaIP, Builder.saveIP()))
10166 return Err;
10167 emitBranch(ContBlock);
10168 // Emit the 'else' code if present.
10169 // There is no need to emit line number for unconditional branch.
10170 emitBlock(ElseBlock, CurFn);
10171 if (Error Err = ElseGen(AllocaIP, Builder.saveIP()))
10172 return Err;
10173 // There is no need to emit line number for unconditional branch.
10174 emitBranch(ContBlock);
10175 // Emit the continuation block for code after the if.
10176 emitBlock(ContBlock, CurFn, /*IsFinished=*/true);
10177 return Error::success();
10178}
10179
10180bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
10181 const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) {
10184 "Unexpected Atomic Ordering.");
10185
10186 bool Flush = false;
10188
10189 switch (AK) {
10190 case Read:
10193 FlushAO = AtomicOrdering::Acquire;
10194 Flush = true;
10195 }
10196 break;
10197 case Write:
10198 case Compare:
10199 case Update:
10202 FlushAO = AtomicOrdering::Release;
10203 Flush = true;
10204 }
10205 break;
10206 case Capture:
10207 switch (AO) {
10209 FlushAO = AtomicOrdering::Acquire;
10210 Flush = true;
10211 break;
10213 FlushAO = AtomicOrdering::Release;
10214 Flush = true;
10215 break;
10219 Flush = true;
10220 break;
10221 default:
10222 // do nothing - leave silently.
10223 break;
10224 }
10225 }
10226
10227 if (Flush) {
10228 // Currently Flush RT call still doesn't take memory_ordering, so for when
10229 // that happens, this tries to do the resolution of which atomic ordering
10230 // to use with but issue the flush call
10231 // TODO: pass `FlushAO` after memory ordering support is added
10232 (void)FlushAO;
10233 emitFlush(Loc);
10234 }
10235
10236 // for AO == AtomicOrdering::Monotonic and all other case combinations
10237 // do nothing
10238 return Flush;
10239}
10240
10244 AtomicOrdering AO, InsertPointTy AllocaIP) {
10245 if (!updateToLocation(Loc))
10246 return Loc.IP;
10247
10248 assert(X.Var->getType()->isPointerTy() &&
10249 "OMP Atomic expects a pointer to target memory");
10250 Type *XElemTy = X.ElemTy;
10251 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10252 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
10253 "OMP atomic read expected a scalar type");
10254
10255 Value *XRead = nullptr;
10256
10257 if (XElemTy->isIntegerTy()) {
10258 LoadInst *XLD =
10259 Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read");
10260 XLD->setAtomic(AO);
10261 XRead = cast<Value>(XLD);
10262 } else if (XElemTy->isStructTy()) {
10263 // FIXME: Add checks to ensure __atomic_load is emitted iff the
10264 // target does not support `atomicrmw` of the size of the struct
10265 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
10266 OldVal->setAtomic(AO);
10267 const DataLayout &DL = OldVal->getModule()->getDataLayout();
10268 unsigned LoadSize = DL.getTypeStoreSize(XElemTy);
10269 OpenMPIRBuilder::AtomicInfo atomicInfo(
10270 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
10271 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
10272 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
10273 XRead = AtomicLoadRes.first;
10274 OldVal->eraseFromParent();
10275 } else {
10276 // We need to perform atomic op as integer
10277 IntegerType *IntCastTy =
10278 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
10279 LoadInst *XLoad =
10280 Builder.CreateLoad(IntCastTy, X.Var, X.IsVolatile, "omp.atomic.load");
10281 XLoad->setAtomic(AO);
10282 if (XElemTy->isFloatingPointTy()) {
10283 XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast");
10284 } else {
10285 XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast");
10286 }
10287 }
10288 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
10289 Builder.CreateStore(XRead, V.Var, V.IsVolatile);
10290 return Builder.saveIP();
10291}
10292
10295 AtomicOpValue &X, Value *Expr,
10296 AtomicOrdering AO, InsertPointTy AllocaIP) {
10297 if (!updateToLocation(Loc))
10298 return Loc.IP;
10299
10300 assert(X.Var->getType()->isPointerTy() &&
10301 "OMP Atomic expects a pointer to target memory");
10302 Type *XElemTy = X.ElemTy;
10303 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10304 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
10305 "OMP atomic write expected a scalar type");
10306
10307 if (XElemTy->isIntegerTy()) {
10308 StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile);
10309 XSt->setAtomic(AO);
10310 } else if (XElemTy->isStructTy()) {
10311 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
10312 const DataLayout &DL = OldVal->getModule()->getDataLayout();
10313 unsigned LoadSize = DL.getTypeStoreSize(XElemTy);
10314 OpenMPIRBuilder::AtomicInfo atomicInfo(
10315 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
10316 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
10317 atomicInfo.EmitAtomicStoreLibcall(AO, Expr);
10318 OldVal->eraseFromParent();
10319 } else {
10320 // We need to bitcast and perform atomic op as integers
10321 IntegerType *IntCastTy =
10322 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
10323 Value *ExprCast =
10324 Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast");
10325 StoreInst *XSt = Builder.CreateStore(ExprCast, X.Var, X.IsVolatile);
10326 XSt->setAtomic(AO);
10327 }
10328
10329 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write);
10330 return Builder.saveIP();
10331}
10332
10335 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
10336 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr,
10337 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
10338 assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
10339 if (!updateToLocation(Loc))
10340 return Loc.IP;
10341
10342 LLVM_DEBUG({
10343 Type *XTy = X.Var->getType();
10344 assert(XTy->isPointerTy() &&
10345 "OMP Atomic expects a pointer to target memory");
10346 Type *XElemTy = X.ElemTy;
10347 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10348 XElemTy->isPointerTy()) &&
10349 "OMP atomic update expected a scalar type");
10350 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
10351 (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) &&
10352 "OpenMP atomic does not support LT or GT operations");
10353 });
10354
10355 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
10356 AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp, X.IsVolatile,
10357 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
10358 if (!AtomicResult)
10359 return AtomicResult.takeError();
10360 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
10361 return Builder.saveIP();
10362}
10363
10364// FIXME: Duplicating AtomicExpand
10365Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
10366 AtomicRMWInst::BinOp RMWOp) {
10367 switch (RMWOp) {
10368 case AtomicRMWInst::Add:
10369 return Builder.CreateAdd(Src1, Src2);
10370 case AtomicRMWInst::Sub:
10371 return Builder.CreateSub(Src1, Src2);
10372 case AtomicRMWInst::And:
10373 return Builder.CreateAnd(Src1, Src2);
10375 return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2));
10376 case AtomicRMWInst::Or:
10377 return Builder.CreateOr(Src1, Src2);
10378 case AtomicRMWInst::Xor:
10379 return Builder.CreateXor(Src1, Src2);
10384 case AtomicRMWInst::Max:
10385 case AtomicRMWInst::Min:
10396 llvm_unreachable("Unsupported atomic update operation");
10397 }
10398 llvm_unreachable("Unsupported atomic update operation");
10399}
10400
10401Expected<std::pair<Value *, Value *>> OpenMPIRBuilder::emitAtomicUpdate(
10402 InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
10404 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr,
10405 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
10406 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2
10407 // or a complex datatype.
10408 bool emitRMWOp = false;
10409 switch (RMWOp) {
10410 case AtomicRMWInst::Add:
10411 case AtomicRMWInst::And:
10413 case AtomicRMWInst::Or:
10414 case AtomicRMWInst::Xor:
10416 emitRMWOp = XElemTy;
10417 break;
10418 case AtomicRMWInst::Sub:
10419 emitRMWOp = (IsXBinopExpr && XElemTy);
10420 break;
10421 default:
10422 emitRMWOp = false;
10423 }
10424 emitRMWOp &= XElemTy->isIntegerTy();
10425
10426 std::pair<Value *, Value *> Res;
10427 if (emitRMWOp) {
10428 AtomicRMWInst *RMWInst =
10429 Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
10430 if (T.isAMDGPU()) {
10431 if (IsIgnoreDenormalMode)
10432 RMWInst->setMetadata("amdgpu.ignore.denormal.mode",
10433 llvm::MDNode::get(Builder.getContext(), {}));
10434 if (!IsFineGrainedMemory)
10435 RMWInst->setMetadata("amdgpu.no.fine.grained.memory",
10436 llvm::MDNode::get(Builder.getContext(), {}));
10437 if (!IsRemoteMemory)
10438 RMWInst->setMetadata("amdgpu.no.remote.memory",
10439 llvm::MDNode::get(Builder.getContext(), {}));
10440 }
10441 Res.first = RMWInst;
10442 // not needed except in case of postfix captures. Generate anyway for
10443 // consistency with the else part. Will be removed with any DCE pass.
10444 // AtomicRMWInst::Xchg does not have a coressponding instruction.
10445 if (RMWOp == AtomicRMWInst::Xchg)
10446 Res.second = Res.first;
10447 else
10448 Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp);
10449 } else if (RMWOp == llvm::AtomicRMWInst::BinOp::BAD_BINOP &&
10450 XElemTy->isStructTy()) {
10451 LoadInst *OldVal =
10452 Builder.CreateLoad(XElemTy, X, X->getName() + ".atomic.load");
10453 OldVal->setAtomic(AO);
10454 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
10455 unsigned LoadSize =
10456 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
10457
10458 OpenMPIRBuilder::AtomicInfo atomicInfo(
10459 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
10460 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X);
10461 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
10462 BasicBlock *CurBB = Builder.GetInsertBlock();
10463 Instruction *CurBBTI = CurBB->getTerminator();
10464 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
10465 BasicBlock *ExitBB =
10466 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
10467 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
10468 X->getName() + ".atomic.cont");
10469 ContBB->getTerminator()->eraseFromParent();
10470 Builder.restoreIP(AllocaIP);
10471 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
10472 NewAtomicAddr->setName(X->getName() + "x.new.val");
10473 Builder.SetInsertPoint(ContBB);
10474 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
10475 PHI->addIncoming(AtomicLoadRes.first, CurBB);
10476 Value *OldExprVal = PHI;
10477 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
10478 if (!CBResult)
10479 return CBResult.takeError();
10480 Value *Upd = *CBResult;
10481 Builder.CreateStore(Upd, NewAtomicAddr);
10484 auto Result = atomicInfo.EmitAtomicCompareExchangeLibcall(
10485 AtomicLoadRes.second, NewAtomicAddr, AO, Failure);
10486 LoadInst *PHILoad = Builder.CreateLoad(XElemTy, Result.first);
10487 PHI->addIncoming(PHILoad, Builder.GetInsertBlock());
10488 Builder.CreateCondBr(Result.second, ExitBB, ContBB);
10489 OldVal->eraseFromParent();
10490 Res.first = OldExprVal;
10491 Res.second = Upd;
10492
10493 if (UnreachableInst *ExitTI =
10495 CurBBTI->eraseFromParent();
10496 Builder.SetInsertPoint(ExitBB);
10497 } else {
10498 Builder.SetInsertPoint(ExitTI);
10499 }
10500 } else {
10501 IntegerType *IntCastTy =
10502 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
10503 LoadInst *OldVal =
10504 Builder.CreateLoad(IntCastTy, X, X->getName() + ".atomic.load");
10505 OldVal->setAtomic(AO);
10506 // CurBB
10507 // | /---\
10508 // ContBB |
10509 // | \---/
10510 // ExitBB
10511 BasicBlock *CurBB = Builder.GetInsertBlock();
10512 Instruction *CurBBTI = CurBB->getTerminator();
10513 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
10514 BasicBlock *ExitBB =
10515 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
10516 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
10517 X->getName() + ".atomic.cont");
10518 ContBB->getTerminator()->eraseFromParent();
10519 Builder.restoreIP(AllocaIP);
10520 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
10521 NewAtomicAddr->setName(X->getName() + "x.new.val");
10522 Builder.SetInsertPoint(ContBB);
10523 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
10524 PHI->addIncoming(OldVal, CurBB);
10525 bool IsIntTy = XElemTy->isIntegerTy();
10526 Value *OldExprVal = PHI;
10527 if (!IsIntTy) {
10528 if (XElemTy->isFloatingPointTy()) {
10529 OldExprVal = Builder.CreateBitCast(PHI, XElemTy,
10530 X->getName() + ".atomic.fltCast");
10531 } else {
10532 OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy,
10533 X->getName() + ".atomic.ptrCast");
10534 }
10535 }
10536
10537 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
10538 if (!CBResult)
10539 return CBResult.takeError();
10540 Value *Upd = *CBResult;
10541 Builder.CreateStore(Upd, NewAtomicAddr);
10542 LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicAddr);
10545 AtomicCmpXchgInst *Result = Builder.CreateAtomicCmpXchg(
10546 X, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure);
10547 Result->setVolatile(VolatileX);
10548 Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0);
10549 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
10550 PHI->addIncoming(PreviousVal, Builder.GetInsertBlock());
10551 Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB);
10552
10553 Res.first = OldExprVal;
10554 Res.second = Upd;
10555
10556 // set Insertion point in exit block
10557 if (UnreachableInst *ExitTI =
10559 CurBBTI->eraseFromParent();
10560 Builder.SetInsertPoint(ExitBB);
10561 } else {
10562 Builder.SetInsertPoint(ExitTI);
10563 }
10564 }
10565
10566 return Res;
10567}
10568
10571 AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
10572 AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp,
10573 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr,
10574 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
10575 if (!updateToLocation(Loc))
10576 return Loc.IP;
10577
10578 LLVM_DEBUG({
10579 Type *XTy = X.Var->getType();
10580 assert(XTy->isPointerTy() &&
10581 "OMP Atomic expects a pointer to target memory");
10582 Type *XElemTy = X.ElemTy;
10583 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10584 XElemTy->isPointerTy()) &&
10585 "OMP atomic capture expected a scalar type");
10586 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
10587 "OpenMP atomic does not support LT or GT operations");
10588 });
10589
10590 // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
10591 // 'x' is simply atomically rewritten with 'expr'.
10592 AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
10593 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
10594 AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp, X.IsVolatile,
10595 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
10596 if (!AtomicResult)
10597 return AtomicResult.takeError();
10598 Value *CapturedVal =
10599 (IsPostfixUpdate ? AtomicResult->first : AtomicResult->second);
10600 Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
10601
10602 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
10603 return Builder.saveIP();
10604}
10605
10609 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
10610 bool IsFailOnly) {
10611
10613 return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr,
10614 IsPostfixUpdate, IsFailOnly, Failure);
10615}
10616
10620 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
10621 bool IsFailOnly, AtomicOrdering Failure) {
10622
10623 if (!updateToLocation(Loc))
10624 return Loc.IP;
10625
10626 assert(X.Var->getType()->isPointerTy() &&
10627 "OMP atomic expects a pointer to target memory");
10628 // compare capture
10629 if (V.Var) {
10630 assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type");
10631 assert(V.ElemTy == X.ElemTy && "x and v must be of same type");
10632 }
10633
10634 bool IsInteger = E->getType()->isIntegerTy();
10635
10636 if (Op == OMPAtomicCompareOp::EQ) {
10637 AtomicCmpXchgInst *Result = nullptr;
10638 if (!IsInteger) {
10639 IntegerType *IntCastTy =
10640 IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits());
10641 Value *EBCast = Builder.CreateBitCast(E, IntCastTy);
10642 Value *DBCast = Builder.CreateBitCast(D, IntCastTy);
10643 Result = Builder.CreateAtomicCmpXchg(X.Var, EBCast, DBCast, MaybeAlign(),
10644 AO, Failure);
10645 } else {
10646 Result =
10647 Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure);
10648 }
10649
10650 if (V.Var) {
10651 Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0);
10652 if (!IsInteger)
10653 OldValue = Builder.CreateBitCast(OldValue, X.ElemTy);
10654 assert(OldValue->getType() == V.ElemTy &&
10655 "OldValue and V must be of same type");
10656 if (IsPostfixUpdate) {
10657 Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
10658 } else {
10659 Value *SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1);
10660 if (IsFailOnly) {
10661 // CurBB----
10662 // | |
10663 // v |
10664 // ContBB |
10665 // | |
10666 // v |
10667 // ExitBB <-
10668 //
10669 // where ContBB only contains the store of old value to 'v'.
10670 BasicBlock *CurBB = Builder.GetInsertBlock();
10671 Instruction *CurBBTI = CurBB->getTerminator();
10672 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
10673 BasicBlock *ExitBB = CurBB->splitBasicBlock(
10674 CurBBTI, X.Var->getName() + ".atomic.exit");
10675 BasicBlock *ContBB = CurBB->splitBasicBlock(
10676 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
10677 ContBB->getTerminator()->eraseFromParent();
10678 CurBB->getTerminator()->eraseFromParent();
10679
10680 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
10681
10682 Builder.SetInsertPoint(ContBB);
10683 Builder.CreateStore(OldValue, V.Var);
10684 Builder.CreateBr(ExitBB);
10685
10686 if (UnreachableInst *ExitTI =
10688 CurBBTI->eraseFromParent();
10689 Builder.SetInsertPoint(ExitBB);
10690 } else {
10691 Builder.SetInsertPoint(ExitTI);
10692 }
10693 } else {
10694 Value *CapturedValue =
10695 Builder.CreateSelect(SuccessOrFail, E, OldValue);
10696 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
10697 }
10698 }
10699 }
10700 // The comparison result has to be stored.
10701 if (R.Var) {
10702 assert(R.Var->getType()->isPointerTy() &&
10703 "r.var must be of pointer type");
10704 assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
10705
10706 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
10707 Value *ResultCast = R.IsSigned
10708 ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy)
10709 : Builder.CreateZExt(SuccessFailureVal, R.ElemTy);
10710 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
10711 }
10712 } else {
10713 assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) &&
10714 "Op should be either max or min at this point");
10715 assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is ==");
10716
10717 // Reverse the ordop as the OpenMP forms are different from LLVM forms.
10718 // Let's take max as example.
10719 // OpenMP form:
10720 // x = x > expr ? expr : x;
10721 // LLVM form:
10722 // *ptr = *ptr > val ? *ptr : val;
10723 // We need to transform to LLVM form.
10724 // x = x <= expr ? x : expr;
10726 if (IsXBinopExpr) {
10727 if (IsInteger) {
10728 if (X.IsSigned)
10729 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min
10731 else
10732 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin
10734 } else {
10735 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMin
10737 }
10738 } else {
10739 if (IsInteger) {
10740 if (X.IsSigned)
10741 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max
10743 else
10744 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax
10746 } else {
10747 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMax
10749 }
10750 }
10751
10752 AtomicRMWInst *OldValue =
10753 Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO);
10754 if (V.Var) {
10755 Value *CapturedValue = nullptr;
10756 if (IsPostfixUpdate) {
10757 CapturedValue = OldValue;
10758 } else {
10759 CmpInst::Predicate Pred;
10760 switch (NewOp) {
10761 case AtomicRMWInst::Max:
10762 Pred = CmpInst::ICMP_SGT;
10763 break;
10765 Pred = CmpInst::ICMP_UGT;
10766 break;
10768 Pred = CmpInst::FCMP_OGT;
10769 break;
10770 case AtomicRMWInst::Min:
10771 Pred = CmpInst::ICMP_SLT;
10772 break;
10774 Pred = CmpInst::ICMP_ULT;
10775 break;
10777 Pred = CmpInst::FCMP_OLT;
10778 break;
10779 default:
10780 llvm_unreachable("unexpected comparison op");
10781 }
10782 Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E);
10783 CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue);
10784 }
10785 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
10786 }
10787 }
10788
10789 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare);
10790
10791 return Builder.saveIP();
10792}
10793
10796 BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower,
10797 Value *NumTeamsUpper, Value *ThreadLimit,
10798 Value *IfExpr) {
10799 if (!updateToLocation(Loc))
10800 return InsertPointTy();
10801
10802 uint32_t SrcLocStrSize;
10803 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
10804 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
10805 Function *CurrentFunction = Builder.GetInsertBlock()->getParent();
10806
10807 // Outer allocation basicblock is the entry block of the current function.
10808 BasicBlock &OuterAllocaBB = CurrentFunction->getEntryBlock();
10809 if (&OuterAllocaBB == Builder.GetInsertBlock()) {
10810 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.entry");
10811 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
10812 }
10813
10814 // The current basic block is split into four basic blocks. After outlining,
10815 // they will be mapped as follows:
10816 // ```
10817 // def current_fn() {
10818 // current_basic_block:
10819 // br label %teams.exit
10820 // teams.exit:
10821 // ; instructions after teams
10822 // }
10823 //
10824 // def outlined_fn() {
10825 // teams.alloca:
10826 // br label %teams.body
10827 // teams.body:
10828 // ; instructions within teams body
10829 // }
10830 // ```
10831 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "teams.exit");
10832 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.body");
10833 BasicBlock *AllocaBB =
10834 splitBB(Builder, /*CreateBranch=*/true, "teams.alloca");
10835
10836 bool SubClausesPresent =
10837 (NumTeamsLower || NumTeamsUpper || ThreadLimit || IfExpr);
10838 // Push num_teams
10839 if (!Config.isTargetDevice() && SubClausesPresent) {
10840 assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) &&
10841 "if lowerbound is non-null, then upperbound must also be non-null "
10842 "for bounds on num_teams");
10843
10844 if (NumTeamsUpper == nullptr)
10845 NumTeamsUpper = Builder.getInt32(0);
10846
10847 if (NumTeamsLower == nullptr)
10848 NumTeamsLower = NumTeamsUpper;
10849
10850 if (IfExpr) {
10851 assert(IfExpr->getType()->isIntegerTy() &&
10852 "argument to if clause must be an integer value");
10853
10854 // upper = ifexpr ? upper : 1
10855 if (IfExpr->getType() != Int1)
10856 IfExpr = Builder.CreateICmpNE(IfExpr,
10857 ConstantInt::get(IfExpr->getType(), 0));
10858 NumTeamsUpper = Builder.CreateSelect(
10859 IfExpr, NumTeamsUpper, Builder.getInt32(1), "numTeamsUpper");
10860
10861 // lower = ifexpr ? lower : 1
10862 NumTeamsLower = Builder.CreateSelect(
10863 IfExpr, NumTeamsLower, Builder.getInt32(1), "numTeamsLower");
10864 }
10865
10866 if (ThreadLimit == nullptr)
10867 ThreadLimit = Builder.getInt32(0);
10868
10869 // The __kmpc_push_num_teams_51 function expects int32 as the arguments. So,
10870 // truncate or sign extend the passed values to match the int32 parameters.
10871 Value *NumTeamsLowerInt32 =
10872 Builder.CreateSExtOrTrunc(NumTeamsLower, Builder.getInt32Ty());
10873 Value *NumTeamsUpperInt32 =
10874 Builder.CreateSExtOrTrunc(NumTeamsUpper, Builder.getInt32Ty());
10875 Value *ThreadLimitInt32 =
10876 Builder.CreateSExtOrTrunc(ThreadLimit, Builder.getInt32Ty());
10877
10878 Value *ThreadNum = getOrCreateThreadID(Ident);
10879
10881 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51),
10882 {Ident, ThreadNum, NumTeamsLowerInt32, NumTeamsUpperInt32,
10883 ThreadLimitInt32});
10884 }
10885 // Generate the body of teams.
10886 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
10887 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
10888 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
10889 return Err;
10890
10891 OutlineInfo OI;
10892 OI.EntryBB = AllocaBB;
10893 OI.ExitBB = ExitBB;
10894 OI.OuterAllocaBB = &OuterAllocaBB;
10895
10896 // Insert fake values for global tid and bound tid.
10898 InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
10900 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
10902 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true));
10903
10904 auto HostPostOutlineCB = [this, Ident,
10905 ToBeDeleted](Function &OutlinedFn) mutable {
10906 // The stale call instruction will be replaced with a new call instruction
10907 // for runtime call with the outlined function.
10908
10909 assert(OutlinedFn.hasOneUse() &&
10910 "there must be a single user for the outlined function");
10911 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
10912 ToBeDeleted.push_back(StaleCI);
10913
10914 assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) &&
10915 "Outlined function must have two or three arguments only");
10916
10917 bool HasShared = OutlinedFn.arg_size() == 3;
10918
10919 OutlinedFn.getArg(0)->setName("global.tid.ptr");
10920 OutlinedFn.getArg(1)->setName("bound.tid.ptr");
10921 if (HasShared)
10922 OutlinedFn.getArg(2)->setName("data");
10923
10924 // Call to the runtime function for teams in the current function.
10925 assert(StaleCI && "Error while outlining - no CallInst user found for the "
10926 "outlined function.");
10927 Builder.SetInsertPoint(StaleCI);
10928 SmallVector<Value *> Args = {
10929 Ident, Builder.getInt32(StaleCI->arg_size() - 2), &OutlinedFn};
10930 if (HasShared)
10931 Args.push_back(StaleCI->getArgOperand(2));
10934 omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
10935 Args);
10936
10937 for (Instruction *I : llvm::reverse(ToBeDeleted))
10938 I->eraseFromParent();
10939 };
10940
10941 if (!Config.isTargetDevice())
10942 OI.PostOutlineCB = HostPostOutlineCB;
10943
10944 addOutlineInfo(std::move(OI));
10945
10946 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
10947
10948 return Builder.saveIP();
10949}
10950
10953 InsertPointTy OuterAllocaIP,
10954 BodyGenCallbackTy BodyGenCB) {
10955 if (!updateToLocation(Loc))
10956 return InsertPointTy();
10957
10958 BasicBlock *OuterAllocaBB = OuterAllocaIP.getBlock();
10959
10960 if (OuterAllocaBB == Builder.GetInsertBlock()) {
10961 BasicBlock *BodyBB =
10962 splitBB(Builder, /*CreateBranch=*/true, "distribute.entry");
10963 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
10964 }
10965 BasicBlock *ExitBB =
10966 splitBB(Builder, /*CreateBranch=*/true, "distribute.exit");
10967 BasicBlock *BodyBB =
10968 splitBB(Builder, /*CreateBranch=*/true, "distribute.body");
10969 BasicBlock *AllocaBB =
10970 splitBB(Builder, /*CreateBranch=*/true, "distribute.alloca");
10971
10972 // Generate the body of distribute clause
10973 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
10974 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
10975 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
10976 return Err;
10977
10978 // When using target we use different runtime functions which require a
10979 // callback.
10980 if (Config.isTargetDevice()) {
10981 OutlineInfo OI;
10982 OI.OuterAllocaBB = OuterAllocaIP.getBlock();
10983 OI.EntryBB = AllocaBB;
10984 OI.ExitBB = ExitBB;
10985
10986 addOutlineInfo(std::move(OI));
10987 }
10988 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
10989
10990 return Builder.saveIP();
10991}
10992
10995 std::string VarName) {
10996 llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get(
10998 Names.size()),
10999 Names);
11000 auto *MapNamesArrayGlobal = new llvm::GlobalVariable(
11001 M, MapNamesArrayInit->getType(),
11002 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit,
11003 VarName);
11004 return MapNamesArrayGlobal;
11005}
11006
11007// Create all simple and struct types exposed by the runtime and remember
11008// the llvm::PointerTypes of them for easy access later.
11009void OpenMPIRBuilder::initializeTypes(Module &M) {
11010 LLVMContext &Ctx = M.getContext();
11011 StructType *T;
11012 unsigned DefaultTargetAS = Config.getDefaultTargetAS();
11013 unsigned ProgramAS = M.getDataLayout().getProgramAddressSpace();
11014#define OMP_TYPE(VarName, InitValue) VarName = InitValue;
11015#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
11016 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \
11017 VarName##PtrTy = PointerType::get(Ctx, DefaultTargetAS);
11018#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
11019 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \
11020 VarName##Ptr = PointerType::get(Ctx, ProgramAS);
11021#define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \
11022 T = StructType::getTypeByName(Ctx, StructName); \
11023 if (!T) \
11024 T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \
11025 VarName = T; \
11026 VarName##Ptr = PointerType::get(Ctx, DefaultTargetAS);
11027#include "llvm/Frontend/OpenMP/OMPKinds.def"
11028}
11029
11032 SmallVectorImpl<BasicBlock *> &BlockVector) {
11034 BlockSet.insert(EntryBB);
11035 BlockSet.insert(ExitBB);
11036
11037 Worklist.push_back(EntryBB);
11038 while (!Worklist.empty()) {
11039 BasicBlock *BB = Worklist.pop_back_val();
11040 BlockVector.push_back(BB);
11041 for (BasicBlock *SuccBB : successors(BB))
11042 if (BlockSet.insert(SuccBB).second)
11043 Worklist.push_back(SuccBB);
11044 }
11045}
11046
11048 uint64_t Size, int32_t Flags,
11050 StringRef Name) {
11051 if (!Config.isGPU()) {
11054 Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0);
11055 return;
11056 }
11057 // TODO: Add support for global variables on the device after declare target
11058 // support.
11059 Function *Fn = dyn_cast<Function>(Addr);
11060 if (!Fn)
11061 return;
11062
11063 // Add a function attribute for the kernel.
11064 Fn->addFnAttr("kernel");
11065 if (T.isAMDGCN())
11066 Fn->addFnAttr("uniform-work-group-size");
11067 Fn->addFnAttr(Attribute::MustProgress);
11068}
11069
11070// We only generate metadata for function that contain target regions.
11073
11074 // If there are no entries, we don't need to do anything.
11075 if (OffloadInfoManager.empty())
11076 return;
11077
11078 LLVMContext &C = M.getContext();
11081 16>
11082 OrderedEntries(OffloadInfoManager.size());
11083
11084 // Auxiliary methods to create metadata values and strings.
11085 auto &&GetMDInt = [this](unsigned V) {
11086 return ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), V));
11087 };
11088
11089 auto &&GetMDString = [&C](StringRef V) { return MDString::get(C, V); };
11090
11091 // Create the offloading info metadata node.
11092 NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info");
11093 auto &&TargetRegionMetadataEmitter =
11094 [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
11095 const TargetRegionEntryInfo &EntryInfo,
11097 // Generate metadata for target regions. Each entry of this metadata
11098 // contains:
11099 // - Entry 0 -> Kind of this type of metadata (0).
11100 // - Entry 1 -> Device ID of the file where the entry was identified.
11101 // - Entry 2 -> File ID of the file where the entry was identified.
11102 // - Entry 3 -> Mangled name of the function where the entry was
11103 // identified.
11104 // - Entry 4 -> Line in the file where the entry was identified.
11105 // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line.
11106 // - Entry 6 -> Order the entry was created.
11107 // The first element of the metadata node is the kind.
11108 Metadata *Ops[] = {
11109 GetMDInt(E.getKind()), GetMDInt(EntryInfo.DeviceID),
11110 GetMDInt(EntryInfo.FileID), GetMDString(EntryInfo.ParentName),
11111 GetMDInt(EntryInfo.Line), GetMDInt(EntryInfo.Count),
11112 GetMDInt(E.getOrder())};
11113
11114 // Save this entry in the right position of the ordered entries array.
11115 OrderedEntries[E.getOrder()] = std::make_pair(&E, EntryInfo);
11116
11117 // Add metadata to the named metadata node.
11118 MD->addOperand(MDNode::get(C, Ops));
11119 };
11120
11121 OffloadInfoManager.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter);
11122
11123 // Create function that emits metadata for each device global variable entry;
11124 auto &&DeviceGlobalVarMetadataEmitter =
11125 [&C, &OrderedEntries, &GetMDInt, &GetMDString, MD](
11126 StringRef MangledName,
11128 // Generate metadata for global variables. Each entry of this metadata
11129 // contains:
11130 // - Entry 0 -> Kind of this type of metadata (1).
11131 // - Entry 1 -> Mangled name of the variable.
11132 // - Entry 2 -> Declare target kind.
11133 // - Entry 3 -> Order the entry was created.
11134 // The first element of the metadata node is the kind.
11135 Metadata *Ops[] = {GetMDInt(E.getKind()), GetMDString(MangledName),
11136 GetMDInt(E.getFlags()), GetMDInt(E.getOrder())};
11137
11138 // Save this entry in the right position of the ordered entries array.
11139 TargetRegionEntryInfo varInfo(MangledName, 0, 0, 0);
11140 OrderedEntries[E.getOrder()] = std::make_pair(&E, varInfo);
11141
11142 // Add metadata to the named metadata node.
11143 MD->addOperand(MDNode::get(C, Ops));
11144 };
11145
11146 OffloadInfoManager.actOnDeviceGlobalVarEntriesInfo(
11147 DeviceGlobalVarMetadataEmitter);
11148
11149 for (const auto &E : OrderedEntries) {
11150 assert(E.first && "All ordered entries must exist!");
11151 if (const auto *CE =
11153 E.first)) {
11154 if (!CE->getID() || !CE->getAddress()) {
11155 // Do not blame the entry if the parent funtion is not emitted.
11156 TargetRegionEntryInfo EntryInfo = E.second;
11157 StringRef FnName = EntryInfo.ParentName;
11158 if (!M.getNamedValue(FnName))
11159 continue;
11160 ErrorFn(EMIT_MD_TARGET_REGION_ERROR, EntryInfo);
11161 continue;
11162 }
11163 createOffloadEntry(CE->getID(), CE->getAddress(),
11164 /*Size=*/0, CE->getFlags(),
11166 } else if (const auto *CE = dyn_cast<
11168 E.first)) {
11171 CE->getFlags());
11172 switch (Flags) {
11175 if (Config.isTargetDevice() && Config.hasRequiresUnifiedSharedMemory())
11176 continue;
11177 if (!CE->getAddress()) {
11178 ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR, E.second);
11179 continue;
11180 }
11181 // The vaiable has no definition - no need to add the entry.
11182 if (CE->getVarSize() == 0)
11183 continue;
11184 break;
11186 assert(((Config.isTargetDevice() && !CE->getAddress()) ||
11187 (!Config.isTargetDevice() && CE->getAddress())) &&
11188 "Declaret target link address is set.");
11189 if (Config.isTargetDevice())
11190 continue;
11191 if (!CE->getAddress()) {
11193 continue;
11194 }
11195 break;
11198 if (!CE->getAddress()) {
11199 ErrorFn(EMIT_MD_GLOBAL_VAR_INDIRECT_ERROR, E.second);
11200 continue;
11201 }
11202 break;
11203 default:
11204 break;
11205 }
11206
11207 // Hidden or internal symbols on the device are not externally visible.
11208 // We should not attempt to register them by creating an offloading
11209 // entry. Indirect variables are handled separately on the device.
11210 if (auto *GV = dyn_cast<GlobalValue>(CE->getAddress()))
11211 if ((GV->hasLocalLinkage() || GV->hasHiddenVisibility()) &&
11212 (Flags !=
11214 Flags != OffloadEntriesInfoManager::
11215 OMPTargetGlobalVarEntryIndirectVTable))
11216 continue;
11217
11218 // Indirect globals need to use a special name that doesn't match the name
11219 // of the associated host global.
11221 Flags ==
11223 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
11224 Flags, CE->getLinkage(), CE->getVarName());
11225 else
11226 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
11227 Flags, CE->getLinkage());
11228
11229 } else {
11230 llvm_unreachable("Unsupported entry kind.");
11231 }
11232 }
11233
11234 // Emit requires directive globals to a special entry so the runtime can
11235 // register them when the device image is loaded.
11236 // TODO: This reduces the offloading entries to a 32-bit integer. Offloading
11237 // entries should be redesigned to better suit this use-case.
11238 if (Config.hasRequiresFlags() && !Config.isTargetDevice())
11242 ".requires", /*Size=*/0,
11244 Config.getRequiresFlags());
11245}
11246
11249 unsigned FileID, unsigned Line, unsigned Count) {
11250 raw_svector_ostream OS(Name);
11251 OS << KernelNamePrefix << llvm::format("%x", DeviceID)
11252 << llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
11253 if (Count)
11254 OS << "_" << Count;
11255}
11256
11258 SmallVectorImpl<char> &Name, const TargetRegionEntryInfo &EntryInfo) {
11259 unsigned NewCount = getTargetRegionEntryInfoCount(EntryInfo);
11261 Name, EntryInfo.ParentName, EntryInfo.DeviceID, EntryInfo.FileID,
11262 EntryInfo.Line, NewCount);
11263}
11264
11267 vfs::FileSystem &VFS,
11268 StringRef ParentName) {
11269 sys::fs::UniqueID ID(0xdeadf17e, 0);
11270 auto FileIDInfo = CallBack();
11271 uint64_t FileID = 0;
11272 if (ErrorOr<vfs::Status> Status = VFS.status(std::get<0>(FileIDInfo))) {
11273 ID = Status->getUniqueID();
11274 FileID = Status->getUniqueID().getFile();
11275 } else {
11276 // If the inode ID could not be determined, create a hash value
11277 // the current file name and use that as an ID.
11278 FileID = hash_value(std::get<0>(FileIDInfo));
11279 }
11280
11281 return TargetRegionEntryInfo(ParentName, ID.getDevice(), FileID,
11282 std::get<1>(FileIDInfo));
11283}
11284
11286 unsigned Offset = 0;
11287 for (uint64_t Remain =
11288 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
11290 !(Remain & 1); Remain = Remain >> 1)
11291 Offset++;
11292 return Offset;
11293}
11294
11297 // Rotate by getFlagMemberOffset() bits.
11298 return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1)
11299 << getFlagMemberOffset());
11300}
11301
11304 omp::OpenMPOffloadMappingFlags MemberOfFlag) {
11305 // If the entry is PTR_AND_OBJ but has not been marked with the special
11306 // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
11307 // marked as MEMBER_OF.
11308 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
11310 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
11313 return;
11314
11315 // Entries with ATTACH are not members-of anything. They are handled
11316 // separately by the runtime after other maps have been handled.
11317 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
11319 return;
11320
11321 // Reset the placeholder value to prepare the flag for the assignment of the
11322 // proper MEMBER_OF value.
11323 Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF;
11324 Flags |= MemberOfFlag;
11325}
11326
11330 bool IsDeclaration, bool IsExternallyVisible,
11331 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
11332 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
11333 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
11334 std::function<Constant *()> GlobalInitializer,
11335 std::function<GlobalValue::LinkageTypes()> VariableLinkage) {
11336 // TODO: convert this to utilise the IRBuilder Config rather than
11337 // a passed down argument.
11338 if (OpenMPSIMD)
11339 return nullptr;
11340
11343 CaptureClause ==
11345 Config.hasRequiresUnifiedSharedMemory())) {
11346 SmallString<64> PtrName;
11347 {
11348 raw_svector_ostream OS(PtrName);
11349 OS << MangledName;
11350 if (!IsExternallyVisible)
11351 OS << format("_%x", EntryInfo.FileID);
11352 OS << "_decl_tgt_ref_ptr";
11353 }
11354
11355 Value *Ptr = M.getNamedValue(PtrName);
11356
11357 if (!Ptr) {
11358 GlobalValue *GlobalValue = M.getNamedValue(MangledName);
11359 Ptr = getOrCreateInternalVariable(LlvmPtrTy, PtrName);
11360
11361 auto *GV = cast<GlobalVariable>(Ptr);
11362 GV->setLinkage(GlobalValue::WeakAnyLinkage);
11363
11364 if (!Config.isTargetDevice()) {
11365 if (GlobalInitializer)
11366 GV->setInitializer(GlobalInitializer());
11367 else
11368 GV->setInitializer(GlobalValue);
11369 }
11370
11372 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
11373 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
11374 GlobalInitializer, VariableLinkage, LlvmPtrTy, cast<Constant>(Ptr));
11375 }
11376
11377 return cast<Constant>(Ptr);
11378 }
11379
11380 return nullptr;
11381}
11382
11386 bool IsDeclaration, bool IsExternallyVisible,
11387 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
11388 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
11389 std::vector<Triple> TargetTriple,
11390 std::function<Constant *()> GlobalInitializer,
11391 std::function<GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy,
11392 Constant *Addr) {
11394 (TargetTriple.empty() && !Config.isTargetDevice()))
11395 return;
11396
11398 StringRef VarName;
11399 int64_t VarSize;
11401
11403 CaptureClause ==
11405 !Config.hasRequiresUnifiedSharedMemory()) {
11407 VarName = MangledName;
11408 GlobalValue *LlvmVal = M.getNamedValue(VarName);
11409
11410 if (!IsDeclaration)
11411 VarSize = divideCeil(
11412 M.getDataLayout().getTypeSizeInBits(LlvmVal->getValueType()), 8);
11413 else
11414 VarSize = 0;
11415 Linkage = (VariableLinkage) ? VariableLinkage() : LlvmVal->getLinkage();
11416
11417 // This is a workaround carried over from Clang which prevents undesired
11418 // optimisation of internal variables.
11419 if (Config.isTargetDevice() &&
11420 (!IsExternallyVisible || Linkage == GlobalValue::LinkOnceODRLinkage)) {
11421 // Do not create a "ref-variable" if the original is not also available
11422 // on the host.
11423 if (!OffloadInfoManager.hasDeviceGlobalVarEntryInfo(VarName))
11424 return;
11425
11426 std::string RefName = createPlatformSpecificName({VarName, "ref"});
11427
11428 if (!M.getNamedValue(RefName)) {
11429 Constant *AddrRef =
11430 getOrCreateInternalVariable(Addr->getType(), RefName);
11431 auto *GvAddrRef = cast<GlobalVariable>(AddrRef);
11432 GvAddrRef->setConstant(true);
11433 GvAddrRef->setLinkage(GlobalValue::InternalLinkage);
11434 GvAddrRef->setInitializer(Addr);
11435 GeneratedRefs.push_back(GvAddrRef);
11436 }
11437 }
11438 } else {
11441 else
11443
11444 if (Config.isTargetDevice()) {
11445 VarName = (Addr) ? Addr->getName() : "";
11446 Addr = nullptr;
11447 } else {
11449 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
11450 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
11451 LlvmPtrTy, GlobalInitializer, VariableLinkage);
11452 VarName = (Addr) ? Addr->getName() : "";
11453 }
11454 VarSize = M.getDataLayout().getPointerSize();
11456 }
11457
11458 OffloadInfoManager.registerDeviceGlobalVarEntryInfo(VarName, Addr, VarSize,
11459 Flags, Linkage);
11460}
11461
11462/// Loads all the offload entries information from the host IR
11463/// metadata.
11465 // If we are in target mode, load the metadata from the host IR. This code has
11466 // to match the metadata creation in createOffloadEntriesAndInfoMetadata().
11467
11468 NamedMDNode *MD = M.getNamedMetadata(ompOffloadInfoName);
11469 if (!MD)
11470 return;
11471
11472 for (MDNode *MN : MD->operands()) {
11473 auto &&GetMDInt = [MN](unsigned Idx) {
11474 auto *V = cast<ConstantAsMetadata>(MN->getOperand(Idx));
11475 return cast<ConstantInt>(V->getValue())->getZExtValue();
11476 };
11477
11478 auto &&GetMDString = [MN](unsigned Idx) {
11479 auto *V = cast<MDString>(MN->getOperand(Idx));
11480 return V->getString();
11481 };
11482
11483 switch (GetMDInt(0)) {
11484 default:
11485 llvm_unreachable("Unexpected metadata!");
11486 break;
11487 case OffloadEntriesInfoManager::OffloadEntryInfo::
11488 OffloadingEntryInfoTargetRegion: {
11489 TargetRegionEntryInfo EntryInfo(/*ParentName=*/GetMDString(3),
11490 /*DeviceID=*/GetMDInt(1),
11491 /*FileID=*/GetMDInt(2),
11492 /*Line=*/GetMDInt(4),
11493 /*Count=*/GetMDInt(5));
11494 OffloadInfoManager.initializeTargetRegionEntryInfo(EntryInfo,
11495 /*Order=*/GetMDInt(6));
11496 break;
11497 }
11498 case OffloadEntriesInfoManager::OffloadEntryInfo::
11499 OffloadingEntryInfoDeviceGlobalVar:
11500 OffloadInfoManager.initializeDeviceGlobalVarEntryInfo(
11501 /*MangledName=*/GetMDString(1),
11503 /*Flags=*/GetMDInt(2)),
11504 /*Order=*/GetMDInt(3));
11505 break;
11506 }
11507 }
11508}
11509
11511 StringRef HostFilePath) {
11512 if (HostFilePath.empty())
11513 return;
11514
11515 auto Buf = VFS.getBufferForFile(HostFilePath);
11516 if (std::error_code Err = Buf.getError()) {
11517 report_fatal_error(("error opening host file from host file path inside of "
11518 "OpenMPIRBuilder: " +
11519 Err.message())
11520 .c_str());
11521 }
11522
11523 LLVMContext Ctx;
11525 Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx));
11526 if (std::error_code Err = M.getError()) {
11528 ("error parsing host file inside of OpenMPIRBuilder: " + Err.message())
11529 .c_str());
11530 }
11531
11532 loadOffloadInfoMetadata(*M.get());
11533}
11534
11535//===----------------------------------------------------------------------===//
11536// OffloadEntriesInfoManager
11537//===----------------------------------------------------------------------===//
11538
11540 return OffloadEntriesTargetRegion.empty() &&
11541 OffloadEntriesDeviceGlobalVar.empty();
11542}
11543
11544unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
11545 const TargetRegionEntryInfo &EntryInfo) const {
11546 auto It = OffloadEntriesTargetRegionCount.find(
11547 getTargetRegionEntryCountKey(EntryInfo));
11548 if (It == OffloadEntriesTargetRegionCount.end())
11549 return 0;
11550 return It->second;
11551}
11552
11553void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
11554 const TargetRegionEntryInfo &EntryInfo) {
11555 OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
11556 EntryInfo.Count + 1;
11557}
11558
11559/// Initialize target region entry.
11561 const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
11562 OffloadEntriesTargetRegion[EntryInfo] =
11563 OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
11565 ++OffloadingEntriesNum;
11566}
11567
11569 TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID,
11571 assert(EntryInfo.Count == 0 && "expected default EntryInfo");
11572
11573 // Update the EntryInfo with the next available count for this location.
11574 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
11575
11576 // If we are emitting code for a target, the entry is already initialized,
11577 // only has to be registered.
11578 if (OMPBuilder->Config.isTargetDevice()) {
11579 // This could happen if the device compilation is invoked standalone.
11580 if (!hasTargetRegionEntryInfo(EntryInfo)) {
11581 return;
11582 }
11583 auto &Entry = OffloadEntriesTargetRegion[EntryInfo];
11584 Entry.setAddress(Addr);
11585 Entry.setID(ID);
11586 Entry.setFlags(Flags);
11587 } else {
11589 hasTargetRegionEntryInfo(EntryInfo, /*IgnoreAddressId*/ true))
11590 return;
11591 assert(!hasTargetRegionEntryInfo(EntryInfo) &&
11592 "Target region entry already registered!");
11593 OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum, Addr, ID, Flags);
11594 OffloadEntriesTargetRegion[EntryInfo] = Entry;
11595 ++OffloadingEntriesNum;
11596 }
11597 incrementTargetRegionEntryInfoCount(EntryInfo);
11598}
11599
11601 TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId) const {
11602
11603 // Update the EntryInfo with the next available count for this location.
11604 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
11605
11606 auto It = OffloadEntriesTargetRegion.find(EntryInfo);
11607 if (It == OffloadEntriesTargetRegion.end()) {
11608 return false;
11609 }
11610 // Fail if this entry is already registered.
11611 if (!IgnoreAddressId && (It->second.getAddress() || It->second.getID()))
11612 return false;
11613 return true;
11614}
11615
11617 const OffloadTargetRegionEntryInfoActTy &Action) {
11618 // Scan all target region entries and perform the provided action.
11619 for (const auto &It : OffloadEntriesTargetRegion) {
11620 Action(It.first, It.second);
11621 }
11622}
11623
11625 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order) {
11626 OffloadEntriesDeviceGlobalVar.try_emplace(Name, Order, Flags);
11627 ++OffloadingEntriesNum;
11628}
11629
11631 StringRef VarName, Constant *Addr, int64_t VarSize,
11633 if (OMPBuilder->Config.isTargetDevice()) {
11634 // This could happen if the device compilation is invoked standalone.
11635 if (!hasDeviceGlobalVarEntryInfo(VarName))
11636 return;
11637 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
11638 if (Entry.getAddress() && hasDeviceGlobalVarEntryInfo(VarName)) {
11639 if (Entry.getVarSize() == 0) {
11640 Entry.setVarSize(VarSize);
11641 Entry.setLinkage(Linkage);
11642 }
11643 return;
11644 }
11645 Entry.setVarSize(VarSize);
11646 Entry.setLinkage(Linkage);
11647 Entry.setAddress(Addr);
11648 } else {
11649 if (hasDeviceGlobalVarEntryInfo(VarName)) {
11650 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
11651 assert(Entry.isValid() && Entry.getFlags() == Flags &&
11652 "Entry not initialized!");
11653 if (Entry.getVarSize() == 0) {
11654 Entry.setVarSize(VarSize);
11655 Entry.setLinkage(Linkage);
11656 }
11657 return;
11658 }
11660 Flags ==
11662 OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum,
11663 Addr, VarSize, Flags, Linkage,
11664 VarName.str());
11665 else
11666 OffloadEntriesDeviceGlobalVar.try_emplace(
11667 VarName, OffloadingEntriesNum, Addr, VarSize, Flags, Linkage, "");
11668 ++OffloadingEntriesNum;
11669 }
11670}
11671
11674 // Scan all target region entries and perform the provided action.
11675 for (const auto &E : OffloadEntriesDeviceGlobalVar)
11676 Action(E.getKey(), E.getValue());
11677}
11678
11679//===----------------------------------------------------------------------===//
11680// CanonicalLoopInfo
11681//===----------------------------------------------------------------------===//
11682
11683void CanonicalLoopInfo::collectControlBlocks(
11685 // We only count those BBs as control block for which we do not need to
11686 // reverse the CFG, i.e. not the loop body which can contain arbitrary control
11687 // flow. For consistency, this also means we do not add the Body block, which
11688 // is just the entry to the body code.
11689 BBs.reserve(BBs.size() + 6);
11690 BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()});
11691}
11692
11694 assert(isValid() && "Requires a valid canonical loop");
11695 for (BasicBlock *Pred : predecessors(Header)) {
11696 if (Pred != Latch)
11697 return Pred;
11698 }
11699 llvm_unreachable("Missing preheader");
11700}
11701
11702void CanonicalLoopInfo::setTripCount(Value *TripCount) {
11703 assert(isValid() && "Requires a valid canonical loop");
11704
11705 Instruction *CmpI = &getCond()->front();
11706 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
11707 CmpI->setOperand(1, TripCount);
11708
11709#ifndef NDEBUG
11710 assertOK();
11711#endif
11712}
11713
11714void CanonicalLoopInfo::mapIndVar(
11715 llvm::function_ref<Value *(Instruction *)> Updater) {
11716 assert(isValid() && "Requires a valid canonical loop");
11717
11718 Instruction *OldIV = getIndVar();
11719
11720 // Record all uses excluding those introduced by the updater. Uses by the
11721 // CanonicalLoopInfo itself to keep track of the number of iterations are
11722 // excluded.
11723 SmallVector<Use *> ReplacableUses;
11724 for (Use &U : OldIV->uses()) {
11725 auto *User = dyn_cast<Instruction>(U.getUser());
11726 if (!User)
11727 continue;
11728 if (User->getParent() == getCond())
11729 continue;
11730 if (User->getParent() == getLatch())
11731 continue;
11732 ReplacableUses.push_back(&U);
11733 }
11734
11735 // Run the updater that may introduce new uses
11736 Value *NewIV = Updater(OldIV);
11737
11738 // Replace the old uses with the value returned by the updater.
11739 for (Use *U : ReplacableUses)
11740 U->set(NewIV);
11741
11742#ifndef NDEBUG
11743 assertOK();
11744#endif
11745}
11746
11748#ifndef NDEBUG
11749 // No constraints if this object currently does not describe a loop.
11750 if (!isValid())
11751 return;
11752
11753 BasicBlock *Preheader = getPreheader();
11754 BasicBlock *Body = getBody();
11755 BasicBlock *After = getAfter();
11756
11757 // Verify standard control-flow we use for OpenMP loops.
11758 assert(Preheader);
11759 assert(isa<BranchInst>(Preheader->getTerminator()) &&
11760 "Preheader must terminate with unconditional branch");
11761 assert(Preheader->getSingleSuccessor() == Header &&
11762 "Preheader must jump to header");
11763
11764 assert(Header);
11765 assert(isa<BranchInst>(Header->getTerminator()) &&
11766 "Header must terminate with unconditional branch");
11767 assert(Header->getSingleSuccessor() == Cond &&
11768 "Header must jump to exiting block");
11769
11770 assert(Cond);
11771 assert(Cond->getSinglePredecessor() == Header &&
11772 "Exiting block only reachable from header");
11773
11774 assert(isa<BranchInst>(Cond->getTerminator()) &&
11775 "Exiting block must terminate with conditional branch");
11776 assert(size(successors(Cond)) == 2 &&
11777 "Exiting block must have two successors");
11778 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(0) == Body &&
11779 "Exiting block's first successor jump to the body");
11780 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(1) == Exit &&
11781 "Exiting block's second successor must exit the loop");
11782
11783 assert(Body);
11784 assert(Body->getSinglePredecessor() == Cond &&
11785 "Body only reachable from exiting block");
11786 assert(!isa<PHINode>(Body->front()));
11787
11788 assert(Latch);
11789 assert(isa<BranchInst>(Latch->getTerminator()) &&
11790 "Latch must terminate with unconditional branch");
11791 assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header");
11792 // TODO: To support simple redirecting of the end of the body code that has
11793 // multiple; introduce another auxiliary basic block like preheader and after.
11794 assert(Latch->getSinglePredecessor() != nullptr);
11795 assert(!isa<PHINode>(Latch->front()));
11796
11797 assert(Exit);
11798 assert(isa<BranchInst>(Exit->getTerminator()) &&
11799 "Exit block must terminate with unconditional branch");
11800 assert(Exit->getSingleSuccessor() == After &&
11801 "Exit block must jump to after block");
11802
11803 assert(After);
11804 assert(After->getSinglePredecessor() == Exit &&
11805 "After block only reachable from exit block");
11806 assert(After->empty() || !isa<PHINode>(After->front()));
11807
11808 Instruction *IndVar = getIndVar();
11809 assert(IndVar && "Canonical induction variable not found?");
11810 assert(isa<IntegerType>(IndVar->getType()) &&
11811 "Induction variable must be an integer");
11812 assert(cast<PHINode>(IndVar)->getParent() == Header &&
11813 "Induction variable must be a PHI in the loop header");
11814 assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader);
11815 assert(
11816 cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero());
11817 assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch);
11818
11819 auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1);
11820 assert(cast<Instruction>(NextIndVar)->getParent() == Latch);
11821 assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add);
11822 assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar);
11823 assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1))
11824 ->isOne());
11825
11826 Value *TripCount = getTripCount();
11827 assert(TripCount && "Loop trip count not found?");
11828 assert(IndVar->getType() == TripCount->getType() &&
11829 "Trip count and induction variable must have the same type");
11830
11831 auto *CmpI = cast<CmpInst>(&Cond->front());
11832 assert(CmpI->getPredicate() == CmpInst::ICMP_ULT &&
11833 "Exit condition must be a signed less-than comparison");
11834 assert(CmpI->getOperand(0) == IndVar &&
11835 "Exit condition must compare the induction variable");
11836 assert(CmpI->getOperand(1) == TripCount &&
11837 "Exit condition must compare with the trip count");
11838#endif
11839}
11840
11842 Header = nullptr;
11843 Cond = nullptr;
11844 Latch = nullptr;
11845 Exit = nullptr;
11846}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Rewrite undef for PHI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Expand Atomic instructions
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Hexagon Common GEP
Hexagon Hardware Loops
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This header defines various interfaces for pass management in LLVM.
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file contains the declarations for metadata subclasses.
#define T
uint64_t IntrinsicInst * II
#define OMP_KERNEL_ARG_VERSION
Provides definitions for Target specific Grid Values.
static Value * removeASCastIfPresent(Value *V)
static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, Value *TripCount, Function &LoopBodyFn, bool NoLoop)
Value * createFakeIntVal(IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy OuterAllocaIP, llvm::SmallVectorImpl< Instruction * > &ToBeDeleted, OpenMPIRBuilder::InsertPointTy InnerAllocaIP, const Twine &Name="", bool AsPtr=true, bool Is64Bit=false)
static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL)
Make Source branch to Target.
static FunctionCallee getKmpcDistForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void applyParallelAccessesMetadata(CanonicalLoopInfo *CLI, LLVMContext &Ctx, Loop *Loop, LoopInfo &LoopInfo, SmallVector< Metadata * > &LoopMDList)
static FunctionCallee getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for finalizing the dynamic loop using depending on type.
static Expected< Function * > createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, StringRef FuncName, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void FixupDebugInfoForOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func, DenseMap< Value *, std::tuple< Value *, unsigned > > &ValueReplacementMap)
static OMPScheduleType getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType, bool HasOrderedClause)
Adds ordering modifier flags to schedule type.
static OMPScheduleType getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType, bool HasSimdModifier, bool HasMonotonic, bool HasNonmonotonic, bool HasOrderedClause)
Adds monotonicity modifier flags to schedule type.
static void workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident, Function &OutlinedFn, const SmallVector< Instruction *, 4 > &ToBeDeleted, WorksharingLoopType LoopType, bool NoLoop)
static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType)
static llvm::CallInst * emitNoUnwindRuntimeCall(IRBuilder<> &Builder, llvm::FunctionCallee Callee, ArrayRef< llvm::Value * > Args, const llvm::Twine &Name)
static Error populateReductionFunction(Function *ReductionFunc, ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, IRBuilder<> &Builder, ArrayRef< bool > IsByRef, bool IsGPU)
static Function * getFreshReductionFunc(Module &M)
static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder, Function *Function)
static FunctionCallee getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for updating the next loop using OpenMP dynamic scheduling depending...
static bool isConflictIP(IRBuilder<>::InsertPoint IP1, IRBuilder<>::InsertPoint IP2)
Return whether IP1 and IP2 are ambiguous, i.e.
static void checkReductionInfos(ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, bool IsGPU)
static Type * getOffloadingArrayType(Value *V)
static OMPScheduleType getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasDistScheduleChunks)
Determine which scheduling algorithm to use, determined from schedule clause arguments.
static OMPScheduleType computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause, bool HasDistScheduleChunks)
Determine the schedule type using schedule and ordering clause arguments.
static FunctionCallee getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for initializing loop bounds using OpenMP dynamic scheduling dependi...
static StructType * createTaskWithPrivatesTy(OpenMPIRBuilder &OMPIRBuilder, ArrayRef< Value * > OffloadingArraysToPrivatize)
static cl::opt< double > UnrollThresholdFactor("openmp-ir-builder-unroll-threshold-factor", cl::Hidden, cl::desc("Factor for the unroll threshold to account for code " "simplifications still taking place"), cl::init(1.5))
static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI)
Heuristically determine the best-performant unroll factor for CLI.
static Value * emitTaskDependencies(OpenMPIRBuilder &OMPBuilder, const SmallVectorImpl< OpenMPIRBuilder::DependData > &Dependencies)
static Error emitTargetOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry, TargetRegionEntryInfo &EntryInfo, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, Function *&OutlinedFn, Constant *&OutlinedFnID, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value, bool Min)
static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I)
static void redirectAllPredecessorsTo(BasicBlock *OldTarget, BasicBlock *NewTarget, DebugLoc DL)
Redirect all edges that branch to OldTarget to NewTarget.
static void hoistNonEntryAllocasToEntryBlock(llvm::BasicBlock &Block)
static std::unique_ptr< TargetMachine > createTargetMachine(Function *F, CodeGenOptLevel OptLevel)
Create the TargetMachine object to query the backend for optimization preferences.
static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void addAccessGroupMetadata(BasicBlock *Block, MDNode *AccessGroup, LoopInfo &LI)
Attach llvm.access.group metadata to the memref instructions of Block.
static void addBasicBlockMetadata(BasicBlock *BB, ArrayRef< Metadata * > Properties)
Attach metadata Properties to the basic block described by BB.
static void restoreIPandDebugLoc(llvm::IRBuilderBase &Builder, llvm::IRBuilderBase::InsertPoint IP)
This is wrapper over IRBuilderBase::restoreIP that also restores the current debug location to the la...
static LoadInst * loadSharedDataFromTaskDescriptor(OpenMPIRBuilder &OMPIRBuilder, IRBuilderBase &Builder, Value *TaskWithPrivates, Type *TaskWithPrivatesTy)
Given a task descriptor, TaskWithPrivates, return the pointer to the block of pointers containing sha...
static cl::opt< bool > OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden, cl::desc("Use optimistic attributes describing " "'as-if' properties of runtime calls."), cl::init(false))
static FunctionCallee getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType)
static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::TargetDataInfo &Info, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID, SmallVectorImpl< Value * > &Args, OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB, OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB, const SmallVector< llvm::OpenMPIRBuilder::DependData > &Dependencies, bool HasNoWait, Value *DynCGroupMem, OMPDynGroupprivateFallbackType DynCGroupMemFallback)
static const omp::GV & getGridValue(const Triple &T, Function *Kernel)
static Function * emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI, StructType *PrivatesTy, StructType *TaskWithPrivatesTy, const size_t NumOffloadingArrays, const int SharedArgsOperandNo)
Create an entry point for a target task with the following.
static void addLoopMetadata(CanonicalLoopInfo *Loop, ArrayRef< Metadata * > Properties)
Attach loop metadata Properties to the loop described by Loop.
static void removeUnusedBlocksFromParent(ArrayRef< BasicBlock * > BBs)
Determine which blocks in BBs are reachable from outside and remove the ones that are not reachable f...
static void targetParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition, Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr, Value *ThreadID, const SmallVector< Instruction *, 4 > &ToBeDeleted)
static void hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, Value *Ident, Value *IfCondition, Instruction *PrivTID, AllocaInst *PrivTIDAddr, const SmallVector< Instruction *, 4 > &ToBeDeleted)
#define P(N)
FunctionAnalysisManager FAM
Function * Fun
This file defines the Pass Instrumentation classes that provide instrumentation points into the pass ...
const SmallVectorImpl< MachineOperand > & Cond
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
std::unordered_set< BasicBlock * > BlockSet
This file implements the SmallBitVector class.
This file defines the SmallSet class.
This file contains some functions that are useful when dealing with strings.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
Defines the virtual file system interface vfs::FileSystem.
Value * RHS
Value * LHS
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
static const uint32_t IV[8]
Definition blake3_impl.h:83
The Input class is used to parse a yaml document into in-memory structs and vectors.
Class for arbitrary precision integers.
Definition APInt.h:78
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
PointerType * getType() const
Overload to return most specific pointer type.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
unsigned getAddressSpace() const
Return the address space for the allocation.
LLVM_ABI std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
LLVM_ABI bool isArrayAllocation() const
Return true if there is an allocation size parameter to the allocation instruction that is not 1.
void setAlignment(Align Align)
const Value * getArraySize() const
Get the number of elements allocated.
bool registerPass(PassBuilderT &&PassBuilder)
Register an analysis pass with the manager.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
unsigned getArgNo() const
Return the index of this formal argument in its containing function.
Definition Argument.h:50
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
Class to represent array types.
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
A function analysis which provides an AssumptionCache.
LLVM_ABI AssumptionCache run(Function &F, FunctionAnalysisManager &)
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
LLVM_ABI std::pair< LoadInst *, AllocaInst * > EmitAtomicLoadLibcall(AtomicOrdering AO)
Definition Atomic.cpp:107
LLVM_ABI void EmitAtomicStoreLibcall(AtomicOrdering AO, Value *Source)
Definition Atomic.cpp:148
an instruction that atomically reads a memory location, combines it with another value,...
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ FMinimum
*p = minimum(old, v) minimum matches the behavior of llvm.minimum.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FMaximum
*p = maximum(old, v) maximum matches the behavior of llvm.maximum.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:407
LLVM_ABI AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const
Add attributes to the attribute set.
LLVM_ABI AttributeSet addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const
Add an argument attribute.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
LLVM_ABI void replaceSuccessorsPhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block's successors to refer to basic block New instead of basic bl...
iterator end()
Definition BasicBlock.h:483
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:470
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
reverse_iterator rbegin()
Definition BasicBlock.h:486
bool empty() const
Definition BasicBlock.h:492
const Instruction & back() const
Definition BasicBlock.h:495
LLVM_ABI BasicBlock * splitBasicBlockBefore(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction and insert the new basic blo...
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI InstListType::const_iterator getFirstNonPHIOrDbg(bool SkipPseudoOp=true) const
Returns a pointer to the first instruction in this block that is not a PHINode or a debug intrinsic,...
LLVM_ABI const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const Instruction & front() const
Definition BasicBlock.h:493
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:172
LLVM_ABI const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
LLVM_ABI const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI SymbolTableList< BasicBlock >::iterator eraseFromParent()
Unlink 'this' from the containing function and delete it.
reverse_iterator rend()
Definition BasicBlock.h:488
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition BasicBlock.h:397
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition BasicBlock.h:668
LLVM_ABI void removePredecessor(BasicBlock *Pred, bool KeepOneInputPHIs=false)
Update PHI nodes in this BasicBlock before removal of predecessor Pred.
Conditional or Unconditional Branch instruction.
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
void setDoesNotThrow()
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Value * getArgOperand(unsigned i) const
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Class to represented the control flow structure of an OpenMP canonical loop.
Value * getTripCount() const
Returns the llvm::Value containing the number of loop iterations.
BasicBlock * getHeader() const
The header is the entry for each iteration.
LLVM_ABI void assertOK() const
Consistency self-check.
Type * getIndVarType() const
Return the type of the induction variable (and the trip count).
BasicBlock * getBody() const
The body block is the single entry for a loop iteration and not controlled by CanonicalLoopInfo.
bool isValid() const
Returns whether this object currently represents the IR of a loop.
void setLastIter(Value *IterVar)
Sets the last iteration variable for this loop.
OpenMPIRBuilder::InsertPointTy getAfterIP() const
Return the insertion point for user code after the loop.
OpenMPIRBuilder::InsertPointTy getBodyIP() const
Return the insertion point for user code in the body.
BasicBlock * getAfter() const
The after block is intended for clean-up code such as lifetime end markers.
Function * getFunction() const
LLVM_ABI void invalidate()
Invalidate this loop.
BasicBlock * getLatch() const
Reaching the latch indicates the end of the loop body code.
OpenMPIRBuilder::InsertPointTy getPreheaderIP() const
Return the insertion point for user code before the loop.
BasicBlock * getCond() const
The condition block computes whether there is another loop iteration.
BasicBlock * getExit() const
Reaching the exit indicates no more iterations are being executed.
LLVM_ABI BasicBlock * getPreheader() const
The preheader ensures that there is only a single edge entering the loop.
Instruction * getIndVar() const
Returns the instruction representing the current logical induction variable.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
A cache for the CodeExtractor analysis.
Utility class for extracting code into a new function.
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
static ConstantAsMetadata * get(Constant *C)
Definition Metadata.h:537
static LLVM_ABI Constant * getString(LLVMContext &Context, StringRef Initializer, bool AddNull=true)
This method constructs a CDS and initializes it with a text string.
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition Constants.h:720
static LLVM_ABI Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
static LLVM_ABI Constant * getTruncOrBitCast(Constant *C, Type *Ty)
static LLVM_ABI Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
static LLVM_ABI Constant * getSizeOf(Type *Ty)
getSizeOf constant expr - computes the (alloc) size of a type (in address-units, not bits) in a targe...
static LLVM_ABI Constant * getAddrSpaceCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
static LLVM_ABI ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
static LLVM_ABI Constant * get(StructType *T, ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
DILocalScope * getScope() const
Get the local scope for this variable.
DINodeArray getAnnotations() const
DIFile * getFile() const
Subprogram description. Uses SubclassData1.
Base class for types.
uint32_t getAlignInBits() const
DIFile * getFile() const
DIType * getType() const
unsigned getLine() const
StringRef getName() const
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition DataLayout.h:568
Record of a variable value-assignment, aka a non instruction representation of the dbg....
A debug info location.
Definition DebugLoc.h:123
Analysis pass which computes a DominatorTree.
Definition Dominators.h:283
LLVM_ABI DominatorTree run(Function &F, FunctionAnalysisManager &)
Run the analysis pass over a function and produce a dominator tree.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
Represents either an error or a value T.
Definition ErrorOr.h:56
Lightweight error class with error context and mandatory checking.
Definition Error.h:159
static ErrorSuccess success()
Create a success value.
Definition Error.h:336
Tagged union holding either a T or a Error.
Definition Error.h:485
Error takeError()
Take ownership of the stored error.
Definition Error.h:612
reference get()
Returns a reference to the stored T value.
Definition Error.h:582
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
static LLVM_ABI FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition Function.cpp:639
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
Definition Function.h:168
const BasicBlock & getEntryBlock() const
Definition Function.h:809
Argument * arg_iterator
Definition Function.h:73
bool empty() const
Definition Function.h:859
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:211
void removeFromParent()
removeFromParent - This method unlinks 'this' from the containing module, but does not delete it.
Definition Function.cpp:446
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:362
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:764
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:354
const Function & getFunction() const
Definition Function.h:166
iterator begin()
Definition Function.h:853
arg_iterator arg_begin()
Definition Function.h:868
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
Definition Function.h:357
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
adds the attribute to the list of attributes for the given arg.
Definition Function.cpp:667
Function::iterator insert(Function::iterator Position, BasicBlock *BB)
Insert BB in the basic block list at Position.
Definition Function.h:755
size_t arg_size() const
Definition Function.h:901
Type * getReturnType() const
Returns the type of the ret val.
Definition Function.h:216
iterator end()
Definition Function.h:855
void setCallingConv(CallingConv::ID CC)
Definition Function.h:276
Argument * getArg(unsigned i) const
Definition Function.h:886
bool hasMetadata() const
Return true if this value has any metadata attached to it.
Definition Value.h:603
LLVM_ABI void addMetadata(unsigned KindID, MDNode &MD)
Add a metadata attachment.
LinkageTypes getLinkage() const
void setLinkage(LinkageTypes LT)
Module * getParent()
Get the module that this global value is contained inside of...
void setDSOLocal(bool Local)
PointerType * getType() const
Global values are always pointers.
@ HiddenVisibility
The GV is hidden.
Definition GlobalValue.h:69
@ ProtectedVisibility
The GV is protected.
Definition GlobalValue.h:70
void setVisibility(VisibilityTypes V)
LinkageTypes
An enumeration for the kinds of linkage for global values.
Definition GlobalValue.h:52
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition GlobalValue.h:61
@ CommonLinkage
Tentative definitions.
Definition GlobalValue.h:63
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
@ WeakODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:58
@ WeakAnyLinkage
Keep one copy of named function when linking (weak)
Definition GlobalValue.h:57
@ AppendingLinkage
Special purpose, only applies to global arrays.
Definition GlobalValue.h:59
@ LinkOnceODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:56
Type * getValueType() const
InsertPoint - A saved insertion point.
Definition IRBuilder.h:291
BasicBlock * getBlock() const
Definition IRBuilder.h:306
bool isSet() const
Returns true if this insert point is set.
Definition IRBuilder.h:304
BasicBlock::iterator getPoint() const
Definition IRBuilder.h:307
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
InsertPoint saveIP() const
Returns the current insert point.
Definition IRBuilder.h:311
void restoreIP(InsertPoint IP)
Sets the current insert point to a previously-saved location.
Definition IRBuilder.h:323
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
LLVM_ABI const DebugLoc & getStableDebugLoc() const
Fetch the debug location for this node, unless this is a debug intrinsic, in which case fetch the deb...
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI void moveBefore(InstListType::iterator InsertPos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void moveBeforePreserving(InstListType::iterator MovePos)
Perform a moveBefore operation, while signalling that the caller intends to preserve the original ord...
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI void insertAfter(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately after the specified instruction.
Class to represent integer types.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Value * getPointerOperand()
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
LLVM_ABI LoopInfo run(Function &F, FunctionAnalysisManager &AM)
Definition LoopInfo.cpp:991
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createCallbackEncoding(unsigned CalleeArgNo, ArrayRef< int > Arguments, bool VarArgsArePassed)
Return metadata describing a callback (see llvm::AbstractCallSite).
Metadata node.
Definition Metadata.h:1080
LLVM_ABI void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
static MDTuple * getDistinct(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1580
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1442
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1572
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition Metadata.cpp:614
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
size_type size() const
Definition MapVector.h:56
Root of the metadata hierarchy.
Definition Metadata.h:64
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
LLVMContext & getContext() const
Get the global data context.
Definition Module.h:285
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
A tuple of MDNodes.
Definition Metadata.h:1760
iterator_range< op_iterator > operands()
Definition Metadata.h:1856
LLVM_ABI void addOperand(MDNode *M)
Class that manages information about offload code regions and data.
function_ref< void(StringRef, const OffloadEntryInfoDeviceGlobalVar &)> OffloadDeviceGlobalVarEntryInfoActTy
Applies action Action on all registered entries.
OMPTargetDeviceClauseKind
Kind of device clause for declare target variables and functions NOTE: Currently not used as a part o...
@ OMPTargetDeviceClauseAny
The target is marked for all devices.
LLVM_ABI void registerDeviceGlobalVarEntryInfo(StringRef VarName, Constant *Addr, int64_t VarSize, OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage)
Register device global variable entry.
LLVM_ABI void initializeDeviceGlobalVarEntryInfo(StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order)
Initialize device global variable entry.
LLVM_ABI void actOnDeviceGlobalVarEntriesInfo(const OffloadDeviceGlobalVarEntryInfoActTy &Action)
OMPTargetRegionEntryKind
Kind of the target registry entry.
@ OMPTargetRegionEntryTargetRegion
Mark the entry as target region.
LLVM_ABI void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, const TargetRegionEntryInfo &EntryInfo)
LLVM_ABI bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId=false) const
Return true if a target region entry with the provided information exists.
LLVM_ABI void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID, OMPTargetRegionEntryKind Flags)
Register target region entry.
LLVM_ABI void actOnTargetRegionEntriesInfo(const OffloadTargetRegionEntryInfoActTy &Action)
LLVM_ABI void initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo, unsigned Order)
Initialize target region entry.
OMPTargetGlobalVarEntryKind
Kind of the global variable entry..
@ OMPTargetGlobalVarEntryEnter
Mark the entry as a declare target enter.
@ OMPTargetGlobalRegisterRequires
Mark the entry as a register requires global.
@ OMPTargetGlobalVarEntryIndirect
Mark the entry as a declare target indirect global.
@ OMPTargetGlobalVarEntryLink
Mark the entry as a to declare target link.
@ OMPTargetGlobalVarEntryTo
Mark the entry as a to declare target.
@ OMPTargetGlobalVarEntryIndirectVTable
Mark the entry as a declare target indirect vtable.
function_ref< void(const TargetRegionEntryInfo &EntryInfo, const OffloadEntryInfoTargetRegion &)> OffloadTargetRegionEntryInfoActTy
brief Applies action Action on all registered entries.
bool hasDeviceGlobalVarEntryInfo(StringRef VarName) const
Checks if the variable with the given name has been registered already.
LLVM_ABI bool empty() const
Return true if a there are no entries defined.
std::optional< bool > IsTargetDevice
Flag to define whether to generate code for the role of the OpenMP host (if set to false) or device (...
std::optional< bool > IsGPU
Flag for specifying if the compilation is done for an accelerator.
LLVM_ABI int64_t getRequiresFlags() const
Returns requires directive clauses as flags compatible with those expected by libomptarget.
std::optional< bool > OpenMPOffloadMandatory
Flag for specifying if offloading is mandatory.
LLVM_ABI void setHasRequiresReverseOffload(bool Value)
LLVM_ABI bool hasRequiresUnifiedSharedMemory() const
LLVM_ABI void setHasRequiresUnifiedSharedMemory(bool Value)
unsigned getDefaultTargetAS() const
LLVM_ABI bool hasRequiresDynamicAllocators() const
LLVM_ABI void setHasRequiresUnifiedAddress(bool Value)
LLVM_ABI void setHasRequiresDynamicAllocators(bool Value)
LLVM_ABI bool hasRequiresReverseOffload() const
LLVM_ABI bool hasRequiresUnifiedAddress() const
Struct that keeps the information that should be kept throughout a 'target data' region.
An interface to create LLVM-IR for OpenMP directives.
LLVM_ABI InsertPointOrErrorTy createOrderedThreadsSimd(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsThreads)
Generator for 'omp ordered [threads | simd]'.
LLVM_ABI Constant * getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize, omp::IdentFlag Flags=omp::IdentFlag(0), unsigned Reserve2Flags=0)
Return an ident_t* encoding the source location SrcLocStr and Flags.
LLVM_ABI FunctionCallee getOrCreateRuntimeFunction(Module &M, omp::RuntimeFunction FnID)
Return the function declaration for the runtime function with FnID.
LLVM_ABI InsertPointOrErrorTy createCancel(const LocationDescription &Loc, Value *IfCondition, omp::Directive CanceledDirective)
Generator for 'omp cancel'.
std::function< Expected< Function * >(StringRef FunctionName)> FunctionGenCallback
Functions used to generate a function with the given name.
ReductionGenCBKind
Enum class for the RedctionGen CallBack type to be used.
LLVM_ABI CanonicalLoopInfo * collapseLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, InsertPointTy ComputeIP)
Collapse a loop nest into a single loop.
LLVM_ABI InsertPointOrErrorTy createTask(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, bool Tied=true, Value *Final=nullptr, Value *IfCondition=nullptr, SmallVector< DependData > Dependencies={}, bool Mergeable=false, Value *EventHandle=nullptr, Value *Priority=nullptr)
Generator for #omp taskloop
LLVM_ABI void createTaskyield(const LocationDescription &Loc)
Generator for 'omp taskyield'.
std::function< Error(InsertPointTy CodeGenIP)> FinalizeCallbackTy
Callback type for variable finalization (think destructors).
LLVM_ABI void emitBranch(BasicBlock *Target)
LLVM_ABI Error emitCancelationCheckImpl(Value *CancelFlag, omp::Directive CanceledDirective)
Generate control flow and cleanup for cancellation.
static LLVM_ABI void writeThreadBoundsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
LLVM_ABI void emitTaskwaitImpl(const LocationDescription &Loc)
Generate a taskwait runtime call.
LLVM_ABI Constant * registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, Function *OutlinedFunction, StringRef EntryFnName, StringRef EntryFnIDName)
Registers the given function and sets up the attribtues of the function Returns the FunctionID.
LLVM_ABI GlobalVariable * emitKernelExecutionMode(StringRef KernelName, omp::OMPTgtExecModeFlags Mode)
Emit the kernel execution mode.
LLVM_ABI InsertPointOrErrorTy createDistribute(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB)
Generator for #omp distribute
LLVM_ABI void initialize()
Initialize the internal state, this will put structures types and potentially other helpers into the ...
LLVM_ABI void createTargetDeinit(const LocationDescription &Loc, int32_t TeamsReductionDataSize=0, int32_t TeamsReductionBufferLength=1024)
Create a runtime call for kmpc_target_deinit.
LLVM_ABI InsertPointOrErrorTy createTaskgroup(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB)
Generator for the taskgroup construct.
LLVM_ABI InsertPointTy createAtomicWrite(const LocationDescription &Loc, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, InsertPointTy AllocaIP)
Emit atomic write for : X = Expr — Only Scalar data types.
LLVM_ABI void loadOffloadInfoMetadata(Module &M)
Loads all the offload entries information from the host IR metadata.
function_ref< MapInfosTy &(InsertPointTy CodeGenIP)> GenMapInfoCallbackTy
Callback type for creating the map infos for the kernel parameters.
LLVM_ABI Error emitOffloadingArrays(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr)
Emit the arrays used to pass the captures and map information to the offloading runtime library.
LLVM_ABI void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully unroll a loop.
function_ref< Error(InsertPointTy CodeGenIP, Value *IndVar)> LoopBodyGenCallbackTy
Callback type for loop body code generation.
LLVM_ABI InsertPointOrErrorTy emitScanReduction(const LocationDescription &Loc, ArrayRef< llvm::OpenMPIRBuilder::ReductionInfo > ReductionInfos, ScanInfo *ScanRedInfo)
This function performs the scan reduction of the values updated in the input phase.
LLVM_ABI void emitFlush(const LocationDescription &Loc)
Generate a flush runtime call.
static LLVM_ABI std::pair< int32_t, int32_t > readThreadBoundsForKernel(const Triple &T, Function &Kernel)
}
OpenMPIRBuilderConfig Config
The OpenMPIRBuilder Configuration.
LLVM_ABI CallInst * createOMPInteropDestroy(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_destroy.
LLVM_ABI Error emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen, BodyGenCallbackTy ElseGen, InsertPointTy AllocaIP={})
Emits code for OpenMP 'if' clause using specified BodyGenCallbackTy Here is the logic: if (Cond) { Th...
function_ref< InsertPointOrErrorTy( Argument &Arg, Value *Input, Value *&RetVal, InsertPointTy AllocaIP, InsertPointTy CodeGenIP)> TargetGenArgAccessorsCallbackTy
LLVM_ABI void emitUsed(StringRef Name, ArrayRef< llvm::WeakTrackingVH > List)
Emit the llvm.used metadata.
LLVM_ABI InsertPointOrErrorTy createSingle(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef< llvm::Value * > CPVars={}, ArrayRef< llvm::Function * > CPFuncs={})
Generator for 'omp single'.
LLVM_ABI InsertPointOrErrorTy createTeams(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower=nullptr, Value *NumTeamsUpper=nullptr, Value *ThreadLimit=nullptr, Value *IfExpr=nullptr)
Generator for #omp teams
std::forward_list< CanonicalLoopInfo > LoopInfos
Collection of owned canonical loop objects that eventually need to be free'd.
LLVM_ABI void createTaskwait(const LocationDescription &Loc)
Generator for 'omp taskwait'.
LLVM_ABI CanonicalLoopInfo * createLoopSkeleton(DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore, BasicBlock *PostInsertBefore, const Twine &Name={})
Create the control flow structure of a canonical OpenMP loop.
LLVM_ABI std::string createPlatformSpecificName(ArrayRef< StringRef > Parts) const
Get the create a name using the platform specific separators.
LLVM_ABI FunctionCallee createDispatchNextFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_next_* runtime function for the specified size IVSize and sign IVSigned.
static LLVM_ABI void getKernelArgsVector(TargetKernelArgs &KernelArgs, IRBuilderBase &Builder, SmallVector< Value * > &ArgsVector)
Create the kernel args vector used by emitTargetKernel.
function_ref< Error(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)> BodyGenCallbackTy
Callback type for body (=inner region) code generation.
LLVM_ABI void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully or partially unroll a loop.
LLVM_ABI InsertPointOrErrorTy createParallel(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable)
Generator for 'omp parallel'.
LLVM_ABI omp::OpenMPOffloadMappingFlags getMemberOfFlag(unsigned Position)
Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on the position given.
LLVM_ABI void addAttributes(omp::RuntimeFunction FnID, Function &Fn)
Add attributes known for FnID to Fn.
Module & M
The underlying LLVM-IR module.
StringMap< Constant * > SrcLocStrMap
Map to remember source location strings.
LLVM_ABI void createMapperAllocas(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumOperands, struct MapperAllocas &MapperAllocas)
Create the allocas instruction used in call to mapper functions.
LLVM_ABI Constant * getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the source location LocStr.
void addOutlineInfo(OutlineInfo &&OI)
Add a new region that will be outlined later.
LLVM_ABI Error emitTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry, Function *&OutlinedFn, Constant *&OutlinedFnID)
Create a unique name for the entry function using the source location information of the current targ...
LLVM_ABI Expected< SmallVector< llvm::CanonicalLoopInfo * > > createCanonicalScanLoops(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop, InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo)
Generator for the control flow structure of an OpenMP canonical loops if the parent directive has an ...
LLVM_ABI FunctionCallee createDispatchFiniFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_fini_* runtime function for the specified size IVSize and sign IVSigned.
LLVM_ABI void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, int32_t Factor, CanonicalLoopInfo **UnrolledCLI)
Partially unroll a loop.
function_ref< Error(Value *DeviceID, Value *RTLoc, IRBuilderBase::InsertPoint TargetTaskAllocaIP)> TargetTaskBodyCallbackTy
Callback type for generating the bodies of device directives that require outer target tasks (e....
Expected< MapInfosTy & > MapInfosOrErrorTy
LLVM_ABI void emitTaskyieldImpl(const LocationDescription &Loc)
Generate a taskyield runtime call.
LLVM_ABI void emitMapperCall(const LocationDescription &Loc, Function *MapperFunc, Value *SrcLocInfo, Value *MaptypesArg, Value *MapnamesArg, struct MapperAllocas &MapperAllocas, int64_t DeviceID, unsigned NumOperands)
Create the call for the target mapper function.
function_ref< Expected< Function * >(unsigned int)> CustomMapperCallbackTy
LLVM_ABI InsertPointTy createAtomicCompare(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO, omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly)
Emit atomic compare for constructs: — Only scalar data types cond-expr-stmt: x = x ordop expr ?
LLVM_ABI InsertPointTy createOrderedDepend(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumLoops, ArrayRef< llvm::Value * > StoreValues, const Twine &Name, bool IsDependSource)
Generator for 'omp ordered depend (source | sink)'.
LLVM_ABI InsertPointTy createCopyinClauseBlocks(InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr, llvm::IntegerType *IntPtrTy, bool BranchtoEnd=true)
Generate conditional branch and relevant BasicBlocks through which private threads copy the 'copyin' ...
function_ref< InsertPointOrErrorTy( InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &Original, Value &Inner, Value *&ReplVal)> PrivatizeCallbackTy
Callback type for variable privatization (think copy & default constructor).
LLVM_ABI bool isFinalized()
Check whether the finalize function has already run.
function_ref< InsertPointOrErrorTy( InsertPointTy AllocaIP, InsertPointTy CodeGenIP)> TargetBodyGenCallbackTy
SmallVector< FinalizationInfo, 8 > FinalizationStack
The finalization stack made up of finalize callbacks currently in-flight, wrapped into FinalizationIn...
LLVM_ABI std::vector< CanonicalLoopInfo * > tileLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, ArrayRef< Value * > TileSizes)
Tile a loop nest.
LLVM_ABI CallInst * createOMPInteropInit(const LocationDescription &Loc, Value *InteropVar, omp::OMPInteropType InteropType, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_init.
LLVM_ABI void finalize(Function *Fn=nullptr)
Finalize the underlying module, e.g., by outlining regions.
SmallVector< OutlineInfo, 16 > OutlineInfos
Collection of regions that need to be outlined during finalization.
LLVM_ABI Function * getOrCreateRuntimeFunctionPtr(omp::RuntimeFunction FnID)
LLVM_ABI InsertPointTy createTargetInit(const LocationDescription &Loc, const llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs)
The omp target interface.
LLVM_ABI InsertPointOrErrorTy createReductions(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< ReductionInfo > ReductionInfos, ArrayRef< bool > IsByRef, bool IsNoWait=false, bool IsTeamsReduction=false)
Generator for 'omp reduction'.
const Triple T
The target triple of the underlying module.
DenseMap< std::pair< Constant *, uint64_t >, Constant * > IdentMap
Map to remember existing ident_t*.
LLVM_ABI CallInst * createOMPFree(const LocationDescription &Loc, Value *Addr, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_free.
LLVM_ABI FunctionCallee createForStaticInitFunction(unsigned IVSize, bool IVSigned, bool IsGPUDistribute)
Returns __kmpc_for_static_init_* runtime function for the specified size IVSize and sign IVSigned.
LLVM_ABI CallInst * createOMPAlloc(const LocationDescription &Loc, Value *Size, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_Alloc.
LLVM_ABI void emitNonContiguousDescriptor(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info)
Emit an array of struct descriptors to be assigned to the offload args.
LLVM_ABI InsertPointOrErrorTy createSection(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for 'omp section'.
function_ref< InsertPointOrErrorTy(InsertPointTy)> EmitFallbackCallbackTy
Callback function type for functions emitting the host fallback code that is executed when the kernel...
static LLVM_ABI TargetRegionEntryInfo getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack, vfs::FileSystem &VFS, StringRef ParentName="")
Creates a unique info for a target entry when provided a filename and line number from.
LLVM_ABI void emitBlock(BasicBlock *BB, Function *CurFn, bool IsFinished=false)
LLVM_ABI Value * getOrCreateThreadID(Value *Ident)
Return the current thread ID.
LLVM_ABI InsertPointOrErrorTy createMaster(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for 'omp master'.
LLVM_ABI InsertPointOrErrorTy createTargetData(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond, TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB, CustomMapperCallbackTy CustomMapperCB, omp::RuntimeFunction *MapperFunc=nullptr, function_ref< InsertPointOrErrorTy(InsertPointTy CodeGenIP, BodyGenTy BodyGenType)> BodyGenCB=nullptr, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, Value *SrcLocInfo=nullptr)
Generator for 'omp target data'.
CallInst * createRuntimeFunctionCall(FunctionCallee Callee, ArrayRef< Value * > Args, StringRef Name="")
LLVM_ABI InsertPointOrErrorTy emitKernelLaunch(const LocationDescription &Loc, Value *OutlinedFnID, EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args, Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP)
Generate a target region entry call and host fallback call.
StringMap< GlobalVariable *, BumpPtrAllocator > InternalVars
An ordered map of auto-generated variables to their unique names.
LLVM_ABI InsertPointOrErrorTy createCancellationPoint(const LocationDescription &Loc, omp::Directive CanceledDirective)
Generator for 'omp cancellation point'.
LLVM_ABI FunctionCallee createDispatchInitFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_init_* runtime function for the specified size IVSize and sign IVSigned.
LLVM_ABI InsertPointOrErrorTy createScan(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< llvm::Value * > ScanVars, ArrayRef< llvm::Type * > ScanVarsType, bool IsInclusive, ScanInfo *ScanRedInfo)
This directive split and directs the control flow to input phase blocks or scan phase blocks based on...
LLVM_ABI CallInst * createOMPInteropUse(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_use.
IRBuilder<>::InsertPoint InsertPointTy
Type used throughout for insertion points.
LLVM_ABI GlobalVariable * getOrCreateInternalVariable(Type *Ty, const StringRef &Name, std::optional< unsigned > AddressSpace={})
Gets (if variable with the given name already exist) or creates internal global variable with the spe...
LLVM_ABI GlobalVariable * createOffloadMapnames(SmallVectorImpl< llvm::Constant * > &Names, std::string VarName)
Create the global variable holding the offload names information.
std::forward_list< ScanInfo > ScanInfos
Collection of owned ScanInfo objects that eventually need to be free'd.
static LLVM_ABI void writeTeamsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
LLVM_ABI Value * calculateCanonicalLoopTripCount(const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop, const Twine &Name="loop")
Calculate the trip count of a canonical loop.
LLVM_ABI InsertPointOrErrorTy createBarrier(const LocationDescription &Loc, omp::Directive Kind, bool ForceSimpleCall=false, bool CheckCancelFlag=true)
Emitter methods for OpenMP directives.
LLVM_ABI void setCorrectMemberOfFlag(omp::OpenMPOffloadMappingFlags &Flags, omp::OpenMPOffloadMappingFlags MemberOfFlag)
Given an initial flag set, this function modifies it to contain the passed in MemberOfFlag generated ...
LLVM_ABI Error emitOffloadingArraysAndArgs(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info, TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo, CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous=false, bool ForEndCall=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr)
Allocates memory for and populates the arrays required for offloading (offload_{baseptrs|ptrs|mappers...
LLVM_ABI Constant * getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the default source location.
LLVM_ABI InsertPointOrErrorTy createCritical(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst)
Generator for 'omp critical'.
LLVM_ABI void createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size, int32_t Flags, GlobalValue::LinkageTypes, StringRef Name="")
Creates offloading entry for the provided entry ID ID, address Addr, size Size, and flags Flags.
static LLVM_ABI unsigned getOpenMPDefaultSimdAlign(const Triple &TargetTriple, const StringMap< bool > &Features)
Get the default alignment value for given target.
LLVM_ABI unsigned getFlagMemberOffset()
Get the offset of the OMP_MAP_MEMBER_OF field.
LLVM_ABI InsertPointOrErrorTy applyWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, bool NeedsBarrier, llvm::omp::ScheduleKind SchedKind=llvm::omp::OMP_SCHEDULE_Default, Value *ChunkSize=nullptr, bool HasSimdModifier=false, bool HasMonotonicModifier=false, bool HasNonmonotonicModifier=false, bool HasOrderedClause=false, omp::WorksharingLoopType LoopType=omp::WorksharingLoopType::ForStaticLoop, bool NoLoop=false, bool HasDistSchedule=false, Value *DistScheduleChunkSize=nullptr)
Modifies the canonical loop to be a workshare loop.
LLVM_ABI InsertPointOrErrorTy createAtomicCapture(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, AtomicOpValue &V, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr, bool IsIgnoreDenormalMode=false, bool IsFineGrainedMemory=false, bool IsRemoteMemory=false)
Emit atomic update for constructs: — Only Scalar data types V = X; X = X BinOp Expr ,...
LLVM_ABI void createOffloadEntriesAndInfoMetadata(EmitMetadataErrorReportFunctionTy &ErrorReportFunction)
LLVM_ABI void applySimd(CanonicalLoopInfo *Loop, MapVector< Value *, Value * > AlignedVars, Value *IfCond, omp::OrderKind Order, ConstantInt *Simdlen, ConstantInt *Safelen)
Add metadata to simd-ize a loop.
LLVM_ABI InsertPointOrErrorTy createAtomicUpdate(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr, bool IsIgnoreDenormalMode=false, bool IsFineGrainedMemory=false, bool IsRemoteMemory=false)
Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X For complex Operations: X = ...
std::function< std::tuple< std::string, uint64_t >()> FileIdentifierInfoCallbackTy
bool isLastFinalizationInfoCancellable(omp::Directive DK)
Return true if the last entry in the finalization stack is of kind DK and cancellable.
LLVM_ABI InsertPointTy emitTargetKernel(const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return, Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads, Value *HostPtr, ArrayRef< Value * > KernelArgs)
Generate a target region entry call.
LLVM_ABI GlobalVariable * createOffloadMaptypes(SmallVectorImpl< uint64_t > &Mappings, std::string VarName)
Create the global variable holding the offload mappings information.
LLVM_ABI CallInst * createCachedThreadPrivate(const LocationDescription &Loc, llvm::Value *Pointer, llvm::ConstantInt *Size, const llvm::Twine &Name=Twine(""))
Create a runtime call for kmpc_threadprivate_cached.
IRBuilder Builder
The LLVM-IR Builder used to create IR.
LLVM_ABI InsertPointOrErrorTy createTarget(const LocationDescription &Loc, bool IsOffloadEntry, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::InsertPointTy CodeGenIP, TargetDataInfo &Info, TargetRegionEntryInfo &EntryInfo, const TargetKernelDefaultAttrs &DefaultAttrs, const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, SmallVectorImpl< Value * > &Inputs, GenMapInfoCallbackTy GenMapInfoCB, TargetBodyGenCallbackTy BodyGenCB, TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB, CustomMapperCallbackTy CustomMapperCB, const SmallVector< DependData > &Dependencies, bool HasNowait=false, Value *DynCGroupMem=nullptr, omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback=omp::OMPDynGroupprivateFallbackType::Abort)
Generator for 'omp target'.
LLVM_ABI GlobalValue * createGlobalFlag(unsigned Value, StringRef Name)
Create a hidden global flag Name in the module with initial value Value.
LLVM_ABI void emitOffloadingArraysArgument(IRBuilderBase &Builder, OpenMPIRBuilder::TargetDataRTArgs &RTArgs, OpenMPIRBuilder::TargetDataInfo &Info, bool ForEndCall=false)
Emit the arguments to be passed to the runtime library based on the arrays of base pointers,...
LLVM_ABI InsertPointOrErrorTy createMasked(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, Value *Filter)
Generator for 'omp masked'.
LLVM_ABI Expected< CanonicalLoopInfo * > createCanonicalLoop(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *TripCount, const Twine &Name="loop")
Generator for the control flow structure of an OpenMP canonical loop.
function_ref< Expected< InsertPointTy >( InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DestPtr, Value *SrcPtr)> TaskDupCallbackTy
Callback type for task duplication function code generation.
LLVM_ABI Value * getSizeInBytes(Value *BasePtr)
Computes the size of type in bytes.
LLVM_ABI InsertPointOrErrorTy createReductionsGPU(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< ReductionInfo > ReductionInfos, ArrayRef< bool > IsByRef, bool IsNoWait=false, bool IsTeamsReduction=false, ReductionGenCBKind ReductionGenCBKind=ReductionGenCBKind::MLIR, std::optional< omp::GV > GridValue={}, unsigned ReductionBufNum=1024, Value *SrcLocInfo=nullptr)
Design of OpenMP reductions on the GPU.
LLVM_ABI Expected< Function * > emitUserDefinedMapper(function_ref< MapInfosOrErrorTy(InsertPointTy CodeGenIP, llvm::Value *PtrPHI, llvm::Value *BeginArg)> PrivAndGenMapInfoCB, llvm::Type *ElemTy, StringRef FuncName, CustomMapperCallbackTy CustomMapperCB)
Emit the user-defined mapper function.
LLVM_ABI FunctionCallee createDispatchDeinitFunction()
Returns __kmpc_dispatch_deinit runtime function.
LLVM_ABI void registerTargetGlobalVariable(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy, Constant *Addr)
Registers a target variable for device or host.
BodyGenTy
Type of BodyGen to use for region codegen.
LLVM_ABI CanonicalLoopInfo * fuseLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops)
Fuse a sequence of loops.
SmallVector< llvm::Function *, 16 > ConstantAllocaRaiseCandidates
A collection of candidate target functions that's constant allocas will attempt to be raised on a cal...
OffloadEntriesInfoManager OffloadInfoManager
Info manager to keep track of target regions.
static LLVM_ABI std::pair< int32_t, int32_t > readTeamBoundsForKernel(const Triple &T, Function &Kernel)
Read/write a bounds on teams for Kernel.
const std::string ompOffloadInfoName
OMP Offload Info Metadata name string.
Expected< InsertPointTy > InsertPointOrErrorTy
Type used to represent an insertion point or an error value.
LLVM_ABI InsertPointTy createCopyPrivate(const LocationDescription &Loc, llvm::Value *BufSize, llvm::Value *CpyBuf, llvm::Value *CpyFn, llvm::Value *DidIt)
Generator for __kmpc_copyprivate.
LLVM_ABI InsertPointOrErrorTy createSections(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< StorableBodyGenCallbackTy > SectionCBs, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait)
Generator for 'omp sections'.
LLVM_ABI InsertPointOrErrorTy emitTargetTask(TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc, OpenMPIRBuilder::InsertPointTy AllocaIP, const SmallVector< llvm::OpenMPIRBuilder::DependData > &Dependencies, const TargetDataRTArgs &RTArgs, bool HasNoWait)
Generate a target-task for the target construct.
std::function< void(EmitMetadataErrorKind, TargetRegionEntryInfo)> EmitMetadataErrorReportFunctionTy
Callback function type.
LLVM_ABI Expected< ScanInfo * > scanInfoInitialize()
Creates a ScanInfo object, allocates and returns the pointer.
LLVM_ABI InsertPointTy createAtomicRead(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOrdering AO, InsertPointTy AllocaIP)
Emit atomic Read for : V = X — Only Scalar data types.
bool updateToLocation(const LocationDescription &Loc)
Update the internal location to Loc.
LLVM_ABI void createFlush(const LocationDescription &Loc)
Generator for 'omp flush'.
LLVM_ABI Constant * getAddrOfDeclareTargetVar(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, Type *LlvmPtrTy, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage)
Retrieve (or create if non-existent) the address of a declare target variable, used in conjunction wi...
EmitMetadataErrorKind
The kind of errors that can occur when emitting the offload entries and metadata.
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Pseudo-analysis pass that exposes the PassInstrumentation to pass managers.
Class to represent pointers.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
PostDominatorTree Class - Concrete subclass of DominatorTree that is used to compute the post-dominat...
Analysis pass that exposes the ScalarEvolution for a function.
LLVM_ABI ScalarEvolution run(Function &F, FunctionAnalysisManager &AM)
The main scalar evolution driver.
ScanInfo holds the information to assist in lowering of Scan reduction.
llvm::SmallDenseMap< llvm::Value *, llvm::Value * > * ScanBuffPtrs
Maps the private reduction variable to the pointer of the temporary buffer.
llvm::BasicBlock * OMPScanLoopExit
Exit block of loop body.
llvm::Value * IV
Keeps track of value of iteration variable for input/scan loop to be used for Scan directive lowering...
llvm::BasicBlock * OMPAfterScanBlock
Dominates the body of the loop before scan directive.
llvm::BasicBlock * OMPScanInit
Block before loop body where scan initializations are done.
llvm::BasicBlock * OMPBeforeScanBlock
Dominates the body of the loop before scan directive.
llvm::BasicBlock * OMPScanFinish
Block after loop body where scan finalizations are done.
llvm::Value * Span
Stores the span of canonical loop being lowered to be used for temporary buffer allocation or Finaliz...
bool OMPFirstScanLoop
If true, it indicates Input phase is lowered; else it indicates ScanPhase is lowered.
llvm::BasicBlock * OMPScanDispatch
Controls the flow to before or after scan blocks.
A vector that has set insertion semantics.
Definition SetVector.h:57
bool remove_if(UnaryPredicate P)
Remove items from the set vector based on a predicate function.
Definition SetVector.h:230
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:100
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
SmallBitVector & set()
bool test(unsigned Idx) const
bool all() const
Returns true if all bits are set.
bool any() const
Returns true if any bit is set.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
bool remove_if(UnaryPredicate P)
Remove elements that match the given predicate.
iterator end() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:134
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
void setAlignment(Align Align)
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:133
ValueTy lookup(StringRef Key) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition StringMap.h:260
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::string str() const
str - Get the contents as an std::string.
Definition StringRef.h:222
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:140
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:143
size_t count(char C) const
Return the number of occurrences of C in the string.
Definition StringRef.h:471
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:270
StringRef drop_back(size_t N=1) const
Return a StringRef equal to 'this' but with the last N elements dropped.
Definition StringRef.h:636
Class to represent struct types.
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:413
static LLVM_ABI StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition Type.cpp:619
Type * getElementType(unsigned N) const
Multiway switch.
LLVM_ABI void addCase(ConstantInt *OnVal, BasicBlock *Dest)
Add an entry to the switch instruction.
Analysis pass providing the TargetTransformInfo.
LLVM_ABI Result run(const Function &F, FunctionAnalysisManager &)
Analysis pass providing the TargetLibraryInfo.
Target - Wrapper for Target specific information.
TargetMachine * createTargetMachine(const Triple &TT, StringRef CPU, StringRef Features, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM=std::nullopt, CodeGenOptLevel OL=CodeGenOptLevel::Default, bool JIT=false) const
createTargetMachine - Create a target specific machine implementation for the specified Triple.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
bool isPPC() const
Tests whether the target is PowerPC (32- or 64-bit LE or BE).
Definition Triple.h:1092
bool isX86() const
Tests whether the target is x86 (32- or 64-bit).
Definition Triple.h:1154
bool isWasm() const
Tests whether the target is wasm (32- and 64-bit).
Definition Triple.h:1170
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM_ABI std::string str() const
Return the twine contents as a std::string.
Definition Twine.cpp:17
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
LLVM_ABI unsigned getIntegerBitWidth() const
LLVM_ABI Type * getStructElementType(unsigned N) const
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:280
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
This function has undefined behavior.
Produce an estimate of the unrolled cost of the specified loop.
Definition UnrollLoop.h:135
LLVM_ABI bool canUnroll() const
Whether it is legal to unroll this loop.
uint64_t getRolledLoopSize() const
Definition UnrollLoop.h:151
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:403
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:440
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
User * user_back()
Definition Value.h:413
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:967
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition Value.cpp:188
LLVM_ABI const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition Value.cpp:713
bool use_empty() const
Definition Value.h:347
user_iterator user_end()
Definition Value.h:411
LLVM_ABI bool replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition Value.cpp:561
iterator_range< use_iterator > uses()
Definition Value.h:381
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
A raw_ostream that writes to an SmallVector or SmallString.
The virtual file system interface.
llvm::ErrorOr< std::unique_ptr< llvm::MemoryBuffer > > getBufferForFile(const Twine &Name, int64_t FileSize=-1, bool RequiresNullTerminator=true, bool IsVolatile=false, bool IsText=true)
This is a convenience method that opens a file, gets its content and then closes the file.
virtual llvm::ErrorOr< Status > status(const Twine &Path)=0
Get the status of the entry at Path, if one exists.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ PTX_Kernel
Call to a PTX kernel. Passes all arguments in parameter space.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
LLVM_ABI GlobalVariable * emitOffloadingEntry(Module &M, object::OffloadKind Kind, Constant *Addr, StringRef Name, uint64_t Size, uint32_t Flags, uint64_t Data, Constant *AuxAddr=nullptr, StringRef SectionName="llvm_offload_entries")
Create an offloading section struct used to register this global at runtime.
Definition Utility.cpp:86
OpenMPOffloadMappingFlags
Values for bit flags used to specify the mapping type for offloading.
@ OMP_MAP_PTR_AND_OBJ
The element being mapped is a pointer-pointee pair; both the pointer and the pointee should be mapped...
@ OMP_MAP_MEMBER_OF
The 16 MSBs of the flags indicate whether the entry is member of some struct/class.
IdentFlag
IDs for all omp runtime library ident_t flag encodings (see their defintion in openmp/runtime/src/kmp...
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
constexpr const GV & getAMDGPUGridValues()
static constexpr GV SPIRVGridValues
For generic SPIR-V GPUs.
OMPDynGroupprivateFallbackType
The fallback types for the dyn_groupprivate clause.
static constexpr GV NVPTXGridValues
For Nvidia GPUs.
Function * Kernel
Summary of a kernel (=entry point for target offloading).
Definition OpenMPOpt.h:21
WorksharingLoopType
A type of worksharing loop construct.
OMPAtomicCompareOp
Atomic compare operations. Currently OpenMP only supports ==, >, and <.
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
LLVM_ABI BasicBlock * splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch, llvm::Twine Suffix=".split")
Like splitBB, but reuses the current block's name for the new name.
@ Offset
Definition DWP.cpp:532
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:831
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
hash_code hash_value(const FixedPointSemantics &Val)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1669
LLVM_ABI Expected< std::unique_ptr< Module > > parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context, ParserCallbacks Callbacks={})
Read the specified bitcode file, returning the module.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:841
LLVM_ABI BasicBlock * CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix="", Function *F=nullptr, ClonedCodeInfo *CodeInfo=nullptr, bool MapAtoms=true)
Return a copy of the specified basic block, but without embedding the block into a particular functio...
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
unsigned getPointerAddressSpace(const Type *T)
Definition SPIRVUtils.h:370
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:296
auto successors(const MachineBasicBlock *BB)
LLVM_ABI bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition Error.h:198
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
LLVM_ABI BasicBlock * splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch, DebugLoc DL, llvm::Twine Name={})
Split a BasicBlock at an InsertPoint, even if the block is degenerate (missing the terminator).
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
std::string utostr(uint64_t X, bool isNeg=false)
void * PointerTy
ErrorOr< T > expectedToErrorOrAndEmitErrors(LLVMContext &Ctx, Expected< T > Val)
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
LLVM_ABI bool convertUsersOfConstantsToInstructions(ArrayRef< Constant * > Consts, Function *RestrictToFunc=nullptr, bool RemoveDeadConstants=true, bool IncludeSelf=false)
Replace constant expressions users of the given constants with instructions.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
FunctionAddr VTableAddr uintptr_t uintptr_t Version
Definition InstrProf.h:302
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
LLVM_ABI void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:129
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
TargetTransformInfo TTI
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
Definition Error.h:769
LLVM_ABI bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, MemoryDependenceResults *MemDep=nullptr, bool PredecessorWithTwoSuccessors=false, DominatorTree *DT=nullptr)
Attempts to merge a block into its predecessor, if possible.
@ Mul
Product of integers.
@ Add
Sum of integers.
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
LLVM_ABI void remapInstructionsInBlocks(ArrayRef< BasicBlock * > Blocks, ValueToValueMapTy &VMap)
Remaps instructions in Blocks using the mapping in VMap.
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
LLVM_ABI void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New, bool CreateBranch, DebugLoc DL)
Move the instruction after an InsertPoint to the beginning of another BasicBlock.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto predecessors(const MachineBasicBlock *BB)
PointerUnion< const Value *, const PseudoSourceValue * > ValueType
LLVM_ABI Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
Attempt to constant fold an insertvalue instruction with the specified operands and indices.
@ Continue
Definition DWP.h:22
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI void DeleteDeadBlocks(ArrayRef< BasicBlock * > BBs, DomTreeUpdater *DTU=nullptr, bool KeepOneInputPHIs=false)
Delete the specified blocks from BB.
bool to_integer(StringRef S, N &Num, unsigned Base=0)
Convert the string S to an integer of the specified type using the radix Base. If Base is 0,...
static auto filterDbgVars(iterator_range< simple_ilist< DbgRecord >::iterator > R)
Filter the DbgRecord range to DbgVariableRecord types only and downcast.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
a struct to pack relevant information while generating atomic Ops
Error mergeFiniBB(IRBuilderBase &Builder, BasicBlock *ExistingFiniBB)
For cases where there is an unavoidable existing finalization block (e.g.
Expected< BasicBlock * > getFiniBB(IRBuilderBase &Builder)
The basic block to which control should be transferred to implement the FiniCB.
Description of a LLVM-IR insertion point (IP) and a debug/source location (filename,...
This structure contains combined information generated for mappable clauses, including base pointers,...
MapDeviceInfoArrayTy DevicePointers
StructNonContiguousInfo NonContigInfo
Helper that contains information about regions we need to outline during finalization.
LLVM_ABI void collectBlocks(SmallPtrSetImpl< BasicBlock * > &BlockSet, SmallVectorImpl< BasicBlock * > &BlockVector)
Collect all blocks in between EntryBB and ExitBB in both the given vector and set.
SmallVector< Value *, 2 > ExcludeArgsFromAggregate
Information about an OpenMP reduction.
EvalKind EvaluationKind
Reduction evaluation kind - scalar, complex or aggregate.
ReductionGenAtomicCBTy AtomicReductionGen
Callback for generating the atomic reduction body, may be null.
ReductionGenCBTy ReductionGen
Callback for generating the reduction body.
Value * Variable
Reduction variable of pointer type.
Value * PrivateVariable
Thread-private partial reduction variable.
ReductionGenClangCBTy ReductionGenClang
Clang callback for generating the reduction body.
Type * ElementType
Reduction element type, must match pointee type of variable.
ReductionGenDataPtrPtrCBTy DataPtrPtrGen
Container for the arguments used to pass data to the runtime library.
Value * SizesArray
The array of sizes passed to the runtime library.
Value * PointersArray
The array of section pointers passed to the runtime library.
Value * MappersArray
The array of user-defined mappers passed to the runtime library.
Value * MapTypesArrayEnd
The array of map types passed to the runtime library for the end of the region, or nullptr if there a...
Value * BasePointersArray
The array of base pointer passed to the runtime library.
Value * MapTypesArray
The array of map types passed to the runtime library for the beginning of the region or for the entir...
Value * MapNamesArray
The array of original declaration names of mapped pointers sent to the runtime library for debugging.
Data structure that contains the needed information to construct the kernel args vector.
ArrayRef< Value * > NumThreads
The number of threads.
TargetDataRTArgs RTArgs
Arguments passed to the runtime library.
Value * NumIterations
The number of iterations.
Value * DynCGroupMem
The size of the dynamic shared memory.
unsigned NumTargetItems
Number of arguments passed to the runtime library.
bool HasNoWait
True if the kernel has 'no wait' clause.
ArrayRef< Value * > NumTeams
The number of teams.
omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback
The fallback mechanism for the shared memory.
Container to pass the default attributes with which a kernel must be launched, used to set kernel att...
Container to pass LLVM IR runtime values or constants related to the number of teams and threads with...
Value * DeviceID
Device ID value used in the kernel launch.
Value * MaxThreads
'parallel' construct 'num_threads' clause value, if present and it is an SPMD kernel.
Value * LoopTripCount
Total number of iterations of the SPMD or Generic-SPMD kernel or null if it is a generic kernel.
Data structure to contain the information needed to uniquely identify a target entry.
static LLVM_ABI void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, StringRef ParentName, unsigned DeviceID, unsigned FileID, unsigned Line, unsigned Count)
static constexpr const char * KernelNamePrefix
The prefix used for kernel names.
static const Target * lookupTarget(StringRef TripleStr, std::string &Error)
lookupTarget - Lookup a target based on a target triple.
Parameters that control the generic loop unrolling transformation.
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
unsigned Threshold
The cost threshold for the unrolled loop.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...