doxygen/VPlanTransforms_8cpp_source.html

//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

///

/// \file

/// This file implements a set of utility VPlan to VPlan transformations.

///

//===----------------------------------------------------------------------===//


#include "VPlanTransforms.h"

#include "VPRecipeBuilder.h"

#include "VPlan.h"

#include "VPlanAnalysis.h"

#include "VPlanCFG.h"

#include "VPlanDominatorTree.h"

#include "VPlanHelpers.h"

#include "VPlanPatternMatch.h"

#include "VPlanUtils.h"

#include "VPlanVerifier.h"

#include "llvm/ADT/APInt.h"

#include "llvm/ADT/PostOrderIterator.h"

#include "llvm/ADT/STLExtras.h"

#include "llvm/ADT/SetOperations.h"

#include "llvm/ADT/SetVector.h"

#include "llvm/ADT/SmallPtrSet.h"

#include "llvm/ADT/TypeSwitch.h"

#include "llvm/Analysis/IVDescriptors.h"

#include "llvm/Analysis/InstSimplifyFolder.h"

#include "llvm/Analysis/Loads.h"

#include "llvm/Analysis/LoopInfo.h"

#include "llvm/Analysis/MemoryLocation.h"

#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"

#include "llvm/Analysis/ScopedNoAliasAA.h"

#include "llvm/Analysis/VectorUtils.h"

#include "llvm/IR/Intrinsics.h"

#include "llvm/IR/MDBuilder.h"

#include "llvm/IR/Metadata.h"

#include "llvm/Support/Casting.h"

#include "llvm/Support/TypeSize.h"

#include "llvm/Transforms/Utils/LoopUtils.h"

#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"


using namespace llvm;

using namespace VPlanPatternMatch;

using namespace SCEVPatternMatch;


bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(

    VPlan &Plan, const TargetLibraryInfo &TLI) {


  ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(

      Plan.getVectorLoopRegion());

  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {

    // Skip blocks outside region

    if (!VPBB->getParent())

      break;

    VPRecipeBase *Term = VPBB->getTerminator();

    auto EndIter = Term ? Term->getIterator() : VPBB->end();

    // Introduce each ingredient into VPlan.

    for (VPRecipeBase &Ingredient :

         make_early_inc_range(make_range(VPBB->begin(), EndIter))) {


      VPValue *VPV = Ingredient.getVPSingleValue();

      if (!VPV->getUnderlyingValue())

        continue;


      Instruction *Inst = cast<Instruction>(VPV->getUnderlyingValue());


      VPRecipeBase *NewRecipe = nullptr;

      if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {

        auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());

        NewRecipe = new VPWidenPHIRecipe(PhiR->operands(), PhiR->getDebugLoc(),

                                         Phi->getName());

      } else if (auto *VPI = dyn_cast<VPInstruction>(&Ingredient)) {

        assert(!isa<PHINode>(Inst) && "phis should be handled above");

        // Create VPWidenMemoryRecipe for loads and stores.

        if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {

          NewRecipe = new VPWidenLoadRecipe(

              *Load, Ingredient.getOperand(0), nullptr /*Mask*/,

              false /*Consecutive*/, *VPI, Ingredient.getDebugLoc());

        } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {

          NewRecipe = new VPWidenStoreRecipe(

              *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),

              nullptr /*Mask*/, false /*Consecutive*/, *VPI,

              Ingredient.getDebugLoc());

        } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {

          NewRecipe = new VPWidenGEPRecipe(GEP->getSourceElementType(),

                                           Ingredient.operands(), *VPI,

                                           Ingredient.getDebugLoc(), GEP);

        } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {

          Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);

          if (VectorID == Intrinsic::not_intrinsic)

            return false;


          // The noalias.scope.decl intrinsic declares a noalias scope that

          // is valid for a single iteration. Emitting it as a single-scalar

          // replicate would incorrectly extend the scope across multiple

          // original iterations packed into one vector iteration.

          // FIXME: If we want to vectorize this loop, then we have to drop

          // all the associated !alias.scope and !noalias.

          if (VectorID == Intrinsic::experimental_noalias_scope_decl)

            return false;


          // These intrinsics are recognized by getVectorIntrinsicIDForCall

          // but are not widenable. Emit them as replicate instead of widening.

          if (VectorID == Intrinsic::assume ||

              VectorID == Intrinsic::lifetime_end ||

              VectorID == Intrinsic::lifetime_start ||

              VectorID == Intrinsic::sideeffect ||

              VectorID == Intrinsic::pseudoprobe) {

            // If the operand of llvm.assume holds before vectorization, it will

            // also hold per lane.

            // llvm.pseudoprobe requires to be duplicated per lane for accurate

            // sample count.

            const bool IsSingleScalar = VectorID != Intrinsic::assume &&

                                        VectorID != Intrinsic::pseudoprobe;

            NewRecipe = new VPReplicateRecipe(CI, Ingredient.operands(),

                                              /*IsSingleScalar=*/IsSingleScalar,

                                              /*Mask=*/nullptr, *VPI, *VPI,

                                              Ingredient.getDebugLoc());

          } else {

            NewRecipe = new VPWidenIntrinsicRecipe(

                *CI, VectorID, drop_end(Ingredient.operands()), CI->getType(),

                VPIRFlags(*CI), *VPI, CI->getDebugLoc());

          }

        } else if (auto *CI = dyn_cast<CastInst>(Inst)) {

          NewRecipe = new VPWidenCastRecipe(

              CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI,

              VPIRFlags(*CI), VPIRMetadata(*CI));

        } else {

          NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,

                                        *VPI, Ingredient.getDebugLoc());

        }

      } else {

        assert(isa<VPWidenIntOrFpInductionRecipe>(&Ingredient) &&

               "inductions must be created earlier");

        continue;

      }


      NewRecipe->insertBefore(&Ingredient);

      if (NewRecipe->getNumDefinedValues() == 1)

        VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());

      else

        assert(NewRecipe->getNumDefinedValues() == 0 &&

               "Only recpies with zero or one defined values expected");

      Ingredient.eraseFromParent();

    }

  }

  return true;

}


/// Helper for extra no-alias checks via known-safe recipe and SCEV.


class SinkStoreInfo {

  const SmallPtrSetImpl<VPRecipeBase *> &ExcludeRecipes;

  VPReplicateRecipe &GroupLeader;

  PredicatedScalarEvolution &PSE;

  const Loop &L;


  // Return true if \p A and \p B are known to not alias for all VFs in the

  // plan, checked via the distance between the accesses

  bool isNoAliasViaDistance(VPReplicateRecipe *A, VPReplicateRecipe *B) const {

    if (A->getOpcode() != Instruction::Store ||

        B->getOpcode() != Instruction::Store)

      return false;


    VPValue *AddrA = A->getOperand(1);

    const SCEV *SCEVA = vputils::getSCEVExprForVPValue(AddrA, PSE, &L);

    VPValue *AddrB = B->getOperand(1);

    const SCEV *SCEVB = vputils::getSCEVExprForVPValue(AddrB, PSE, &L);

    if (isa<SCEVCouldNotCompute>(SCEVA) || isa<SCEVCouldNotCompute>(SCEVB))

      return false;


    const APInt *Distance;

    ScalarEvolution &SE = *PSE.getSE();

    if (!match(SE.getMinusSCEV(SCEVA, SCEVB), m_scev_APInt(Distance)))

      return false;


    const DataLayout &DL = SE.getDataLayout();

    Type *TyA = A->getOperand(0)->getScalarType();

    uint64_t SizeA = DL.getTypeStoreSize(TyA);

    Type *TyB = B->getOperand(0)->getScalarType();

    uint64_t SizeB = DL.getTypeStoreSize(TyB);


    // Use the maximum store size to ensure no overlap from either direction.

    // Currently only handles fixed sizes, as it is only used for

    // replicating VPReplicateRecipes.

    uint64_t MaxStoreSize = std::max(SizeA, SizeB);


    auto VFs = B->getParent()->getPlan()->vectorFactors();

    ElementCount MaxVF = *max_element(VFs, ElementCount::isKnownLT);

    if (MaxVF.isScalable())

      return false;

    return Distance->abs().uge(

        MaxVF.multiplyCoefficientBy(MaxStoreSize).getFixedValue());

  }


public:


  SinkStoreInfo(const SmallPtrSetImpl<VPRecipeBase *> &ExcludeRecipes,

                VPReplicateRecipe &GroupLeader, PredicatedScalarEvolution &PSE,

                const Loop &L)

      : ExcludeRecipes(ExcludeRecipes), GroupLeader(GroupLeader), PSE(PSE),

        L(L) {}


  /// Return true if \p R should be skipped during alias checking, either

  /// because it's in the exclude set or because no-alias can be proven via

  /// SCEV.


  bool shouldSkip(VPRecipeBase &R) const {

    auto *Store = dyn_cast<VPReplicateRecipe>(&R);

    return ExcludeRecipes.contains(&R) ||

           (Store && isNoAliasViaDistance(Store, &GroupLeader));

  }


};


/// Check if a memory operation doesn't alias with memory operations using

/// scoped noalias metadata, in blocks in the single-successor chain between \p

/// FirstBB and \p LastBB. If \p SinkInfo is std::nullopt, only recipes that may

/// write to memory are checked (for load hoisting). Otherwise recipes that both

/// read and write memory are checked, and SCEV is used to prove no-alias

/// between the group leader and other replicate recipes (for store sinking).

static bool


canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc,

                               VPBasicBlock *FirstBB, VPBasicBlock *LastBB,

                               std::optional<SinkStoreInfo> SinkInfo = {}) {

  bool CheckReads = SinkInfo.has_value();

  if (!MemLoc.AATags.Scope)

    return false;


  for (VPBasicBlock *VPBB :

       VPBlockUtils::blocksInSingleSuccessorChainBetween(FirstBB, LastBB)) {

    for (VPRecipeBase &R : *VPBB) {

      if (SinkInfo && SinkInfo->shouldSkip(R))

        continue;


      // Skip recipes that don't need checking.

      if (!R.mayWriteToMemory() && !(CheckReads && R.mayReadFromMemory()))

        continue;


      auto Loc = vputils::getMemoryLocation(R);

      if (!Loc)

        // Conservatively assume aliasing for memory operations without

        // location.

        return false;


      if (ScopedNoAliasAAResult::alias(*Loc, MemLoc) != AliasResult::NoAlias)

        return false;

    }

  }

  return true;

}


/// Get the value type of the replicate load or store. \p IsLoad indicates

/// whether it is a load.


static Type *getLoadStoreValueType(VPReplicateRecipe *R, bool IsLoad) {

  return (IsLoad ? R : R->getOperand(0))->getScalarType();

}


/// Collect either replicated Loads or Stores grouped by their address SCEV and

/// their load-store type, in a deep-traversal of the vector loop region in \p

/// Plan.

template <unsigned Opcode>

static SmallVector<SmallVector<VPReplicateRecipe *, 4>>


collectGroupedReplicateMemOps(

    VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L,

    function_ref<bool(VPReplicateRecipe *)> FilterFn) {

  static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,

                "Only Load and Store opcodes supported");

  constexpr bool IsLoad = (Opcode == Instruction::Load);

  SmallDenseMap<std::pair<const SCEV *, const Type *>,

                SmallVector<VPReplicateRecipe *, 4>>

      RecipesByAddressAndType;

  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(

           vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry()))) {

    for (VPRecipeBase &R : *VPBB) {

      auto *RepR = dyn_cast<VPReplicateRecipe>(&R);

      if (!RepR || RepR->getOpcode() != Opcode || !FilterFn(RepR))

        continue;


      // For loads, operand 0 is address; for stores, operand 1 is address.

      VPValue *Addr = RepR->getOperand(IsLoad ? 0 : 1);

      const Type *LoadStoreTy = getLoadStoreValueType(RepR, IsLoad);

      const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(Addr, PSE, L);

      if (!isa<SCEVCouldNotCompute>(AddrSCEV))

        RecipesByAddressAndType[{AddrSCEV, LoadStoreTy}].push_back(RepR);

    }

  }

  auto Groups = to_vector(RecipesByAddressAndType.values());

  VPDominatorTree VPDT(Plan);

  for (auto &Group : Groups) {

    // Sort mem ops by dominance order, with earliest (most dominating) first.

    stable_sort(Group, [&VPDT](VPReplicateRecipe *A, VPReplicateRecipe *B) {

      return VPDT.properlyDominates(A, B);

    });

  }

  return Groups;

}


static bool sinkScalarOperands(VPlan &Plan) {

  auto Iter = vp_depth_first_deep(Plan.getEntry());

  bool ScalarVFOnly = Plan.hasScalarVFOnly();

  bool Changed = false;


  SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList;

  auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](

                                        VPBasicBlock *SinkTo, VPValue *Op) {

    auto *Candidate =

        dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());

    if (!Candidate)

      return;


    // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes

    // for now.

    if (!isa<VPReplicateRecipe, VPScalarIVStepsRecipe>(Candidate))

      return;


    if (Candidate->getParent() == SinkTo ||

        vputils::cannotHoistOrSinkRecipe(*Candidate, /*Sinking=*/true))

      return;


    if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))

      if (!ScalarVFOnly && RepR->isSingleScalar())

        return;


    WorkList.insert({SinkTo, Candidate});

  };


  // First, collect the operands of all recipes in replicate blocks as seeds for

  // sinking.

  for (VPRegionBlock *VPR : VPBlockUtils::blocksOnly<VPRegionBlock>(Iter)) {

    VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();

    if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)

      continue;

    VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());

    if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())

      continue;

    for (auto &Recipe : *VPBB)

      for (VPValue *Op : Recipe.operands())

        InsertIfValidSinkCandidate(VPBB, Op);

  }


  // Try to sink each replicate or scalar IV steps recipe in the worklist.

  for (unsigned I = 0; I != WorkList.size(); ++I) {

    VPBasicBlock *SinkTo;

    VPSingleDefRecipe *SinkCandidate;

    std::tie(SinkTo, SinkCandidate) = WorkList[I];


    // All recipe users of SinkCandidate must be in the same block SinkTo or all

    // users outside of SinkTo must only use the first lane of SinkCandidate. In

    // the latter case, we need to duplicate SinkCandidate.

    auto UsersOutsideSinkTo =

        make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {

          return cast<VPRecipeBase>(U)->getParent() != SinkTo;

        });

    if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {

          return !U->usesFirstLaneOnly(SinkCandidate);

        }))

      continue;

    bool NeedsDuplicating = !UsersOutsideSinkTo.empty();


    if (NeedsDuplicating) {

      if (ScalarVFOnly)

        continue;

      VPSingleDefRecipe *Clone;

      if (auto *SinkCandidateRepR =

              dyn_cast<VPReplicateRecipe>(SinkCandidate)) {

        // TODO: Handle converting to uniform recipes as separate transform,

        // then cloning should be sufficient here.

        Instruction *I = SinkCandidate->getUnderlyingInstr();

        Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,

                                      nullptr /*Mask*/, *SinkCandidateRepR,

                                      *SinkCandidateRepR);

        // TODO: add ".cloned" suffix to name of Clone's VPValue.

      } else {

        Clone = SinkCandidate->clone();

      }


      Clone->insertBefore(SinkCandidate);

      SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {

        return cast<VPRecipeBase>(&U)->getParent() != SinkTo;

      });

    }

    SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());

    for (VPValue *Op : SinkCandidate->operands())

      InsertIfValidSinkCandidate(SinkTo, Op);

    Changed = true;

  }

  return Changed;

}


/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return

/// the mask.


static VPValue *getPredicatedMask(VPRegionBlock *R) {

  auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());

  if (!EntryBB || EntryBB->size() != 1 ||

      !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))

    return nullptr;


  return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);

}


/// If \p R is a triangle region, return the 'then' block of the triangle.


static VPBasicBlock *getPredicatedThenBlock(VPRegionBlock *R) {

  auto *EntryBB = cast<VPBasicBlock>(R->getEntry());

  if (EntryBB->getNumSuccessors() != 2)

    return nullptr;


  auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);

  auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);

  if (!Succ0 || !Succ1)

    return nullptr;


  if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)

    return nullptr;

  if (Succ0->getSingleSuccessor() == Succ1)

    return Succ0;

  if (Succ1->getSingleSuccessor() == Succ0)

    return Succ1;

  return nullptr;

}


// Merge replicate regions in their successor region, if a replicate region

// is connected to a successor replicate region with the same predicate by a

// single, empty VPBasicBlock.


static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) {

  SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;


  // Collect replicate regions followed by an empty block, followed by another

  // replicate region with matching masks to process front. This is to avoid

  // iterator invalidation issues while merging regions.

  SmallVector<VPRegionBlock *, 8> WorkList;

  for (VPRegionBlock *Region1 : VPBlockUtils::blocksOnly<VPRegionBlock>(

           vp_depth_first_deep(Plan.getEntry()))) {

    if (!Region1->isReplicator())

      continue;

    auto *MiddleBasicBlock =

        dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());

    if (!MiddleBasicBlock || !MiddleBasicBlock->empty())

      continue;


    auto *Region2 =

        dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());

    if (!Region2 || !Region2->isReplicator())

      continue;


    VPValue *Mask1 = getPredicatedMask(Region1);

    VPValue *Mask2 = getPredicatedMask(Region2);

    if (!Mask1 || Mask1 != Mask2)

      continue;


    assert(Mask1 && Mask2 && "both region must have conditions");

    WorkList.push_back(Region1);

  }


  // Move recipes from Region1 to its successor region, if both are triangles.

  for (VPRegionBlock *Region1 : WorkList) {

    if (TransformedRegions.contains(Region1))

      continue;

    auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());

    auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());


    VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);

    VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);

    if (!Then1 || !Then2)

      continue;


    // Note: No fusion-preventing memory dependencies are expected in either

    // region. Such dependencies should be rejected during earlier dependence

    // checks, which guarantee accesses can be re-ordered for vectorization.

    //

    // Move recipes to the successor region.

    for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))

      ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());


    auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());

    auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());


    // Move VPPredInstPHIRecipes from the merge block to the successor region's

    // merge block. Update all users inside the successor region to use the

    // original values.

    for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {

      VPValue *PredInst1 =

          cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);

      VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();

      Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {

        return cast<VPRecipeBase>(&U)->getParent() == Then2;

      });


      // Remove phi recipes that are unused after merging the regions.

      if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {

        Phi1ToMove.eraseFromParent();

        continue;

      }

      Phi1ToMove.moveBefore(*Merge2, Merge2->begin());

    }


    // Remove the dead recipes in Region1's entry block.

    for (VPRecipeBase &R :

         make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))

      R.eraseFromParent();


    // Finally, remove the first region.

    for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {

      VPBlockUtils::disconnectBlocks(Pred, Region1);

      VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);

    }

    VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);

    TransformedRegions.insert(Region1);

  }


  return !TransformedRegions.empty();

}


static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,

                                            VPRegionBlock *ParentRegion,

                                            VPlan &Plan) {

  Instruction *Instr = PredRecipe->getUnderlyingInstr();

  // Build the triangular if-then region.

  std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();

  assert(Instr->getParent() && "Predicated instruction not in any basic block");

  auto *BlockInMask = PredRecipe->getMask();

  auto *MaskDef = BlockInMask->getDefiningRecipe();

  auto *BOMRecipe = new VPBranchOnMaskRecipe(

      BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());

  auto *Entry =

      Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);


  // Replace predicated replicate recipe with a replicate recipe without a

  // mask but in the replicate region.

  auto *RecipeWithoutMask = new VPReplicateRecipe(

      PredRecipe->getUnderlyingInstr(), drop_end(PredRecipe->operands()),

      PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe, *PredRecipe,

      PredRecipe->getDebugLoc());

  auto *Pred =

      Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);

  auto *Exiting = Plan.createVPBasicBlock(Twine(RegionName) + ".continue");

  VPRegionBlock *Region =

      Plan.createReplicateRegion(Entry, Exiting, RegionName);


  // Note: first set Entry as region entry and then connect successors starting

  // from it in order, to propagate the "parent" of each VPBasicBlock.

  Region->setParent(ParentRegion);

  VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);

  VPBlockUtils::connectBlocks(Pred, Exiting);


  if (PredRecipe->getNumUsers() != 0) {

    auto *PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,

                                              RecipeWithoutMask->getDebugLoc());

    Exiting->appendRecipe(PHIRecipe);

    PredRecipe->replaceAllUsesWith(PHIRecipe);

  }

  PredRecipe->eraseFromParent();

  return Region;

}


static void addReplicateRegions(VPlan &Plan) {

  SmallVector<VPReplicateRecipe *> WorkList;

  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(

           vp_depth_first_deep(Plan.getEntry()))) {

    for (VPRecipeBase &R : *VPBB)

      if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {

        if (RepR->isPredicated())

          WorkList.push_back(RepR);

      }

  }


  unsigned BBNum = 0;

  for (VPReplicateRecipe *RepR : WorkList) {

    VPBasicBlock *CurrentBlock = RepR->getParent();

    VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());


    BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();

    SplitBlock->setName(

        OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");

    // Record predicated instructions for above packing optimizations.

    VPRegionBlock *Region =

        createReplicateRegion(RepR, CurrentBlock->getParent(), Plan);

    VPBlockUtils::insertOnEdge(CurrentBlock, SplitBlock, Region);


    VPRegionBlock *ParentRegion = Region->getParent();

    if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)

      ParentRegion->setExiting(SplitBlock);

  }

}


bool VPlanTransforms::mergeBlocksIntoPredecessors(VPlan &Plan) {

  SmallVector<VPBasicBlock *> WorkList;

  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(

           vp_depth_first_deep(Plan.getEntry()))) {

    // Don't fold the blocks in the skeleton of the Plan into their single

    // predecessors for now.

    // TODO: Remove restriction once more of the skeleton is modeled in VPlan.

    if (!VPBB->getParent())

      continue;

    auto *PredVPBB =

        dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());

    if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||

        isa<VPIRBasicBlock>(PredVPBB))

      continue;

    WorkList.push_back(VPBB);

  }


  for (VPBasicBlock *VPBB : WorkList) {

    VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());

    for (VPRecipeBase &R : make_early_inc_range(*VPBB))

      R.moveBefore(*PredVPBB, PredVPBB->end());

    VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);

    auto *ParentRegion = VPBB->getParent();

    if (ParentRegion && ParentRegion->getExiting() == VPBB)

      ParentRegion->setExiting(PredVPBB);

    VPBlockUtils::transferSuccessors(VPBB, PredVPBB);

    // VPBB is now dead and will be cleaned up when the plan gets destroyed.

  }

  return !WorkList.empty();

}


void VPlanTransforms::createAndOptimizeReplicateRegions(VPlan &Plan) {

  // Convert masked VPReplicateRecipes to if-then region blocks.

  addReplicateRegions(Plan);


  bool ShouldSimplify = true;

  while (ShouldSimplify) {

    ShouldSimplify = sinkScalarOperands(Plan);

    ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);

    ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);

  }

}


/// Remove redundant casts of inductions.

///

/// Such redundant casts are casts of induction variables that can be ignored,

/// because we already proved that the casted phi is equal to the uncasted phi

/// in the vectorized loop. There is no need to vectorize the cast - the same

/// value can be used for both the phi and casts in the vector loop.


static void removeRedundantInductionCasts(VPlan &Plan) {

  for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {

    auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);

    if (!IV || IV->getTruncInst())

      continue;


    // A sequence of IR Casts has potentially been recorded for IV, which

    // *must be bypassed* when the IV is vectorized, because the vectorized IV

    // will produce the desired casted value. This sequence forms a def-use

    // chain and is provided in reverse order, ending with the cast that uses

    // the IV phi. Search for the recipe of the last cast in the chain and

    // replace it with the original IV. Note that only the final cast is

    // expected to have users outside the cast-chain and the dead casts left

    // over will be cleaned up later.

    ArrayRef<Instruction *> Casts = IV->getInductionDescriptor().getCastInsts();

    VPValue *FindMyCast = IV;

    for (Instruction *IRCast : reverse(Casts)) {

      VPSingleDefRecipe *FoundUserCast = nullptr;

      for (auto *U : FindMyCast->users()) {

        auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);

        if (UserCast && UserCast->getUnderlyingValue() == IRCast) {

          FoundUserCast = UserCast;

          break;

        }

      }

      // A cast recipe in the chain may have been removed by earlier DCE.

      if (!FoundUserCast)

        break;

      FindMyCast = FoundUserCast;

    }

    if (FindMyCast != IV)

      FindMyCast->replaceAllUsesWith(IV);

  }

}


static VPScalarIVStepsRecipe *


createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind,

                    Instruction::BinaryOps InductionOpcode,

                    FPMathOperator *FPBinOp, Instruction *TruncI,

                    VPIRValue *StartV, VPValue *Step, DebugLoc DL,

                    VPBuilder &Builder) {

  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();

  VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();

  VPValue *CanonicalIV = LoopRegion->getCanonicalIV();

  VPSingleDefRecipe *BaseIV =

      Builder.createDerivedIV(Kind, FPBinOp, StartV, CanonicalIV, Step);


  // Truncate base induction if needed.

  Type *ResultTy = BaseIV->getScalarType();

  if (TruncI) {

    Type *TruncTy = TruncI->getType();

    assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&

           "Not truncating.");

    assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");

    BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);

    ResultTy = TruncTy;

  }


  // Truncate step if needed.

  Type *StepTy = Step->getScalarType();

  if (ResultTy != StepTy) {

    assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&

           "Not truncating.");

    assert(StepTy->isIntegerTy() && "Truncation requires an integer type");

    auto *VecPreheader =

        cast<VPBasicBlock>(HeaderVPBB->getSingleHierarchicalPredecessor());

    VPBuilder::InsertPointGuard Guard(Builder);

    Builder.setInsertPoint(VecPreheader);

    Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);

  }

  return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,

                                     &Plan.getVF(), DL);

}


void VPlanTransforms::replaceWideCanonicalIVWithWideIV(

    VPlan &Plan, ScalarEvolution &SE, const TargetTransformInfo &TTI,

    TargetTransformInfo::TargetCostKind CostKind, ElementCount VF, unsigned UF,

    const SmallPtrSetImpl<const Value *> &ValuesToIgnore) {

  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();

  if (!LoopRegion)

    return;


  auto *WideCanIV =

      findUserOf<VPWidenCanonicalIVRecipe>(LoopRegion->getCanonicalIV());

  if (!WideCanIV)

    return;


  Type *CanIVTy = LoopRegion->getCanonicalIVType();


  // Replace the wide canonical IV with a scalar-iv-steps over the canonical

  // IV.

  if (Plan.hasScalarVFOnly() || vputils::onlyFirstLaneUsed(WideCanIV)) {

    VPBuilder Builder(WideCanIV);

    WideCanIV->replaceAllUsesWith(createScalarIVSteps(

        Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,

        nullptr, Plan.getZero(CanIVTy), Plan.getConstantInt(CanIVTy, 1),

        WideCanIV->getDebugLoc(), Builder));

    WideCanIV->eraseFromParent();

    return;

  }


  if (vputils::onlyScalarValuesUsed(WideCanIV))

    return;


  // If a canonical VPWidenIntOrFpInductionRecipe already produces vector lanes

  // in the header, reuse it instead of introducing another wide induction phi.

  VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();

  for (VPRecipeBase &Phi : Header->phis()) {

    VPWidenIntOrFpInductionRecipe *WidenIV;

    if (!match(&Phi, m_CanonicalWidenIV(WidenIV)))

      continue;

    // The reused wide IV feeds the header mask, whose lanes may extend past

    // the trip count; drop flags that only hold inside the scalar loop.

    WidenIV->dropPoisonGeneratingFlags();

    WideCanIV->replaceAllUsesWith(WidenIV);

    WideCanIV->eraseFromParent();

    return;

  }


  // Introduce a new VPWidenIntOrFpInductionRecipe if profitable.

  auto *VecTy = VectorType::get(CanIVTy, VF);

  InstructionCost BroadcastCost = TTI.getShuffleCost(

      TargetTransformInfo::SK_Broadcast, VecTy, VecTy, {}, CostKind);

  InstructionCost PHICost = TTI.getCFInstrCost(Instruction::PHI, CostKind);

  if (PHICost > BroadcastCost)

    return;


  // Bail out if the additional wide induction phi increase the expected spill

  // cost.

  VPRegisterUsage UnrolledBase =

      calculateRegisterUsageForPlan(Plan, VF, TTI, ValuesToIgnore)[0];

  for (unsigned &NumUsers : make_second_range(UnrolledBase.MaxLocalUsers))

    NumUsers *= UF;

  unsigned RegClass = TTI.getRegisterClassForType(/*Vector=*/true, VecTy);

  VPRegisterUsage Projected = UnrolledBase;

  Projected.MaxLocalUsers[RegClass] += TTI.getRegUsageForType(VecTy);

  if (Projected.spillCost(TTI, CostKind) >

      UnrolledBase.spillCost(TTI, CostKind))

    return;


  InductionDescriptor ID =

      InductionDescriptor::getCanonicalIntInduction(CanIVTy, SE);

  VPValue *StepV = Plan.getConstantInt(CanIVTy, 1);

  auto *NewWideIV = new VPWidenIntOrFpInductionRecipe(

      /*IV=*/nullptr, Plan.getZero(CanIVTy), StepV, &Plan.getVF(), ID,

      WideCanIV->getNoWrapFlags(), WideCanIV->getDebugLoc());

  NewWideIV->insertBefore(&*Header->getFirstNonPhi());

  WideCanIV->replaceAllUsesWith(NewWideIV);

  WideCanIV->eraseFromParent();

}


/// Returns true if \p R is dead and can be removed.


static bool isDeadRecipe(VPRecipeBase &R) {

  // Do remove conditional assume instructions as their conditions may be

  // flattened.

  auto *RepR = dyn_cast<VPReplicateRecipe>(&R);

  bool IsConditionalAssume = RepR && RepR->isPredicated() &&

                             match(RepR, m_Intrinsic<Intrinsic::assume>());

  if (IsConditionalAssume)

    return true;


  if (R.mayHaveSideEffects())

    return false;


  // Recipe is dead if no user keeps the recipe alive.

  return all_of(R.definedValues(),

                [](VPValue *V) { return V->getNumUsers() == 0; });

}


void VPlanTransforms::removeDeadRecipes(VPlan &Plan) {

  PostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> POT(

      Plan.getEntry());

  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(POT)) {

    // The recipes in the block are processed in reverse order, to catch chains

    // of dead recipes.

    for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {

      if (isDeadRecipe(R)) {

        R.eraseFromParent();

        continue;

      }


      // Check if R is a dead VPPhi <-> update cycle and remove it.

      VPValue *Start, *Incoming;

      if (!match(&R, m_VPPhi(m_VPValue(Start), m_VPValue(Incoming))))

        continue;

      auto *PhiR = cast<VPPhi>(&R);

      VPUser *PhiUser = PhiR->getSingleUser();

      if (!PhiUser)

        continue;

      if (PhiUser != Incoming->getDefiningRecipe() ||

          Incoming->getNumUsers() != 1)

        continue;

      PhiR->replaceAllUsesWith(Start);

      PhiR->eraseFromParent();

      Incoming->getDefiningRecipe()->eraseFromParent();

    }

  }

}


static SmallVector<VPUser *> collectUsersRecursively(VPValue *V) {

  SetVector<VPUser *> Users(llvm::from_range, V->users());

  for (unsigned I = 0; I != Users.size(); ++I) {

    VPRecipeBase *Cur = cast<VPRecipeBase>(Users[I]);

    for (VPValue *V : Cur->definedValues())

      Users.insert_range(V->users());

  }

  return Users.takeVector();

}


/// Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd

/// (IndStart, ScalarIVSteps (0, Step)). This is used when the recipe only

/// generates scalar values.

static VPValue *


scalarizeVPWidenPointerInduction(VPWidenPointerInductionRecipe *PtrIV,

                                 VPlan &Plan, VPBuilder &Builder) {

  const InductionDescriptor &ID = PtrIV->getInductionDescriptor();

  VPIRValue *StartV = Plan.getZero(ID.getStep()->getType());

  VPValue *StepV = PtrIV->getOperand(1);

  VPScalarIVStepsRecipe *Steps = createScalarIVSteps(

      Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,

      nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);


  return Builder.createPtrAdd(PtrIV->getStartValue(), Steps,

                              PtrIV->getDebugLoc(), "next.gep");

}


/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd

/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as

/// VPWidenPointerInductionRecipe will generate vectors only. If some users

/// require vectors while other require scalars, the scalar uses need to extract

/// the scalars from the generated vectors (Note that this is different to how

/// int/fp inductions are handled). Legalize extract-from-ends using uniform

/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so

/// the correct end value is available. Also optimize

/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by

/// providing them scalar steps built on the canonical scalar IV and update the

/// original IV's users. This is an optional optimization to reduce the needs of

/// vector extracts.


static void legalizeAndOptimizeInductions(VPlan &Plan) {

  VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();

  bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();

  VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());

  for (VPRecipeBase &Phi : HeaderVPBB->phis()) {

    auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);

    if (!PhiR)

      continue;


    // Try to narrow wide and replicating recipes to uniform recipes, based on

    // VPlan analysis.

    // TODO: Apply to all recipes in the future, to replace legacy uniformity

    // analysis.

    auto Users = collectUsersRecursively(PhiR);

    for (VPUser *U : reverse(Users)) {

      auto *Def = dyn_cast<VPRecipeWithIRFlags>(U);

      auto *RepR = dyn_cast<VPReplicateRecipe>(U);

      // Skip recipes that shouldn't be narrowed.

      if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||

          Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||

          (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))

        continue;


      // Skip recipes that may have other lanes than their first used.

      if (!vputils::isSingleScalar(Def) && !vputils::onlyFirstLaneUsed(Def))

        continue;


      // TODO: Support scalarizing ExtractValue.

      if (match(Def,

                m_Binary<Instruction::ExtractValue>(m_VPValue(), m_VPValue())))

        continue;


      auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),

                                          Def->operands(), /*IsUniform*/ true,

                                          /*Mask*/ nullptr, /*Flags*/ *Def);

      Clone->insertAfter(Def);

      Def->replaceAllUsesWith(Clone);

    }


    // Replace wide pointer inductions which have only their scalars used by

    // PtrAdd(IndStart, ScalarIVSteps (0, Step)).

    if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {

      if (!Plan.hasScalarVFOnly() &&

          !PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))

        continue;


      VPValue *PtrAdd = scalarizeVPWidenPointerInduction(PtrIV, Plan, Builder);

      PtrIV->replaceAllUsesWith(PtrAdd);

      continue;

    }


    // Replace widened induction with scalar steps for users that only use

    // scalars.

    auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);

    if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {

          return U->usesScalars(WideIV);

        }))

      continue;


    const InductionDescriptor &ID = WideIV->getInductionDescriptor();

    VPScalarIVStepsRecipe *Steps = createScalarIVSteps(

        Plan, ID.getKind(), ID.getInductionOpcode(),

        dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),

        WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),

        WideIV->getDebugLoc(), Builder);


    // Update scalar users of IV to use Step instead.

    if (!HasOnlyVectorVFs) {

      assert(!Plan.hasScalableVF() &&

             "plans containing a scalar VF cannot also include scalable VFs");

      WideIV->replaceAllUsesWith(Steps);

    } else {

      bool HasScalableVF = Plan.hasScalableVF();

      WideIV->replaceUsesWithIf(Steps,

                                [WideIV, HasScalableVF](VPUser &U, unsigned) {

                                  if (HasScalableVF)

                                    return U.usesFirstLaneOnly(WideIV);

                                  return U.usesScalars(WideIV);

                                });

    }

  }

}


/// Check if \p VPV is an untruncated wide induction, either before or after the

/// increment. If so return the header IV (before the increment), otherwise

/// return null.

static VPWidenInductionRecipe *


getOptimizableIVOf(VPValue *VPV, PredicatedScalarEvolution &PSE) {

  auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);

  if (WideIV) {

    // VPV itself is a wide induction, separately compute the end value for exit

    // users if it is not a truncated IV.

    auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);

    return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;

  }


  // Check if VPV is an optimizable induction increment.

  VPRecipeBase *Def = VPV->getDefiningRecipe();

  if (!Def || Def->getNumOperands() != 2)

    return nullptr;

  WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));

  if (!WideIV)

    WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));

  if (!WideIV)

    return nullptr;


  auto IsWideIVInc = [&]() {

    auto &ID = WideIV->getInductionDescriptor();


    // Check if VPV increments the induction by the induction step.

    VPValue *IVStep = WideIV->getStepValue();

    switch (ID.getInductionOpcode()) {

    case Instruction::Add:

      return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));

    case Instruction::FAdd:

      return match(VPV, m_c_FAdd(m_Specific(WideIV), m_Specific(IVStep)));

    case Instruction::FSub:

      return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),

                                                    m_Specific(IVStep)));

    case Instruction::Sub: {

      // IVStep will be the negated step of the subtraction. Check if Step == -1

      // * IVStep.

      VPValue *Step;

      if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))

        return false;

      const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, PSE);

      const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, PSE);

      ScalarEvolution &SE = *PSE.getSE();

      return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&

             !isa<SCEVCouldNotCompute>(StepSCEV) &&

             IVStepSCEV == SE.getNegativeSCEV(StepSCEV);

    }

    default:

      return ID.getKind() == InductionDescriptor::IK_PtrInduction &&

             match(VPV, m_GetElementPtr(m_Specific(WideIV),

                                        m_Specific(WideIV->getStepValue())));

    }

    llvm_unreachable("should have been covered by switch above");

  };

  return IsWideIVInc() ? WideIV : nullptr;

}


/// Attempts to optimize the induction variable exit values for users in the

/// early exit block.


static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan, VPValue *Op,

                                               PredicatedScalarEvolution &PSE) {

  VPValue *Incoming, *Mask;

  if (!match(Op, m_ExtractLane(m_FirstActiveLane(m_VPValue(Mask)),

                               m_VPValue(Incoming))))

    return nullptr;


  auto *WideIV = getOptimizableIVOf(Incoming, PSE);

  if (!WideIV)

    return nullptr;


  auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);

  if (WideIntOrFp && WideIntOrFp->getTruncInst())

    return nullptr;


  // Calculate the final index.

  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();

  auto *CanonicalIV = LoopRegion->getCanonicalIV();

  Type *CanonicalIVType = LoopRegion->getCanonicalIVType();

  auto *ExtractR = cast<VPInstruction>(Op);

  VPBuilder B(ExtractR);


  DebugLoc DL = ExtractR->getDebugLoc();

  VPValue *FirstActiveLane = B.createFirstActiveLane(Mask, DL);

  FirstActiveLane = B.createScalarZExtOrTrunc(

      FirstActiveLane, CanonicalIVType, FirstActiveLane->getScalarType(), DL);

  VPValue *EndValue = B.createAdd(CanonicalIV, FirstActiveLane, DL);


  // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it

  // changed it means the exit is using the incremented value, so we need to

  // add the step.

  if (Incoming != WideIV) {

    VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);

    EndValue = B.createAdd(EndValue, One, DL);

  }


  if (!match(WideIV, m_CanonicalWidenIV())) {

    const InductionDescriptor &ID = WideIV->getInductionDescriptor();

    VPIRValue *Start = WideIV->getStartValue();

    VPValue *Step = WideIV->getStepValue();

    EndValue = B.createDerivedIV(

        ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),

        Start, EndValue, Step);

  }


  return EndValue;

}


/// Compute the end value for \p WideIV, unless it is truncated. Creates a

/// VPDerivedIVRecipe for non-canonical inductions.


static VPValue *tryToComputeEndValueForInduction(VPWidenInductionRecipe *WideIV,

                                                 VPBuilder &VectorPHBuilder,

                                                 VPValue *VectorTC) {

  auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);

  // Truncated wide inductions resume from the last lane of their vector value

  // in the last vector iteration which is handled elsewhere.

  if (WideIntOrFp && WideIntOrFp->getTruncInst())

    return nullptr;


  VPIRValue *Start = WideIV->getStartValue();

  VPValue *Step = WideIV->getStepValue();

  const InductionDescriptor &ID = WideIV->getInductionDescriptor();

  VPValue *EndValue = VectorTC;

  if (!match(WideIV, m_CanonicalWidenIV())) {

    EndValue = VectorPHBuilder.createDerivedIV(

        ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),

        Start, VectorTC, Step);

  }


  // EndValue is derived from the vector trip count (which has the same type as

  // the widest induction) and thus may be wider than the induction here.

  Type *ScalarTypeOfWideIV = WideIV->getScalarType();

  if (ScalarTypeOfWideIV != EndValue->getScalarType()) {

    EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,

                                                ScalarTypeOfWideIV,

                                                WideIV->getDebugLoc());

  }


  return EndValue;

}


/// Attempts to optimize the induction variable exit values for users in the

/// exit block coming from the latch in the original scalar loop.

static VPValue *


optimizeLatchExitInductionUser(VPlan &Plan, VPValue *Op,

                               DenseMap<VPValue *, VPValue *> &EndValues,

                               PredicatedScalarEvolution &PSE) {

  VPValue *Incoming;

  if (!match(Op, m_ExtractLastLaneOfLastPart(m_VPValue(Incoming))))

    return nullptr;


  VPWidenInductionRecipe *WideIV = getOptimizableIVOf(Incoming, PSE);

  if (!WideIV)

    return nullptr;


  VPValue *EndValue = EndValues.lookup(WideIV);

  assert(EndValue && "Must have computed the end value up front");


  // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it

  // changed it means the exit is using the incremented value, so we don't

  // need to subtract the step.

  if (Incoming != WideIV)

    return EndValue;


  // Otherwise, subtract the step from the EndValue.

  auto *ExtractR = cast<VPInstruction>(Op);

  VPBuilder B(ExtractR);

  VPValue *Step = WideIV->getStepValue();

  Type *ScalarTy = WideIV->getScalarType();

  if (ScalarTy->isIntegerTy())

    return B.createSub(EndValue, Step, DebugLoc::getUnknown(), "ind.escape");

  if (ScalarTy->isPointerTy()) {

    Type *StepTy = Step->getScalarType();

    auto *Zero = Plan.getZero(StepTy);

    return B.createPtrAdd(EndValue, B.createSub(Zero, Step),

                          DebugLoc::getUnknown(), "ind.escape");

  }

  if (ScalarTy->isFloatingPointTy()) {

    const auto &ID = WideIV->getInductionDescriptor();

    return B.createNaryOp(

        ID.getInductionBinOp()->getOpcode() == Instruction::FAdd

            ? Instruction::FSub

            : Instruction::FAdd,

        {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});

  }

  llvm_unreachable("all possible induction types must be handled");

  return nullptr;

}


void VPlanTransforms::optimizeInductionLiveOutUsers(

    VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail) {

  // Compute end values for all inductions.

  VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();

  auto *VectorPH = cast<VPBasicBlock>(VectorRegion->getSinglePredecessor());

  VPBuilder VectorPHBuilder(VectorPH, VectorPH->begin());

  DenseMap<VPValue *, VPValue *> EndValues;

  VPValue *ResumeTC =

      FoldTail ? Plan.getTripCount() : &Plan.getVectorTripCount();

  for (auto &Phi : VectorRegion->getEntryBasicBlock()->phis()) {

    auto *WideIV = dyn_cast<VPWidenInductionRecipe>(&Phi);

    if (!WideIV)

      continue;

    if (VPValue *EndValue =

            tryToComputeEndValueForInduction(WideIV, VectorPHBuilder, ResumeTC))

      EndValues[WideIV] = EndValue;

  }


  VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();

  for (VPRecipeBase &R : make_early_inc_range(*MiddleVPBB)) {

    VPValue *Op;

    if (!match(&R, m_ExitingIVValue(m_VPValue(Op))))

      continue;

    auto *WideIV = cast<VPWidenInductionRecipe>(Op);

    if (VPValue *EndValue = EndValues.lookup(WideIV)) {

      R.getVPSingleValue()->replaceAllUsesWith(EndValue);

      R.eraseFromParent();

    }

  }


  // Then, optimize exit block users.

  for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {

    for (VPRecipeBase &R : ExitVPBB->phis()) {

      auto *ExitIRI = cast<VPIRPhi>(&R);


      for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {

        VPValue *Escape = nullptr;

        if (PredVPBB == MiddleVPBB)

          Escape = optimizeLatchExitInductionUser(

              Plan, ExitIRI->getOperand(Idx), EndValues, PSE);

        else

          Escape = optimizeEarlyExitInductionUser(

              Plan, ExitIRI->getOperand(Idx), PSE);

        if (Escape)

          ExitIRI->setOperand(Idx, Escape);

      }

    }

  }

}


/// Remove redundant ExpandSCEVRecipes in \p Plan's entry block by replacing

/// them with already existing recipes expanding the same SCEV expression.


static void removeRedundantExpandSCEVRecipes(VPlan &Plan) {

  DenseMap<const SCEV *, VPValue *> SCEV2VPV;


  for (VPRecipeBase &R :

       make_early_inc_range(*Plan.getEntry()->getEntryBasicBlock())) {

    auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);

    if (!ExpR)

      continue;


    const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);

    if (Inserted)

      continue;


    ExpR->replaceAllUsesWith(V->second);

    if (ExpR == Plan.getTripCount())

      Plan.resetTripCount(V->second);


    ExpR->eraseFromParent();

  }

}


static void recursivelyDeleteDeadRecipes(VPValue *V) {

  SmallVector<VPValue *> WorkList;

  SmallPtrSet<VPValue *, 8> Seen;

  WorkList.push_back(V);


  while (!WorkList.empty()) {

    VPValue *Cur = WorkList.pop_back_val();

    if (!Seen.insert(Cur).second)

      continue;

    VPRecipeBase *R = Cur->getDefiningRecipe();

    if (!R)

      continue;

    if (!isDeadRecipe(*R))

      continue;

    append_range(WorkList, R->operands());

    R->eraseFromParent();

  }

}


/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.

/// Returns an optional pair, where the first element indicates whether it is

/// an intrinsic ID.

static std::optional<std::pair<bool, unsigned>>


getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R) {

  return TypeSwitch<const VPSingleDefRecipe *,

                    std::optional<std::pair<bool, unsigned>>>(R)

      .Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe, VPWidenGEPRecipe,

            VPReplicateRecipe>(

          [](auto *I) { return std::make_pair(false, I->getOpcode()); })

      .Case([](const VPWidenIntrinsicRecipe *I) {

        return std::make_pair(true, I->getVectorIntrinsicID());

      })

      .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe, VPScalarIVStepsRecipe>(

          [](auto *I) {

            // For recipes that do not directly map to LLVM IR instructions,

            // assign opcodes after the last VPInstruction opcode (which is also

            // after the last IR Instruction opcode), based on the VPRecipeID.

            return std::make_pair(false, VPInstruction::OpsEnd + 1 +

                                             I->getVPRecipeID());

          })

      .Default([](auto *) { return std::nullopt; });

}


/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a

/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p

/// Operands are foldable live-ins.


static VPIRValue *tryToFoldLiveIns(VPSingleDefRecipe &R,

                                   ArrayRef<VPValue *> Operands,

                                   const DataLayout &DL, LLVMContext &Ctx) {

  auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);

  if (!OpcodeOrIID)

    return nullptr;


  SmallVector<Value *, 4> Ops;

  for (VPValue *Op : Operands) {

    if (!match(Op, m_LiveIn()))

      return nullptr;

    Value *V = Op->getUnderlyingValue();

    if (!V)

      return nullptr;

    Ops.push_back(V);

  }


  auto FoldToIRValue = [&]() -> Value * {

    InstSimplifyFolder Folder(DL);

    if (OpcodeOrIID->first) {

      if (R.getNumOperands() != 2)

        return nullptr;

      unsigned ID = OpcodeOrIID->second;

      return Folder.FoldBinaryIntrinsic(ID, Ops[0], Ops[1], R.getScalarType());

    }

    unsigned Opcode = OpcodeOrIID->second;

    if (Instruction::isBinaryOp(Opcode))

      return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),

                              Ops[0], Ops[1]);

    if (Instruction::isCast(Opcode))

      return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],

                             R.getVPSingleValue()->getScalarType());

    switch (Opcode) {

    case VPInstruction::LogicalAnd:

      return Folder.FoldSelect(Ops[0], Ops[1],

                               ConstantInt::getNullValue(Ops[1]->getType()));

    case VPInstruction::Not:

      return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],

                              Constant::getAllOnesValue(Ops[0]->getType()));

    case Instruction::Select:

      return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);

    case Instruction::ICmp:

    case Instruction::FCmp:

      return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],

                            Ops[1]);

    case Instruction::GetElementPtr: {

      auto &RFlags = cast<VPRecipeWithIRFlags>(R);

      auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());

      return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],

                            drop_begin(Ops), RFlags.getGEPNoWrapFlags());

    }

    case VPInstruction::PtrAdd:

    case VPInstruction::WidePtrAdd:

      return Folder.FoldGEP(IntegerType::getInt8Ty(Ctx), Ops[0], Ops[1],

                            cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());

    // An extract of a live-in is an extract of a broadcast, so return the

    // broadcasted element.

    case Instruction::ExtractElement:

      assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");

      return Ops[0];

    }

    return nullptr;

  };


  if (Value *V = FoldToIRValue())

    return R.getParent()->getPlan()->getOrAddLiveIn(V);

  return nullptr;

}


/// Try to simplify logical and bitwise recipes in \p Def.


static bool simplifyLogicalRecipe(VPSingleDefRecipe *Def, VPBuilder &Builder,

                                  bool CanCreateNewRecipe) {

  VPlan *Plan = Def->getParent()->getPlan();


  // Simplify (X && Y) | (X && !Y) -> X.

  // TODO: Split up into simpler, modular combines: (X && Y) | (X && Z) into X

  // && (Y | Z) and (X | !X) into true. This requires queuing newly created

  // recipes to be visited during simplification.

  VPValue *X, *Y, *Z;

  if (match(Def,

            m_c_BinaryOr(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)),

                         m_LogicalAnd(m_Deferred(X), m_Not(m_Deferred(Y)))))) {

    Def->replaceAllUsesWith(X);

    Def->eraseFromParent();

    return true;

  }


  // x | AllOnes -> AllOnes

  if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes()))) {

    Def->replaceAllUsesWith(Plan->getAllOnesValue(Def->getScalarType()));

    return true;

  }


  // x | 0 -> x

  if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt()))) {

    Def->replaceAllUsesWith(X);

    return true;

  }


  // x | !x -> AllOnes

  if (match(Def, m_c_BinaryOr(m_VPValue(X), m_Not(m_Deferred(X))))) {

    Def->replaceAllUsesWith(Plan->getAllOnesValue(Def->getScalarType()));

    return true;

  }


  // x & 0 -> 0

  if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt()))) {

    Def->replaceAllUsesWith(Plan->getZero(Def->getScalarType()));

    return true;

  }


  // x & AllOnes -> x

  if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_AllOnes()))) {

    Def->replaceAllUsesWith(X);

    return true;

  }


  // x && false -> false

  if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_False()))) {

    Def->replaceAllUsesWith(Plan->getFalse());

    return true;

  }


  // x && true -> x

  if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_True()))) {

    Def->replaceAllUsesWith(X);

    return true;

  }


  // (x && y) | (x && z) -> x && (y | z)

  if (CanCreateNewRecipe &&

      match(Def, m_c_BinaryOr(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)),

                              m_LogicalAnd(m_Deferred(X), m_VPValue(Z)))) &&

      // Simplify only if one of the operands has one use to avoid creating an

      // extra recipe.

      (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||

       !Def->getOperand(1)->hasMoreThanOneUniqueUser())) {

    Def->replaceAllUsesWith(

        Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));

    return true;

  }


  // x && (x && y) -> x && y

  if (match(Def, m_LogicalAnd(m_VPValue(X),

                              m_LogicalAnd(m_Deferred(X), m_VPValue())))) {

    Def->replaceAllUsesWith(Def->getOperand(1));

    return true;

  }


  // x && (y && x) -> x && y

  if (match(Def, m_LogicalAnd(m_VPValue(X),

                              m_LogicalAnd(m_VPValue(Y), m_Deferred(X))))) {

    Def->replaceAllUsesWith(Builder.createLogicalAnd(X, Y));

    return true;

  }


  // x && !x -> 0

  if (match(Def, m_LogicalAnd(m_VPValue(X), m_Not(m_Deferred(X))))) {

    Def->replaceAllUsesWith(Plan->getFalse());

    return true;

  }


  if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X)))) {

    Def->replaceAllUsesWith(X);

    return true;

  }


  // select c, false, true -> not c

  VPValue *C;

  if (CanCreateNewRecipe &&

      match(Def, m_Select(m_VPValue(C), m_False(), m_True()))) {

    Def->replaceAllUsesWith(Builder.createNot(C));

    return true;

  }


  // select !c, x, y -> select c, y, x

  if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {

    Def->setOperand(0, C);

    Def->setOperand(1, Y);

    Def->setOperand(2, X);

    return true;

  }


  // select x, (i1 y | z), y -> y | (x && z)

  if (CanCreateNewRecipe &&

      match(Def, m_Select(m_VPValue(X),

                          m_OneUse(m_c_BinaryOr(m_VPValue(Y), m_VPValue(Z))),

                          m_Deferred(Y))) &&

      Y->getScalarType()->isIntegerTy(1)) {

    Def->replaceAllUsesWith(

        Builder.createOr(Y, Builder.createLogicalAnd(X, Z)));

    return true;

  }


  return false;

}


/// Try to simplify VPSingleDefRecipe \p Def.


static void simplifyRecipe(VPSingleDefRecipe *Def) {

  VPlan *Plan = Def->getParent()->getPlan();


  // Simplification of live-in IR values for SingleDef recipes using

  // InstSimplifyFolder.

  const DataLayout &DL = Plan->getDataLayout();

  if (VPValue *V =

          tryToFoldLiveIns(*Def, Def->operands(), DL, Plan->getContext()))

    return Def->replaceAllUsesWith(V);


  // Fold PredPHI LiveIn -> LiveIn.

  if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) {

    VPValue *Op = PredPHI->getOperand(0);

    if (isa<VPIRValue>(Op))

      PredPHI->replaceAllUsesWith(Op);

  }


  VPBuilder Builder(Def);


  // Avoid replacing VPInstructions with underlying values with new

  // VPInstructions, as we would fail to create widen/replicate recpes from the

  // new VPInstructions without an underlying value, and miss out on some

  // transformations that only apply to widened/replicated recipes later, by

  // doing so.

  // TODO: We should also not replace non-VPInstructions like VPWidenRecipe with

  // VPInstructions without underlying values, as those will get skipped during

  // cost computation.

  bool CanCreateNewRecipe =

      !isa<VPInstruction>(Def) || !Def->getUnderlyingValue();


  VPValue *A;

  if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {

    Type *TruncTy = Def->getScalarType();

    Type *ATy = A->getScalarType();

    if (TruncTy == ATy) {

      Def->replaceAllUsesWith(A);

    } else {

      // Don't replace a non-widened cast recipe with a widened cast.

      if (!isa<VPWidenCastRecipe>(Def))

        return;

      if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {


        unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue()))

                                 ? Instruction::SExt

                                 : Instruction::ZExt;

        auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,

                                            TruncTy);

        if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) {

          // UnderlyingExt has distinct return type, used to retain legacy cost.

          Ext->setUnderlyingValue(UnderlyingExt);

        }

        Def->replaceAllUsesWith(Ext);

      } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {

        auto *Trunc = Builder.createWidenCast(Instruction::Trunc, A, TruncTy);

        Def->replaceAllUsesWith(Trunc);

      }

    }

  }


  if (simplifyLogicalRecipe(Def, Builder, CanCreateNewRecipe))

    return;


  VPValue *X, *Y, *C;

  if (match(Def, m_c_Add(m_VPValue(A), m_ZeroInt())))

    return Def->replaceAllUsesWith(A);


  if (match(Def, m_c_Mul(m_VPValue(A), m_One())))

    return Def->replaceAllUsesWith(A);


  if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))

    return Def->replaceAllUsesWith(Plan->getZero(Def->getScalarType()));


  if (CanCreateNewRecipe && match(Def, m_c_Mul(m_VPValue(A), m_AllOnes()))) {

    // Preserve nsw from the Mul on the new Sub.

    VPIRFlags::WrapFlagsTy NW = {

        false, cast<VPRecipeWithIRFlags>(Def)->hasNoSignedWrap()};

    return Def->replaceAllUsesWith(Builder.createSub(

        Plan->getZero(A->getScalarType()), A, Def->getDebugLoc(), "", NW));

  }


  if (CanCreateNewRecipe &&

      match(Def, m_c_Add(m_VPValue(X), m_Sub(m_ZeroInt(), m_VPValue(Y))))) {

    // Preserve nsw from the Add and the Sub, if it's present on both, on the

    // new Sub.

    VPIRFlags::WrapFlagsTy NW = {

        false,

        cast<VPRecipeWithIRFlags>(Def)->hasNoSignedWrap() &&

            cast<VPRecipeWithIRFlags>(Def->getOperand(Def->getOperand(0) == X))

                ->hasNoSignedWrap()};

    return Def->replaceAllUsesWith(

        Builder.createSub(X, Y, Def->getDebugLoc(), "", NW));

  }


  const APInt *APC;

  if (CanCreateNewRecipe && match(Def, m_c_Mul(m_VPValue(A), m_APInt(APC))) &&

      APC->isPowerOf2())

    return Def->replaceAllUsesWith(Builder.createNaryOp(

        Instruction::Shl,

        {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},

        *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));


  if (CanCreateNewRecipe && match(Def, m_UDiv(m_VPValue(A), m_APInt(APC))) &&

      APC->isPowerOf2())

    return Def->replaceAllUsesWith(Builder.createNaryOp(

        Instruction::LShr,

        {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},

        *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));


  if (match(Def, m_Not(m_VPValue(A)))) {

    if (match(A, m_Not(m_VPValue(A))))

      return Def->replaceAllUsesWith(A);


    // Try to fold Not into compares by adjusting the predicate in-place.

    CmpPredicate Pred;

    if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {

      auto *Cmp = cast<VPRecipeWithIRFlags>(A);

      if (all_of(Cmp->users(),

                 match_fn(m_CombineOr(

                     m_Not(m_Specific(Cmp)),

                     m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {

        Cmp->setPredicate(CmpInst::getInversePredicate(Pred));

        for (VPUser *U : to_vector(Cmp->users())) {

          auto *R = cast<VPSingleDefRecipe>(U);

          if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {

            // select (cmp pred), x, y -> select (cmp inv_pred), y, x

            R->setOperand(1, Y);

            R->setOperand(2, X);

          } else {

            // not (cmp pred) -> cmp inv_pred

            assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");

            R->replaceAllUsesWith(Cmp);

          }

        }

        // If Cmp doesn't have a debug location, use the one from the negation,

        // to preserve the location.

        if (!Cmp->getDebugLoc() && Def->getDebugLoc())

          Cmp->setDebugLoc(Def->getDebugLoc());

      }

    }

  }


  // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->

  //      any-of (fcmp uno %A, %B), ...

  if (match(Def, m_AnyOf())) {

    SmallVector<VPValue *, 4> NewOps;

    VPRecipeBase *UnpairedCmp = nullptr;

    for (VPValue *Op : Def->operands()) {

      VPValue *X;

      if (Op->getNumUsers() > 1 ||

          !match(Op, m_SpecificCmp(CmpInst::FCMP_UNO, m_VPValue(X),

                                   m_Deferred(X)))) {

        NewOps.push_back(Op);

      } else if (!UnpairedCmp) {

        UnpairedCmp = Op->getDefiningRecipe();

      } else {

        NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO,

                                            UnpairedCmp->getOperand(0), X));

        UnpairedCmp = nullptr;

      }

    }


    if (UnpairedCmp)

      NewOps.push_back(UnpairedCmp->getVPSingleValue());


    if (NewOps.size() < Def->getNumOperands()) {

      VPValue *NewAnyOf = Builder.createNaryOp(VPInstruction::AnyOf, NewOps);

      return Def->replaceAllUsesWith(NewAnyOf);

    }

  }


  // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y

  // This is useful for fmax/fmin without fast-math flags, where we need to

  // check if any operand is NaN.

  if (CanCreateNewRecipe &&

      match(Def, m_BinaryOr(m_SpecificCmp(CmpInst::FCMP_UNO, m_VPValue(X),

                                          m_Deferred(X)),

                            m_SpecificCmp(CmpInst::FCMP_UNO, m_VPValue(Y),

                                          m_Deferred(Y))))) {

    VPValue *NewCmp = Builder.createFCmp(CmpInst::FCMP_UNO, X, Y);

    return Def->replaceAllUsesWith(NewCmp);

  }


  // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.

  if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||

       match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&

      Def->getOperand(1)->getScalarType() == Def->getScalarType())

    return Def->replaceAllUsesWith(Def->getOperand(1));


  if (match(Def, m_VPInstruction<VPInstruction::WideIVStep>(m_VPValue(X),

                                                            m_One()))) {

    Type *WideStepTy = Def->getScalarType();

    if (X->getScalarType() != WideStepTy)

      X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);

    Def->replaceAllUsesWith(X);

    return;

  }


  // For i1 vp.merges produced by AnyOf reductions:

  // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl

  if (match(Def, m_Intrinsic<Intrinsic::vp_merge>(m_True(), m_VPValue(A),

                                                  m_VPValue(X), m_VPValue())) &&

      match(A, m_c_BinaryOr(m_Specific(X), m_VPValue(Y))) &&

      Def->getScalarType()->isIntegerTy(1)) {

    Def->setOperand(1, Def->getOperand(0));

    Def->setOperand(0, Y);

    return;

  }


  // Simplify MaskedCond with no block mask to its single operand.

  if (match(Def, m_VPInstruction<VPInstruction::MaskedCond>()) &&

      !cast<VPInstruction>(Def)->isMasked())

    return Def->replaceAllUsesWith(Def->getOperand(0));


  // Look through ExtractLastLane.

  if (match(Def, m_ExtractLastLane(m_VPValue(A)))) {

    if (match(A, m_BuildVector())) {

      auto *BuildVector = cast<VPInstruction>(A);

      Def->replaceAllUsesWith(

          BuildVector->getOperand(BuildVector->getNumOperands() - 1));

      return;

    }


    if (match(A, m_Broadcast(m_VPValue(X))))

      return Def->replaceAllUsesWith(X);


    if (vputils::isSingleScalar(A))

      return Def->replaceAllUsesWith(A);


    if (Plan->hasScalarVFOnly())

      return Def->replaceAllUsesWith(A);

  }


  // Look through ExtractPenultimateElement (BuildVector ....).

  if (match(Def, m_ExtractPenultimateElement(m_BuildVector()))) {

    auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));

    Def->replaceAllUsesWith(

        BuildVector->getOperand(BuildVector->getNumOperands() - 2));

    return;

  }


  uint64_t Idx;

  if (match(Def, m_ExtractElement(m_BuildVector(), m_ConstantInt(Idx)))) {

    auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));

    Def->replaceAllUsesWith(BuildVector->getOperand(Idx));

    return;

  }


  if (match(Def, m_BuildVector()) && all_equal(Def->operands())) {

    Def->replaceAllUsesWith(

        Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));

    return;

  }


  // Look through broadcast of single-scalar when used as select conditions; in

  // that case the scalar condition can be used directly.

  if (match(Def,

            m_Select(m_Broadcast(m_VPValue(C)), m_VPValue(), m_VPValue()))) {

    assert(vputils::isSingleScalar(C) &&

           "broadcast operand must be single-scalar");

    Def->setOperand(0, C);

    return;

  }


  if (match(Def, m_Broadcast(m_VPValue(X))))

    return Def->replaceUsesWithIf(

        X, [Def](const VPUser &U, unsigned) { return U.usesScalars(Def); });


  if (isa<VPPhi, VPWidenPHIRecipe, VPHeaderPHIRecipe>(Def)) {

    if (Def->getNumOperands() == 1) {

      Def->replaceAllUsesWith(Def->getOperand(0));

      return;

    }

    if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {

      if (all_equal(Phi->incoming_values()))

        Phi->replaceAllUsesWith(Phi->getOperand(0));

    }

    return;

  }


  VPIRValue *IRV;

  if (Def->getNumOperands() == 1 &&

      match(Def, m_ComputeReductionResult(m_VPIRValue(IRV))))

    return Def->replaceAllUsesWith(IRV);


  // Some simplifications can only be applied after unrolling. Perform them

  // below.

  if (!Plan->isUnrolled())

    return;


  // After unrolling, extract-lane may be used to extract values from multiple

  // scalar sources. Only simplify when extracting from a single scalar source.

  VPValue *LaneToExtract;

  if (match(Def, m_ExtractLane(m_VPValue(LaneToExtract), m_VPValue(A)))) {

    // Simplify extract-lane(%lane_num, %scalar_val) -> %scalar_val.

    if (vputils::isSingleScalar(A))

      return Def->replaceAllUsesWith(A);


    // Replace extract-lane(0, canonical-WIDEN-INDUCTION) with the region's

    // scalar canonical IV.

    VPWidenIntOrFpInductionRecipe *WidenIV;

    if (match(LaneToExtract, m_ZeroInt()) &&

        match(A, m_CanonicalWidenIV(WidenIV)))

      return Def->replaceAllUsesWith(WidenIV->getRegion()->getCanonicalIV());


    // Simplify extract-lane with single source to extract-element.

    Def->replaceAllUsesWith(Builder.createNaryOp(

        Instruction::ExtractElement, {A, LaneToExtract}, Def->getDebugLoc()));

    return;

  }


  // Look for cycles where Def is of the form:

  //  X = phi(0, IVInc)  ; used only by IVInc, or by IVInc and Inc = X + Y

  //  IVInc = X + Step   ; used by X and Def

  //  Def = IVInc + Y

  // Fold the increment Y into the phi's start value, replace Def with IVInc,

  // and if Inc exists, replace it with X.

  if (match(Def, m_Add(m_Add(m_VPValue(X), m_VPValue()), m_VPValue(Y))) &&

      isa<VPIRValue>(Y) &&

      match(X, m_VPPhi(m_ZeroInt(), m_Specific(Def->getOperand(0))))) {

    auto *Phi = cast<VPPhi>(X);

    auto *IVInc = Def->getOperand(0);

    if (IVInc->getNumUsers() == 2) {

      // If Phi has a second user (besides IVInc's defining recipe), it must

      // be Inc = Phi + Y for the fold to apply.

      auto *Inc = dyn_cast_or_null<VPSingleDefRecipe>(

          findUserOf(Phi, m_Add(m_Specific(Phi), m_Specific(Y))));

      if (Phi->getNumUsers() == 1 || (Phi->getNumUsers() == 2 && Inc)) {

        Def->replaceAllUsesWith(IVInc);

        if (Inc)

          Inc->replaceAllUsesWith(Phi);

        Phi->setOperand(0, Y);

        return;

      }

    }

  }


  // Simplify unrolled VectorPointer without offset, or with zero offset, to

  // just the pointer operand.

  if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(Def))

    if (!VPR->getVFxPart() || match(VPR->getVFxPart(), m_ZeroInt()))

      return VPR->replaceAllUsesWith(VPR->getOperand(0));


  // VPScalarIVSteps after unrolling can be replaced by their start value, if

  // the start index is zero and only the first lane 0 is demanded.

  if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {

    if (!Steps->getStartIndex() && vputils::onlyFirstLaneUsed(Steps)) {

      Steps->replaceAllUsesWith(Steps->getOperand(0));

      return;

    }

  }

  // Simplify redundant ReductionStartVector recipes after unrolling.

  VPValue *StartV;

  if (match(Def, m_VPInstruction<VPInstruction::ReductionStartVector>(

                     m_VPValue(StartV), m_VPValue(), m_VPValue()))) {

    Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {

      auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);

      return PhiR && PhiR->isInLoop();

    });

    return;

  }


  if (Plan->getConcreteUF() == 1 && match(Def, m_ExtractLastPart(m_VPValue(A))))

    return Def->replaceAllUsesWith(A);

}


void VPlanTransforms::simplifyRecipes(VPlan &Plan) {

  ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(

      Plan.getEntry());

  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {

    for (VPRecipeBase &R : make_early_inc_range(*VPBB))

      if (auto *Def = dyn_cast<VPSingleDefRecipe>(&R))

        simplifyRecipe(Def);

  }

}


/// Reassociate (headermask && x) && y -> headermask && (x && y) to allow the

/// header mask to be simplified further when tail folding, e.g. in

/// optimizeEVLMasks.


static void reassociateHeaderMask(VPlan &Plan) {

  VPValue *HeaderMask = vputils::findHeaderMask(Plan);

  if (!HeaderMask)

    return;


  SmallVector<VPUser *> Worklist;

  for (VPUser *U : HeaderMask->users())

    if (match(U, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue())))

      append_range(Worklist, cast<VPSingleDefRecipe>(U)->users());


  while (!Worklist.empty()) {

    auto *R = dyn_cast<VPSingleDefRecipe>(Worklist.pop_back_val());

    VPValue *X, *Y;

    if (!R || !match(R, m_LogicalAnd(

                            m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(X)),

                            m_VPValue(Y))))

      continue;

    append_range(Worklist, R->users());

    VPBuilder Builder(R);

    R->replaceAllUsesWith(

        Builder.createLogicalAnd(HeaderMask, Builder.createLogicalAnd(X, Y)));

  }

}


static std::optional<Instruction::BinaryOps>


getUnmaskedDivRemOpcode(Intrinsic::ID ID) {

  switch (ID) {

  case Intrinsic::masked_udiv:

    return Instruction::UDiv;

  case Intrinsic::masked_sdiv:

    return Instruction::SDiv;

  case Intrinsic::masked_urem:

    return Instruction::URem;

  case Intrinsic::masked_srem:

    return Instruction::SRem;

  default:

    return {};

  }

}


static void narrowToSingleScalarRecipes(VPlan &Plan) {

  if (Plan.hasScalarVFOnly())

    return;


  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(

           vp_depth_first_deep(Plan.getEntry()))) {

    for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {

      if (!isa<VPWidenRecipe, VPWidenGEPRecipe, VPReplicateRecipe,

               VPWidenIntrinsicRecipe>(&R))

        continue;

      auto *RepR = dyn_cast<VPReplicateRecipe>(&R);

      if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))

        continue;


      auto *RepOrWidenR = cast<VPRecipeWithIRFlags>(&R);

      if (RepR && RepR->getOpcode() == Instruction::Store &&

          vputils::isSingleScalar(RepR->getOperand(1))) {

        auto *Clone = new VPReplicateRecipe(

            RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),

            true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Flags*/,

            *RepR /*Metadata*/, RepR->getDebugLoc());

        Clone->insertBefore(RepOrWidenR);

        VPBuilder Builder(Clone);

        VPValue *ExtractOp = Clone->getOperand(0);

        if (vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1)))

          ExtractOp =

              Builder.createNaryOp(VPInstruction::ExtractLastPart, ExtractOp);

        ExtractOp =

            Builder.createNaryOp(VPInstruction::ExtractLastLane, ExtractOp);

        Clone->setOperand(0, ExtractOp);

        RepR->eraseFromParent();

        continue;

      }


      // Narrow llvm.masked.{u,s}{div,rem} intrinsics with a safe divisor.

      if (auto *IntrR = dyn_cast<VPWidenIntrinsicRecipe>(RepOrWidenR)) {

        if (!vputils::onlyFirstLaneUsed(IntrR))

          continue;

        auto Opc = getUnmaskedDivRemOpcode(IntrR->getVectorIntrinsicID());

        if (!Opc)

          continue;

        VPBuilder Builder(IntrR);

        VPValue *SafeDivisor = Builder.createSelect(

            IntrR->getOperand(2), IntrR->getOperand(1),

            Plan.getConstantInt(IntrR->getScalarType(), 1));

        VPValue *Clone = Builder.createNaryOp(

            *Opc, {IntrR->getOperand(0), SafeDivisor},

            VPIRFlags::getDefaultFlags(*Opc), IntrR->getDebugLoc());

        IntrR->replaceAllUsesWith(Clone);

        IntrR->eraseFromParent();

        continue;

      }


      // Skip recipes that aren't single scalars.

      if (!vputils::isSingleScalar(RepOrWidenR))

        continue;


      // Predicate to check if a user of Op introduces extra broadcasts.

      auto IntroducesBCastOf = [](const VPValue *Op) {

        return [Op](const VPUser *U) {

          if (auto *VPI = dyn_cast<VPInstruction>(U)) {

            if (is_contained({VPInstruction::ExtractLastLane,

                              VPInstruction::ExtractLastPart,

                              VPInstruction::ExtractPenultimateElement},

                             VPI->getOpcode()))

              return false;

          }

          return !U->usesScalars(Op);

        };

      };


      if (any_of(RepOrWidenR->users(), IntroducesBCastOf(RepOrWidenR)) &&

          none_of(RepOrWidenR->operands(), [&](VPValue *Op) {

            if (any_of(

                    make_filter_range(Op->users(), not_equal_to(RepOrWidenR)),

                    IntroducesBCastOf(Op)))

              return false;

            // Non-constant live-ins require broadcasts, while constants do not

            // need explicit broadcasts.

            auto *IRV = dyn_cast<VPIRValue>(Op);

            bool LiveInNeedsBroadcast = IRV && !isa<Constant>(IRV->getValue());

            auto *OpR = dyn_cast<VPReplicateRecipe>(Op);

            return LiveInNeedsBroadcast || (OpR && OpR->isSingleScalar());

          }))

        continue;


      auto *Clone = new VPReplicateRecipe(

          RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),

          true /*IsSingleScalar*/, nullptr, *RepOrWidenR);

      Clone->insertBefore(RepOrWidenR);

      RepOrWidenR->replaceAllUsesWith(Clone);

      if (isDeadRecipe(*RepOrWidenR))

        RepOrWidenR->eraseFromParent();

    }

  }

}


/// Try to see if all of \p Blend's masks share a common value logically and'ed

/// and remove it from the masks.


static void removeCommonBlendMask(VPBlendRecipe *Blend) {

  if (Blend->isNormalized())

    return;

  VPValue *CommonEdgeMask;

  if (!match(Blend->getMask(0),

             m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))

    return;

  for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)

    if (!match(Blend->getMask(I),

               m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))

      return;

  for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)

    Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));

}


/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes

/// to make sure the masks are simplified.


static void simplifyBlends(VPlan &Plan) {

  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(

           vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {

    for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {

      auto *Blend = dyn_cast<VPBlendRecipe>(&R);

      if (!Blend)

        continue;


      removeCommonBlendMask(Blend);


      // Try to remove redundant blend recipes.

      SmallPtrSet<VPValue *, 4> UniqueValues;

      if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))

        UniqueValues.insert(Blend->getIncomingValue(0));

      for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)

        if (!match(Blend->getMask(I), m_False()))

          UniqueValues.insert(Blend->getIncomingValue(I));


      if (UniqueValues.size() == 1) {

        Blend->replaceAllUsesWith(*UniqueValues.begin());

        Blend->eraseFromParent();

        continue;

      }


      if (Blend->isNormalized())

        continue;


      // Normalize the blend so its first incoming value is used as the initial

      // value with the others blended into it.


      unsigned StartIndex = 0;

      for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {

        // If a value's mask is used only by the blend then is can be deadcoded.

        // TODO: Find the most expensive mask that can be deadcoded, or a mask

        // that's used by multiple blends where it can be removed from them all.

        VPValue *Mask = Blend->getMask(I);

        if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {

          StartIndex = I;

          break;

        }

      }


      SmallVector<VPValue *, 4> OperandsWithMask;

      OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));


      for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {

        if (I == StartIndex)

          continue;

        OperandsWithMask.push_back(Blend->getIncomingValue(I));

        OperandsWithMask.push_back(Blend->getMask(I));

      }


      auto *NewBlend =

          new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),

                            OperandsWithMask, *Blend, Blend->getDebugLoc());

      NewBlend->insertBefore(&R);


      VPValue *DeadMask = Blend->getMask(StartIndex);

      Blend->replaceAllUsesWith(NewBlend);

      Blend->eraseFromParent();

      recursivelyDeleteDeadRecipes(DeadMask);


      /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.

      VPValue *NewMask;

      if (NewBlend->getNumOperands() == 3 &&

          match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {

        VPValue *Inc0 = NewBlend->getOperand(0);

        VPValue *Inc1 = NewBlend->getOperand(1);

        VPValue *OldMask = NewBlend->getOperand(2);

        NewBlend->setOperand(0, Inc1);

        NewBlend->setOperand(1, Inc0);

        NewBlend->setOperand(2, NewMask);

        if (OldMask->getNumUsers() == 0)

          cast<VPInstruction>(OldMask)->eraseFromParent();

      }

    }

  }

}


/// Optimize the width of vector induction variables in \p Plan based on a known

/// constant Trip Count, \p BestVF and \p BestUF.


static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan,

                                                     ElementCount BestVF,

                                                     unsigned BestUF) {

  // Only proceed if we have not completely removed the vector region.

  if (!Plan.getVectorLoopRegion())

    return false;


  const APInt *TC;

  if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))

    return false;


  // Calculate the minimum power-of-2 bit width that can fit the known TC, VF

  // and UF. Returns at least 8.

  auto ComputeBitWidth = [](APInt TC, uint64_t Align) {

    APInt AlignedTC =

        Align * APIntOps::RoundingUDiv(TC, APInt(TC.getBitWidth(), Align),

                                       APInt::Rounding::UP);

    APInt MaxVal = AlignedTC - 1;

    return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);

  };

  unsigned NewBitWidth =

      ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);


  LLVMContext &Ctx = Plan.getContext();

  auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);


  bool MadeChange = false;


  VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();

  for (VPRecipeBase &Phi : HeaderVPBB->phis()) {

    // Currently only handle canonical IVs as it is trivial to replace the start

    // and stop values, and we currently only perform the optimization when the

    // IV has a single use.

    VPWidenIntOrFpInductionRecipe *WideIV;

    if (!match(&Phi, m_CanonicalWidenIV(WideIV)))

      continue;

    if (WideIV->hasMoreThanOneUniqueUser() ||

        NewIVTy == WideIV->getScalarType())

      continue;


    // Currently only handle cases where the single user is a header-mask

    // comparison with the backedge-taken-count.

    VPUser *SingleUser = WideIV->getSingleUser();

    if (!SingleUser ||

        !match(SingleUser,

               m_ICmp(m_Specific(WideIV),

                      m_Broadcast(m_Specific(Plan.getBackedgeTakenCount())))))

      continue;


    // Update IV operands and comparison bound to use new narrower type.

    assert(!WideIV->getTruncInst() &&

           "canonical IV is not expected to have a truncation");

    auto *NewWideIV = new VPWidenIntOrFpInductionRecipe(

        WideIV->getPHINode(), Plan.getZero(NewIVTy),

        Plan.getConstantInt(NewIVTy, 1), WideIV->getVFValue(),

        WideIV->getInductionDescriptor(), *WideIV, WideIV->getDebugLoc());

    NewWideIV->insertBefore(WideIV);


    auto *NewBTC = new VPWidenCastRecipe(

        Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy,

        nullptr, VPIRFlags::getDefaultFlags(Instruction::Trunc));

    Plan.getVectorPreheader()->appendRecipe(NewBTC);

    auto *Cmp = cast<VPInstruction>(WideIV->getSingleUser());

    Cmp->replaceAllUsesWith(

        VPBuilder(Cmp).createICmp(Cmp->getPredicate(), NewWideIV, NewBTC));


    MadeChange = true;

  }


  return MadeChange;

}


/// Return true if \p Cond is known to be true for given \p BestVF and \p

/// BestUF.


static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan,

                                      ElementCount BestVF, unsigned BestUF,

                                      PredicatedScalarEvolution &PSE) {

  if (match(Cond, m_BinaryOr(m_VPValue(), m_VPValue())))

    return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,

                                                          &PSE](VPValue *C) {

      return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, PSE);

    });


  auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();

  if (!match(Cond, m_SpecificICmp(

                       CmpInst::ICMP_EQ,

                       m_c_Add(m_Specific(CanIV), m_Specific(&Plan.getVFxUF())),

                       m_Specific(&Plan.getVectorTripCount()))))

    return false;


  // The compare checks CanIV + VFxUF == vector trip count. The vector trip

  // count is not conveniently available as SCEV so far, so we compare directly

  // against the original trip count. This is stricter than necessary, as we

  // will only return true if the trip count == vector trip count.

  const SCEV *VectorTripCount =

      vputils::getSCEVExprForVPValue(&Plan.getVectorTripCount(), PSE);

  if (isa<SCEVCouldNotCompute>(VectorTripCount))

    VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), PSE);

  assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&

         "Trip count SCEV must be computable");

  ScalarEvolution &SE = *PSE.getSE();

  ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);

  const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);

  return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);

}


/// Try to replace multiple active lane masks used for control flow with

/// a single, wide active lane mask instruction followed by multiple

/// extract subvector intrinsics. This applies to the active lane mask

/// instructions both in the loop and in the preheader.

/// Incoming values of all ActiveLaneMaskPHIs are updated to use the

/// new extracts from the first active lane mask, which has it's last

/// operand (multiplier) set to UF.


static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF,

                                       unsigned UF) {

  if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)

    return false;


  VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();

  VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();

  auto *Term = &ExitingVPBB->back();


  using namespace llvm::VPlanPatternMatch;

  if (!match(Term, m_BranchOnCond(m_Not(m_ActiveLaneMask(

                       m_VPValue(), m_VPValue(), m_VPValue())))))

    return false;


  auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());

  LLVMContext &Ctx = Plan.getContext();


  auto ExtractFromALM = [&](VPInstruction *ALM,

                            SmallVectorImpl<VPValue *> &Extracts) {

    DebugLoc DL = ALM->getDebugLoc();

    for (unsigned Part = 0; Part < UF; ++Part) {

      SmallVector<VPValue *> Ops;

      Ops.append({ALM, Plan.getConstantInt(64, VF.getKnownMinValue() * Part)});

      auto *Ext =

          new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,

                                     IntegerType::getInt1Ty(Ctx), {}, {}, DL);

      Extracts[Part] = Ext;

      Ext->insertAfter(ALM);

    }

  };


  // Create a list of each active lane mask phi, ordered by unroll part.

  SmallVector<VPActiveLaneMaskPHIRecipe *> Phis(UF, nullptr);

  for (VPRecipeBase &R : Header->phis()) {

    auto *Phi = dyn_cast<VPActiveLaneMaskPHIRecipe>(&R);

    if (!Phi)

      continue;

    VPValue *Index = nullptr;

    match(Phi->getBackedgeValue(),

          m_ActiveLaneMask(m_VPValue(Index), m_VPValue(), m_VPValue()));

    assert(Index && "Expected index from ActiveLaneMask instruction");


    uint64_t Part;

    if (match(Index,

              m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>(

                  m_VPValue(), m_Mul(m_VPValue(), m_ConstantInt(Part)))))

      Phis[Part] = Phi;

    else {

      // Anything other than a CanonicalIVIncrementForPart is part 0

      assert(!match(

          Index,

          m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>()));

      Phis[0] = Phi;

    }

  }


  assert(all_of(Phis, not_equal_to(nullptr)) &&

         "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");


  auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());

  auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());


  assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&

          LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&

         "Expected incoming values of Phi to be ActiveLaneMasks");


  // When using wide lane masks, the return type of the get.active.lane.mask

  // intrinsic is VF x UF (last operand).

  VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);

  EntryALM->setOperand(2, ALMMultiplier);

  LoopALM->setOperand(2, ALMMultiplier);


  // Create UF x extract vectors and insert into preheader.

  SmallVector<VPValue *> EntryExtracts(UF);

  ExtractFromALM(EntryALM, EntryExtracts);


  // Create UF x extract vectors and insert before the loop compare & branch,

  // updating the compare to use the first extract.

  SmallVector<VPValue *> LoopExtracts(UF);

  ExtractFromALM(LoopALM, LoopExtracts);

  VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));

  Not->setOperand(0, LoopExtracts[0]);


  // Update the incoming values of active lane mask phis.

  for (unsigned Part = 0; Part < UF; ++Part) {

    Phis[Part]->setStartValue(EntryExtracts[Part]);

    Phis[Part]->setBackedgeValue(LoopExtracts[Part]);

  }


  return true;

}


/// Try to simplify the branch condition of \p Plan. This may restrict the

/// resulting plan to \p BestVF and \p BestUF.


static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,

                                              unsigned BestUF,

                                              PredicatedScalarEvolution &PSE) {

  VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();

  VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();

  auto *Term = &ExitingVPBB->back();

  VPValue *Cond;

  auto m_CanIVInc = m_Add(m_VPValue(), m_Specific(&Plan.getVFxUF()));

  // Check if the branch condition compares the canonical IV increment (for main

  // loop), or the canonical IV increment plus an offset (for epilog loop).

  if (match(Term, m_BranchOnCount(

                      m_CombineOr(m_CanIVInc, m_c_Add(m_CanIVInc, m_LiveIn())),

                      m_VPValue())) ||

      match(Term, m_BranchOnCond(m_Not(m_ActiveLaneMask(

                      m_VPValue(), m_VPValue(), m_VPValue()))))) {

    // Try to simplify the branch condition if VectorTC <= VF * UF when the

    // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).

    const SCEV *VectorTripCount =

        vputils::getSCEVExprForVPValue(&Plan.getVectorTripCount(), PSE);

    if (isa<SCEVCouldNotCompute>(VectorTripCount))

      VectorTripCount =

          vputils::getSCEVExprForVPValue(Plan.getTripCount(), PSE);

    assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&

           "Trip count SCEV must be computable");

    ScalarEvolution &SE = *PSE.getSE();

    ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);

    const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);

    if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, VectorTripCount, C))

      return false;

  } else if (match(Term, m_BranchOnCond(m_VPValue(Cond))) ||

             match(Term, m_BranchOnTwoConds(m_VPValue(), m_VPValue(Cond)))) {

    // For BranchOnCond, check if we can prove the condition to be true using VF

    // and UF.

    if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, PSE))

      return false;

  } else {

    return false;

  }


  // The vector loop region only executes once. Convert terminator of the

  // exiting block to exit in the first iteration.

  if (match(Term, m_BranchOnTwoConds())) {

    Term->setOperand(1, Plan.getTrue());

    return true;

  }


  auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, Plan.getTrue(), {},

                                {}, Term->getDebugLoc());

  ExitingVPBB->appendRecipe(BOC);

  Term->eraseFromParent();


  return true;

}


/// From the definition of llvm.experimental.get.vector.length,

/// VPInstruction::ExplicitVectorLength(%AVL) = %AVL when %AVL <= VF.


bool VPlanTransforms::simplifyKnownEVL(VPlan &Plan, ElementCount VF,

                                       PredicatedScalarEvolution &PSE) {

  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(

           vp_depth_first_deep(Plan.getEntry()))) {

    for (VPRecipeBase &R : *VPBB) {

      VPValue *AVL;

      if (!match(&R, m_EVL(m_VPValue(AVL))))

        continue;


      const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(AVL, PSE);

      if (isa<SCEVCouldNotCompute>(AVLSCEV))

        continue;

      ScalarEvolution &SE = *PSE.getSE();

      const SCEV *VFSCEV = SE.getElementCount(AVLSCEV->getType(), VF);

      if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, AVLSCEV, VFSCEV))

        continue;


      VPValue *Trunc = VPBuilder(&R).createScalarZExtOrTrunc(

          AVL, Type::getInt32Ty(Plan.getContext()), AVLSCEV->getType(),

          R.getDebugLoc());

      if (Trunc != AVL) {

        auto *TruncR = cast<VPSingleDefRecipe>(Trunc);

        const DataLayout &DL = Plan.getDataLayout();

        if (VPValue *Folded = tryToFoldLiveIns(*TruncR, TruncR->operands(), DL,

                                               Plan.getContext()))

          Trunc = Folded;

      }

      R.getVPSingleValue()->replaceAllUsesWith(Trunc);

      return true;

    }

  }

  return false;

}


void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,

                                         unsigned BestUF,

                                         PredicatedScalarEvolution &PSE) {

  assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");

  assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");


  bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);

  MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);

  MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);


  if (MadeChange) {

    Plan.setVF(BestVF);

    assert(Plan.getConcreteUF() == BestUF && "BestUF must match the Plan's UF");

  }

}


void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {

  for (VPRecipeBase &R :

       Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {

    auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);

    if (!PhiR)

      continue;

    RecurKind RK = PhiR->getRecurrenceKind();

    if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&

        RK != RecurKind::AddChainWithSubs)

      continue;


    for (VPUser *U : collectUsersRecursively(PhiR))

      if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {

        RecWithFlags->dropPoisonGeneratingFlags();

      }

  }

}


namespace {

struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {

  static bool isSentinel(const VPSingleDefRecipe *Def) {

    return Def == getEmptyKey();

  }


  /// If recipe \p R will lower to a GEP with a non-i8 source element type,

  /// return that source element type.

  static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {

    // All VPInstructions that lower to GEPs must have the i8 source element

    // type (as they are PtrAdds), so we omit it.

    return TypeSwitch<const VPSingleDefRecipe *, Type *>(R)

        .Case([](const VPReplicateRecipe *I) -> Type * {

          if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))

            return GEP->getSourceElementType();

          return nullptr;

        })

        .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(

            [](auto *I) { return I->getSourceElementType(); })

        .Default([](auto *) { return nullptr; });

  }


  /// Returns true if recipe \p Def can be safely handed for CSE.

  static bool canHandle(const VPSingleDefRecipe *Def) {

    // We can extend the list of handled recipes in the future,

    // provided we account for the data embedded in them while checking for

    // equality or hashing.

    auto C = getOpcodeOrIntrinsicID(Def);


    // The issue with (Insert|Extract)Value is that the index of the

    // insert/extract is not a proper operand in LLVM IR, and hence also not in

    // VPlan.

    if (!C || (!C->first && (C->second == Instruction::InsertValue ||

                             C->second == Instruction::ExtractValue)))

      return false;


    // During CSE, we can only handle recipes that don't read from memory: if

    // they read from memory, there could be an intervening write to memory

    // before the next instance is CSE'd, leading to an incorrect result.

    return !Def->mayReadFromMemory();

  }


  /// Hash the underlying data of \p Def.

  static unsigned getHashValue(const VPSingleDefRecipe *Def) {

    hash_code Result = hash_combine(

        Def->getVPRecipeID(), getOpcodeOrIntrinsicID(Def),

        getGEPSourceElementType(Def), Def->getScalarType(),

        vputils::isSingleScalar(Def), hash_combine_range(Def->operands()));

    if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))

      if (RFlags->hasPredicate())

        return hash_combine(Result, RFlags->getPredicate());

    if (auto *SIVSteps = dyn_cast<VPScalarIVStepsRecipe>(Def))

      return hash_combine(Result, SIVSteps->getInductionOpcode());

    return Result;

  }


  /// Check equality of underlying data of \p L and \p R.

  static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {

    if (isSentinel(L) || isSentinel(R))

      return L == R;

    if (L->getVPRecipeID() != R->getVPRecipeID() ||

        getOpcodeOrIntrinsicID(L) != getOpcodeOrIntrinsicID(R) ||

        getGEPSourceElementType(L) != getGEPSourceElementType(R) ||

        vputils::isSingleScalar(L) != vputils::isSingleScalar(R) ||

        !equal(L->operands(), R->operands()))

      return false;

    assert(getOpcodeOrIntrinsicID(L) && getOpcodeOrIntrinsicID(R) &&

           "must have valid opcode info for both recipes");

    if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))

      if (LFlags->hasPredicate() &&

          LFlags->getPredicate() !=

              cast<VPRecipeWithIRFlags>(R)->getPredicate())

        return false;

    if (auto *LSIV = dyn_cast<VPScalarIVStepsRecipe>(L))

      if (LSIV->getInductionOpcode() !=

          cast<VPScalarIVStepsRecipe>(R)->getInductionOpcode())

        return false;

    // Recipes in replicate regions implicitly depend on predicate. If either

    // recipe is in a replicate region, only consider them equal if both have

    // the same parent.

    const VPRegionBlock *RegionL = L->getRegion();

    const VPRegionBlock *RegionR = R->getRegion();

    if (((RegionL && RegionL->isReplicator()) ||

         (RegionR && RegionR->isReplicator())) &&

        L->getParent() != R->getParent())

      return false;

    return L->getScalarType() == R->getScalarType();

  }

};

} // end anonymous namespace


/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p

/// Plan.


void VPlanTransforms::cse(VPlan &Plan) {

  VPDominatorTree VPDT(Plan);

  DenseMap<VPSingleDefRecipe *, VPSingleDefRecipe *, VPCSEDenseMapInfo> CSEMap;


  ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(

      Plan.getEntry());

  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {

    for (VPRecipeBase &R : *VPBB) {

      auto *Def = dyn_cast<VPSingleDefRecipe>(&R);

      if (!Def || !VPCSEDenseMapInfo::canHandle(Def))

        continue;

      if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {

        // V must dominate Def for a valid replacement.

        if (!VPDT.dominates(V->getParent(), VPBB))

          continue;

        // Only keep flags present on both V and Def.

        if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))

          RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));

        Def->replaceAllUsesWith(V);

        continue;

      }

      CSEMap[Def] = Def;

    }

  }

}


/// Return true if we do not know how to (mechanically) hoist or sink a

/// non-memory or memory recipe \p R out of a loop region.


static bool cannotHoistOrSinkRecipe(VPRecipeBase &R, VPBasicBlock *FirstBB,

                                    VPBasicBlock *LastBB) {

  if (!isa<VPReplicateRecipe>(R) || !R.mayReadFromMemory())

    return vputils::cannotHoistOrSinkRecipe(R);


  // Check that the load doesn't alias with stores between FirstBB and LastBB.

  auto MemLoc = vputils::getMemoryLocation(R);

  return !MemLoc || !canHoistOrSinkWithNoAliasCheck(*MemLoc, FirstBB, LastBB);

}


/// Move loop-invariant recipes out of the vector loop region in \p Plan.


static void licm(VPlan &Plan) {

  VPBasicBlock *Preheader = Plan.getVectorPreheader();


  // Hoist any loop invariant recipes from the vector loop region to the

  // preheader. Preform a shallow traversal of the vector loop region, to

  // exclude recipes in replicate regions. Since the top-level blocks in the

  // vector loop region are guaranteed to execute if the vector pre-header is,

  // we don't need to check speculation safety.

  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();

  assert(Preheader->getSingleSuccessor() == LoopRegion &&

         "Expected vector prehader's successor to be the vector loop region");

  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(

           vp_depth_first_shallow(LoopRegion->getEntry()))) {

    for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {

      if (cannotHoistOrSinkRecipe(R, LoopRegion->getEntryBasicBlock(),

                                  LoopRegion->getExitingBasicBlock()))

        continue;

      if (any_of(R.operands(), [](VPValue *Op) {

            return !Op->isDefinedOutsideLoopRegions();

          }))

        continue;

      R.moveBefore(*Preheader, Preheader->end());

    }

  }


#ifndef NDEBUG

  VPDominatorTree VPDT(Plan);

#endif

  // Sink recipes with no users inside the vector loop region if all users are

  // in the same exit block of the region.

  // TODO: Extend to sink recipes from inner loops.

  PostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> POT(

      LoopRegion->getEntry());

  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(POT)) {

    for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {

      if (vputils::cannotHoistOrSinkRecipe(R, /*Sinking=*/true))

        continue;


      if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {

        assert(!RepR->isPredicated() &&

               "Expected prior transformation of predicated replicates to "

               "replicate regions");

        // narrowToSingleScalarRecipes should have already maximally narrowed

        // replicates to single-scalar replicates.

        // TODO: When unrolling, replicateByVF doesn't handle sunk

        // non-single-scalar replicates correctly.

        if (!RepR->isSingleScalar())

          continue;

      }


      // TODO: Use R.definedValues() instead of casting to VPSingleDefRecipe to

      // support recipes with multiple defined values (e.g., interleaved loads).

      auto *Def = cast<VPSingleDefRecipe>(&R);


      // Cannot sink the recipe if the user is defined in a loop region or a

      // non-successor of the vector loop region. Cannot sink if user is a phi

      // either.

      VPBasicBlock *SinkBB = nullptr;

      if (any_of(Def->users(), [&SinkBB, &LoopRegion](VPUser *U) {

            auto *UserR = cast<VPRecipeBase>(U);

            VPBasicBlock *Parent = UserR->getParent();

            // TODO: Support sinking when users are in multiple blocks.

            if (SinkBB && SinkBB != Parent)

              return true;

            SinkBB = Parent;

            // TODO: If the user is a PHI node, we should check the block of

            // incoming value. Support PHI node users if needed.

            return UserR->isPhi() || Parent->getEnclosingLoopRegion() ||

                   Parent->getSinglePredecessor() != LoopRegion;

          }))

        continue;


      if (!SinkBB)

        SinkBB = cast<VPBasicBlock>(LoopRegion->getSingleSuccessor());


      // TODO: This will need to be a check instead of a assert after

      // conditional branches in vectorized loops are supported.

      assert(VPDT.properlyDominates(VPBB, SinkBB) &&

             "Defining block must dominate sink block");

      // TODO: Clone the recipe if users are on multiple exit paths, instead of

      // just moving.

      Def->moveBefore(*SinkBB, SinkBB->getFirstNonPhi());

    }

  }

}


void VPlanTransforms::truncateToMinimalBitwidths(

    VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {

  if (Plan.hasScalarVFOnly())

    return;

  // Keep track of created truncates, so they can be re-used. Note that we

  // cannot use RAUW after creating a new truncate, as this would could make

  // other uses have different types for their operands, making them invalidly

  // typed.

  DenseMap<VPValue *, VPWidenCastRecipe *> ProcessedTruncs;

  VPBasicBlock *PH = Plan.getVectorPreheader();

  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(

           vp_depth_first_deep(Plan.getVectorLoopRegion()))) {

    for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {

      if (!isa<VPWidenRecipe, VPWidenCastRecipe, VPReplicateRecipe,

               VPWidenLoadRecipe, VPWidenIntrinsicRecipe>(&R))

        continue;


      VPValue *ResultVPV = R.getVPSingleValue();

      auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());

      unsigned NewResSizeInBits = MinBWs.lookup(UI);

      if (!NewResSizeInBits)

        continue;


      // If the value wasn't vectorized, we must maintain the original scalar

      // type. Skip those here, after incrementing NumProcessedRecipes. Also

      // skip casts which do not need to be handled explicitly here, as

      // redundant casts will be removed during recipe simplification.

      if (isa<VPReplicateRecipe, VPWidenCastRecipe>(&R))

        continue;


      Type *OldResTy = ResultVPV->getScalarType();

      unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();

      assert(OldResTy->isIntegerTy() && "only integer types supported");

      (void)OldResSizeInBits;


      auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);


      // Any wrapping introduced by shrinking this operation shouldn't be

      // considered undefined behavior. So, we can't unconditionally copy

      // arithmetic wrapping flags to VPW.

      if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))

        VPW->dropPoisonGeneratingFlags();


      assert((OldResSizeInBits != NewResSizeInBits ||

              match(&R, m_ICmp(m_VPValue(), m_VPValue()))) &&

             "Only ICmps should not need extending the result.");

      assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");


      // For loads/intrinsics we don't recreate the recipe; just wrap the

      // original wide result in a ZExt to OldResTy.

      if (isa<VPWidenLoadRecipe, VPWidenIntrinsicRecipe>(&R)) {

        if (OldResSizeInBits != NewResSizeInBits) {

          auto *Ext = VPBuilder::getToInsertAfter(&R).createWidenCast(

              Instruction::ZExt, ResultVPV, OldResTy);

          ResultVPV->replaceAllUsesWith(Ext);

          Ext->setOperand(0, ResultVPV);

        }

        continue;

      }


      // Shrink operands by introducing truncates as needed.

      unsigned StartIdx =

          match(&R, m_Select(m_VPValue(), m_VPValue(), m_VPValue())) ? 1 : 0;

      SmallVector<VPValue *> NewOperands(R.operands());

      for (VPValue *&Op : drop_begin(NewOperands, StartIdx)) {

        unsigned OpSizeInBits = Op->getScalarType()->getScalarSizeInBits();

        if (OpSizeInBits == NewResSizeInBits)

          continue;

        assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");

        auto [ProcessedIter, Inserted] = ProcessedTruncs.try_emplace(Op);

        if (Inserted) {

          VPBuilder Builder;

          if (isa<VPIRValue>(Op))

            Builder.setInsertPoint(PH);

          else

            Builder.setInsertPoint(&R);

          ProcessedIter->second =

              Builder.createWidenCast(Instruction::Trunc, Op, NewResTy);

        }

        Op = ProcessedIter->second;

      }


      auto *NWR = cast<VPWidenRecipe>(&R)->cloneWithOperands(NewOperands);

      NWR->insertBefore(&R);


      // Wrap NWR in a ZExt to preserve the original wide type for downstream

      // users (unless this is an ICmp, which produces i1 regardless).

      VPValue *Replacement = NWR->getVPSingleValue();

      if (OldResSizeInBits != NewResSizeInBits)

        Replacement =

            VPBuilder::getToInsertAfter(NWR)

                .createWidenCast(Instruction::ZExt, Replacement, OldResTy)

                ->getVPSingleValue();

      ResultVPV->replaceAllUsesWith(Replacement);

      R.eraseFromParent();

    }

  }

}


void VPlanTransforms::removeBranchOnConst(VPlan &Plan, bool OnlyLatches) {

  std::optional<VPDominatorTree> VPDT;

  if (OnlyLatches)

    VPDT.emplace(Plan);


  // Collect all blocks before modifying the CFG so we can identify unreachable

  // ones after constant branch removal.

  SmallVector<VPBlockBase *> AllBlocks(vp_depth_first_shallow(Plan.getEntry()));


  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(AllBlocks)) {

    VPValue *Cond;

    // Skip blocks that are not terminated by BranchOnCond.

    if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))

      continue;


    if (OnlyLatches && !VPBlockUtils::isLatch(VPBB, *VPDT))

      continue;


    assert(VPBB->getNumSuccessors() == 2 &&

           "Two successors expected for BranchOnCond");

    unsigned RemovedIdx;

    if (match(Cond, m_True()))

      RemovedIdx = 1;

    else if (match(Cond, m_False()))

      RemovedIdx = 0;

    else

      continue;


    VPBasicBlock *RemovedSucc =

        cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);

    assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&

           "There must be a single edge between VPBB and its successor");

    // Values coming from VPBB into phi recipes of RemovedSucc are removed from

    // these recipes.

    for (VPRecipeBase &R : RemovedSucc->phis())

      cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);


    // Disconnect blocks and remove the terminator.

    VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);

    VPBB->back().eraseFromParent();

  }


  // Compute which blocks are still reachable from the entry after constant

  // branch removal.

  SmallPtrSet<VPBlockBase *, 16> Reachable(

      llvm::from_range, vp_depth_first_shallow(Plan.getEntry()));


  // Detach all unreachable blocks from their successors, removing their recipes

  // and incoming values from phi recipes.

  VPSymbolicValue Tmp(nullptr);

  for (VPBlockBase *B : AllBlocks) {

    if (Reachable.contains(B))

      continue;

    for (VPBlockBase *Succ : to_vector(B->successors())) {

      if (auto *SuccBB = dyn_cast<VPBasicBlock>(Succ))

        for (VPRecipeBase &R : SuccBB->phis())

          cast<VPPhiAccessors>(&R)->removeIncomingValueFor(B);

      VPBlockUtils::disconnectBlocks(B, Succ);

    }

    for (VPBasicBlock *DeadBB :

         VPBlockUtils::blocksOnly<VPBasicBlock>(vp_depth_first_deep(B))) {

      for (VPRecipeBase &R : make_early_inc_range(*DeadBB)) {

        for (VPValue *Def : R.definedValues())

          Def->replaceAllUsesWith(&Tmp);

        R.eraseFromParent();

      }

    }

  }

}


void VPlanTransforms::optimize(VPlan &Plan) {

  RUN_VPLAN_PASS(removeRedundantInductionCasts, Plan);


  RUN_VPLAN_PASS(reassociateHeaderMask, Plan);

  RUN_VPLAN_PASS(simplifyRecipes, Plan);

  RUN_VPLAN_PASS(removeDeadRecipes, Plan);

  RUN_VPLAN_PASS(simplifyBlends, Plan);

  RUN_VPLAN_PASS(legalizeAndOptimizeInductions, Plan);

  RUN_VPLAN_PASS(narrowToSingleScalarRecipes, Plan);

  RUN_VPLAN_PASS(removeRedundantExpandSCEVRecipes, Plan);

  RUN_VPLAN_PASS(reassociateHeaderMask, Plan);

  RUN_VPLAN_PASS(simplifyRecipes, Plan);

  RUN_VPLAN_PASS(removeBranchOnConst, Plan, /*OnlyLatches=*/false);

  RUN_VPLAN_PASS(removeDeadRecipes, Plan);


  RUN_VPLAN_PASS(createAndOptimizeReplicateRegions, Plan);

  RUN_VPLAN_PASS(mergeBlocksIntoPredecessors, Plan);

  RUN_VPLAN_PASS(licm, Plan);

}


// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace

// the loop terminator with a branch-on-cond recipe with the negated

// active-lane-mask as operand. Note that this turns the loop into an

// uncountable one. Only the existing terminator is replaced, all other existing

// recipes/users remain unchanged, except for poison-generating flags being

// dropped from the canonical IV increment. Return the created

// VPActiveLaneMaskPHIRecipe.

//

// The function adds the following recipes:

//

// vector.ph:

//   %EntryInc = canonical-iv-increment-for-part CanonicalIVStart

//   %EntryALM = active-lane-mask %EntryInc, TC

//

// vector.body:

//   ...

//   %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]

//   ...

//   %InLoopInc = canonical-iv-increment-for-part CanonicalIVIncrement

//   %ALM = active-lane-mask %InLoopInc, TC

//   %Negated = Not %ALM

//   branch-on-cond %Negated

//

static VPActiveLaneMaskPHIRecipe *


addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan) {

  VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();

  VPBasicBlock *EB = TopRegion->getExitingBasicBlock();

  VPValue *StartV = Plan.getZero(TopRegion->getCanonicalIVType());

  auto *CanonicalIVIncrement = TopRegion->getOrCreateCanonicalIVIncrement();

  // TODO: Check if dropping the flags is needed.

  TopRegion->clearCanonicalIVNUW(CanonicalIVIncrement);

  DebugLoc DL = CanonicalIVIncrement->getDebugLoc();

  // We can't use StartV directly in the ActiveLaneMask VPInstruction, since

  // we have to take unrolling into account. Each part needs to start at

  //   Part * VF

  auto *VecPreheader = Plan.getVectorPreheader();

  VPBuilder Builder(VecPreheader);


  // Create the ActiveLaneMask instruction using the correct start values.

  VPValue *TC = Plan.getTripCount();

  VPValue *VF = &Plan.getVF();


  auto *EntryIncrement = Builder.createOverflowingOp(

      VPInstruction::CanonicalIVIncrementForPart, {StartV, VF}, {false, false},

      DL, "index.part.next");


  // Create the active lane mask instruction in the VPlan preheader.

  VPValue *ALMMultiplier =

      Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);

  auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,

                                        {EntryIncrement, TC, ALMMultiplier}, DL,

                                        "active.lane.mask.entry");


  // Now create the ActiveLaneMaskPhi recipe in the main loop using the

  // preheader ActiveLaneMask instruction.

  auto *LaneMaskPhi =

      new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc::getUnknown());

  auto *HeaderVPBB = TopRegion->getEntryBasicBlock();

  LaneMaskPhi->insertBefore(*HeaderVPBB, HeaderVPBB->begin());


  // Create the active lane mask for the next iteration of the loop before the

  // original terminator.

  VPRecipeBase *OriginalTerminator = EB->getTerminator();

  Builder.setInsertPoint(OriginalTerminator);

  auto *InLoopIncrement = Builder.createOverflowingOp(

      VPInstruction::CanonicalIVIncrementForPart,

      {CanonicalIVIncrement, &Plan.getVF()}, {false, false}, DL);

  auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,

                                   {InLoopIncrement, TC, ALMMultiplier}, DL,

                                   "active.lane.mask.next");

  LaneMaskPhi->addBackedgeValue(ALM);


  // Replace the original terminator with BranchOnCond. We have to invert the

  // mask here because a true condition means jumping to the exit block.

  auto *NotMask = Builder.createNot(ALM, DL);

  Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);

  OriginalTerminator->eraseFromParent();

  return LaneMaskPhi;

}


void VPlanTransforms::addActiveLaneMask(VPlan &Plan,

                                        bool UseActiveLaneMaskForControlFlow) {

  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();

  auto *WideCanonicalIV =

      findUserOf<VPWidenCanonicalIVRecipe>(LoopRegion->getCanonicalIV());

  assert(WideCanonicalIV &&

         "Must have widened canonical IV when tail folding!");

  VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan);

  VPSingleDefRecipe *LaneMask;

  if (UseActiveLaneMaskForControlFlow) {

    LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(Plan);

  } else {

    VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);

    VPValue *ALMMultiplier =

        Plan.getConstantInt(LoopRegion->getCanonicalIVType(), 1);

    LaneMask =

        B.createNaryOp(VPInstruction::ActiveLaneMask,

                       {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},

                       nullptr, "active.lane.mask");

  }


  // Walk users of WideCanonicalIV and replace the header mask of the form

  // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,

  // removing the old one to ensure there is always only a single header mask.

  HeaderMask->replaceAllUsesWith(LaneMask);

  HeaderMask->eraseFromParent();

}


template <typename Op0_t, typename Op1_t> struct RemoveMask_match {

  Op0_t In;

  Op1_t &Out;


  RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}


  template <typename OpTy> bool match(OpTy *V) const {

    if (m_Specific(In).match(V)) {

      Out = nullptr;

      return true;

    }

    return m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V);

  }


};


/// Match a specific mask \p In, or a combination of it (logical-and In, Out).

/// Returns the remaining part \p Out if so, or nullptr otherwise.

template <typename Op0_t, typename Op1_t>


static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,

                                                          Op1_t &Out) {

  return RemoveMask_match<Op0_t, Op1_t>(In, Out);

}


static std::optional<Intrinsic::ID> getVPDivRemIntrinsic(Intrinsic::ID IntrID) {

  switch (IntrID) {

  case Intrinsic::masked_udiv:

    return Intrinsic::vp_udiv;

  case Intrinsic::masked_sdiv:

    return Intrinsic::vp_sdiv;

  case Intrinsic::masked_urem:

    return Intrinsic::vp_urem;

  case Intrinsic::masked_srem:

    return Intrinsic::vp_srem;

  default:

    return std::nullopt;

  }

}


/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding

/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based

/// recipe could be created.

/// \p HeaderMask  Header Mask.

/// \p CurRecipe   Recipe to be transform.

/// \p EVL         The explicit vector length parameter of vector-predication

/// intrinsics.


static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,

                                       VPRecipeBase &CurRecipe, VPValue &EVL) {

  VPlan *Plan = CurRecipe.getParent()->getPlan();

  DebugLoc DL = CurRecipe.getDebugLoc();

  VPValue *Addr, *Mask, *EndPtr;


  /// Adjust any end pointers so that they point to the end of EVL lanes not VF.

  auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {

    auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone();

    EVLEndPtr->insertBefore(&CurRecipe);

    // Cast EVL (i32) to match the VF operand's type.

    VPValue *EVLAsVF = VPBuilder(EVLEndPtr).createScalarZExtOrTrunc(

        &EVL, EVLEndPtr->getOperand(1)->getScalarType(), EVL.getScalarType(),

        DebugLoc::getUnknown());

    EVLEndPtr->setOperand(1, EVLAsVF);

    return EVLEndPtr;

  };


  auto GetVPReverse = [&CurRecipe, &EVL, Plan,

                       DL](VPValue *V) -> VPWidenIntrinsicRecipe * {

    if (!V)

      return nullptr;

    auto *Reverse = new VPWidenIntrinsicRecipe(

        Intrinsic::experimental_vp_reverse, {V, Plan->getTrue(), &EVL},

        V->getScalarType(), {}, {}, DL);

    Reverse->insertBefore(&CurRecipe);

    return Reverse;

  };


  if (match(&CurRecipe,

            m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))))

    return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,

                                    EVL, Mask);


  VPValue *ReversedVal;

  if (match(&CurRecipe, m_Reverse(m_VPValue(ReversedVal))) &&

      match(ReversedVal,

            m_MaskedLoad(m_VPValue(EndPtr),

                         m_Reverse(m_RemoveMask(HeaderMask, Mask)))) &&

      match(EndPtr, m_VecEndPtr(m_VPValue(), m_Specific(&Plan->getVF())))) {

    Mask = GetVPReverse(Mask);

    Addr = AdjustEndPtr(EndPtr);

    auto *LoadR = new VPWidenLoadEVLRecipe(

        *cast<VPWidenLoadRecipe>(ReversedVal), Addr, EVL, Mask);

    LoadR->insertBefore(&CurRecipe);

    return new VPWidenIntrinsicRecipe(Intrinsic::experimental_vp_reverse,

                                      {LoadR, Plan->getTrue(), &EVL},

                                      LoadR->getScalarType(), {}, {}, DL);

  }


  VPValue *Stride;

  if (match(&CurRecipe, m_Intrinsic<Intrinsic::experimental_vp_strided_load>(

                            m_VPValue(Addr), m_VPValue(Stride),

                            m_RemoveMask(HeaderMask, Mask),

                            m_TruncOrSelf(m_Specific(&Plan->getVF()))))) {

    if (!Mask)

      Mask = Plan->getTrue();

    auto *NewLoad = cast<VPWidenMemIntrinsicRecipe>(&CurRecipe)->clone();

    NewLoad->setOperand(2, Mask);

    NewLoad->setOperand(3, &EVL);

    return NewLoad;

  }


  VPValue *StoredVal;

  if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(StoredVal),

                                      m_RemoveMask(HeaderMask, Mask))))

    return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,

                                     StoredVal, EVL, Mask);


  if (match(&CurRecipe,

            m_MaskedStore(m_VPValue(EndPtr), m_Reverse(m_VPValue(ReversedVal)),

                          m_Reverse(m_RemoveMask(HeaderMask, Mask)))) &&

      match(EndPtr, m_VecEndPtr(m_VPValue(), m_Specific(&Plan->getVF())))) {

    Mask = GetVPReverse(Mask);

    Addr = AdjustEndPtr(EndPtr);

    StoredVal = GetVPReverse(ReversedVal);

    return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,

                                     StoredVal, EVL, Mask);

  }


  if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))

    if (Rdx->isConditional() &&

        match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask)))

      return new VPReductionEVLRecipe(*Rdx, EVL, Mask);


  if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe))

    if (Interleave->getMask() &&

        match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask)))

      return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);


  VPValue *LHS, *RHS;

  if (match(&CurRecipe, m_Select(m_RemoveMask(HeaderMask, Mask), m_VPValue(LHS),

                                 m_VPValue(RHS))))

    return new VPWidenIntrinsicRecipe(

        Intrinsic::vp_merge, {Mask ? Mask : Plan->getTrue(), LHS, RHS, &EVL},

        LHS->getScalarType(), {}, {}, DL);


  if (match(&CurRecipe, m_LastActiveLane(m_Specific(HeaderMask)))) {

    Type *Ty = CurRecipe.getVPSingleValue()->getScalarType();

    VPValue *ZExt =

        VPBuilder(&CurRecipe)

            .createScalarZExtOrTrunc(&EVL, Ty, EVL.getScalarType(), DL);

    return new VPInstruction(

        Instruction::Sub, {ZExt, Plan->getConstantInt(Ty, 1)},

        VPIRFlags::getDefaultFlags(Instruction::Sub), {}, DL);

  }


  // lhs | (headermask && rhs) -> vp.merge rhs, true, lhs, evl

  if (match(&CurRecipe,

            m_c_BinaryOr(m_VPValue(LHS),

                         m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(RHS)))))

    return new VPWidenIntrinsicRecipe(Intrinsic::vp_merge,

                                      {RHS, Plan->getTrue(), LHS, &EVL},

                                      LHS->getScalarType(), {}, {}, DL);


  if (auto *IntrR = dyn_cast<VPWidenIntrinsicRecipe>(&CurRecipe))

    if (auto VPID = getVPDivRemIntrinsic(IntrR->getVectorIntrinsicID()))

      if (match(IntrR->getOperand(2), m_RemoveMask(HeaderMask, Mask)))

        return new VPWidenIntrinsicRecipe(*VPID,

                                          {IntrR->getOperand(0),

                                           IntrR->getOperand(1),

                                           Mask ? Mask : Plan->getTrue(), &EVL},

                                          IntrR->getScalarType(), {}, {}, DL);


  return nullptr;

}


/// Optimize away any EVL-based header masks to VP intrinsic based recipes.

/// The transforms here need to preserve the original semantics.


void VPlanTransforms::optimizeEVLMasks(VPlan &Plan) {

  // Find the EVL-based header mask if it exists: icmp ult step-vector, EVL

  VPValue *HeaderMask = nullptr, *EVL = nullptr;

  for (VPRecipeBase &R : *Plan.getVectorLoopRegion()->getEntryBasicBlock()) {

    if (match(&R, m_SpecificICmp(CmpInst::ICMP_ULT, m_StepVector(),

                                 m_VPValue(EVL))) &&

        match(EVL, m_EVL(m_VPValue()))) {

      HeaderMask = R.getVPSingleValue();

      break;

    }

  }

  if (!HeaderMask)

    return;


  SmallVector<VPRecipeBase *> OldRecipes;

  for (VPUser *U : collectUsersRecursively(HeaderMask)) {

    VPRecipeBase *R = cast<VPRecipeBase>(U);

    if (auto *NewR = optimizeMaskToEVL(HeaderMask, *R, *EVL)) {

      NewR->insertBefore(R);

      for (auto [Old, New] :

           zip_equal(R->definedValues(), NewR->definedValues()))

        Old->replaceAllUsesWith(New);

      OldRecipes.push_back(R);

    }

  }


  // Replace remaining (HeaderMask && Mask) with vp.merge (True, Mask,

  // False, EVL)

  for (VPUser *U : collectUsersRecursively(HeaderMask)) {

    VPValue *Mask;

    if (match(U, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask)))) {

      auto *LogicalAnd = cast<VPInstruction>(U);

      auto *Merge = new VPWidenIntrinsicRecipe(

          Intrinsic::vp_merge, {Plan.getTrue(), Mask, Plan.getFalse(), EVL},

          Mask->getScalarType(), {}, {}, LogicalAnd->getDebugLoc());

      Merge->insertBefore(LogicalAnd);

      LogicalAnd->replaceAllUsesWith(Merge);

      OldRecipes.push_back(LogicalAnd);

    }

  }


  for (VPRecipeBase *R : reverse(OldRecipes)) {

    SmallVector<VPValue *> PossiblyDead(R->operands());

    R->eraseFromParent();

    for (VPValue *Op : PossiblyDead)

      recursivelyDeleteDeadRecipes(Op);

  }

}


/// After replacing the canonical IV with a EVL-based IV, fixup recipes that use

/// VF to use the EVL instead to avoid incorrect updates on the penultimate

/// iteration.


static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL) {

  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();

  VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();


  // EVL is i32 but VF/VFxUF are IdxTy. Convert as needed.

  VPValue *EVLAsIdx =

      VPBuilder::getToInsertAfter(EVL.getDefiningRecipe())

          .createScalarZExtOrTrunc(&EVL, Plan.getVF().getScalarType(),

                                   EVL.getScalarType(), DebugLoc::getUnknown());


  assert(all_of(Plan.getVF().users(),

                [&Plan](VPUser *U) {

                  auto IsAllowedUser =

                      IsaPred<VPVectorEndPointerRecipe, VPScalarIVStepsRecipe,

                              VPWidenIntOrFpInductionRecipe,

                              VPWidenMemIntrinsicRecipe>;

                  if (match(U, m_Trunc(m_Specific(&Plan.getVF()))))

                    return all_of(cast<VPSingleDefRecipe>(U)->users(),

                                  IsAllowedUser);

                  return IsAllowedUser(U);

                }) &&

         "User of VF that we can't transform to EVL.");

  Plan.getVF().replaceUsesWithIf(EVLAsIdx, [](VPUser &U, unsigned Idx) {

    return isa<VPWidenIntOrFpInductionRecipe, VPScalarIVStepsRecipe>(U);

  });


  assert(all_of(Plan.getVFxUF().users(),

                match_fn(m_CombineOr(

                    m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),

                            m_Specific(&Plan.getVFxUF())),

                    m_Isa<VPWidenPointerInductionRecipe>()))) &&

         "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "

         "increment of the canonical induction.");

  Plan.getVFxUF().replaceUsesWithIf(EVLAsIdx, [](VPUser &U, unsigned Idx) {

    // Only replace uses in VPWidenPointerInductionRecipe; The increment of the

    // canonical induction must not be updated.

    return isa<VPWidenPointerInductionRecipe>(U);

  });


  // Create a scalar phi to track the previous EVL if fixed-order recurrence is

  // contained.

  bool ContainsFORs =

      any_of(Header->phis(), IsaPred<VPFirstOrderRecurrencePHIRecipe>);

  if (ContainsFORs) {

    // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.

    VPValue *MaxEVL = &Plan.getVF();

    // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.

    VPBuilder Builder(LoopRegion->getPreheaderVPBB());

    MaxEVL = Builder.createScalarZExtOrTrunc(

        MaxEVL, Type::getInt32Ty(Plan.getContext()), MaxEVL->getScalarType(),

        DebugLoc::getUnknown());


    Builder.setInsertPoint(Header, Header->getFirstNonPhi());

    VPValue *PrevEVL = Builder.createScalarPhi(

        {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");


    for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(

             vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry()))) {

      for (VPRecipeBase &R : *VPBB) {

        VPValue *V1, *V2;

        if (!match(&R,

                   m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>(

                       m_VPValue(V1), m_VPValue(V2))))

          continue;

        VPValue *Imm = Plan.getOrAddLiveIn(

            ConstantInt::getSigned(Type::getInt32Ty(Plan.getContext()), -1));

        VPWidenIntrinsicRecipe *VPSplice = new VPWidenIntrinsicRecipe(

            Intrinsic::experimental_vp_splice,

            {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},

            R.getVPSingleValue()->getScalarType(), {}, {}, R.getDebugLoc());

        VPSplice->insertBefore(&R);

        R.getVPSingleValue()->replaceAllUsesWith(VPSplice);

      }

    }

  }


  VPValue *HeaderMask = vputils::findHeaderMask(Plan);

  if (!HeaderMask)

    return;


  // Ensure that any reduction that uses a select to mask off tail lanes does so

  // in the vector loop, not the middle block, since EVL tail folding can have

  // tail elements in the penultimate iteration.

  assert(all_of(*Plan.getMiddleBlock(), [&Plan, HeaderMask](VPRecipeBase &R) {

    if (match(&R, m_ComputeReductionResult(m_Select(m_Specific(HeaderMask),

                                                    m_VPValue(), m_VPValue()))))

      return R.getOperand(0)->getDefiningRecipe()->getRegion() ==

             Plan.getVectorLoopRegion();

    return true;

  }));


  // Replace header masks with a mask equivalent to predicating by EVL:

  //

  // icmp ule widen-canonical-iv backedge-taken-count

  // ->

  // icmp ult step-vector, EVL

  VPRecipeBase *EVLR = EVL.getDefiningRecipe();

  VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));

  Type *EVLType = EVL.getScalarType();

  VPValue *EVLMask = Builder.createICmp(

      CmpInst::ICMP_ULT,

      Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);

  HeaderMask->replaceAllUsesWith(EVLMask);

}


/// Converts a tail folded vector loop region to step by

/// VPInstruction::ExplicitVectorLength elements instead of VF elements each

/// iteration.

///

/// - Add a VPCurrentIterationPHIRecipe and related recipes to \p Plan and

///   replaces all uses of the canonical IV except for the canonical IV

///   increment with a VPCurrentIterationPHIRecipe. The canonical IV is used

///   only for loop iterations counting after this transformation.

///

/// - The header mask is replaced with a header mask based on the EVL.

///

/// - Plans with FORs have a new phi added to keep track of the EVL of the

///   previous iteration, and VPFirstOrderRecurrencePHIRecipes are replaced with

///   @llvm.vp.splice.

///

/// The function uses the following definitions:

///  %StartV is the canonical induction start value.

///

/// The function adds the following recipes:

///

/// vector.ph:

/// ...

///

/// vector.body:

/// ...

/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],

///                                      [ %NextIter, %vector.body ]

/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]

/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL

/// ...

/// %OpEVL = cast i32 %VPEVL to IVSize

/// %NextIter = add IVSize %OpEVL, %CurrentIter

/// %NextAVL = sub IVSize nuw %AVL, %OpEVL

/// ...

///

/// If MaxSafeElements is provided, the function adds the following recipes:

/// vector.ph:

/// ...

///

/// vector.body:

/// ...

/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],

///                                      [ %NextIter, %vector.body ]

/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]

/// %cmp = cmp ult %AVL, MaxSafeElements

/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements

/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL

/// ...

/// %OpEVL = cast i32 %VPEVL to IVSize

/// %NextIter = add IVSize %OpEVL, %CurrentIter

/// %NextAVL = sub IVSize nuw %AVL, %OpEVL

/// ...

///


void VPlanTransforms::addExplicitVectorLength(

    VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {

  if (Plan.hasScalarVFOnly())

    return;

  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();

  VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();


  auto *CanonicalIV = LoopRegion->getCanonicalIV();

  auto *CanIVTy = LoopRegion->getCanonicalIVType();

  VPValue *StartV = Plan.getZero(CanIVTy);

  auto *CanonicalIVIncrement = LoopRegion->getOrCreateCanonicalIVIncrement();


  // Create the CurrentIteration recipe in the vector loop.

  auto *CurrentIteration =

      new VPCurrentIterationPHIRecipe(StartV, DebugLoc::getUnknown());

  CurrentIteration->insertBefore(*Header, Header->begin());

  VPBuilder Builder(Header, Header->getFirstNonPhi());

  // Create the AVL (application vector length), starting from TC -> 0 in steps

  // of EVL.

  VPPhi *AVLPhi = Builder.createScalarPhi(

      {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");

  VPValue *AVL = AVLPhi;


  if (MaxSafeElements) {

    // Support for MaxSafeDist for correct loop emission.

    VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);

    VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);

    AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),

                               "safe_avl");

  }

  auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,

                                     DebugLoc::getUnknown(), "evl");


  Builder.setInsertPoint(CanonicalIVIncrement);

  VPValue *OpVPEVL = VPEVL;


  auto *I32Ty = Type::getInt32Ty(Plan.getContext());

  OpVPEVL = Builder.createScalarZExtOrTrunc(

      OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());


  auto *NextIter = Builder.createAdd(

      OpVPEVL, CurrentIteration, CanonicalIVIncrement->getDebugLoc(),

      "current.iteration.next", CanonicalIVIncrement->getNoWrapFlags());

  CurrentIteration->addBackedgeValue(NextIter);


  VPValue *NextAVL =

      Builder.createSub(AVLPhi, OpVPEVL, DebugLoc::getCompilerGenerated(),

                        "avl.next", {/*NUW=*/true, /*NSW=*/false});

  AVLPhi->addIncoming(NextAVL);


  fixupVFUsersForEVL(Plan, *VPEVL);

  removeDeadRecipes(Plan);


  // Replace all uses of the canonical IV with VPCurrentIterationPHIRecipe

  // except for the canonical IV increment.

  CanonicalIV->replaceAllUsesWith(CurrentIteration);

  CanonicalIVIncrement->setOperand(0, CanonicalIV);

  // TODO: support unroll factor > 1.

  Plan.setUF(1);

}


void VPlanTransforms::convertToVariableLengthStep(VPlan &Plan) {

  // Find the vector loop entry by locating VPCurrentIterationPHIRecipe.

  // There should be only one VPCurrentIteration in the entire plan.

  VPCurrentIterationPHIRecipe *CurrentIteration = nullptr;


  for (VPBasicBlock *VPBB : VPBlockUtils::blocksAs<VPBasicBlock>(

           vp_depth_first_shallow(Plan.getEntry())))

    for (VPRecipeBase &R : VPBB->phis())

      if (auto *PhiR = dyn_cast<VPCurrentIterationPHIRecipe>(&R)) {

        assert(!CurrentIteration &&

               "Found multiple CurrentIteration. Only one expected");

        CurrentIteration = PhiR;

      }


  // Early return if it is not variable-length stepping.

  if (!CurrentIteration)

    return;


  VPBasicBlock *HeaderVPBB = CurrentIteration->getParent();

  VPValue *CurrentIterationIncr = CurrentIteration->getBackedgeValue();


  // Convert CurrentIteration to concrete recipe.

  auto *ScalarR =

      VPBuilder(CurrentIteration)

          .createScalarPhi(

              {CurrentIteration->getStartValue(), CurrentIterationIncr},

              CurrentIteration->getDebugLoc(), "current.iteration.iv");

  CurrentIteration->replaceAllUsesWith(ScalarR);

  CurrentIteration->eraseFromParent();


  // Replace CanonicalIVInc with CurrentIteration increment if it exists.

  auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());

  if (auto *CanIVInc = findUserOf(

          CanonicalIV, m_c_Add(m_VPValue(), m_Specific(&Plan.getVFxUF())))) {

    cast<VPInstruction>(CanIVInc)->replaceAllUsesWith(CurrentIterationIncr);

    CanIVInc->eraseFromParent();

  }

}


void VPlanTransforms::convertEVLExitCond(VPlan &Plan) {

  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();

  if (!LoopRegion)

    return;

  VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();

  if (Header->empty())

    return;

  // The EVL IV is always at the beginning.

  auto *EVLPhi = dyn_cast<VPCurrentIterationPHIRecipe>(&Header->front());

  if (!EVLPhi)

    return;


  // Bail if not an EVL tail folded loop.

  VPValue *AVL;

  if (!match(EVLPhi->getBackedgeValue(),

             m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi))))

    return;


  // The AVL may be capped to a safe distance.

  VPValue *SafeAVL, *UnsafeAVL;

  if (match(AVL,

            m_Select(m_SpecificICmp(CmpInst::ICMP_ULT, m_VPValue(UnsafeAVL),

                                    m_VPValue(SafeAVL)),

                     m_Deferred(UnsafeAVL), m_Deferred(SafeAVL))))

    AVL = UnsafeAVL;


  VPValue *AVLNext;

  [[maybe_unused]] bool FoundAVLNext =

      match(AVL, m_VPInstruction<Instruction::PHI>(

                     m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));

  assert(FoundAVLNext && "Didn't find AVL backedge?");


  VPBasicBlock *Latch = LoopRegion->getExitingBasicBlock();

  auto *LatchBr = cast<VPInstruction>(Latch->getTerminator());

  if (match(LatchBr, m_BranchOnCond(m_True())))

    return;


  VPValue *CanIVInc;

  [[maybe_unused]] bool FoundIncrement = match(

      LatchBr,

      m_BranchOnCond(m_SpecificCmp(CmpInst::ICMP_EQ, m_VPValue(CanIVInc),

                                   m_Specific(&Plan.getVectorTripCount()))));

  assert(FoundIncrement &&

         match(CanIVInc, m_Add(m_Specific(LoopRegion->getCanonicalIV()),

                               m_Specific(&Plan.getVFxUF()))) &&

         "Expected BranchOnCond with ICmp comparing CanIV + VFxUF with vector "

         "trip count");


  Type *AVLTy = AVLNext->getScalarType();

  VPBuilder Builder(LatchBr);

  LatchBr->setOperand(

      0, Builder.createICmp(CmpInst::ICMP_EQ, AVLNext, Plan.getZero(AVLTy)));

}


void VPlanTransforms::replaceSymbolicStrides(

    VPlan &Plan, PredicatedScalarEvolution &PSE,

    const DenseMap<Value *, const SCEV *> &StridesMap) {

  // Replace VPValues for known constant strides guaranteed by predicated scalar

  // evolution that are guaranteed to be guarded by the runtime checks; that is,

  // blocks dominated by the vector preheader.

  assert(!Plan.getVectorLoopRegion() &&

         "expected to run before loop regions are created");

  VPDominatorTree VPDT(Plan);

  VPBlockBase *Preheader = Plan.getEntry()->getSuccessors()[1];

  auto CanUseVersionedStride = [&VPDT, Preheader](VPUser &U, unsigned) {

    auto *R = cast<VPRecipeBase>(&U);

    VPBlockBase *Parent = R->getParent();

    return VPDT.dominates(Preheader, Parent);

  };

  ValueToSCEVMapTy RewriteMap;

  for (const SCEV *Stride : StridesMap.values()) {

    using namespace SCEVPatternMatch;

    auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();

    const APInt *StrideConst;

    if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))

      // Only handle constant strides for now.

      continue;


    auto *CI = Plan.getConstantInt(*StrideConst);

    if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))

      StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);


    // The versioned value may not be used in the loop directly but through a

    // sext/zext. Add new live-ins in those cases.

    for (Value *U : StrideV->users()) {

      if (!isa<SExtInst, ZExtInst>(U))

        continue;

      VPValue *StrideVPV = Plan.getLiveIn(U);

      if (!StrideVPV)

        continue;

      unsigned BW = U->getType()->getScalarSizeInBits();

      APInt C =

          isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);

      VPValue *CI = Plan.getConstantInt(C);

      StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);

    }

    RewriteMap[StrideV] = PSE.getSCEV(StrideV);

  }


  for (VPRecipeBase &R : *Plan.getEntry()) {

    auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);

    if (!ExpSCEV)

      continue;

    const SCEV *ScevExpr = ExpSCEV->getSCEV();

    auto *NewSCEV =

        SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);

    if (NewSCEV != ScevExpr) {

      VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);

      ExpSCEV->replaceAllUsesWith(NewExp);

      if (Plan.getTripCount() == ExpSCEV)

        Plan.resetTripCount(NewExp);

    }

  }

}


void VPlanTransforms::dropPoisonGeneratingRecipes(VPlan &Plan) {

  // Collect recipes in the backward slice of `Root` that may generate a poison

  // value that is used after vectorization.

  SmallPtrSet<VPRecipeBase *, 16> Visited;

  auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {

    SmallVector<VPRecipeBase *, 16> Worklist;

    Worklist.push_back(Root);


    // Traverse the backward slice of Root through its use-def chain.

    while (!Worklist.empty()) {

      VPRecipeBase *CurRec = Worklist.pop_back_val();


      if (!Visited.insert(CurRec).second)

        continue;


      // Prune search if we find another recipe generating a widen memory

      // instruction. Widen memory instructions involved in address computation

      // will lead to gather/scatter instructions, which don't need to be

      // handled.

      if (isa<VPWidenMemoryRecipe, VPInterleaveRecipe, VPScalarIVStepsRecipe,

              VPHeaderPHIRecipe>(CurRec))

        continue;


      // This recipe contributes to the address computation of a widen

      // load/store. If the underlying instruction has poison-generating flags,

      // drop them directly.

      if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {

        VPValue *A, *B;

        // Dropping disjoint from an OR may yield incorrect results, as some

        // analysis may have converted it to an Add implicitly (e.g. SCEV used

        // for dependence analysis). Instead, replace it with an equivalent Add.

        // This is possible as all users of the disjoint OR only access lanes

        // where the operands are disjoint or poison otherwise.

        if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&

            RecWithFlags->isDisjoint()) {

          VPBuilder Builder(RecWithFlags);

          VPInstruction *New =

              Builder.createAdd(A, B, RecWithFlags->getDebugLoc());

          New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());

          RecWithFlags->replaceAllUsesWith(New);

          RecWithFlags->eraseFromParent();

          CurRec = New;

        } else

          RecWithFlags->dropPoisonGeneratingFlags();

      } else {

        Instruction *Instr = dyn_cast_or_null<Instruction>(

            CurRec->getVPSingleValue()->getUnderlyingValue());

        (void)Instr;

        assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&

               "found instruction with poison generating flags not covered by "

               "VPRecipeWithIRFlags");

      }


      // Add new definitions to the worklist.

      for (VPValue *Operand : CurRec->operands())

        if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())

          Worklist.push_back(OpDef);

    }

  });


  // We want to exclude the tail folding case, as we don't need to drop flags

  // for operations computing the first lane in this case: the first lane of the

  // header mask must always be true.

  auto IsNotHeaderMask = [&Plan](VPValue *Mask) {

    return Mask && !vputils::isHeaderMask(Mask, Plan);

  };


  // Traverse all the recipes in the VPlan and collect the poison-generating

  // recipes in the backward slice starting at the address of a VPWidenRecipe or

  // VPInterleaveRecipe.

  auto Iter =

      vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntryBasicBlock());

  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {

    for (VPRecipeBase &Recipe : *VPBB) {

      if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {

        VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();

        if (AddrDef && WidenRec->isConsecutive() &&

            IsNotHeaderMask(WidenRec->getMask()))

          CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);

      } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {

        VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();

        if (AddrDef && IsNotHeaderMask(InterleaveRec->getMask()))

          CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);

      }

    }

  }

}


void VPlanTransforms::createInterleaveGroups(

    VPlan &Plan,

    const SmallPtrSetImpl<const InterleaveGroup<Instruction> *>

        &InterleaveGroups,

    const bool &EpilogueAllowed) {

  if (InterleaveGroups.empty())

    return;


  DenseMap<Instruction *, VPWidenMemoryRecipe *> IRMemberToRecipe;

  for (VPBasicBlock *VPBB :

       VPBlockUtils::blocksOnly<VPBasicBlock>(vp_depth_first_shallow(

           Plan.getVectorLoopRegion()->getEntryBasicBlock())))

    for (VPRecipeBase &R : make_filter_range(*VPBB, [](VPRecipeBase &R) {

           return isa<VPWidenMemoryRecipe>(&R);

         })) {

      auto *MemR = cast<VPWidenMemoryRecipe>(&R);

      IRMemberToRecipe[&MemR->getIngredient()] = MemR;

    }


  // Interleave memory: for each Interleave Group we marked earlier as relevant

  // for this VPlan, replace the Recipes widening its memory instructions with a

  // single VPInterleaveRecipe at its insertion point.

  VPDominatorTree VPDT(Plan);

  for (const auto *IG : InterleaveGroups) {

    // Skip interleave groups where members don't have recipes. This can happen

    // when removeDeadRecipes removes recipes that are part of interleave groups

    // but have no users.

    if (llvm::any_of(IG->members(), [&IRMemberToRecipe](Instruction *Member) {

          return !IRMemberToRecipe.contains(Member);

        }))

      continue;


    auto *Start = IRMemberToRecipe.lookup(IG->getMember(0));

    VPIRMetadata InterleaveMD(*Start);

    SmallVector<VPValue *, 4> StoredValues;

    if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start->getAsRecipe()))

      StoredValues.push_back(StoreR->getStoredValue());

    for (unsigned I = 1; I < IG->getFactor(); ++I) {

      Instruction *MemberI = IG->getMember(I);

      if (!MemberI)

        continue;

      VPWidenMemoryRecipe *MemoryR = IRMemberToRecipe.lookup(MemberI);

      if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR->getAsRecipe()))

        StoredValues.push_back(StoreR->getStoredValue());

      InterleaveMD.intersect(*MemoryR);

    }


    bool NeedsMaskForGaps =

        (IG->requiresScalarEpilogue() && !EpilogueAllowed) ||

        (!StoredValues.empty() && !IG->isFull());


    Instruction *IRInsertPos = IG->getInsertPos();

    auto *InsertPos = IRMemberToRecipe.lookup(IRInsertPos);

    VPRecipeBase *InsertPosR = InsertPos->getAsRecipe();


    GEPNoWrapFlags NW = GEPNoWrapFlags::none();

    if (auto *Gep = dyn_cast<GetElementPtrInst>(

            getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))

      NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();


    // Get or create the start address for the interleave group.

    VPValue *Addr = Start->getAddr();

    VPRecipeBase *AddrDef = Addr->getDefiningRecipe();

    if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPosR)) {

      // We cannot re-use the address of member zero because it does not

      // dominate the insert position. Instead, use the address of the insert

      // position and create a PtrAdd adjusting it to the address of member

      // zero.

      // TODO: Hoist Addr's defining recipe (and any operands as needed) to

      // InsertPos or sink loads above zero members to join it.

      assert(IG->getIndex(IRInsertPos) != 0 &&

             "index of insert position shouldn't be zero");

      auto &DL = IRInsertPos->getDataLayout();

      APInt Offset(32,

                   DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *

                       IG->getIndex(IRInsertPos),

                   /*IsSigned=*/true);

      VPValue *OffsetVPV = Plan.getConstantInt(-Offset);

      VPBuilder B(InsertPosR);

      Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);

    }

    // If the group is reverse, adjust the index to refer to the last vector

    // lane instead of the first. We adjust the index from the first vector

    // lane, rather than directly getting the pointer for lane VF - 1, because

    // the pointer operand of the interleaved access is supposed to be uniform.

    if (IG->isReverse()) {

      auto *ReversePtr = new VPVectorEndPointerRecipe(

          Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),

          -(int64_t)IG->getFactor(), NW, InsertPosR->getDebugLoc());

      ReversePtr->insertBefore(InsertPosR);

      Addr = ReversePtr;

    }

    auto *VPIG = new VPInterleaveRecipe(

        IG, Addr, StoredValues, InsertPos->getMask(), NeedsMaskForGaps,

        InterleaveMD, InsertPosR->getDebugLoc());

    VPIG->insertBefore(InsertPosR);


    unsigned J = 0;

    for (unsigned i = 0; i < IG->getFactor(); ++i)

      if (Instruction *Member = IG->getMember(i)) {

        VPRecipeBase *MemberR = IRMemberToRecipe.lookup(Member)->getAsRecipe();

        if (!Member->getType()->isVoidTy()) {

          VPValue *OriginalV = MemberR->getVPSingleValue();

          OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));

          J++;

        }

        MemberR->eraseFromParent();

      }

  }

}


/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial

/// value, phi and backedge value. In the following example:

///

///  vector.ph:

///  Successor(s): vector loop

///

///  <x1> vector loop: {

///    vector.body:

///      WIDEN-INDUCTION %i = phi %start, %step, %vf

///      ...

///      EMIT branch-on-count ...

///    No successors

///  }

///

/// WIDEN-INDUCTION will get expanded to:

///

///  vector.ph:

///    ...

///    vp<%induction.start> = ...

///    vp<%induction.increment> = ...

///

///  Successor(s): vector loop

///

///  <x1> vector loop: {

///    vector.body:

///      ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>

///      ...

///      vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>

///      EMIT branch-on-count ...

///    No successors

///  }

static void


expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR) {

  VPlan *Plan = WidenIVR->getParent()->getPlan();

  VPValue *Start = WidenIVR->getStartValue();

  VPValue *Step = WidenIVR->getStepValue();

  VPValue *VF = WidenIVR->getVFValue();

  DebugLoc DL = WidenIVR->getDebugLoc();


  // The value from the original loop to which we are mapping the new induction

  // variable.

  Type *Ty = WidenIVR->getScalarType();


  const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();

  Instruction::BinaryOps AddOp;

  Instruction::BinaryOps MulOp;

  VPIRFlags Flags = *WidenIVR;

  if (ID.getKind() == InductionDescriptor::IK_IntInduction) {

    AddOp = Instruction::Add;

    MulOp = Instruction::Mul;

  } else {

    AddOp = ID.getInductionOpcode();

    MulOp = Instruction::FMul;

  }


  // If the phi is truncated, truncate the start and step values.

  VPBuilder Builder(Plan->getVectorPreheader());

  Type *StepTy = Step->getScalarType();

  if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {

    assert(StepTy->isIntegerTy() && "Truncation requires an integer type");

    Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);

    Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);

    StepTy = Ty;

  }


  // Construct the initial value of the vector IV in the vector loop preheader.

  Type *IVIntTy =

      IntegerType::get(Plan->getContext(), StepTy->getScalarSizeInBits());

  VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);

  if (StepTy->isFloatingPointTy())

    Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);


  VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);

  VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);


  Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);

  Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,

                              DebugLoc::getUnknown(), "induction");


  // Create the widened phi of the vector IV.

  auto *WidePHI = VPBuilder(WidenIVR).createWidenPhi(

      Init, WidenIVR->getDebugLoc(), "vec.ind");


  // Create the backedge value for the vector IV.

  VPValue *Inc;

  VPValue *Prev;

  // If unrolled, use the increment and prev value from the operands.

  if (auto *SplatVF = WidenIVR->getSplatVFValue()) {

    Inc = SplatVF;

    Prev = WidenIVR->getLastUnrolledPartOperand();

  } else {

    // Move the insertion point after the VF definition when the VF is defined

    // inside a loop, such as for EVL tail-folding.

    if (VPRecipeBase *R = VF->getDefiningRecipe())

      if (R->getParent()->getEnclosingLoopRegion())

        Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));


    // Multiply the vectorization factor by the step using integer or

    // floating-point arithmetic as appropriate.

    if (StepTy->isFloatingPointTy())

      VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,

                                    DL);

    else

      VF = Builder.createScalarZExtOrTrunc(VF, StepTy, VF->getScalarType(), DL);


    Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);

    Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);

    Prev = WidePHI;

  }


  VPBasicBlock *ExitingBB = Plan->getVectorLoopRegion()->getExitingBasicBlock();

  Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());

  auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,

                                    WidenIVR->getDebugLoc(), "vec.ind.next");


  WidePHI->addIncoming(Next);


  WidenIVR->replaceAllUsesWith(WidePHI);

}


/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the

/// initial value, phi and backedge value. In the following example:

///

///  <x1> vector loop: {

///    vector.body:

///      EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf

///      ...

///      EMIT branch-on-count ...

///  }

///

/// WIDEN-POINTER-INDUCTION will get expanded to:

///

///  <x1> vector loop: {

///    vector.body:

///      EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind

///      EMIT %mul = mul %stepvector, %step

///      EMIT %vector.gep = wide-ptradd %pointer.phi, %mul

///      ...

///      EMIT %ptr.ind = ptradd %pointer.phi, %vf

///      EMIT branch-on-count ...

///  }


static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R) {

  VPlan *Plan = R->getParent()->getPlan();

  VPValue *Start = R->getStartValue();

  VPValue *Step = R->getStepValue();

  VPValue *VF = R->getVFValue();


  assert(R->getInductionDescriptor().getKind() ==

             InductionDescriptor::IK_PtrInduction &&

         "Not a pointer induction according to InductionDescriptor!");

  assert(R->getScalarType()->isPointerTy() && "Unexpected type.");

  assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&

         "Recipe should have been replaced");


  VPBuilder Builder(R);

  DebugLoc DL = R->getDebugLoc();


  // Build a scalar pointer phi.

  VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");


  // Create actual address geps that use the pointer phi as base and a

  // vectorized version of the step value (<step*0, ..., step*N>) as offset.

  Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());

  Type *StepTy = Step->getScalarType();

  VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);

  Offset = Builder.createOverflowingOp(Instruction::Mul, {Offset, Step});

  VPValue *PtrAdd =

      Builder.createWidePtrAdd(ScalarPtrPhi, Offset, DL, "vector.gep");

  R->replaceAllUsesWith(PtrAdd);


  // Create the backedge value for the scalar pointer phi.

  VPBasicBlock *ExitingBB = Plan->getVectorLoopRegion()->getExitingBasicBlock();

  Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());

  VF = Builder.createScalarZExtOrTrunc(VF, StepTy, VF->getScalarType(), DL);

  VPValue *Inc = Builder.createOverflowingOp(Instruction::Mul, {Step, VF});


  VPValue *InductionGEP =

      Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");

  ScalarPtrPhi->addIncoming(InductionGEP);

}


/// Expand a VPDerivedIVRecipe into executable recipes.


static void expandVPDerivedIV(VPDerivedIVRecipe *R) {

  VPBuilder Builder(R);

  VPIRValue *Start = R->getStartValue();

  VPValue *Step = R->getStepValue();

  VPValue *Index = R->getIndex();

  Type *StepTy = Step->getScalarType();

  Type *IndexTy = Index->getScalarType();

  Index = StepTy->isIntegerTy()

              ? Builder.createScalarSExtOrTrunc(

                    Index, StepTy, IndexTy, DebugLoc::getCompilerGenerated())

              : Builder.createScalarCast(Instruction::SIToFP, Index, StepTy,

                                         DebugLoc::getCompilerGenerated());

  switch (R->getInductionKind()) {

  case InductionDescriptor::IK_IntInduction: {

    assert(Index->getScalarType() == Start->getScalarType() &&

           "Index type does not match StartValue type");

    return R->replaceAllUsesWith(Builder.createAdd(

        Start, Builder.createOverflowingOp(Instruction::Mul, {Index, Step})));

  }

  case InductionDescriptor::IK_PtrInduction:

    return R->replaceAllUsesWith(Builder.createPtrAdd(

        Start, Builder.createOverflowingOp(Instruction::Mul, {Index, Step})));

  case InductionDescriptor::IK_FpInduction: {

    assert(StepTy->isFloatingPointTy() && "Expected FP Step value");

    const FPMathOperator *FPBinOp = R->getFPBinOp();

    assert(FPBinOp &&

           (FPBinOp->getOpcode() == Instruction::FAdd ||

            FPBinOp->getOpcode() == Instruction::FSub) &&

           "Original BinOp should be defined for FP induction");

    FastMathFlags FMF = FPBinOp->getFastMathFlags();

    VPValue *FMul = Builder.createNaryOp(Instruction::FMul, {Step, Index}, FMF);

    return R->replaceAllUsesWith(

        Builder.createNaryOp(FPBinOp->getOpcode(), {Start, FMul}, FMF));

  }

  case InductionDescriptor::IK_NoInduction:

    return;

  }

  llvm_unreachable("Unhandled induction kind");

}


void VPlanTransforms::dissolveLoopRegions(VPlan &Plan) {

  // Replace loop regions with explicity CFG.

  SmallVector<VPRegionBlock *> LoopRegions;

  for (VPRegionBlock *R : VPBlockUtils::blocksOnly<VPRegionBlock>(

           vp_depth_first_deep(Plan.getEntry()))) {

    if (!R->isReplicator())

      LoopRegions.push_back(R);

  }

  for (VPRegionBlock *R : LoopRegions)

    R->dissolveToCFGLoop();

}


void VPlanTransforms::expandBranchOnTwoConds(VPlan &Plan) {

  SmallVector<VPInstruction *> WorkList;

  // The transform runs after dissolving loop regions, so all VPBasicBlocks

  // terminated with BranchOnTwoConds are reached via a shallow traversal.

  for (VPBasicBlock *VPBB : VPBlockUtils::blocksAs<VPBasicBlock>(

           vp_depth_first_shallow(Plan.getEntry()))) {

    if (!VPBB->empty() && match(&VPBB->back(), m_BranchOnTwoConds()))

      WorkList.push_back(cast<VPInstruction>(&VPBB->back()));

  }


  // Expand BranchOnTwoConds instructions into explicit CFG with two new

  // single-condition branches:

  // 1. A branch that replaces BranchOnTwoConds, jumps to the first successor if

  //    the first condition is true, and otherwise jumps to a new interim block.

  // 2. A branch that ends the interim block, jumps to the second successor if

  //    the second condition is true, and otherwise jumps to the third

  //    successor.

  for (VPInstruction *Br : WorkList) {

    assert(Br->getNumOperands() == 2 &&

           "BranchOnTwoConds must have exactly 2 conditions");

    DebugLoc DL = Br->getDebugLoc();

    VPBasicBlock *BrOnTwoCondsBB = Br->getParent();

    const auto Successors = to_vector(BrOnTwoCondsBB->getSuccessors());

    assert(Successors.size() == 3 &&

           "BranchOnTwoConds must have exactly 3 successors");


    for (VPBlockBase *Succ : Successors)

      VPBlockUtils::disconnectBlocks(BrOnTwoCondsBB, Succ);


    VPValue *Cond0 = Br->getOperand(0);

    VPValue *Cond1 = Br->getOperand(1);

    VPBlockBase *Succ0 = Successors[0];

    VPBlockBase *Succ1 = Successors[1];

    VPBlockBase *Succ2 = Successors[2];


    // If the successor block for both conditions is the same, then combine the

    // two conditions and plant a single conditional branch.

    if (Succ0 == Succ1) {

      VPBuilder Builder(Br);

      VPValue *Combined = Builder.createOr(Cond0, Cond1, DL);

      Builder.createNaryOp(VPInstruction::BranchOnCond, {Combined}, DL);

      VPBlockUtils::connectBlocks(BrOnTwoCondsBB, Succ0);

      VPBlockUtils::connectBlocks(BrOnTwoCondsBB, Succ2);

      Br->eraseFromParent();

      continue;

    }


    assert(!Succ0->getParent() && !Succ1->getParent() && !Succ2->getParent() &&

           !BrOnTwoCondsBB->getParent() && "regions must already be dissolved");


    VPBasicBlock *InterimBB =

        Plan.createVPBasicBlock(BrOnTwoCondsBB->getName() + ".interim");


    VPBuilder(BrOnTwoCondsBB)

        .createNaryOp(VPInstruction::BranchOnCond, {Cond0}, DL);

    VPBlockUtils::connectBlocks(BrOnTwoCondsBB, Succ0);

    VPBlockUtils::connectBlocks(BrOnTwoCondsBB, InterimBB);


    VPBuilder(InterimBB).createNaryOp(VPInstruction::BranchOnCond, {Cond1}, DL);

    VPBlockUtils::connectBlocks(InterimBB, Succ1);

    VPBlockUtils::connectBlocks(InterimBB, Succ2);

    Br->eraseFromParent();

  }

}


void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) {

  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(

           vp_depth_first_deep(Plan.getEntry()))) {

    for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {

      VPBuilder Builder(&R);

      if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {

        expandVPWidenIntOrFpInduction(WidenIVR);

        WidenIVR->eraseFromParent();

        continue;

      }


      if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {

        // If the recipe only generates scalars, scalarize it instead of

        // expanding it.

        if (WidenIVR->onlyScalarsGenerated(Plan.hasScalableVF())) {

          VPValue *PtrAdd =

              scalarizeVPWidenPointerInduction(WidenIVR, Plan, Builder);

          WidenIVR->replaceAllUsesWith(PtrAdd);

          WidenIVR->eraseFromParent();

          continue;

        }

        expandVPWidenPointerInduction(WidenIVR);

        WidenIVR->eraseFromParent();

        continue;

      }


      if (auto *DerivedIVR = dyn_cast<VPDerivedIVRecipe>(&R)) {

        expandVPDerivedIV(DerivedIVR);

        DerivedIVR->eraseFromParent();

        continue;

      }


      if (auto *WideCanIV = dyn_cast<VPWidenCanonicalIVRecipe>(&R)) {

        VPValue *CanIV = WideCanIV->getCanonicalIV();

        Type *CanIVTy = CanIV->getScalarType();

        VPValue *Step = WideCanIV->getStepValue();

        if (!Step) {

          assert(Plan.getConcreteUF() == 1 &&

                 "Expected unroller to have materialized step for UF != 1");

          Step = Plan.getZero(CanIVTy);

        }

        CanIV = Builder.createNaryOp(VPInstruction::Broadcast, CanIV);

        Step = Builder.createNaryOp(VPInstruction::Broadcast, Step);

        Step = Builder.createAdd(

            Step, Builder.createNaryOp(VPInstruction::StepVector, {}, CanIVTy));

        VPValue *CanVecIV =

            Builder.createAdd(CanIV, Step, WideCanIV->getDebugLoc(), "vec.iv",

                              WideCanIV->getNoWrapFlags());

        WideCanIV->replaceAllUsesWith(CanVecIV);

        WideCanIV->eraseFromParent();

        continue;

      }


      // Expand VPBlendRecipe into VPInstruction::Select.

      if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {

        VPValue *Select = Blend->getIncomingValue(0);

        for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)

          Select = Builder.createSelect(Blend->getMask(I),

                                        Blend->getIncomingValue(I), Select,

                                        R.getDebugLoc(), "predphi", *Blend);

        Blend->replaceAllUsesWith(Select);

        Blend->eraseFromParent();

        continue;

      }


      if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(&R)) {

        if (!VEPR->getOffset()) {

          assert(Plan.getConcreteUF() == 1 &&

                 "Expected unroller to have materialized offset for UF != 1");

          VEPR->materializeOffset();

        }

        continue;

      }


      if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {

        Expr->decompose();

        Expr->eraseFromParent();

        continue;

      }


      // Expand LastActiveLane into Not + FirstActiveLane + Sub.

      auto *LastActiveL = dyn_cast<VPInstruction>(&R);

      if (LastActiveL &&

          LastActiveL->getOpcode() == VPInstruction::LastActiveLane) {

        // Create Not(Mask) for all operands.

        SmallVector<VPValue *, 2> NotMasks;

        for (VPValue *Op : LastActiveL->operands()) {

          VPValue *NotMask = Builder.createNot(Op, LastActiveL->getDebugLoc());

          NotMasks.push_back(NotMask);

        }


        // Create FirstActiveLane on the inverted masks.

        VPValue *FirstInactiveLane = Builder.createFirstActiveLane(

            NotMasks, LastActiveL->getDebugLoc(), "first.inactive.lane");


        // Subtract 1 to get the last active lane.

        VPValue *One =

            Plan.getConstantInt(FirstInactiveLane->getScalarType(), 1);

        VPValue *LastLane =

            Builder.createSub(FirstInactiveLane, One,

                              LastActiveL->getDebugLoc(), "last.active.lane");


        LastActiveL->replaceAllUsesWith(LastLane);

        LastActiveL->eraseFromParent();

        continue;

      }


      // Lower MaskedCond with block mask to LogicalAnd.

      if (match(&R, m_VPInstruction<VPInstruction::MaskedCond>())) {

        auto *VPI = cast<VPInstruction>(&R);

        assert(VPI->isMasked() &&

               "Unmasked MaskedCond should be simplified earlier");

        VPI->replaceAllUsesWith(Builder.createNaryOp(

            VPInstruction::LogicalAnd, {VPI->getMask(), VPI->getOperand(0)}));

        VPI->eraseFromParent();

        continue;

      }


      // Lower CanonicalIVIncrementForPart to plain Add.

      if (match(

              &R,

              m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>())) {

        auto *VPI = cast<VPInstruction>(&R);

        VPValue *Add = Builder.createOverflowingOp(

            Instruction::Add, VPI->operands(), VPI->getNoWrapFlags(),

            VPI->getDebugLoc());

        VPI->replaceAllUsesWith(Add);

        VPI->eraseFromParent();

        continue;

      }


      // Lower BranchOnCount to ICmp + BranchOnCond.

      VPValue *IV, *TC;

      if (match(&R, m_BranchOnCount(m_VPValue(IV), m_VPValue(TC)))) {

        auto *BranchOnCountInst = cast<VPInstruction>(&R);

        DebugLoc DL = BranchOnCountInst->getDebugLoc();

        VPValue *Cond = Builder.createICmp(CmpInst::ICMP_EQ, IV, TC, DL);

        Builder.createNaryOp(VPInstruction::BranchOnCond, Cond, DL);

        BranchOnCountInst->eraseFromParent();

        continue;

      }


      VPValue *VectorStep;

      VPValue *ScalarStep;

      if (!match(&R, m_VPInstruction<VPInstruction::WideIVStep>(

                         m_VPValue(VectorStep), m_VPValue(ScalarStep))))

        continue;


      // Expand WideIVStep.

      auto *VPI = cast<VPInstruction>(&R);

      Type *IVTy = VPI->getScalarType();

      if (VectorStep->getScalarType() != IVTy) {

        Instruction::CastOps CastOp = IVTy->isFloatingPointTy()

                                          ? Instruction::UIToFP

                                          : Instruction::Trunc;

        VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);

      }


      assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");

      if (ScalarStep->getScalarType() != IVTy) {

        ScalarStep =

            Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);

      }


      VPIRFlags Flags;

      unsigned MulOpc;

      if (IVTy->isFloatingPointTy()) {

        MulOpc = Instruction::FMul;

        Flags = VPI->getFastMathFlags();

      } else {

        MulOpc = Instruction::Mul;

        Flags = VPIRFlags::getDefaultFlags(MulOpc);

      }


      VPInstruction *Mul = Builder.createNaryOp(

          MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());

      VectorStep = Mul;

      VPI->replaceAllUsesWith(VectorStep);

      VPI->eraseFromParent();

    }

  }

}


struct EarlyExitInfo {

  VPBasicBlock *EarlyExitingVPBB;

  VPIRBasicBlock *EarlyExitVPBB;

  VPValue *CondToExit;

};


/// Update \p Plan to mask memory operations in the loop based on whether the

/// early exit is taken or not.


static bool handleUncountableExitsWithSideEffects(

    VPlan &Plan, SmallVectorImpl<EarlyExitInfo> &Exits,

    VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VPBasicBlock *MiddleVPBB,

    Loop *TheLoop, PredicatedScalarEvolution &PSE, DominatorTree &DT,

    AssumptionCache *AC, VPDominatorTree &VPDT) {


  // Disconnect early exiting blocks from successors, remove branches. We

  // currently don't support multiple uses for recipes involved in creating

  // the uncountable exit condition.

  for (auto &Exit : Exits) {

    if (Exit.EarlyExitingVPBB == LatchVPBB)

      continue;


    for (VPRecipeBase &R : Exit.EarlyExitVPBB->phis())

      cast<VPIRPhi>(&R)->removeIncomingValueFor(Exit.EarlyExitingVPBB);

    Exit.EarlyExitingVPBB->getTerminator()->eraseFromParent();

    VPBlockUtils::disconnectBlocks(Exit.EarlyExitingVPBB, Exit.EarlyExitVPBB);

  }


  // We can abandon a VPlan entirely if we return false here, so we shouldn't

  // crash if some earlier assumptions on scalar IR don't hold for the vplan

  // version of the loop.

  SmallVector<VPInstruction *, 2> GEPs;

  SmallVector<VPInstruction *, 8> ConditionRecipes;


  std::optional<VPValue *> Cond =

      vputils::getRecipesForUncountableExit(ConditionRecipes, GEPs, LatchVPBB);

  if (!Cond)

    return false;


  // Find load contributing to condition.

  VPRecipeBase *CondLoad = nullptr;

  for (auto *Recipe : ConditionRecipes) {

    if (match(Recipe, m_VPInstruction<Instruction::Load>(m_VPValue()))) {

      // TODO: Support more than one load. Needs legality updates too.

      assert(CondLoad == nullptr && "Too many condition loads");

      CondLoad = Recipe;

    }

  }

  assert(CondLoad && "Couldn't find load");


  // Ensure that we are guaranteed to be able to dereference the memory used

  // for determining the uncountable exit for the maximum possible number of

  // scalar iterations of the loop.

  //

  // TODO: Support first-faulting loads in cases where we don't know whether

  //       all possible addresses are dereferenceable.

  {

    SmallVector<const SCEVPredicate *, 4> Predicates;

    VPSingleDefRecipe *Load = cast<VPSingleDefRecipe>(CondLoad);

    VPValue *Ptr = Load->getOperand(0);

    const SCEV *PtrSCEV = vputils::getSCEVExprForVPValue(Ptr, PSE, TheLoop);

    const DataLayout &DL = Plan.getDataLayout();

    APInt EltSize(DL.getIndexTypeSizeInBits(Ptr->getScalarType()),

                  DL.getTypeStoreSize(Load->getScalarType()).getFixedValue());

    if (!isDereferenceableAndAlignedInLoop(

            PtrSCEV, cast<LoadInst>(Load->getUnderlyingInstr())->getAlign(),

            PSE.getSE()->getConstant(EltSize), TheLoop, *PSE.getSE(), DT, AC,

            &Predicates))

      return false;

  }


  // Check GEPs to see if we can link them to a widen IV recipe with a step of

  // 1; we're only interested in contiguous accesses for the condition load

  // right now.

  for (auto *GEP : GEPs) {

    VPValue *MaybeIV = nullptr;

    if (!match(GEP, m_VPInstruction<Instruction::GetElementPtr>(

                        m_LiveIn(), m_VPValue(MaybeIV))))

      return false;


    auto *WIV = dyn_cast<VPWidenInductionRecipe>(MaybeIV);

    if (!WIV)

      return false;


    if (!match(WIV->getStartValue(), m_SpecificInt(0)) ||

        !match(WIV->getStepValue(), m_SpecificInt(1)))

      return false;

  }


  // Find an insertion point. Default to the end of the header but override

  // if we find a memory op that needs masking before the condition load.

  auto InsertIt = HeaderVPBB->end();

  VPRecipeBase *CondR = (*Cond)->getDefiningRecipe();

  bool CondMoveNeeded = CondR->getParent() != HeaderVPBB;

  for (VPRecipeBase &R : *HeaderVPBB) {

    if (&R == CondLoad)

      continue;


    if (R.mayReadOrWriteMemory()) {

      if (!VPDT.properlyDominates(CondR, &R)) {

        CondMoveNeeded = true;

        InsertIt = R.getIterator();

      }

      break;

    }

  }


  // If another memory operation would take place before the comparison to

  // determine whether to exit early or the comparison doesn't take place in

  // the header, move the comparison (and supporting recipes).

  if (CondMoveNeeded)

    for (auto *Recipe : reverse(ConditionRecipes))

      Recipe->moveBefore(*HeaderVPBB, InsertIt);


  // Create a mask to represent all lanes that fully execute in the vector loop,

  // stopping short of any early exit.

  VPBuilder MaskBuilder(HeaderVPBB, InsertIt);

  VPValue *FirstActive = MaskBuilder.createFirstActiveLane(*Cond);

  VPValue *IV = cast<VPSingleDefRecipe>(&HeaderVPBB->front());

  Type *IVScalarTy = IV->getScalarType();

  Type *FirstActiveTy = FirstActive->getScalarType();

  VPValue *ALMMultiplier = Plan.getConstantInt(IVScalarTy, 1);

  VPValue *Zero = Plan.getZero(IVScalarTy);

  FirstActive = MaskBuilder.createScalarZExtOrTrunc(FirstActive, IVScalarTy,

                                                    FirstActiveTy, DebugLoc());

  VPValue *Mask = MaskBuilder.createNaryOp(VPInstruction::ActiveLaneMask,

                                           {Zero, FirstActive, ALMMultiplier},

                                           DebugLoc(), "uncountable.exit.mask");


  // Convert all other memory operations to use the mask.

  for (VPBasicBlock *VPBB : vp_rpo_plain_cfg_loop_body(HeaderVPBB))

    for (VPRecipeBase &R : *VPBB)

      if (R.mayReadOrWriteMemory() && &R != CondLoad) {

        // TODO: Handle conditional memory operations in the loop.

        if (!VPDT.dominates(R.getParent(), LatchVPBB))

          return false;

        cast<VPInstruction>(&R)->addMask(Mask);

      }


  // Update middle block branch to compare (IV + however many lanes were active)

  // against the full trip count, since we may be exiting the vector loop early.

  // If we didn't take an early exit, we should get the equivalent of VF from

  // the FirstActiveLane.

  VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->end());

  VPValue *ScalarIV = MiddleBuilder.createNaryOp(VPInstruction::ExtractLane,

                                                 {Zero, IV}, DebugLoc());

  VPValue *ExitIV = MiddleBuilder.createAdd(ScalarIV, FirstActive);

  VPValue *FullTC =

      MiddleBuilder.createICmp(CmpInst::ICMP_EQ, ExitIV, Plan.getTripCount());

  MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {FullTC});


  // Update resume phi in scalar.ph.

  VPBasicBlock *ScalarPH = Plan.getScalarPreheader();

  auto Phis = ScalarPH->phis();

  // TODO: Handle more than one Phi; re-derive from IV.

  // TODO: Handle reductions.

  if (range_size(Phis) != 1)

    return false;

  VPPhi *ContinueIV = cast<VPPhi>(Phis.begin());

  ContinueIV->setOperand(0, ExitIV);

  return true;

}


bool VPlanTransforms::handleUncountableEarlyExits(

    VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB,

    VPBasicBlock *MiddleVPBB, Loop *TheLoop, PredicatedScalarEvolution &PSE,

    DominatorTree &DT, AssumptionCache *AC, UncountableExitStyle Style) {

  VPDominatorTree VPDT(Plan);

  VPBuilder LatchBuilder(LatchVPBB->getTerminator());

  SmallVector<EarlyExitInfo> Exits;

  for (VPIRBasicBlock *ExitBlock : Plan.getExitBlocks()) {

    for (VPBlockBase *Pred : to_vector(ExitBlock->getPredecessors())) {

      if (Pred == MiddleVPBB)

        continue;

      // Collect condition for this early exit.

      auto *EarlyExitingVPBB = cast<VPBasicBlock>(Pred);

      VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];

      VPValue *CondOfEarlyExitingVPBB;

      [[maybe_unused]] bool Matched =

          match(EarlyExitingVPBB->getTerminator(),

                m_BranchOnCond(m_VPValue(CondOfEarlyExitingVPBB)));

      assert(Matched && "Terminator must be BranchOnCond");


      // Insert the MaskedCond in the EarlyExitingVPBB so the predicator adds

      // the correct block mask.

      VPBuilder EarlyExitingBuilder(EarlyExitingVPBB->getTerminator());

      auto *CondToEarlyExit = EarlyExitingBuilder.createNaryOp(

          VPInstruction::MaskedCond,

          TrueSucc == ExitBlock

              ? CondOfEarlyExitingVPBB

              : EarlyExitingBuilder.createNot(CondOfEarlyExitingVPBB));

      assert((isa<VPIRValue>(CondOfEarlyExitingVPBB) ||

              !VPDT.properlyDominates(EarlyExitingVPBB, LatchVPBB) ||

              VPDT.properlyDominates(

                  CondOfEarlyExitingVPBB->getDefiningRecipe()->getParent(),

                  LatchVPBB)) &&

             "exit condition must dominate the latch");

      Exits.push_back({

          EarlyExitingVPBB,

          ExitBlock,

          CondToEarlyExit,

      });

    }

  }


  assert(!Exits.empty() && "must have at least one early exit");

  // Sort exits by RPO order to get correct program order. RPO gives a

  // topological ordering of the CFG, ensuring upstream exits are checked

  // before downstream exits in the dispatch chain.

  ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(

      HeaderVPBB);

  DenseMap<VPBlockBase *, unsigned> RPOIdx;

  for (const auto &[Num, VPB] : enumerate(RPOT))

    RPOIdx[VPB] = Num;

  llvm::sort(Exits, [&RPOIdx](const EarlyExitInfo &A, const EarlyExitInfo &B) {

    return RPOIdx[A.EarlyExitingVPBB] < RPOIdx[B.EarlyExitingVPBB];

  });

#ifndef NDEBUG

  // After RPO sorting, verify that for any pair where one exit dominates

  // another, the dominating exit comes first. This is guaranteed by RPO

  // (topological order) and is required for the dispatch chain correctness.

  for (unsigned I = 0; I + 1 < Exits.size(); ++I)

    for (unsigned J = I + 1; J < Exits.size(); ++J)

      assert(!VPDT.properlyDominates(Exits[J].EarlyExitingVPBB,

                                     Exits[I].EarlyExitingVPBB) &&

             "RPO sort must place dominating exits before dominated ones");

#endif


  // Build the AnyOf condition for the latch terminator using logical OR

  // to avoid poison propagation from later exit conditions when an earlier

  // exit is taken.

  VPValue *Combined = Exits[0].CondToExit;

  for (const EarlyExitInfo &Info : drop_begin(Exits))

    Combined = LatchBuilder.createLogicalOr(Combined, Info.CondToExit);


  VPValue *IsAnyExitTaken =

      LatchBuilder.createNaryOp(VPInstruction::AnyOf, {Combined});


  // Create a comparison for the latch exit condition and replace the

  // BranchOnCond with a BranchOnTwoConds. The original BranchOnCond's condition

  // is used as the latch-exit condition; canonical IV recipes have not been

  // introduced yet, so there is no BranchOnCount to derive the condition from.

  auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());

  assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCond &&

         "Unexpected terminator");

  VPValue *IsLatchExitTaken = LatchExitingBranch->getOperand(0);

  DebugLoc LatchDL = LatchExitingBranch->getDebugLoc();

  LatchExitingBranch->eraseFromParent();

  LatchBuilder.setInsertPoint(LatchVPBB);

  LatchBuilder.createNaryOp(VPInstruction::BranchOnTwoConds,

                            {IsAnyExitTaken, IsLatchExitTaken}, LatchDL);

  LatchVPBB->clearSuccessors();


  if (Style == UncountableExitStyle::MaskedHandleExitInScalarLoop) {

    // If handling the exiting lane in the scalar loop, combine the exit

    // conditions into a single BranchOnCond.

    LatchVPBB->setSuccessors({MiddleVPBB, MiddleVPBB, HeaderVPBB});

    MiddleVPBB->clearPredecessors();

    MiddleVPBB->setPredecessors({LatchVPBB, LatchVPBB});

    return handleUncountableExitsWithSideEffects(Plan, Exits, HeaderVPBB,

                                                 LatchVPBB, MiddleVPBB, TheLoop,

                                                 PSE, DT, AC, VPDT);

  }


  // Create the vector.early.exit blocks.

  SmallVector<VPBasicBlock *> VectorEarlyExitVPBBs(Exits.size());

  for (unsigned Idx = 0; Idx != Exits.size(); ++Idx) {

    Twine BlockSuffix = Exits.size() == 1 ? "" : Twine(".") + Twine(Idx);

    VPBasicBlock *VectorEarlyExitVPBB =

        Plan.createVPBasicBlock("vector.early.exit" + BlockSuffix);

    VectorEarlyExitVPBBs[Idx] = VectorEarlyExitVPBB;

  }


  // Create the dispatch block (or reuse the single exit block if only one

  // exit). The dispatch block computes the first active lane of the combined

  // condition and, for multiple exits, chains through conditions to determine

  // which exit to take.

  VPBasicBlock *DispatchVPBB =

      Exits.size() == 1 ? VectorEarlyExitVPBBs[0]

                        : Plan.createVPBasicBlock("vector.early.exit.check");

  DispatchVPBB->setPredecessors({LatchVPBB});

  LatchVPBB->setSuccessors({DispatchVPBB, MiddleVPBB, HeaderVPBB});

  VPBuilder DispatchBuilder(DispatchVPBB, DispatchVPBB->begin());

  VPValue *FirstActiveLane = DispatchBuilder.createFirstActiveLane(

      {Combined}, DebugLoc::getUnknown(), "first.active.lane");


  // For each early exit, disconnect the original exiting block

  // (early.exiting.I) from the exit block (ir-bb<exit.I>) and route through a

  // new vector.early.exit block. Update ir-bb<exit.I>'s phis to extract their

  // values at the first active lane:

  //

  // Input:

  //  early.exiting.I:

  //     ...

  //    EMIT branch-on-cond vp<%cond.I>

  //  Successor(s): in.loop.succ, ir-bb<exit.I>

  //

  //  ir-bb<exit.I>:

  //    IR %phi = phi [ vp<%incoming.I>, early.exiting.I ], ...

  //

  // Output:

  //  early.exiting.I:

  //    ...

  //  Successor(s): in.loop.succ

  //

  //  vector.early.exit.I:

  //    EMIT vp<%exit.val> = extract-lane vp<%first.lane>, vp<%incoming.I>

  //  Successor(s): ir-bb<exit.I>

  //

  //  ir-bb<exit.I>:

  //    IR %phi = phi ... (extra operand: vp<%exit.val> from

  //                                      vector.early.exit.I)

  //

  for (auto [Exit, VectorEarlyExitVPBB] :

       zip_equal(Exits, VectorEarlyExitVPBBs)) {

    auto &[EarlyExitingVPBB, EarlyExitVPBB, _] = Exit;

    // Adjust the phi nodes in EarlyExitVPBB.

    //   1. remove incoming values from EarlyExitingVPBB,

    //   2. extract the incoming value at FirstActiveLane

    //   3. add back the extracts as last operands for the phis

    // Then adjust the CFG, removing the edge between EarlyExitingVPBB and

    // EarlyExitVPBB and adding a new edge between VectorEarlyExitVPBB and

    // EarlyExitVPBB. The extracts at FirstActiveLane are now the incoming

    // values from VectorEarlyExitVPBB.

    for (VPRecipeBase &R : EarlyExitVPBB->phis()) {

      auto *ExitIRI = cast<VPIRPhi>(&R);

      VPValue *IncomingVal =

          ExitIRI->getIncomingValueForBlock(EarlyExitingVPBB);

      VPValue *NewIncoming = IncomingVal;

      if (!isa<VPIRValue>(IncomingVal)) {

        VPBuilder EarlyExitBuilder(VectorEarlyExitVPBB);

        NewIncoming = EarlyExitBuilder.createNaryOp(

            VPInstruction::ExtractLane, {FirstActiveLane, IncomingVal},

            DebugLoc::getUnknown(), "early.exit.value");

      }

      ExitIRI->removeIncomingValueFor(EarlyExitingVPBB);

      ExitIRI->addIncoming(NewIncoming);

    }


    EarlyExitingVPBB->getTerminator()->eraseFromParent();

    VPBlockUtils::disconnectBlocks(EarlyExitingVPBB, EarlyExitVPBB);

    VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);

  }


  // Chain through exits: for each exit, check if its condition is true at

  // the first active lane. If so, take that exit; otherwise, try the next.

  // The last exit needs no check since it must be taken if all others fail.

  //

  // For 3 exits (cond.0, cond.1, cond.2), this creates:

  //

  // latch:

  //   ...

  //   EMIT vp<%combined> = logical-or vp<%cond.0>, vp<%cond.1>, vp<%cond.2>

  //   ...

  //

  // vector.early.exit.check:

  //   EMIT vp<%first.lane> = first-active-lane vp<%combined>

  //   EMIT vp<%at.cond.0> = extract-lane vp<%first.lane>, vp<%cond.0>

  //   EMIT branch-on-cond vp<%at.cond.0>

  // Successor(s): vector.early.exit.0, vector.early.exit.check.0

  //

  // vector.early.exit.check.0:

  //   EMIT vp<%at.cond.1> = extract-lane vp<%first.lane>, vp<%cond.1>

  //   EMIT branch-on-cond vp<%at.cond.1>

  // Successor(s): vector.early.exit.1, vector.early.exit.2

  VPBasicBlock *CurrentBB = DispatchVPBB;

  for (auto [I, Exit] : enumerate(ArrayRef(Exits).drop_back())) {

    VPValue *LaneVal = DispatchBuilder.createNaryOp(

        VPInstruction::ExtractLane, {FirstActiveLane, Exit.CondToExit},

        DebugLoc::getUnknown(), "exit.cond.at.lane");


    // For the last dispatch, branch directly to the last exit on false;

    // otherwise, create a new check block.

    bool IsLastDispatch = (I + 2 == Exits.size());

    VPBasicBlock *FalseBB =

        IsLastDispatch ? VectorEarlyExitVPBBs.back()

                       : Plan.createVPBasicBlock(

                             Twine("vector.early.exit.check.") + Twine(I));


    DispatchBuilder.createNaryOp(VPInstruction::BranchOnCond, {LaneVal});

    CurrentBB->setSuccessors({VectorEarlyExitVPBBs[I], FalseBB});

    VectorEarlyExitVPBBs[I]->setPredecessors({CurrentBB});

    FalseBB->setPredecessors({CurrentBB});


    CurrentBB = FalseBB;

    DispatchBuilder.setInsertPoint(CurrentBB);

  }


  return true;

}


/// This function tries convert extended in-loop reductions to

/// VPExpressionRecipe and clamp the \p Range if it is beneficial and

/// valid. The created recipe must be decomposed to its constituent

/// recipes before execution.

static VPExpressionRecipe *


tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx,

                                     VFRange &Range) {

  Type *RedTy = Red->getScalarType();

  VPValue *VecOp = Red->getVecOp();


  assert(!Red->isPartialReduction() &&

         "This path does not support partial reductions");


  // Clamp the range if using extended-reduction is profitable.

  auto IsExtendedRedValidAndClampRange =

      [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {

    return LoopVectorizationPlanner::getDecisionAndClampRange(

        [&](ElementCount VF) {

          auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));

          TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;


          InstructionCost ExtRedCost = InstructionCost::getInvalid();

          InstructionCost ExtCost =

              cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);

          InstructionCost RedCost = Red->computeCost(VF, Ctx);


          assert(!RedTy->isFloatingPointTy() &&

                 "getExtendedReductionCost only supports integer types");

          ExtRedCost = Ctx.TTI.getExtendedReductionCost(

              Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,

              Red->getFastMathFlags(), CostKind);

          return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;

        },

        Range);

  };


  VPValue *A;

  // Match reduce(ext)).

  if (match(VecOp, m_Isa<VPWidenCastRecipe>(m_ZExtOrSExt(m_VPValue(A)))) &&

      IsExtendedRedValidAndClampRange(

          RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),

          cast<VPWidenCastRecipe>(VecOp)->getOpcode(), A->getScalarType()))

    return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);


  return nullptr;

}


/// This function tries convert extended in-loop reductions to

/// VPExpressionRecipe and clamp the \p Range if it is beneficial

/// and valid. The created VPExpressionRecipe must be decomposed to its

/// constituent recipes before execution. Patterns of the

/// VPExpressionRecipe:

///   reduce.add(mul(...)),

///   reduce.add(mul(ext(A), ext(B))),

///   reduce.add(ext(mul(ext(A), ext(B)))).

///   reduce.fadd(fmul(ext(A), ext(B)))

static VPExpressionRecipe *


tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,

                                          VPCostContext &Ctx, VFRange &Range) {

  unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());

  if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&

      Opcode != Instruction::FAdd)

    return nullptr;


  assert(!Red->isPartialReduction() &&

         "This path does not support partial reductions");

  Type *RedTy = Red->getScalarType();


  // Clamp the range if using multiply-accumulate-reduction is profitable.

  auto IsMulAccValidAndClampRange =

      [&](VPWidenRecipe *Mul, VPWidenCastRecipe *Ext0, VPWidenCastRecipe *Ext1,

          VPWidenCastRecipe *OuterExt) -> bool {

    return LoopVectorizationPlanner::getDecisionAndClampRange(

        [&](ElementCount VF) {

          TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

          Type *SrcTy = Ext0 ? Ext0->getOperand(0)->getScalarType() : RedTy;

          InstructionCost MulAccCost;


          // getMulAccReductionCost for in-loop reductions does not support

          // mixed or floating-point extends.

          if (Ext0 && Ext1 &&

              (Ext0->getOpcode() != Ext1->getOpcode() ||

               Ext0->getOpcode() == Instruction::CastOps::FPExt))

            return false;


          bool IsZExt =

              !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;

          auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));

          MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,

                                                      SrcVecTy, CostKind);


          InstructionCost MulCost = Mul->computeCost(VF, Ctx);

          InstructionCost RedCost = Red->computeCost(VF, Ctx);

          InstructionCost ExtCost = 0;

          if (Ext0)

            ExtCost += Ext0->computeCost(VF, Ctx);

          if (Ext1)

            ExtCost += Ext1->computeCost(VF, Ctx);

          if (OuterExt)

            ExtCost += OuterExt->computeCost(VF, Ctx);


          return MulAccCost.isValid() &&

                 MulAccCost < ExtCost + MulCost + RedCost;

        },

        Range);

  };


  VPValue *VecOp = Red->getVecOp();

  VPRecipeBase *Sub = nullptr;

  VPValue *A, *B;

  VPValue *Tmp = nullptr;


  if (RedTy->isFloatingPointTy())

    return nullptr;


  // Sub reductions could have a sub between the add reduction and vec op.

  if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {

    Sub = VecOp->getDefiningRecipe();

    VecOp = Tmp;

  }


  // If ValB is a constant and can be safely extended, truncate it to the same

  // type as ExtA's operand, then extend it to the same type as ExtA. This

  // creates two uniform extends that can more easily be matched by the rest of

  // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all

  // replaced with the new extend of the constant.

  auto ExtendAndReplaceConstantOp = [](VPWidenCastRecipe *ExtA,

                                       VPWidenCastRecipe *&ExtB, VPValue *&ValB,

                                       VPWidenRecipe *Mul) {

    if (!ExtA || ExtB || !isa<VPIRValue>(ValB))

      return;

    Type *NarrowTy = ExtA->getOperand(0)->getScalarType();

    Instruction::CastOps ExtOpc = ExtA->getOpcode();

    const APInt *Const;

    if (!match(ValB, m_APInt(Const)) ||

        !llvm::canConstantBeExtended(

            Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))

      return;

    // The truncate ensures that the type of each extended operand is the

    // same, and it's been proven that the constant can be extended from

    // NarrowTy safely. Necessary since ExtA's extended operand would be

    // e.g. an i8, while the const will likely be an i32. This will be

    // elided by later optimisations.

    VPBuilder Builder(Mul);

    auto *Trunc =

        Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);

    Type *WideTy = ExtA->getScalarType();

    ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);

    Mul->setOperand(1, ExtB);

  };


  // Try to match reduce.add(mul(...)).

  if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {

    auto *RecipeA = dyn_cast<VPWidenCastRecipe>(A);

    auto *RecipeB = dyn_cast<VPWidenCastRecipe>(B);

    auto *Mul = cast<VPWidenRecipe>(VecOp);


    // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))

    ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);


    // Match reduce.add/sub(mul(ext, ext)).

    if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&

        match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&

        IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {

      if (Sub)

        return new VPExpressionRecipe(RecipeA, RecipeB, Mul,

                                      cast<VPWidenRecipe>(Sub), Red);

      return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);

    }

    // TODO: Add an expression type for this variant with a negated mul

    if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))

      return new VPExpressionRecipe(Mul, Red);

  }

  // TODO: Add an expression type for negated versions of other expression

  // variants.

  if (Sub)

    return nullptr;


  // Match reduce.add(ext(mul(A, B))).

  if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {

    auto *Ext = cast<VPWidenCastRecipe>(VecOp);

    auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));

    auto *Ext0 = dyn_cast<VPWidenCastRecipe>(A);

    auto *Ext1 = dyn_cast<VPWidenCastRecipe>(B);


    // reduce.add(ext(mul(ext, const)))

    // -> reduce.add(ext(mul(ext, ext(const))))

    ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);


    // reduce.add(ext(mul(ext(A), ext(B))))

    // -> reduce.add(mul(wider_ext(A), wider_ext(B)))

    // The inner extends must either have the same opcode as the outer extend or

    // be the same, in which case the multiply can never result in a negative

    // value and the outer extend can be folded away by doing wider

    // extends for the operands of the mul.

    if (Ext0 && Ext1 &&

        (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&

        Ext0->getOpcode() == Ext1->getOpcode() &&

        IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {

      auto *NewExt0 = new VPWidenCastRecipe(

          Ext0->getOpcode(), Ext0->getOperand(0), Ext->getScalarType(), nullptr,

          *Ext0, *Ext0, Ext0->getDebugLoc());

      NewExt0->insertBefore(Ext0);


      VPWidenCastRecipe *NewExt1 = NewExt0;

      if (Ext0 != Ext1) {

        NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),

                                        Ext->getScalarType(), nullptr, *Ext1,

                                        *Ext1, Ext1->getDebugLoc());

        NewExt1->insertBefore(Ext1);

      }

      auto *NewMul = Mul->cloneWithOperands({NewExt0, NewExt1});

      NewMul->insertBefore(Mul);

      Ext->replaceAllUsesWith(NewMul);

      Ext->eraseFromParent();

      Mul->eraseFromParent();

      return new VPExpressionRecipe(NewExt0, NewExt1, NewMul, Red);

    }

  }

  return nullptr;

}


/// This function tries to create abstract recipes from the reduction recipe for

/// following optimizations and cost estimation.


static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red,

                                               VPCostContext &Ctx,

                                               VFRange &Range) {

  // Creation of VPExpressions for partial reductions is entirely handled in

  // transformToPartialReduction.

  assert(!Red->isPartialReduction() &&

         "This path does not support partial reductions");


  VPExpressionRecipe *AbstractR = nullptr;

  auto IP = std::next(Red->getIterator());

  auto *VPBB = Red->getParent();

  if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))

    AbstractR = MulAcc;

  else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))

    AbstractR = ExtRed;

  // Cannot create abstract inloop reduction recipes.

  if (!AbstractR)

    return;


  AbstractR->insertBefore(*VPBB, IP);

  Red->replaceAllUsesWith(AbstractR);

}


void VPlanTransforms::convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx,

                                               VFRange &Range) {

  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(

           vp_depth_first_deep(Plan.getVectorLoopRegion()))) {

    for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {

      if (auto *Red = dyn_cast<VPReductionRecipe>(&R))

        tryToCreateAbstractReductionRecipe(Red, Ctx, Range);

    }

  }

}


void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {

  if (Plan.hasScalarVFOnly())

    return;


#ifndef NDEBUG

  VPDominatorTree VPDT(Plan);

#endif


  SmallVector<VPValue *> VPValues;

  if (VPValue *BTC = Plan.getBackedgeTakenCount())

    VPValues.push_back(BTC);

  append_range(VPValues, Plan.getLiveIns());

  for (VPRecipeBase &R : *Plan.getEntry())

    append_range(VPValues, R.definedValues());


  auto *VectorPreheader = Plan.getVectorPreheader();

  for (VPValue *VPV : VPValues) {

    if (vputils::onlyScalarValuesUsed(VPV) ||

        (isa<VPIRValue>(VPV) && isa<Constant>(VPV->getLiveInIRValue())))

      continue;


    // Add explicit broadcast at the insert point that dominates all users.

    VPBasicBlock *HoistBlock = VectorPreheader;

    VPBasicBlock::iterator HoistPoint = VectorPreheader->end();

    for (VPUser *User : VPV->users()) {

      if (User->usesScalars(VPV))

        continue;

      if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)

        HoistPoint = HoistBlock->begin();

      else

        assert(VPDT.dominates(VectorPreheader,

                              cast<VPRecipeBase>(User)->getParent()) &&

               "All users must be in the vector preheader or dominated by it");

    }


    VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);

    auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});

    VPV->replaceUsesWithIf(Broadcast,

                           [VPV, Broadcast](VPUser &U, unsigned Idx) {

                             return Broadcast != &U && !U.usesScalars(VPV);

                           });

  }

}


// Collect common metadata from a group of replicate recipes by intersecting

// metadata from all recipes in the group.


static VPIRMetadata getCommonMetadata(ArrayRef<VPReplicateRecipe *> Recipes) {

  VPIRMetadata CommonMetadata = *Recipes.front();

  for (VPReplicateRecipe *Recipe : drop_begin(Recipes))

    CommonMetadata.intersect(*Recipe);

  return CommonMetadata;

}


template <unsigned Opcode>

static SmallVector<SmallVector<VPReplicateRecipe *, 4>>


collectComplementaryPredicatedMemOps(VPlan &Plan,

                                     PredicatedScalarEvolution &PSE,

                                     const Loop *L) {

  static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,

                "Only Load and Store opcodes supported");

  [[maybe_unused]] constexpr bool IsLoad = (Opcode == Instruction::Load);


  // For each address, collect operations with the same or complementary masks.

  SmallVector<SmallVector<VPReplicateRecipe *, 4>> AllGroups;

  auto Groups = collectGroupedReplicateMemOps<Opcode>(

      Plan, PSE, L,

      [](VPReplicateRecipe *RepR) { return RepR->isPredicated(); });

  for (auto Recipes : Groups) {

    if (Recipes.size() < 2)

      continue;


    assert(all_equal(

               map_range(Recipes, bind_back<getLoadStoreValueType>(IsLoad))) &&

           "Expected all recipes in group to have the same load-store type");


    // Collect groups with the same or complementary masks.

    for (VPReplicateRecipe *&RecipeI : Recipes) {

      if (!RecipeI)

        continue;


      VPValue *MaskI = RecipeI->getMask();

      SmallVector<VPReplicateRecipe *, 4> Group;

      Group.push_back(RecipeI);

      RecipeI = nullptr;


      // Find all operations with the same or complementary masks.

      bool HasComplementaryMask = false;

      for (VPReplicateRecipe *&RecipeJ : Recipes) {

        if (!RecipeJ)

          continue;


        VPValue *MaskJ = RecipeJ->getMask();

        // Check if any operation in the group has a complementary mask with

        // another, that is M1 == NOT(M2) or M2 == NOT(M1).

        HasComplementaryMask |= match(MaskI, m_Not(m_Specific(MaskJ))) ||

                                match(MaskJ, m_Not(m_Specific(MaskI)));

        Group.push_back(RecipeJ);

        RecipeJ = nullptr;

      }


      if (HasComplementaryMask) {

        assert(Group.size() >= 2 && "must have at least 2 entries");

        AllGroups.push_back(std::move(Group));

      }

    }

  }


  return AllGroups;

}


// Find the recipe with minimum alignment in the group.

template <typename InstType>

static VPReplicateRecipe *


findRecipeWithMinAlign(ArrayRef<VPReplicateRecipe *> Group) {

  return *min_element(Group, [](VPReplicateRecipe *A, VPReplicateRecipe *B) {

    return cast<InstType>(A->getUnderlyingInstr())->getAlign() <

           cast<InstType>(B->getUnderlyingInstr())->getAlign();

  });

}


void VPlanTransforms::hoistPredicatedLoads(VPlan &Plan,

                                           PredicatedScalarEvolution &PSE,

                                           const Loop *L) {

  auto Groups =

      collectComplementaryPredicatedMemOps<Instruction::Load>(Plan, PSE, L);

  if (Groups.empty())

    return;


  // Process each group of loads.

  for (auto &Group : Groups) {

    // Try to use the earliest (most dominating) load to replace all others.

    VPReplicateRecipe *EarliestLoad = Group[0];

    VPBasicBlock *FirstBB = EarliestLoad->getParent();

    VPBasicBlock *LastBB = Group.back()->getParent();


    // Check that the load doesn't alias with stores between first and last.

    auto LoadLoc = vputils::getMemoryLocation(*EarliestLoad);

    if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(*LoadLoc, FirstBB, LastBB))

      continue;


    // Collect common metadata from all loads in the group.

    VPIRMetadata CommonMetadata = getCommonMetadata(Group);


    // Find the load with minimum alignment to use.

    auto *LoadWithMinAlign = findRecipeWithMinAlign<LoadInst>(Group);


    bool IsSingleScalar = EarliestLoad->isSingleScalar();

    assert(all_of(Group,

                  [IsSingleScalar](VPReplicateRecipe *R) {

                    return R->isSingleScalar() == IsSingleScalar;

                  }) &&

           "all members in group must agree on IsSingleScalar");


    // Create an unpredicated version of the earliest load with common

    // metadata.

    auto *UnpredicatedLoad = new VPReplicateRecipe(

        LoadWithMinAlign->getUnderlyingInstr(), {EarliestLoad->getOperand(0)},

        IsSingleScalar, /*Mask=*/nullptr, *EarliestLoad, CommonMetadata);


    UnpredicatedLoad->insertBefore(EarliestLoad);


    // Replace all loads in the group with the unpredicated load.

    for (VPReplicateRecipe *Load : Group) {

      Load->replaceAllUsesWith(UnpredicatedLoad);

      Load->eraseFromParent();

    }

  }

}


static bool


canSinkStoreWithNoAliasCheck(ArrayRef<VPReplicateRecipe *> StoresToSink,

                             PredicatedScalarEvolution &PSE, const Loop &L) {

  auto StoreLoc = vputils::getMemoryLocation(*StoresToSink.front());

  if (!StoreLoc || !StoreLoc->AATags.Scope)

    return false;


  // When sinking a group of stores, all members of the group alias each other.

  // Skip them during the alias checks.

  SmallPtrSet<VPRecipeBase *, 4> StoresToSinkSet(StoresToSink.begin(),

                                                 StoresToSink.end());


  VPBasicBlock *FirstBB = StoresToSink.front()->getParent();

  VPBasicBlock *LastBB = StoresToSink.back()->getParent();

  SinkStoreInfo SinkInfo(StoresToSinkSet, *StoresToSink[0], PSE, L);

  return canHoistOrSinkWithNoAliasCheck(*StoreLoc, FirstBB, LastBB, SinkInfo);

}


void VPlanTransforms::sinkPredicatedStores(VPlan &Plan,

                                           PredicatedScalarEvolution &PSE,

                                           const Loop *L) {

  auto Groups =

      collectComplementaryPredicatedMemOps<Instruction::Store>(Plan, PSE, L);

  if (Groups.empty())

    return;


  for (auto &Group : Groups) {

    if (!canSinkStoreWithNoAliasCheck(Group, PSE, *L))

      continue;


    // Use the last (most dominated) store's location for the unconditional

    // store.

    VPReplicateRecipe *LastStore = Group.back();

    VPBasicBlock *InsertBB = LastStore->getParent();


    // Collect common alias metadata from all stores in the group.

    VPIRMetadata CommonMetadata = getCommonMetadata(Group);


    // Build select chain for stored values.

    VPValue *SelectedValue = Group[0]->getOperand(0);

    VPBuilder Builder(InsertBB, LastStore->getIterator());


    bool IsSingleScalar = Group[0]->isSingleScalar();

    for (unsigned I = 1; I < Group.size(); ++I) {

      assert(IsSingleScalar == Group[I]->isSingleScalar() &&

             "all members in group must agree on IsSingleScalar");

      VPValue *Mask = Group[I]->getMask();

      VPValue *Value = Group[I]->getOperand(0);

      SelectedValue = Builder.createSelect(Mask, Value, SelectedValue,

                                           Group[I]->getDebugLoc());

    }


    // Find the store with minimum alignment to use.

    auto *StoreWithMinAlign = findRecipeWithMinAlign<StoreInst>(Group);


    // Create unconditional store with selected value and common metadata.

    auto *UnpredicatedStore = new VPReplicateRecipe(

        StoreWithMinAlign->getUnderlyingInstr(),

        {SelectedValue, LastStore->getOperand(1)}, IsSingleScalar,

        /*Mask=*/nullptr, *LastStore, CommonMetadata);

    UnpredicatedStore->insertBefore(*InsertBB, LastStore->getIterator());


    // Remove all predicated stores from the group.

    for (VPReplicateRecipe *Store : Group)

      Store->eraseFromParent();

  }

}


void VPlanTransforms::materializeConstantVectorTripCount(

    VPlan &Plan, ElementCount BestVF, unsigned BestUF,

    PredicatedScalarEvolution &PSE) {

  assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");

  assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");


  VPValue *TC = Plan.getTripCount();

  if (TC->getNumUsers() == 0)

    return;


  // Skip cases for which the trip count may be non-trivial to materialize.

  // I.e., when a scalar tail is absent - due to tail folding, or when a scalar

  // tail is required.

  if (!Plan.hasScalarTail() ||

      Plan.getMiddleBlock()->getSingleSuccessor() ==

          Plan.getScalarPreheader() ||

      !isa<VPIRValue>(TC))

    return;


  // Materialize vector trip counts for constants early if it can simply

  // be computed as (Original TC / VF * UF) * VF * UF.

  // TODO: Compute vector trip counts for loops requiring a scalar epilogue and

  // tail-folded loops.

  ScalarEvolution &SE = *PSE.getSE();

  auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());

  if (!isa<SCEVConstant>(TCScev))

    return;

  const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);

  auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);

  if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))

    Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());

}


void VPlanTransforms::materializeBackedgeTakenCount(VPlan &Plan,

                                                    VPBasicBlock *VectorPH) {

  VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();

  if (BTC->getNumUsers() == 0)

    return;


  VPBuilder Builder(VectorPH, VectorPH->begin());

  auto *TCTy = Plan.getTripCount()->getScalarType();

  auto *TCMO =

      Builder.createSub(Plan.getTripCount(), Plan.getConstantInt(TCTy, 1),

                        DebugLoc::getCompilerGenerated(), "trip.count.minus.1");

  BTC->replaceAllUsesWith(TCMO);

}


void VPlanTransforms::materializePacksAndUnpacks(VPlan &Plan) {

  if (Plan.hasScalarVFOnly())

    return;


  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();

  auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(

      vp_depth_first_shallow(Plan.getEntry()));

  auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(

      vp_depth_first_shallow(LoopRegion->getEntry()));

  // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes,

  // VPScalarIVStepsRecipe and VPInstructions, excluding ones in replicate

  // regions. Those are not materialized explicitly yet.

  // TODO: materialize build vectors for replicating recipes in replicating

  // regions.

  for (VPBasicBlock *VPBB :

       concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {

    for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {

      if (!isa<VPScalarIVStepsRecipe, VPReplicateRecipe, VPInstruction>(&R))

        continue;

      auto *DefR = cast<VPSingleDefRecipe>(&R);

      auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {

        VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();

        return !U->usesScalars(DefR) || ParentRegion != LoopRegion;

      };

      if ((isa<VPReplicateRecipe>(DefR) &&

           cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||

          (isa<VPInstruction>(DefR) &&

           (vputils::onlyFirstLaneUsed(DefR) ||

            !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||

          none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))

        continue;


      Type *ScalarTy = DefR->getScalarType();

      unsigned Opcode = ScalarTy->isStructTy()

                            ? VPInstruction::BuildStructVector

                            : VPInstruction::BuildVector;

      auto *BuildVector = new VPInstruction(Opcode, {DefR});

      BuildVector->insertAfter(DefR);


      DefR->replaceUsesWithIf(

          BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](

                           VPUser &U, unsigned) {

            return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);

          });

    }

  }


  // Create explicit VPInstructions to convert vectors to scalars. The current

  // implementation is conservative - it may miss some cases that may or may not

  // be vector values. TODO: introduce Unpacks speculatively - remove them later

  // if they are known to operate on scalar values.

  for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {

    for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {

      if (isa<VPReplicateRecipe, VPInstruction, VPScalarIVStepsRecipe,

              VPDerivedIVRecipe>(&R))

        continue;

      for (VPValue *Def : R.definedValues()) {

        // Skip recipes that are single-scalar or only have their first lane

        // used.

        // TODO: The Defs skipped here may or may not be vector values.

        // Introduce Unpacks, and remove them later, if they are guaranteed to

        // produce scalar values.

        if (vputils::isSingleScalar(Def) || vputils::onlyFirstLaneUsed(Def))

          continue;


        // At the moment, we create unpacks only for scalar users outside

        // replicate regions. Recipes inside replicate regions still extract the

        // required lanes implicitly.

        // TODO: Remove once replicate regions are unrolled completely.

        auto IsCandidateUnpackUser = [Def](VPUser *U) {

          VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();

          return U->usesScalars(Def) &&

                 (!ParentRegion || !ParentRegion->isReplicator());

        };

        if (none_of(Def->users(), IsCandidateUnpackUser))

          continue;


        auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});

        if (R.isPhi())

          Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());

        else

          Unpack->insertAfter(&R);

        Def->replaceUsesWithIf(Unpack,

                               [&IsCandidateUnpackUser](VPUser &U, unsigned) {

                                 return IsCandidateUnpackUser(&U);

                               });

      }

    }

  }

}


void VPlanTransforms::materializeVectorTripCount(

    VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking,

    bool RequiresScalarEpilogue, VPValue *Step,

    std::optional<uint64_t> MaxRuntimeStep) {

  VPSymbolicValue &VectorTC = Plan.getVectorTripCount();

  // There's nothing to do if there are no users of the vector trip count or its

  // IR value has already been set.

  if (VectorTC.getNumUsers() == 0 || VectorTC.getUnderlyingValue())

    return;


  VPValue *TC = Plan.getTripCount();

  Type *TCTy = TC->getScalarType();

  VPBasicBlock::iterator InsertPt = VectorPHVPBB->begin();

  if (auto *StepR = Step->getDefiningRecipe()) {

    assert(VPDominatorTree(Plan).dominates(StepR->getParent(), VectorPHVPBB) &&

           "Step VPBB must dominate VectorPHVPBB");

    // Insert after Step's definition to maintain valid def-use ordering.

    InsertPt = std::next(StepR->getIterator());

  }

  VPBuilder Builder(VectorPHVPBB, InsertPt);


  // For scalable steps, if TC is a constant and is divisible by the maximum

  // possible runtime step, then TC % Step == 0 for all valid vscale values

  // and the vector trip count equals TC directly.

  const APInt *TCVal;

  if (!RequiresScalarEpilogue && match(TC, m_APInt(TCVal)) && MaxRuntimeStep &&

      TCVal->getZExtValue() % *MaxRuntimeStep == 0) {

    VectorTC.replaceAllUsesWith(TC);

    return;

  }


  // If the tail is to be folded by masking, round the number of iterations N

  // up to a multiple of Step instead of rounding down. This is done by first

  // adding Step-1 and then rounding down. Note that it's ok if this addition

  // overflows: the vector induction variable will eventually wrap to zero given

  // that it starts at zero and its Step is a power of two; the loop will then

  // exit, with the last early-exit vector comparison also producing all-true.

  if (TailByMasking) {

    TC = Builder.createAdd(

        TC, Builder.createSub(Step, Plan.getConstantInt(TCTy, 1)),

        DebugLoc::getCompilerGenerated(), "n.rnd.up");

  }


  // Now we need to generate the expression for the part of the loop that the

  // vectorized body will execute. This is equal to N - (N % Step) if scalar

  // iterations are not required for correctness, or N - Step, otherwise. Step

  // is equal to the vectorization factor (number of SIMD elements) times the

  // unroll factor (number of SIMD instructions).

  VPValue *R =

      Builder.createNaryOp(Instruction::URem, {TC, Step},

                           DebugLoc::getCompilerGenerated(), "n.mod.vf");


  // There are cases where we *must* run at least one iteration in the remainder

  // loop.  See the cost model for when this can happen.  If the step evenly

  // divides the trip count, we set the remainder to be equal to the step. If

  // the step does not evenly divide the trip count, no adjustment is necessary

  // since there will already be scalar iterations. Note that the minimum

  // iterations check ensures that N >= Step.

  if (RequiresScalarEpilogue) {

    assert(!TailByMasking &&

           "requiring scalar epilogue is not supported with fail folding");

    VPValue *IsZero =

        Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getZero(TCTy));

    R = Builder.createSelect(IsZero, Step, R);

  }


  VPValue *Res =

      Builder.createSub(TC, R, DebugLoc::getCompilerGenerated(), "n.vec");

  VectorTC.replaceAllUsesWith(Res);

}


void VPlanTransforms::materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH,

                                         ElementCount VFEC) {

  // If VF and VFxUF have already been materialized (no remaining users),

  // there's nothing more to do.

  if (Plan.getVF().isMaterialized()) {

    assert(Plan.getVFxUF().isMaterialized() &&

           "VF and VFxUF must be materialized together");

    return;

  }


  VPBuilder Builder(VectorPH, VectorPH->begin());

  Type *TCTy = Plan.getTripCount()->getScalarType();

  VPValue &VF = Plan.getVF();

  VPValue &VFxUF = Plan.getVFxUF();

  // If there are no users of the runtime VF, compute VFxUF by constant folding

  // the multiplication of VF and UF.

  if (VF.getNumUsers() == 0) {

    VPValue *RuntimeVFxUF =

        Builder.createElementCount(TCTy, VFEC * Plan.getConcreteUF());

    VFxUF.replaceAllUsesWith(RuntimeVFxUF);

    return;

  }


  // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *

  // vscale) * UF.

  VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);

  if (!vputils::onlyScalarValuesUsed(&VF)) {

    VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);

    VF.replaceUsesWithIf(

        BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });

  }

  VF.replaceAllUsesWith(RuntimeVF);


  VPValue *MulByUF = Builder.createOverflowingOp(

      Instruction::Mul,

      {RuntimeVF, Plan.getConstantInt(TCTy, Plan.getConcreteUF())},

      {true, false});

  VFxUF.replaceAllUsesWith(MulByUF);

}


void VPlanTransforms::attachAliasMaskToHeaderMask(VPlan &Plan) {

  VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan);

  auto *HeaderMaskDef = HeaderMask->getDefiningRecipe();

  Type *I1Ty = IntegerType::getInt1Ty(Plan.getContext());


  VPBuilder Builder(Plan.getVectorPreheader());

  auto *AliasMask = Builder.createNaryOp(

      VPInstruction::IncomingAliasMask, {}, nullptr, {}, {},

      DebugLoc::getUnknown(), "incoming.alias.mask", I1Ty);


  if (HeaderMaskDef->isPhi())

    Builder = VPBuilder(&*HeaderMaskDef->getParent()->getFirstNonPhi());

  else

    Builder = VPBuilder::getToInsertAfter(HeaderMaskDef);


  // Update all existing users of the header mask to "HeaderMask & AliasMask".

  auto *ClampedHeaderMask = Builder.createAnd(HeaderMask, AliasMask);

  HeaderMask->replaceUsesWithIf(ClampedHeaderMask, [&](VPUser &U, unsigned) {

    return &U != ClampedHeaderMask;

  });

}


VPValue *


VPlanTransforms::materializeAliasMask(VPlan &Plan, VPBasicBlock *AliasCheckVPBB,

                                      ArrayRef<PointerDiffInfo> DiffChecks) {

  VPBuilder Builder(AliasCheckVPBB);

  Type *I1Ty = IntegerType::getInt1Ty(Plan.getContext());


  VPValue *IncomingAliasMask = vputils::findIncomingAliasMask(Plan);

  assert(IncomingAliasMask && "Expected an alias mask!");


  VPValue *AliasMask = nullptr;

  for (const PointerDiffInfo &Check : DiffChecks) {

    VPValue *Src = vputils::getOrCreateVPValueForSCEVExpr(Plan, Check.SrcStart);

    VPValue *Sink =

        vputils::getOrCreateVPValueForSCEVExpr(Plan, Check.SinkStart);

    Type *AddrType = Src->getScalarType();


    // TODO: Only freeze the required pointer (not both src and sink).

    if (Check.NeedsFreeze) {

      Src = Builder.createScalarFreeze(Src, AddrType, DebugLoc::getUnknown());

      Sink = Builder.createScalarFreeze(Sink, AddrType, DebugLoc::getUnknown());

    }


    // TODO: Generate loop_dependence_raw_mask when there's a read-after-write

    // dependency between the source and the sink. This is not necessary for

    // correctness of the mask, but using the "raw" variant prevents loads

    // depending on the completion of stores.

    VPWidenIntrinsicRecipe *WARMask = Builder.insert(new VPWidenIntrinsicRecipe(

        Intrinsic::loop_dependence_war_mask,

        {Src, Sink, Plan.getConstantInt(AddrType, Check.AccessSize)}, I1Ty));


    if (AliasMask)

      AliasMask = Builder.createAnd(AliasMask, WARMask);

    else

      AliasMask = WARMask;

  }


  Type *IVTy = Plan.getVectorLoopRegion()->getCanonicalIVType();

  Type *IndexTy = Plan.getDataLayout().getIndexType(Plan.getContext(), 0);

  VPValue *NumActive = Builder.createNaryOp(

      VPInstruction::NumActiveLanes, {AliasMask}, nullptr, {}, {},

      DebugLoc::getUnknown(), "num.active.lanes", IndexTy);

  VPValue *ClampedVF = Builder.createScalarZExtOrTrunc(

      NumActive, IVTy, IndexTy, DebugLoc::getCompilerGenerated());


  IncomingAliasMask->replaceAllUsesWith(AliasMask);


  return ClampedVF;

}


void VPlanTransforms::materializeAliasMaskCheckBlock(

    VPlan &Plan, ArrayRef<PointerDiffInfo> DiffChecks, bool HasBranchWeights) {

  VPBasicBlock *ClampedVFCheck =

      Plan.createVPBasicBlock("vector.clamped.vf.check");


  VPValue *ClampedVF = materializeAliasMask(Plan, ClampedVFCheck, DiffChecks);

  VPBuilder Builder(ClampedVFCheck);

  DebugLoc DL = DebugLoc::getCompilerGenerated();

  Type *TCTy = Plan.getTripCount()->getScalarType();


  // Check the "ClampedVF" from the alias mask is larger than one.

  VPValue *IsScalar =

      Builder.createICmp(CmpInst::ICMP_ULE, ClampedVF,

                         Plan.getConstantInt(TCTy, 1), DL, "vf.is.scalar");


  VPValue *TripCount = Plan.getTripCount();

  VPValue *MaxUIntTripCount =

      Plan.getConstantInt(cast<IntegerType>(TCTy)->getMask());

  VPValue *DistanceToMax = Builder.createSub(MaxUIntTripCount, TripCount);


  // For tail-folding: Don't execute the vector loop if (UMax - n) < ClampedVF.

  // Note: The ClampedVF may not be a power-of-two. This means the loop exit

  // condition (index.next == n.vec) may not be correct in the case of an

  // overflow. The issue is `n.vec` could be zero due to an overflow, but

  // index.next is not guaranteed to overflow to zero as the ClampedVF is not a

  // power-of-two).

  VPValue *TripCountCheck = Builder.createICmp(

      ICmpInst::ICMP_ULT, DistanceToMax, ClampedVF, DL, "vf.step.overflow");


  VPValue *Cond = Builder.createOr(IsScalar, TripCountCheck, DL);

  attachVPCheckBlock(Plan, Cond, ClampedVFCheck, HasBranchWeights);


  // Materialize the trip count early as this will add a use of (VFxUF) that

  // needs to be replaced with the ClampedVF.

  materializeVectorTripCount(Plan, Plan.getVectorPreheader(),

                             /*TailByMasking=*/true,

                             /*RequiresScalarEpilogue=*/false,

                             &Plan.getVFxUF());


  assert(Plan.getConcreteUF() == 1 &&

         "Clamped VF not supported with interleaving");

  Plan.getVF().replaceAllUsesWith(ClampedVF);

  Plan.getVFxUF().replaceAllUsesWith(ClampedVF);

}


void VPlanTransforms::expandSCEVsToVPInstructions(VPlan &Plan,

                                                  ScalarEvolution &SE) {

  auto *Entry = Plan.getEntry();

  VPBuilder Builder(Entry, Entry->begin());

  VPSCEVExpander Expander(Builder, SE);


  // Expand VPExpandSCEVRecipes to VPInstructions using VPSCEVExpander. During

  // the transition, unsupported VPExpandSCEVRecipes are skipped and left for

  // late expansion.

  for (VPRecipeBase &R : make_early_inc_range(*Entry)) {

    auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);

    if (!ExpSCEV || ExpSCEV->getNumUsers() == 0)

      continue;

    Builder.setInsertPoint(ExpSCEV);

    VPValue *Expanded = Expander.tryToExpand(ExpSCEV->getSCEV());

    if (!Expanded)

      continue;

    ExpSCEV->replaceAllUsesWith(Expanded);

    if (Plan.getTripCount() == ExpSCEV)

      Plan.resetTripCount(Expanded);

    ExpSCEV->eraseFromParent();

  }

}


DenseMap<const SCEV *, Value *>


VPlanTransforms::expandSCEVs(VPlan &Plan, ScalarEvolution &SE) {

  SCEVExpander Expander(SE, "induction", /*PreserveLCSSA=*/false);


  auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());

  BasicBlock *EntryBB = Entry->getIRBasicBlock();

  DenseMap<const SCEV *, Value *> ExpandedSCEVs;

  // Expand remaining VPExpandSCEVRecipes to IR instructions using SCEVExpander.

  for (VPRecipeBase &R : make_early_inc_range(*Entry)) {

    auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);

    if (!ExpSCEV)

      continue;

    const SCEV *Expr = ExpSCEV->getSCEV();

    Value *Res =

        Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());

    ExpandedSCEVs[Expr] = Res;

    VPValue *Exp = Plan.getOrAddLiveIn(Res);

    ExpSCEV->replaceAllUsesWith(Exp);

    if (Plan.getTripCount() == ExpSCEV)

      Plan.resetTripCount(Exp);

    ExpSCEV->eraseFromParent();

  }

  assert(none_of(*Entry, IsaPred<VPExpandSCEVRecipe>) &&

         "all VPExpandSCEVRecipes must have been expanded");

  // Add IR instructions in the entry basic block but not in the VPIRBasicBlock

  // to the VPIRBasicBlock.

  auto EI = Entry->begin();

  for (Instruction &I : drop_end(*EntryBB)) {

    if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&

        &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {

      EI++;

      continue;

    }

    VPIRInstruction::create(I)->insertBefore(*Entry, EI);

  }


  return ExpandedSCEVs;

}


/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be

/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a

/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding

/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to

/// an index-independent load if it feeds all wide ops at all indices (\p OpV

/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p

/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V

/// is defined at \p Idx of a load interleave group.


static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx,

                          VPValue *OpV, unsigned Idx, bool IsScalable) {

  VPValue *Member0Op = WideMember0->getOperand(OpIdx);

  VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();

  if (!Member0OpR)

    return Member0Op == OpV;

  if (auto *W = dyn_cast<VPWidenLoadRecipe>(Member0OpR))

    // For scalable VFs, the narrowed plan processes vscale iterations at once,

    // so a shared wide load cannot be narrowed to a uniform scalar; bail out.

    return !IsScalable && !W->getMask() && W->isConsecutive() &&

           Member0Op == OpV;

  if (auto *IR = dyn_cast<VPInterleaveRecipe>(Member0OpR))

    return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;

  return false;

}


static bool canNarrowOps(ArrayRef<VPValue *> Ops, bool IsScalable) {

  SmallVector<VPValue *> Ops0;

  auto *WideMember0 = dyn_cast<VPSingleDefRecipe>(Ops[0]);

  if (!WideMember0)

    return false;

  for (VPValue *V : Ops) {

    if (!isa<VPWidenRecipe, VPWidenCastRecipe>(V))

      return false;

    auto *R = cast<VPSingleDefRecipe>(V);

    if (getOpcodeOrIntrinsicID(R) != getOpcodeOrIntrinsicID(WideMember0))

      return false;

  }


  for (unsigned Idx = 0; Idx != WideMember0->getNumOperands(); ++Idx) {

    SmallVector<VPValue *> OpsI;

    for (VPValue *Op : Ops)

      OpsI.push_back(Op->getDefiningRecipe()->getOperand(Idx));


    if (canNarrowOps(OpsI, IsScalable))

      continue;


    if (any_of(enumerate(OpsI), [WideMember0, Idx, IsScalable](const auto &P) {

          const auto &[OpIdx, OpV] = P;

          return !canNarrowLoad(WideMember0, Idx, OpV, OpIdx, IsScalable);

        }))

      return false;

  }


  return true;

}


/// Returns VF from \p VFs if \p IR is a full interleave group with factor and

/// number of members both equal to VF. The interleave group must also access

/// the full vector width.

static std::optional<ElementCount>


isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR,

                             ArrayRef<ElementCount> VFs,

                             const TargetTransformInfo &TTI) {

  if (!InterleaveR || InterleaveR->getMask())

    return std::nullopt;


  Type *GroupElementTy = nullptr;

  if (InterleaveR->getStoredValues().empty()) {

    GroupElementTy = InterleaveR->getVPValue(0)->getScalarType();

    if (!all_of(InterleaveR->definedValues(), [GroupElementTy](VPValue *Op) {

          return Op->getScalarType() == GroupElementTy;

        }))

      return std::nullopt;

  } else {

    GroupElementTy = InterleaveR->getStoredValues()[0]->getScalarType();

    if (!all_of(InterleaveR->getStoredValues(), [GroupElementTy](VPValue *Op) {

          return Op->getScalarType() == GroupElementTy;

        }))

      return std::nullopt;

  }


  auto IG = InterleaveR->getInterleaveGroup();

  if (IG->getFactor() != IG->getNumMembers())

    return std::nullopt;


  auto GetVectorBitWidthForVF = [&TTI](ElementCount VF) {

    TypeSize Size = TTI.getRegisterBitWidth(

        VF.isFixed() ? TargetTransformInfo::RGK_FixedWidthVector

                     : TargetTransformInfo::RGK_ScalableVector);

    assert(Size.isScalable() == VF.isScalable() &&

           "if Size is scalable, VF must be scalable and vice versa");

    return Size.getKnownMinValue();

  };


  for (ElementCount VF : VFs) {

    unsigned MinVal = VF.getKnownMinValue();

    unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * MinVal;

    if (IG->getFactor() == MinVal && GroupSize == GetVectorBitWidthForVF(VF))

      return {VF};

  }

  return std::nullopt;

}


/// Returns true if \p VPValue is a narrow VPValue.


static bool isAlreadyNarrow(VPValue *VPV) {

  if (isa<VPIRValue>(VPV))

    return true;

  auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);

  return RepR && RepR->isSingleScalar();

}


// Convert the wide recipes defining the VPValues in \p Members feeding an

// interleave group to a single narrow variant. The first member is reused as

// the narrowed recipe.

static VPValue *


narrowInterleaveGroupOp(ArrayRef<VPValue *> Members,

                        SmallPtrSetImpl<VPValue *> &NarrowedOps) {

  VPValue *V = Members.front();

  auto *R = V->getDefiningRecipe();

  if (!R || NarrowedOps.contains(V))

    return V;


  if (isAlreadyNarrow(V))

    return V;


  if (isa<VPWidenRecipe, VPWidenCastRecipe>(R)) {

    auto *WideMember0 = cast<VPRecipeWithIRFlags>(R);

    for (VPValue *Member : Members.drop_front())

      WideMember0->intersectFlags(*cast<VPRecipeWithIRFlags>(Member));

    for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx) {

      SmallVector<VPValue *> OpsI;

      for (VPValue *Member : Members)

        OpsI.push_back(Member->getDefiningRecipe()->getOperand(Idx));

      WideMember0->setOperand(Idx, narrowInterleaveGroupOp(OpsI, NarrowedOps));

    }

    return V;

  }


  if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {

    // Narrow interleave group to wide load, as transformed VPlan will only

    // process one original iteration.

    auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());

    auto *L = new VPWidenLoadRecipe(*LI, LoadGroup->getAddr(),

                                    LoadGroup->getMask(), /*Consecutive=*/true,

                                    *LoadGroup, LoadGroup->getDebugLoc());

    L->insertBefore(LoadGroup);

    NarrowedOps.insert(L);

    return L;

  }


  if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {

    assert(RepR->isSingleScalar() && RepR->getOpcode() == Instruction::Load &&

           "must be a single scalar load");

    NarrowedOps.insert(RepR);

    return RepR;

  }


  auto *WideLoad = cast<VPWidenLoadRecipe>(R);

  VPValue *PtrOp = WideLoad->getAddr();

  if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))

    PtrOp = VecPtr->getOperand(0);

  // Narrow wide load to uniform scalar load, as transformed VPlan will only

  // process one original iteration.

  auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},

                                  /*IsUniform*/ true,

                                  /*Mask*/ nullptr, {}, *WideLoad);

  N->insertBefore(WideLoad);

  NarrowedOps.insert(N);

  return N;

}


std::unique_ptr<VPlan>


VPlanTransforms::narrowInterleaveGroups(VPlan &Plan,

                                        const TargetTransformInfo &TTI) {

  VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();


  if (!VectorLoop)

    return nullptr;


  // Only handle single-block loops for now.

  if (VectorLoop->getEntryBasicBlock() != VectorLoop->getExitingBasicBlock())

    return nullptr;


  // Skip plans when we may not be able to properly narrow.

  VPBasicBlock *Exiting = VectorLoop->getExitingBasicBlock();

  if (!match(&Exiting->back(), m_BranchOnCount()))

    return nullptr;


  assert(match(&Exiting->back(),

               m_BranchOnCount(m_Add(m_VPValue(), m_Specific(&Plan.getVFxUF())),

                               m_Specific(&Plan.getVectorTripCount()))) &&

         "unexpected branch-on-count");


  SmallVector<VPInterleaveRecipe *> StoreGroups;

  std::optional<ElementCount> VFToOptimize;

  for (auto &R : *VectorLoop->getEntryBasicBlock()) {

    if (isa<VPDerivedIVRecipe, VPScalarIVStepsRecipe>(&R) &&

        vputils::onlyFirstLaneUsed(cast<VPSingleDefRecipe>(&R)))

      continue;


    // Bail out on recipes not supported at the moment:

    //  * phi recipes other than the canonical induction

    //  * recipes writing to memory except interleave groups

    // Only support plans with a canonical induction phi.

    if (R.isPhi())

      return nullptr;


    auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);

    if (R.mayWriteToMemory() && !InterleaveR)

      return nullptr;


    // Bail out if any recipe defines a vector value used outside the

    // vector loop region.

    if (any_of(R.definedValues(), [&](VPValue *V) {

          return any_of(V->users(), [&](VPUser *U) {

            auto *UR = cast<VPRecipeBase>(U);

            return UR->getParent()->getParent() != VectorLoop;

          });

        }))

      return nullptr;


    // All other ops are allowed, but we reject uses that cannot be converted

    // when checking all allowed consumers (store interleave groups) below.

    if (!InterleaveR)

      continue;


    // Try to find a single VF, where all interleave groups are consecutive and

    // saturate the full vector width. If we already have a candidate VF, check

    // if it is applicable for the current InterleaveR, otherwise look for a

    // suitable VF across the Plan's VFs.

    SmallVector<ElementCount> VFs =

        VFToOptimize ? SmallVector<ElementCount>({*VFToOptimize})

                     : to_vector(Plan.vectorFactors());

    std::optional<ElementCount> NarrowedVF =

        isConsecutiveInterleaveGroup(InterleaveR, VFs, TTI);

    if (!NarrowedVF || (VFToOptimize && NarrowedVF != VFToOptimize))

      return nullptr;

    VFToOptimize = NarrowedVF;


    // Skip read interleave groups.

    if (InterleaveR->getStoredValues().empty())

      continue;


    // Narrow interleave groups, if all operands are already matching narrow

    // ops.

    auto *Member0 = InterleaveR->getStoredValues()[0];

    if (isAlreadyNarrow(Member0) &&

        all_of(InterleaveR->getStoredValues(), equal_to(Member0))) {

      StoreGroups.push_back(InterleaveR);

      continue;

    }


    // For now, we only support full interleave groups storing load interleave

    // groups.

    if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {

          VPRecipeBase *DefR = Op.value()->getDefiningRecipe();

          if (!DefR)

            return false;

          auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);

          return IR && IR->getInterleaveGroup()->isFull() &&

                 IR->getVPValue(Op.index()) == Op.value();

        })) {

      StoreGroups.push_back(InterleaveR);

      continue;

    }


    // Check if all values feeding InterleaveR are matching wide recipes, which

    // operands that can be narrowed.

    if (!canNarrowOps(InterleaveR->getStoredValues(),

                      VFToOptimize->isScalable()))

      return nullptr;

    StoreGroups.push_back(InterleaveR);

  }


  if (StoreGroups.empty())

    return nullptr;


  VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();

  bool RequiresScalarEpilogue =

      MiddleVPBB->getNumSuccessors() == 1 &&

      MiddleVPBB->getSingleSuccessor() == Plan.getScalarPreheader();

  // Bail out for tail-folding (middle block with a single successor to exit).

  if (MiddleVPBB->getNumSuccessors() != 2 && !RequiresScalarEpilogue)

    return nullptr;


  // All interleave groups in Plan can be narrowed for VFToOptimize. Split the

  // original Plan into 2: a) a new clone which contains all VFs of Plan, except

  // VFToOptimize, and b) the original Plan with VFToOptimize as single VF.

  // TODO: Handle cases where only some interleave groups can be narrowed.

  std::unique_ptr<VPlan> NewPlan;

  if (size(Plan.vectorFactors()) != 1) {

    NewPlan = std::unique_ptr<VPlan>(Plan.duplicate());

    Plan.setVF(*VFToOptimize);

    NewPlan->removeVF(*VFToOptimize);

  }


  // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.

  SmallPtrSet<VPValue *, 4> NarrowedOps;

  // Narrow operation tree rooted at store groups.

  for (auto *StoreGroup : StoreGroups) {

    VPValue *Res =

        narrowInterleaveGroupOp(StoreGroup->getStoredValues(), NarrowedOps);

    auto *SI =

        cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());

    auto *S = new VPWidenStoreRecipe(*SI, StoreGroup->getAddr(), Res, nullptr,

                                     /*Consecutive=*/true, *StoreGroup,

                                     StoreGroup->getDebugLoc());

    S->insertBefore(StoreGroup);

    StoreGroup->eraseFromParent();

  }


  // Adjust induction to reflect that the transformed plan only processes one

  // original iteration.

  VPInstruction *CanIVInc = vputils::findCanonicalIVIncrement(Plan);

  Type *CanIVTy = VectorLoop->getCanonicalIVType();

  VPBasicBlock *VectorPH = Plan.getVectorPreheader();

  VPBuilder PHBuilder(VectorPH, VectorPH->begin());


  VPValue *UF = &Plan.getUF();

  VPValue *Step;

  if (VFToOptimize->isScalable()) {

    VPValue *VScale =

        PHBuilder.createElementCount(CanIVTy, ElementCount::getScalable(1));

    Step = PHBuilder.createOverflowingOp(Instruction::Mul, {VScale, UF},

                                         {true, false});

    Plan.getVF().replaceAllUsesWith(VScale);

  } else {

    Step = UF;

    Plan.getVF().replaceAllUsesWith(Plan.getConstantInt(CanIVTy, 1));

  }

  // Materialize vector trip count with the narrowed step.

  materializeVectorTripCount(Plan, VectorPH, /*TailByMasking=*/false,

                             RequiresScalarEpilogue, Step);


  CanIVInc->setOperand(1, Step);

  Plan.getVFxUF().replaceAllUsesWith(Step);


  removeDeadRecipes(Plan);

  assert(none_of(*VectorLoop->getEntryBasicBlock(),

                 IsaPred<VPVectorPointerRecipe>) &&

         "All VPVectorPointerRecipes should have been removed");

  return NewPlan;

}


/// Add branch weight metadata, if the \p Plan's middle block is terminated by a

/// BranchOnCond recipe.


void VPlanTransforms::addBranchWeightToMiddleTerminator(

    VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {

  VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();

  auto *MiddleTerm =

      dyn_cast_or_null<VPInstruction>(MiddleVPBB->getTerminator());

  // Only add branch metadata if there is a (conditional) terminator.

  if (!MiddleTerm)

    return;


  assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&

         "must have a BranchOnCond");

  // Assume that `TripCount % VectorStep ` is equally distributed.

  unsigned VectorStep = Plan.getConcreteUF() * VF.getKnownMinValue();

  if (VF.isScalable() && VScaleForTuning.has_value())

    VectorStep *= *VScaleForTuning;

  assert(VectorStep > 0 && "trip count should not be zero");

  MDBuilder MDB(Plan.getContext());

  MDNode *BranchWeights =

      MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);

  MiddleTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);

}


void VPlanTransforms::adjustFirstOrderRecurrenceMiddleUsers(VPlan &Plan,

                                                            VFRange &Range) {

  VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();

  auto *MiddleVPBB = Plan.getMiddleBlock();

  VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());


  auto IsScalableOne = [](ElementCount VF) -> bool {

    return VF == ElementCount::getScalable(1);

  };


  for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {

    auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);

    if (!FOR)

      continue;


    assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&

           "Cannot handle loops with uncountable early exits");


    // Find the existing splice for this FOR, created in

    // createHeaderPhiRecipes. All uses of FOR have already been replaced with

    // RecurSplice there; only RecurSplice itself still references FOR.

    auto *RecurSplice =

        findUserOf<VPInstruction::FirstOrderRecurrenceSplice>(FOR);

    assert(RecurSplice && "expected FirstOrderRecurrenceSplice");


    // For VF vscale x 1, if vscale = 1, we are unable to extract the

    // penultimate value of the recurrence. Instead we rely on the existing

    // extract of the last element from the result of

    // VPInstruction::FirstOrderRecurrenceSplice.

    // TODO: Consider vscale_range info and UF.

    if (any_of(RecurSplice->users(),

               [](VPUser *U) { return !cast<VPRecipeBase>(U)->getRegion(); }) &&

        LoopVectorizationPlanner::getDecisionAndClampRange(IsScalableOne,

                                                           Range))

      return;


    // This is the second phase of vectorizing first-order recurrences, creating

    // extracts for users outside the loop. An overview of the transformation is

    // described below. Suppose we have the following loop with some use after

    // the loop of the last a[i-1],

    //

    //   for (int i = 0; i < n; ++i) {

    //     t = a[i - 1];

    //     b[i] = a[i] - t;

    //   }

    //   use t;

    //

    // There is a first-order recurrence on "a". For this loop, the shorthand

    // scalar IR looks like:

    //

    //   scalar.ph:

    //     s.init = a[-1]

    //     br scalar.body

    //

    //   scalar.body:

    //     i = phi [0, scalar.ph], [i+1, scalar.body]

    //     s1 = phi [s.init, scalar.ph], [s2, scalar.body]

    //     s2 = a[i]

    //     b[i] = s2 - s1

    //     br cond, scalar.body, exit.block

    //

    //   exit.block:

    //     use = lcssa.phi [s1, scalar.body]

    //

    // In this example, s1 is a recurrence because it's value depends on the

    // previous iteration. In the first phase of vectorization, we created a

    // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts

    // for users in the scalar preheader and exit block.

    //

    //   vector.ph:

    //     v_init = vector(..., ..., ..., a[-1])

    //     br vector.body

    //

    //   vector.body

    //     i = phi [0, vector.ph], [i+4, vector.body]

    //     v1 = phi [v_init, vector.ph], [v2, vector.body]

    //     v2 = a[i, i+1, i+2, i+3]

    //     v1' = splice(v1(3), v2(0, 1, 2))

    //     b[i, i+1, i+2, i+3] = v2 - v1'

    //     br cond, vector.body, middle.block

    //

    //   middle.block:

    //     vector.recur.extract.for.phi = v2(2)

    //     vector.recur.extract = v2(3)

    //     br cond, scalar.ph, exit.block

    //

    //   scalar.ph:

    //     scalar.recur.init = phi [vector.recur.extract, middle.block],

    //                             [s.init, otherwise]

    //     br scalar.body

    //

    //   scalar.body:

    //     i = phi [0, scalar.ph], [i+1, scalar.body]

    //     s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]

    //     s2 = a[i]

    //     b[i] = s2 - s1

    //     br cond, scalar.body, exit.block

    //

    //   exit.block:

    //     lo = lcssa.phi [s1, scalar.body],

    //                    [vector.recur.extract.for.phi, middle.block]

    //

    // Update extracts of the splice in the middle block: they extract the

    // penultimate element of the recurrence.

    for (VPRecipeBase &R : make_early_inc_range(

             make_range(MiddleVPBB->getFirstNonPhi(), MiddleVPBB->end()))) {

      if (!match(&R, m_ExtractLastLaneOfLastPart(m_Specific(RecurSplice))))

        continue;


      auto *ExtractR = cast<VPInstruction>(&R);

      VPValue *PenultimateElement = MiddleBuilder.createNaryOp(

          VPInstruction::ExtractPenultimateElement, RecurSplice->getOperand(1),

          {}, "vector.recur.extract.for.phi");

      for (VPUser *ExitU : to_vector(ExtractR->users())) {

        if (auto *ExitPhi = dyn_cast<VPIRPhi>(ExitU))

          ExitPhi->replaceUsesOfWith(ExtractR, PenultimateElement);

      }

    }

  }

}


/// Check if \p V is a binary expression of a widened IV and a loop-invariant

/// value. Returns the widened IV if found, nullptr otherwise.


static VPWidenIntOrFpInductionRecipe *getExpressionIV(VPValue *V) {

  auto *BinOp = dyn_cast<VPWidenRecipe>(V);

  if (!BinOp || !Instruction::isBinaryOp(BinOp->getOpcode()) ||

      Instruction::isIntDivRem(BinOp->getOpcode()))

    return nullptr;


  VPValue *WidenIVCandidate = BinOp->getOperand(0);

  VPValue *InvariantCandidate = BinOp->getOperand(1);

  if (!isa<VPWidenIntOrFpInductionRecipe>(WidenIVCandidate))

    std::swap(WidenIVCandidate, InvariantCandidate);


  if (!InvariantCandidate->isDefinedOutsideLoopRegions())

    return nullptr;


  return dyn_cast<VPWidenIntOrFpInductionRecipe>(WidenIVCandidate);

}


/// Create a scalar version of \p BinOp, with its \p WidenIV operand replaced

/// by \p ScalarIV, and place it after \p ScalarIV's defining recipe.


static VPValue *cloneBinOpForScalarIV(VPWidenRecipe *BinOp, VPValue *ScalarIV,

                                      VPWidenIntOrFpInductionRecipe *WidenIV) {

  assert(Instruction::isBinaryOp(BinOp->getOpcode()) &&

         BinOp->getNumOperands() == 2 && "BinOp must have 2 operands");

  auto *ClonedOp = BinOp->clone();

  if (ClonedOp->getOperand(0) == WidenIV) {

    ClonedOp->setOperand(0, ScalarIV);

  } else {

    assert(ClonedOp->getOperand(1) == WidenIV && "one operand must be WideIV");

    ClonedOp->setOperand(1, ScalarIV);

  }

  ClonedOp->insertAfter(ScalarIV->getDefiningRecipe());

  return ClonedOp;

}


void VPlanTransforms::optimizeFindIVReductions(VPlan &Plan,

                                               PredicatedScalarEvolution &PSE,

                                               Loop &L) {

  ScalarEvolution &SE = *PSE.getSE();

  VPRegionBlock *VectorLoopRegion = Plan.getVectorLoopRegion();


  // Helper lambda to check if the IV range excludes the sentinel value. Try

  // signed first, then unsigned. Return an excluded sentinel if found,

  // otherwise return std::nullopt.

  auto CheckSentinel = [&SE](const SCEV *IVSCEV,

                             bool UseMax) -> std::optional<APSInt> {

    unsigned BW = IVSCEV->getType()->getScalarSizeInBits();

    for (bool Signed : {true, false}) {

      APSInt Sentinel = UseMax ? APSInt::getMinValue(BW, /*Unsigned=*/!Signed)

                               : APSInt::getMaxValue(BW, /*Unsigned=*/!Signed);


      ConstantRange IVRange =

          Signed ? SE.getSignedRange(IVSCEV) : SE.getUnsignedRange(IVSCEV);

      if (!IVRange.contains(Sentinel))

        return Sentinel;

    }

    return std::nullopt;

  };


  VPValue *HeaderMask = vputils::findHeaderMask(Plan);

  for (VPRecipeBase &Phi :

       make_early_inc_range(VectorLoopRegion->getEntryBasicBlock()->phis())) {

    auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&Phi);

    if (!PhiR || !RecurrenceDescriptor::isFindLastRecurrenceKind(

                     PhiR->getRecurrenceKind()))

      continue;


    Type *PhiTy = PhiR->getScalarType();

    if (PhiTy->isPointerTy() || PhiTy->isFloatingPointTy())

      continue;


    // If there's a header mask, the backedge select will not be the find-last

    // select.

    VPValue *BackedgeVal = PhiR->getBackedgeValue();

    auto *FindLastSelect = cast<VPSingleDefRecipe>(BackedgeVal);

    if (HeaderMask &&

        !match(BackedgeVal,

               m_Select(m_Specific(HeaderMask),

                        m_VPSingleDefRecipe(FindLastSelect), m_Specific(PhiR))))

      continue;


    // Get the find-last expression from the find-last select of the reduction

    // phi. The find-last select should be a select between the phi and the

    // find-last expression.

    VPValue *Cond, *FindLastExpression;

    if (!match(FindLastSelect, m_Select(m_VPValue(Cond), m_Specific(PhiR),

                                        m_VPValue(FindLastExpression))) &&

        !match(FindLastSelect,

               m_Select(m_VPValue(Cond), m_VPValue(FindLastExpression),

                        m_Specific(PhiR))))

      continue;


    // Check if FindLastExpression is a simple expression of a widened IV. If

    // so, we can track the underlying IV instead and sink the expression.

    auto *IVOfExpressionToSink = getExpressionIV(FindLastExpression);

    const SCEV *IVSCEV = vputils::getSCEVExprForVPValue(

        IVOfExpressionToSink ? IVOfExpressionToSink : FindLastExpression, PSE,

        &L);

    const SCEV *Step;

    if (!match(IVSCEV, m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step)))) {

      assert(!match(vputils::getSCEVExprForVPValue(FindLastExpression, PSE, &L),

                    m_scev_AffineAddRec(m_SCEV(), m_SCEV())) &&

             "IVOfExpressionToSink not being an AddRec must imply "

             "FindLastExpression not being an AddRec.");

      continue;

    }


    // Determine direction from SCEV step.

    if (!SE.isKnownNonZero(Step))

      continue;


    // Positive step means we need UMax/SMax to find the last IV value, and

    // UMin/SMin otherwise.

    bool UseMax = SE.isKnownPositive(Step);

    std::optional<APSInt> SentinelVal = CheckSentinel(IVSCEV, UseMax);

    bool UseSigned = SentinelVal && SentinelVal->isSigned();


    // Sinking an expression will disable epilogue vectorization. Only use it,

    // if FindLastExpression cannot be vectorized via a sentinel. Sinking may

    // also prevent vectorizing using a sentinel (e.g., if the expression is a

    // multiply or divide by large constant, respectively), which also makes

    // sinking undesirable.

    if (IVOfExpressionToSink) {

      const SCEV *FindLastExpressionSCEV =

          vputils::getSCEVExprForVPValue(FindLastExpression, PSE, &L);

      if (match(FindLastExpressionSCEV,

                m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step)))) {

        bool NewUseMax = SE.isKnownPositive(Step);

        if (auto NewSentinel =

                CheckSentinel(FindLastExpressionSCEV, NewUseMax)) {

          // The original expression already has a sentinel, so prefer not

          // sinking to keep epilogue vectorization possible.

          SentinelVal = *NewSentinel;

          UseSigned = NewSentinel->isSigned();

          UseMax = NewUseMax;

          IVSCEV = FindLastExpressionSCEV;

          IVOfExpressionToSink = nullptr;

        }

      }

    }


    // If no sentinel was found, fall back to a boolean AnyOf reduction to track

    // if the condition was ever true. Requires the IV to not wrap, otherwise we

    // cannot use min/max.

    if (!SentinelVal) {

      auto *AR = cast<SCEVAddRecExpr>(IVSCEV);

      if (AR->hasNoSignedWrap())

        UseSigned = true;

      else if (AR->hasNoUnsignedWrap())

        UseSigned = false;

      else

        continue;

    }


    VPInstruction *RdxResult = cast<VPInstruction>(vputils::findRecipe(

        BackedgeVal,

        match_fn(m_VPInstruction<VPInstruction::ComputeReductionResult>())));


    VPValue *NewFindLastSelect = BackedgeVal;

    VPValue *SelectCond = Cond;

    if (!SentinelVal || IVOfExpressionToSink) {

      // When we need to create a new select, normalize the condition so that

      // PhiR is the last operand and include the header mask if needed.

      DebugLoc DL = FindLastSelect->getDefiningRecipe()->getDebugLoc();

      VPBuilder LoopBuilder(FindLastSelect->getDefiningRecipe());

      if (FindLastSelect->getDefiningRecipe()->getOperand(1) == PhiR)

        SelectCond = LoopBuilder.createNot(SelectCond);


      // When tail folding, mask the condition with the header mask to prevent

      // propagating poison from inactive lanes in the last vector iteration.

      if (HeaderMask)

        SelectCond = LoopBuilder.createLogicalAnd(HeaderMask, SelectCond);


      if (SelectCond != Cond || IVOfExpressionToSink) {

        NewFindLastSelect = LoopBuilder.createSelect(

            SelectCond,

            IVOfExpressionToSink ? IVOfExpressionToSink : FindLastExpression,

            PhiR, DL);

      }

    }


    // Create the reduction result in the middle block using sentinel directly.

    RecurKind MinMaxKind =

        UseMax ? (UseSigned ? RecurKind::SMax : RecurKind::UMax)

               : (UseSigned ? RecurKind::SMin : RecurKind::UMin);

    VPIRFlags Flags(MinMaxKind, /*IsOrdered=*/false, /*IsInLoop=*/false,

                    FastMathFlags());

    DebugLoc ExitDL = RdxResult->getDebugLoc();

    VPBuilder MiddleBuilder(RdxResult);

    VPValue *ReducedIV =

        MiddleBuilder.createNaryOp(VPInstruction::ComputeReductionResult,

                                   NewFindLastSelect, Flags, ExitDL);


    // If IVOfExpressionToSink is an expression to sink, sink it now.

    VPValue *VectorRegionExitingVal = ReducedIV;

    if (IVOfExpressionToSink)

      VectorRegionExitingVal =

          cloneBinOpForScalarIV(cast<VPWidenRecipe>(FindLastExpression),

                                ReducedIV, IVOfExpressionToSink);


    VPValue *NewRdxResult;

    VPValue *StartVPV = PhiR->getStartValue();

    if (SentinelVal) {

      // Sentinel-based approach: reduce IVs with min/max, compare against

      // sentinel to detect if condition was ever true, select accordingly.

      VPValue *Sentinel = Plan.getConstantInt(*SentinelVal);

      auto *Cmp = MiddleBuilder.createICmp(CmpInst::ICMP_NE, ReducedIV,

                                           Sentinel, ExitDL);

      NewRdxResult = MiddleBuilder.createSelect(Cmp, VectorRegionExitingVal,

                                                StartVPV, ExitDL);

      StartVPV = Sentinel;

    } else {

      // Introduce a boolean AnyOf reduction to track if the condition was ever

      // true in the loop. Use it to select the initial start value, if it was

      // never true.

      auto *AnyOfPhi = new VPReductionPHIRecipe(

          /*Phi=*/nullptr, RecurKind::Or, *Plan.getFalse(), *Plan.getFalse(),

          RdxUnordered{1}, {}, /*HasUsesOutsideReductionChain=*/false);

      AnyOfPhi->insertAfter(PhiR);


      VPBuilder LoopBuilder(BackedgeVal->getDefiningRecipe());

      VPValue *OrVal = LoopBuilder.createOr(AnyOfPhi, SelectCond);

      AnyOfPhi->setOperand(1, OrVal);


      NewRdxResult = MiddleBuilder.createAnyOfReduction(

          OrVal, VectorRegionExitingVal, StartVPV, ExitDL);


      // Initialize the IV reduction phi with the neutral element, not the

      // original start value, to ensure correct min/max reduction results.

      StartVPV = Plan.getOrAddLiveIn(

          getRecurrenceIdentity(MinMaxKind, IVSCEV->getType(), {}));

    }

    RdxResult->replaceAllUsesWith(NewRdxResult);

    RdxResult->eraseFromParent();


    auto *NewPhiR = new VPReductionPHIRecipe(

        cast<PHINode>(PhiR->getUnderlyingInstr()), RecurKind::FindIV, *StartVPV,

        *NewFindLastSelect, RdxUnordered{1}, {},

        PhiR->hasUsesOutsideReductionChain());

    NewPhiR->insertBefore(PhiR);

    PhiR->replaceAllUsesWith(NewPhiR);

    PhiR->eraseFromParent();

  }

}


namespace {


using ExtendKind = TTI::PartialReductionExtendKind;

struct ReductionExtend {

  Type *SrcType = nullptr;

  ExtendKind Kind = ExtendKind::PR_None;

};


/// Describes the extends used to compute the extended reduction operand.

/// ExtendB is optional. If ExtendB is present, ExtendsUser is a binary

/// operation.

struct ExtendedReductionOperand {

  /// The recipe that consumes the extends.

  VPWidenRecipe *ExtendsUser = nullptr;

  /// Extend descriptions (inputs to getPartialReductionCost).

  ReductionExtend ExtendA, ExtendB;

};


/// A chain of recipes that form a partial reduction. Matches either

///   reduction_bin_op (extended op, accumulator), or

///   reduction_bin_op (accumulator, extended op).

/// The possible forms of the "extended op" are listed in

/// matchExtendedReductionOperand.

struct VPPartialReductionChain {

  /// The top-level binary operation that forms the reduction to a scalar

  /// after the loop body.

  VPWidenRecipe *ReductionBinOp = nullptr;

  /// The user of the extends that is then reduced.

  ExtendedReductionOperand ExtendedOp;

  /// The recurrence kind for the entire partial reduction chain.

  /// This allows distinguishing between Sub and AddWithSub recurrences,

  /// when the ReductionBinOp is a Instruction::Sub.

  RecurKind RK;

  /// The index of the accumulator operand of ReductionBinOp. The extended op

  /// is `1 - AccumulatorOpIdx`.

  unsigned AccumulatorOpIdx;

  unsigned ScaleFactor;

};


static VPSingleDefRecipe *

optimizeExtendsForPartialReduction(VPSingleDefRecipe *Op) {

  // reduce.add(mul(ext(A), C))

  // -> reduce.add(mul(ext(A), ext(trunc(C))))

  const APInt *Const;

  if (match(Op, m_Mul(m_ZExtOrSExt(m_VPValue()), m_APInt(Const)))) {

    auto *ExtA = cast<VPWidenCastRecipe>(Op->getOperand(0));

    Instruction::CastOps ExtOpc = ExtA->getOpcode();

    Type *NarrowTy = ExtA->getOperand(0)->getScalarType();

    if (!Op->hasOneUse() ||

        !llvm::canConstantBeExtended(

            Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))

      return Op;


    VPBuilder Builder(Op);

    auto *Trunc = Builder.createWidenCast(Instruction::CastOps::Trunc,

                                          Op->getOperand(1), NarrowTy);

    Type *WideTy = ExtA->getScalarType();

    Op->setOperand(1, Builder.createWidenCast(ExtOpc, Trunc, WideTy));

    return Op;

  }


  // reduce.add(abs(sub(ext(A), ext(B))))

  // -> reduce.add(ext(absolute-difference(A, B)))

  VPValue *X, *Y;

  if (match(Op, m_WidenIntrinsic<Intrinsic::abs>(m_Sub(

                    m_ZExtOrSExt(m_VPValue(X)), m_ZExtOrSExt(m_VPValue(Y)))))) {

    auto *Sub = Op->getOperand(0)->getDefiningRecipe();

    auto *Ext = cast<VPWidenCastRecipe>(Sub->getOperand(0));

    assert(Ext->getOpcode() ==

               cast<VPWidenCastRecipe>(Sub->getOperand(1))->getOpcode() &&

           "Expected both the LHS and RHS extends to be the same");

    bool IsSigned = Ext->getOpcode() == Instruction::SExt;

    VPBuilder Builder(Op);

    Type *SrcTy = X->getScalarType();

    auto *FreezeX = Builder.insert(new VPWidenRecipe(Instruction::Freeze, {X}));

    auto *FreezeY = Builder.insert(new VPWidenRecipe(Instruction::Freeze, {Y}));

    auto *Max = Builder.insert(

        new VPWidenIntrinsicRecipe(IsSigned ? Intrinsic::smax : Intrinsic::umax,

                                   {FreezeX, FreezeY}, SrcTy));

    auto *Min = Builder.insert(

        new VPWidenIntrinsicRecipe(IsSigned ? Intrinsic::smin : Intrinsic::umin,

                                   {FreezeX, FreezeY}, SrcTy));

    auto *AbsDiff =

        Builder.insert(new VPWidenRecipe(Instruction::Sub, {Max, Min}));

    return Builder.createWidenCast(Instruction::CastOps::ZExt, AbsDiff,

                                   Op->getScalarType());

  }


  // reduce.add(ext(mul(ext(A), ext(B))))

  // -> reduce.add(mul(wider_ext(A), wider_ext(B)))

  // TODO: Support this optimization for float types.

  if (match(Op, m_ZExtOrSExt(m_Mul(m_ZExtOrSExt(m_VPValue()),

                                   m_ZExtOrSExt(m_VPValue()))))) {

    auto *Ext = cast<VPWidenCastRecipe>(Op);

    auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));

    auto *MulLHS = cast<VPWidenCastRecipe>(Mul->getOperand(0));

    auto *MulRHS = cast<VPWidenCastRecipe>(Mul->getOperand(1));

    if (!Mul->hasOneUse() ||

        (Ext->getOpcode() != MulLHS->getOpcode() && MulLHS != MulRHS) ||

        MulLHS->getOpcode() != MulRHS->getOpcode())

      return Op;

    VPBuilder Builder(Mul);

    auto *NewLHS = Builder.createWidenCast(

        MulLHS->getOpcode(), MulLHS->getOperand(0), Ext->getScalarType());

    auto *NewRHS = MulLHS == MulRHS

                       ? NewLHS

                       : Builder.createWidenCast(MulRHS->getOpcode(),

                                                 MulRHS->getOperand(0),

                                                 Ext->getScalarType());

    auto *NewMul = Mul->cloneWithOperands({NewLHS, NewRHS});

    Builder.insert(NewMul);

    Op->replaceAllUsesWith(NewMul);

    Op->eraseFromParent();

    Mul->eraseFromParent();

    return NewMul;

  }


  return Op;

}


static VPExpressionRecipe *

createPartialReductionExpression(VPReductionRecipe *Red) {

  VPValue *VecOp = Red->getVecOp();


  // reduce.[f]add(ext(op))

  //  -> VPExpressionRecipe(op, red)

  if (match(VecOp, m_WidenAnyExtend(m_VPValue())))

    return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);


  // reduce.[f]add(neg(ext(op)))

  // -> VPExpressionRecipe(op, sub/neg, red)

  if (match(VecOp, m_AnyNeg(m_WidenAnyExtend(m_VPValue())))) {

    auto *Neg = cast<VPWidenRecipe>(VecOp);

    auto *Ext =

        cast<VPWidenCastRecipe>(Neg->getOperand(Neg->getNumOperands() - 1));

    return new VPExpressionRecipe(Ext, Neg, Red);

  }


  // reduce.[f]add([f]mul(ext(a), ext(b)))

  //  -> VPExpressionRecipe(a, b, mul, red)

  if (match(VecOp, m_FMul(m_FPExt(m_VPValue()), m_FPExt(m_VPValue()))) ||

      match(VecOp,

            m_Mul(m_ZExtOrSExt(m_VPValue()), m_ZExtOrSExt(m_VPValue())))) {

    auto *Mul = cast<VPWidenRecipe>(VecOp);

    auto *ExtA = cast<VPWidenCastRecipe>(Mul->getOperand(0));

    auto *ExtB = cast<VPWidenCastRecipe>(Mul->getOperand(1));

    return new VPExpressionRecipe(ExtA, ExtB, Mul, Red);

  }


  // reduce.fadd(fneg(fmul(fpext(a), fpext(b))))

  //  -> VPExpressionRecipe(a, b, fmul, fsub, red)

  if (match(VecOp,

            m_FNeg(m_FMul(m_FPExt(m_VPValue()), m_FPExt(m_VPValue()))))) {

    auto *FNeg = cast<VPWidenRecipe>(VecOp);

    auto *FMul = cast<VPWidenRecipe>(FNeg->getOperand(0));

    auto *ExtA = cast<VPWidenCastRecipe>(FMul->getOperand(0));

    auto *ExtB = cast<VPWidenCastRecipe>(FMul->getOperand(1));

    return new VPExpressionRecipe(ExtA, ExtB, FMul, FNeg, Red);

  }


  // reduce.add(neg(mul(ext(a), ext(b))))

  //  -> VPExpressionRecipe(a, b, mul, sub, red)

  if (match(VecOp, m_Sub(m_ZeroInt(), m_Mul(m_ZExtOrSExt(m_VPValue()),

                                            m_ZExtOrSExt(m_VPValue()))))) {

    auto *Sub = cast<VPWidenRecipe>(VecOp);

    auto *Mul = cast<VPWidenRecipe>(Sub->getOperand(1));

    auto *ExtA = cast<VPWidenCastRecipe>(Mul->getOperand(0));

    auto *ExtB = cast<VPWidenCastRecipe>(Mul->getOperand(1));

    return new VPExpressionRecipe(ExtA, ExtB, Mul, Sub, Red);

  }


  llvm_unreachable("Unsupported expression");

}


// Helper to transform a partial reduction chain into a partial reduction

// recipe. Assumes profitability has been checked.

static void transformToPartialReduction(const VPPartialReductionChain &Chain,

                                        VPlan &Plan,

                                        VPReductionPHIRecipe *RdxPhi) {

  VPWidenRecipe *WidenRecipe = Chain.ReductionBinOp;

  assert(WidenRecipe->getNumOperands() == 2 && "Expected binary operation");


  VPValue *Accumulator = WidenRecipe->getOperand(Chain.AccumulatorOpIdx);

  auto *ExtendedOp = cast<VPSingleDefRecipe>(

      WidenRecipe->getOperand(1 - Chain.AccumulatorOpIdx));


  // FIXME: Do these transforms before invoking the cost-model.

  ExtendedOp = optimizeExtendsForPartialReduction(ExtendedOp);


  // Sub-reductions can be implemented in two ways:

  // (1) negate the operand in the vector loop (the default way).

  // (2) subtract the reduced value from the init value in the middle block.

  // Both ways keep the reduction itself as an 'add' reduction.

  //

  // The ISD nodes for partial reductions don't support folding the

  // sub/negation into its operands because the following is not a valid

  // transformation:

  //      sub(0, mul(ext(a), ext(b)))

  //   -> mul(ext(a), ext(sub(0, b)))

  //

  // It's therefore better to choose option (2) such that the partial

  // reduction is always positive (starting at '0') and to do a final

  // subtract in the middle block.

  if ((WidenRecipe->getOpcode() == Instruction::Sub &&

       Chain.RK != RecurKind::Sub) ||

      (WidenRecipe->getOpcode() == Instruction::FSub &&

       Chain.RK != RecurKind::FSub)) {

    VPBuilder Builder(WidenRecipe);

    Type *ElemTy = ExtendedOp->getScalarType();

    VPWidenRecipe *NegRecipe;

    if (WidenRecipe->getOpcode() == Instruction::FSub) {

      NegRecipe =

          new VPWidenRecipe(Instruction::FNeg, {ExtendedOp}, VPIRFlags(),

                            VPIRMetadata(), DebugLoc::getUnknown());

    } else {

      auto *Zero = Plan.getZero(ElemTy);

      NegRecipe =

          new VPWidenRecipe(Instruction::Sub, {Zero, ExtendedOp}, VPIRFlags(),

                            VPIRMetadata(), DebugLoc::getUnknown());

    }

    Builder.insert(NegRecipe);

    ExtendedOp = NegRecipe;

  }


  // Check if WidenRecipe is the final result of the reduction. If so look

  // through selects for predicated reductions.

  VPValue *Cond = nullptr;

  VPValue *ExitValue = cast_or_null<VPInstruction>(

      findUserOf(WidenRecipe, m_Select(m_VPValue(Cond), m_Specific(WidenRecipe),

                                       m_Specific(RdxPhi))));

  bool IsLastInChain = RdxPhi->getBackedgeValue() == WidenRecipe ||

                       RdxPhi->getBackedgeValue() == ExitValue;

  assert((!ExitValue || IsLastInChain) &&

         "if we found ExitValue, it must match RdxPhi's backedge value");


  Type *PhiType = RdxPhi->getScalarType();

  RecurKind RdxKind =

      PhiType->isFloatingPointTy() ? RecurKind::FAdd : RecurKind::Add;

  auto *PartialRed = new VPReductionRecipe(

      RdxKind,

      RdxKind == RecurKind::FAdd ? WidenRecipe->getFastMathFlags()

                                 : FastMathFlags(),

      WidenRecipe->getUnderlyingInstr(), Accumulator, ExtendedOp, Cond,

      RdxUnordered{/*VFScaleFactor=*/Chain.ScaleFactor});

  PartialRed->insertBefore(WidenRecipe);


  if (Cond)

    ExitValue->replaceAllUsesWith(PartialRed);

  WidenRecipe->replaceAllUsesWith(PartialRed);


  // For cost-model purposes, fold this into a VPExpression.

  VPExpressionRecipe *E = createPartialReductionExpression(PartialRed);

  E->insertBefore(WidenRecipe);

  PartialRed->replaceAllUsesWith(E);


  // We only need to update the PHI node once, which is when we find the

  // last reduction in the chain.

  if (!IsLastInChain)

    return;


  // Scale the PHI and ReductionStartVector by the VFScaleFactor

  assert(RdxPhi->getVFScaleFactor() == 1 && "scale factor must not be set");

  RdxPhi->setVFScaleFactor(Chain.ScaleFactor);


  auto *StartInst = cast<VPInstruction>(RdxPhi->getStartValue());

  assert(StartInst->getOpcode() == VPInstruction::ReductionStartVector);

  auto *NewScaleFactor = Plan.getConstantInt(32, Chain.ScaleFactor);

  StartInst->setOperand(2, NewScaleFactor);


  // If this is the last value in a sub-reduction chain, then update the PHI

  // node to start at `0` and update the reduction-result to subtract from

  // the PHI's start value.

  if (Chain.RK != RecurKind::Sub && Chain.RK != RecurKind::FSub)

    return;


  VPValue *OldStartValue = StartInst->getOperand(0);

  StartInst->setOperand(0, StartInst->getOperand(1));


  // Replace reduction_result by 'sub (startval, reductionresult)'.

  VPInstruction *RdxResult = vputils::findComputeReductionResult(RdxPhi);

  assert(RdxResult && "Could not find reduction result");


  VPBuilder Builder = VPBuilder::getToInsertAfter(RdxResult);

  unsigned SubOpc = Chain.RK == RecurKind::FSub ? Instruction::BinaryOps::FSub

                                                : Instruction::BinaryOps::Sub;

  VPInstruction *NewResult = Builder.createNaryOp(

      SubOpc, {OldStartValue, RdxResult}, VPIRFlags::getDefaultFlags(SubOpc),

      RdxPhi->getDebugLoc());

  RdxResult->replaceUsesWithIf(

      NewResult,

      [&NewResult](VPUser &U, unsigned Idx) { return &U != NewResult; });

}


/// Returns the cost of a link in a partial-reduction chain for a given VF.

static InstructionCost

getPartialReductionLinkCost(VPCostContext &CostCtx,

                            const VPPartialReductionChain &Link,

                            ElementCount VF) {

  Type *RdxType = Link.ReductionBinOp->getScalarType();

  const ExtendedReductionOperand &ExtendedOp = Link.ExtendedOp;

  std::optional<unsigned> BinOpc = std::nullopt;

  // If ExtendB is not none, then the "ExtendsUser" is the binary operation.

  if (ExtendedOp.ExtendB.Kind != ExtendKind::PR_None)

    BinOpc = ExtendedOp.ExtendsUser->getOpcode();


  std::optional<llvm::FastMathFlags> Flags;

  if (RdxType->isFloatingPointTy())

    Flags = Link.ReductionBinOp->getFastMathFlags();


  auto GetLinkOpcode = [&Link]() -> unsigned {

    switch (Link.RK) {

    case RecurKind::Sub:

      return Instruction::Add;

    case RecurKind::FSub:

      return Instruction::FAdd;

    default:

      return Link.ReductionBinOp->getOpcode();

    }

  };


  return CostCtx.TTI.getPartialReductionCost(

      GetLinkOpcode(), ExtendedOp.ExtendA.SrcType, ExtendedOp.ExtendB.SrcType,

      RdxType, VF, ExtendedOp.ExtendA.Kind, ExtendedOp.ExtendB.Kind, BinOpc,

      CostCtx.CostKind, Flags);

}


static ExtendKind getPartialReductionExtendKind(VPWidenCastRecipe *Cast) {

  return TTI::getPartialReductionExtendKind(Cast->getOpcode());

}


/// Checks if \p Op (which is an operand of \p UpdateR) is an extended reduction

/// operand. This is an operand where the source of the value (e.g. a load) has

/// been extended (sext, zext, or fpext) before it is used in the reduction.

///

/// Possible forms matched by this function:

///  - UpdateR(PrevValue, ext(...))

///  - UpdateR(PrevValue, mul(ext(...), ext(...)))

///  - UpdateR(PrevValue, mul(ext(...), Constant))

///  - UpdateR(PrevValue, ext(mul(ext(...), ext(...))))

///  - UpdateR(PrevValue, ext(mul(ext(...), Constant)))

///  - UpdateR(PrevValue, abs(sub(ext(...), ext(...)))

///

/// Note: The second operand of UpdateR corresponds to \p Op in the examples.

static std::optional<ExtendedReductionOperand>

matchExtendedReductionOperand(VPWidenRecipe *UpdateR, VPValue *Op) {

  assert(is_contained(UpdateR->operands(), Op) &&

         "Op should be operand of UpdateR");


  // Try matching an absolute difference operand of the form

  // `abs(sub(ext(A), ext(B)))`. This will be later transformed into

  // `ext(absolute-difference(A, B))`. This allows us to perform the absolute

  // difference on a wider type and get the extend for "free" from the partial

  // reduction.

  VPValue *X, *Y;

  if (Op->hasOneUse() &&

      match(Op, m_WidenIntrinsic<Intrinsic::abs>(

                    m_OneUse(m_Sub(m_WidenAnyExtend(m_VPValue(X)),

                                   m_WidenAnyExtend(m_VPValue(Y))))))) {

    auto *Abs = cast<VPWidenIntrinsicRecipe>(Op);

    auto *Sub = cast<VPWidenRecipe>(Abs->getOperand(0));

    auto *LHSExt = cast<VPWidenCastRecipe>(Sub->getOperand(0));

    auto *RHSExt = cast<VPWidenCastRecipe>(Sub->getOperand(1));

    Type *LHSInputType = X->getScalarType();

    Type *RHSInputType = Y->getScalarType();

    if (LHSInputType != RHSInputType ||

        LHSExt->getOpcode() != RHSExt->getOpcode())

      return std::nullopt;

    // Note: This is essentially the same as matching ext(...) as we will

    // rewrite this operand to ext(absolute-difference(A, B)).

    return ExtendedReductionOperand{

        Sub,

        /*ExtendA=*/{LHSInputType, getPartialReductionExtendKind(LHSExt)},

        /*ExtendB=*/{}};

  }


  std::optional<TTI::PartialReductionExtendKind> OuterExtKind;

  if (match(Op, m_WidenAnyExtend(m_VPValue()))) {

    auto *CastRecipe = cast<VPWidenCastRecipe>(Op);

    VPValue *CastSource = CastRecipe->getOperand(0);

    OuterExtKind = getPartialReductionExtendKind(CastRecipe);

    if (match(CastSource, m_Mul(m_VPValue(), m_VPValue())) ||

        match(CastSource, m_FMul(m_VPValue(), m_VPValue()))) {

      // Match: ext(mul(...))

      // Record the outer extend kind and set `Op` to the mul. We can then match

      // this as a binary operation. Note: We can optimize out the outer extend

      // by widening the inner extends to match it. See

      // optimizeExtendsForPartialReduction.

      Op = CastSource;

    } else {

      return ExtendedReductionOperand{

          UpdateR,

          /*ExtendA=*/{CastSource->getScalarType(), *OuterExtKind},

          /*ExtendB=*/{}};

    }

  }


  if (!Op->hasOneUse())

    return std::nullopt;


  VPWidenRecipe *MulOp = dyn_cast<VPWidenRecipe>(Op);

  if (!MulOp ||

      !is_contained({Instruction::Mul, Instruction::FMul}, MulOp->getOpcode()))

    return std::nullopt;


  // The rest of the matching assumes `Op` is a (possibly extended) mul

  // operation.


  VPValue *LHS = MulOp->getOperand(0);

  VPValue *RHS = MulOp->getOperand(1);


  // The LHS of the operation must always be an extend.

  if (!match(LHS, m_WidenAnyExtend(m_VPValue())))

    return std::nullopt;


  auto *LHSCast = cast<VPWidenCastRecipe>(LHS);

  Type *LHSInputType = LHSCast->getOperand(0)->getScalarType();

  ExtendKind LHSExtendKind = getPartialReductionExtendKind(LHSCast);


  // The RHS of the operation can be an extend or a constant integer.

  const APInt *RHSConst = nullptr;

  VPWidenCastRecipe *RHSCast = nullptr;

  if (match(RHS, m_WidenAnyExtend(m_VPValue())))

    RHSCast = cast<VPWidenCastRecipe>(RHS);

  else if (!match(RHS, m_APInt(RHSConst)) ||

           !canConstantBeExtended(RHSConst, LHSInputType, LHSExtendKind))

    return std::nullopt;


  // The outer extend kind must match the inner extends for folding.

  for (VPWidenCastRecipe *Cast : {LHSCast, RHSCast})

    if (Cast && OuterExtKind &&

        getPartialReductionExtendKind(Cast) != OuterExtKind)

      return std::nullopt;


  Type *RHSInputType = LHSInputType;

  ExtendKind RHSExtendKind = LHSExtendKind;

  if (RHSCast) {

    RHSInputType = RHSCast->getOperand(0)->getScalarType();

    RHSExtendKind = getPartialReductionExtendKind(RHSCast);

  }


  return ExtendedReductionOperand{

      MulOp, {LHSInputType, LHSExtendKind}, {RHSInputType, RHSExtendKind}};

}


/// Examines each operation in the reduction chain corresponding to \p RedPhiR,

/// and determines if the target can use a cheaper operation with a wider

/// per-iteration input VF and narrower PHI VF. If successful, returns the chain

/// of operations in the reduction.

static std::optional<SmallVector<VPPartialReductionChain>>

getScaledReductions(VPReductionPHIRecipe *RedPhiR, VPCostContext &CostCtx,

                    VFRange &Range) {

  // Get the backedge value from the reduction PHI and find the

  // ComputeReductionResult that uses it (directly or through a select for

  // predicated reductions).

  auto *RdxResult = vputils::findComputeReductionResult(RedPhiR);

  if (!RdxResult)

    return std::nullopt;

  VPValue *ExitValue = RdxResult->getOperand(0);

  match(ExitValue, m_Select(m_VPValue(), m_VPValue(ExitValue), m_VPValue()));


  SmallVector<VPPartialReductionChain> Chain;

  RecurKind RK = RedPhiR->getRecurrenceKind();

  Type *PhiType = RedPhiR->getScalarType();

  TypeSize PHISize = PhiType->getPrimitiveSizeInBits();


  // Work backwards from the ExitValue examining each reduction operation.

  VPValue *CurrentValue = ExitValue;

  while (CurrentValue != RedPhiR) {

    auto *UpdateR = dyn_cast<VPWidenRecipe>(CurrentValue);

    if (!UpdateR || !Instruction::isBinaryOp(UpdateR->getOpcode()))

      return std::nullopt;


    VPValue *Op = UpdateR->getOperand(1);

    VPValue *PrevValue = UpdateR->getOperand(0);


    // Find the extended operand. The other operand (PrevValue) is the next link

    // in the reduction chain.

    std::optional<ExtendedReductionOperand> ExtendedOp =

        matchExtendedReductionOperand(UpdateR, Op);

    if (!ExtendedOp) {

      ExtendedOp = matchExtendedReductionOperand(UpdateR, PrevValue);

      if (!ExtendedOp)

        return std::nullopt;

      std::swap(Op, PrevValue);

    }


    Type *ExtSrcType = ExtendedOp->ExtendA.SrcType;

    TypeSize ExtSrcSize = ExtSrcType->getPrimitiveSizeInBits();

    if (!PHISize.hasKnownScalarFactor(ExtSrcSize))

      return std::nullopt;


    // Check if a partial reduction chain is supported by the target (i.e. does

    // not have an invalid cost) for the given VF range. Clamps the range and

    // returns true if feasible for any VF.

    VPPartialReductionChain Link(

        {UpdateR, *ExtendedOp, RK,

         PrevValue == UpdateR->getOperand(0) ? 0U : 1U,

         static_cast<unsigned>(PHISize.getKnownScalarFactor(ExtSrcSize))});

    Chain.push_back(Link);

    CurrentValue = PrevValue;

  }


  // The chain links were collected by traversing backwards from the exit value.

  // Reverse the chains so they are in program order.

  std::reverse(Chain.begin(), Chain.end());

  return Chain;

}

} // namespace


void VPlanTransforms::createPartialReductions(VPlan &Plan,

                                              VPCostContext &CostCtx,

                                              VFRange &Range) {

  // Find all possible valid partial reductions, grouping chains by their PHI.

  // This grouping allows invalidating the whole chain, if any link is not a

  // valid partial reduction.

  MapVector<VPReductionPHIRecipe *, SmallVector<VPPartialReductionChain>>

      ChainsByPhi;

  VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();

  for (VPRecipeBase &R : HeaderVPBB->phis()) {

    auto *RedPhiR = dyn_cast<VPReductionPHIRecipe>(&R);

    if (!RedPhiR)

      continue;


    if (auto Chains = getScaledReductions(RedPhiR, CostCtx, Range))

      ChainsByPhi.try_emplace(RedPhiR, std::move(*Chains));

  }


  if (ChainsByPhi.empty())

    return;


  // Build set of partial reduction operations for extend user validation and

  // a map of reduction bin ops to their scale factors for scale validation.

  SmallPtrSet<VPRecipeBase *, 4> PartialReductionOps;

  DenseMap<VPSingleDefRecipe *, unsigned> ScaledReductionMap;

  for (const auto &[_, Chains] : ChainsByPhi)

    for (const VPPartialReductionChain &Chain : Chains) {

      PartialReductionOps.insert(Chain.ExtendedOp.ExtendsUser);

      ScaledReductionMap[Chain.ReductionBinOp] = Chain.ScaleFactor;

    }


  // A partial reduction is invalid if any of its extends are used by

  // something that isn't another partial reduction. This is because the

  // extends are intended to be lowered along with the reduction itself.

  auto ExtendUsersValid = [&](VPValue *Ext) {

    return !isa<VPWidenCastRecipe>(Ext) || all_of(Ext->users(), [&](VPUser *U) {

      return PartialReductionOps.contains(cast<VPRecipeBase>(U));

    });

  };


  auto IsProfitablePartialReductionChainForVF =

      [&](ArrayRef<VPPartialReductionChain> Chain, ElementCount VF) -> bool {

    InstructionCost PartialCost = 0, RegularCost = 0;


    // The chain is a profitable partial reduction chain if the cost of handling

    // the entire chain is cheaper when using partial reductions than when

    // handling the entire chain using regular reductions.

    for (const VPPartialReductionChain &Link : Chain) {

      const ExtendedReductionOperand &ExtendedOp = Link.ExtendedOp;

      InstructionCost LinkCost = getPartialReductionLinkCost(CostCtx, Link, VF);

      if (!LinkCost.isValid())

        return false;


      PartialCost += LinkCost;

      RegularCost += Link.ReductionBinOp->computeCost(VF, CostCtx);

      // If ExtendB is not none, then the "ExtendsUser" is the binary operation.

      if (ExtendedOp.ExtendB.Kind != ExtendKind::PR_None)

        RegularCost += ExtendedOp.ExtendsUser->computeCost(VF, CostCtx);

      for (VPValue *Op : ExtendedOp.ExtendsUser->operands())

        if (auto *Extend = dyn_cast<VPWidenCastRecipe>(Op))

          RegularCost += Extend->computeCost(VF, CostCtx);

    }

    return PartialCost.isValid() && PartialCost < RegularCost;

  };


  // Validate chains: check that extends are only used by partial reductions,

  // and that reduction bin ops are only used by other partial reductions with

  // matching scale factors, are outside the loop region or the select

  // introduced by tail-folding. Otherwise we would create users of scaled

  // reductions where the types of the other operands don't match.

  for (auto &[RedPhiR, Chains] : ChainsByPhi) {

    for (const VPPartialReductionChain &Chain : Chains) {

      if (!all_of(Chain.ExtendedOp.ExtendsUser->operands(), ExtendUsersValid)) {

        Chains.clear();

        break;

      }

      auto UseIsValid = [&, RedPhiR = RedPhiR](VPUser *U) {

        if (auto *PhiR = dyn_cast<VPReductionPHIRecipe>(U))

          return PhiR == RedPhiR;

        auto *R = cast<VPSingleDefRecipe>(U);

        return Chain.ScaleFactor == ScaledReductionMap.lookup_or(R, 0) ||

               match(R, m_ComputeReductionResult(

                            m_Specific(Chain.ReductionBinOp))) ||

               match(R, m_Select(m_VPValue(), m_Specific(Chain.ReductionBinOp),

                                 m_Specific(RedPhiR)));

      };

      if (!all_of(Chain.ReductionBinOp->users(), UseIsValid)) {

        Chains.clear();

        break;

      }


      // Check if the compute-reduction-result is used by a sunk store.

      // TODO: Also form partial reductions in those cases.

      if (auto *RdxResult = vputils::findComputeReductionResult(RedPhiR)) {

        if (any_of(RdxResult->users(), [](VPUser *U) {

              auto *RepR = dyn_cast<VPReplicateRecipe>(U);

              return RepR && RepR->getOpcode() == Instruction::Store;

            })) {

          Chains.clear();

          break;

        }

      }

    }


    // Clear the chain if it is not profitable.

    if (!LoopVectorizationPlanner::getDecisionAndClampRange(

            [&, &Chains = Chains](ElementCount VF) {

              return IsProfitablePartialReductionChainForVF(Chains, VF);

            },

            Range))

      Chains.clear();

  }


  for (auto &[Phi, Chains] : ChainsByPhi)

    for (const VPPartialReductionChain &Chain : Chains)

      transformToPartialReduction(Chain, Plan, Phi);

}


void VPlanTransforms::makeMemOpWideningDecisions(

    VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder) {

  // Collect all loads/stores first. We will start with ones having simpler

  // decisions followed by more complex ones that are potentially

  // guided/dependent on the simpler ones.

  SmallVector<VPInstruction *> MemOps;

  for (VPBasicBlock *VPBB :

       VPBlockUtils::blocksOnly<VPBasicBlock>(vp_depth_first_shallow(

           Plan.getVectorLoopRegion()->getEntryBasicBlock()))) {

    for (VPRecipeBase &R : *VPBB) {

      auto *VPI = dyn_cast<VPInstruction>(&R);

      if (VPI && VPI->getUnderlyingValue() &&

          is_contained({Instruction::Load, Instruction::Store},

                       VPI->getOpcode()))

        MemOps.push_back(VPI);

    }

  }


  VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();

  VPBuilder FinalRedStoresBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());


  for (VPInstruction *VPI : MemOps) {

    auto ReplaceWith = [&](VPRecipeBase *New) {

      New->insertBefore(VPI);

      if (VPI->getOpcode() == Instruction::Load)

        VPI->replaceAllUsesWith(New->getVPSingleValue());

      VPI->eraseFromParent();

    };


    // Note: we must do that for scalar VPlan as well.

    if (RecipeBuilder.replaceWithFinalIfReductionStore(VPI,

                                                       FinalRedStoresBuilder))

      continue;


    // Filter out scalar VPlan for the remaining memory operations.

    if (LoopVectorizationPlanner::getDecisionAndClampRange(

            [](ElementCount VF) { return VF.isScalar(); }, Range))

      continue;


    if (VPHistogramRecipe *Histogram = RecipeBuilder.widenIfHistogram(VPI)) {

      ReplaceWith(Histogram);

      continue;

    }


    VPRecipeBase *Recipe = RecipeBuilder.tryToWidenMemory(VPI, Range);

    if (!Recipe)

      Recipe = RecipeBuilder.handleReplication(VPI, Range);


    ReplaceWith(Recipe);

  }

}


void VPlanTransforms::makeScalarizationDecisions(VPlan &Plan, VFRange &Range) {

  if (LoopVectorizationPlanner::getDecisionAndClampRange(

          [&](ElementCount VF) { return VF.isScalar(); }, Range))

    return;


  PostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> POT(

      Plan.getEntry());

  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(POT)) {

    for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {

      auto *VPI = dyn_cast<VPInstruction>(&R);

      if (!VPI)

        continue;


      auto *I = cast_or_null<Instruction>(VPI->getUnderlyingValue());

      // Wouldn't be able to create a `VPReplicateRecipe` anyway.

      if (!I)

        continue;


      // If executing other lanes produces side-effects we can't avoid them.

      if (VPI->mayHaveSideEffects())

        continue;


      // We want to drop the mask operand, verify we can safely do that.

      if (VPI->isMasked() && !VPI->isSafeToSpeculativelyExecute())

        continue;


      // Avoid rewriting IV increment as that interferes with

      // `removeRedundantCanonicalIVs`.

      if (VPI->getOpcode() == Instruction::Add &&

          any_of(VPI->operands(), IsaPred<VPWidenIntOrFpInductionRecipe>))

        continue;


      // Other lanes are needed - can't drop them.

      if (!vputils::onlyFirstLaneUsed(VPI))

        continue;


      auto *Recipe = new VPReplicateRecipe(

          I, VPI->operandsWithoutMask(), /*IsSingleScalar=*/true,

          /*Mask=*/nullptr, *VPI, *VPI, VPI->getDebugLoc());

      Recipe->insertBefore(VPI);

      VPI->replaceAllUsesWith(Recipe);

      VPI->eraseFromParent();

    }

  }

}


/// Returns true if \p Info's parameter kinds are compatible with \p Args.


static bool areVFParamsOk(const VFInfo &Info, ArrayRef<VPValue *> Args,

                          PredicatedScalarEvolution &PSE, const Loop *L) {

  ScalarEvolution *SE = PSE.getSE();

  return all_of(Info.Shape.Parameters, [&](VFParameter Param) {

    switch (Param.ParamKind) {

    case VFParamKind::Vector:

    case VFParamKind::GlobalPredicate:

      return true;

    case VFParamKind::OMP_Uniform:

      return SE->isSCEVable(Args[Param.ParamPos]->getScalarType()) &&

             SE->isLoopInvariant(

                 vputils::getSCEVExprForVPValue(Args[Param.ParamPos], PSE, L),

                 L);

    case VFParamKind::OMP_Linear:

      return match(vputils::getSCEVExprForVPValue(Args[Param.ParamPos], PSE, L),

                   m_scev_AffineAddRec(

                       m_SCEV(), m_scev_SpecificSInt(Param.LinearStepOrPos),

                       m_SpecificLoop(L)));

    default:

      return false;

    }

  });

}


/// Find a vector variant of \p CI for \p VF, respecting \p MaskRequired.

/// Returns the variant function, or nullptr. Masked variants are assumed to

/// take the mask as a trailing parameter.


static Function *findVectorVariant(CallInst *CI, ArrayRef<VPValue *> Args,

                                   ElementCount VF, bool MaskRequired,

                                   PredicatedScalarEvolution &PSE,

                                   const Loop *L) {

  if (CI->isNoBuiltin())

    return nullptr;

  auto Mappings = VFDatabase::getMappings(*CI);

  const auto *It = find_if(Mappings, [&](const VFInfo &Info) {

    return Info.Shape.VF == VF && (!MaskRequired || Info.isMasked()) &&

           areVFParamsOk(Info, Args, PSE, L);

  });

  if (It == Mappings.end())

    return nullptr;

  return CI->getModule()->getFunction(It->VectorName);

}


namespace {

/// The outcome of choosing how to widen a call at a given VF.

struct CallWideningDecision {

  enum class KindTy { Scalarize, Intrinsic, VectorVariant };

  CallWideningDecision(KindTy Kind, Function *Variant = nullptr)

      : Kind(Kind), Variant(Variant) {}

  KindTy Kind;


  /// Set when Kind == VectorVariant.

  Function *Variant;


  bool operator==(const CallWideningDecision &Other) const {

    return Kind == Other.Kind && Variant == Other.Variant;

  }

};

} // namespace


/// Pick the cheapest widening for the call \p VPI at \p VF among scalarization,

/// vector intrinsic, and vector library variant.


static CallWideningDecision decideCallWidening(VPInstruction &VPI,

                                               ArrayRef<VPValue *> Ops,

                                               ElementCount VF,

                                               VPCostContext &CostCtx) {

  auto *CI = cast<CallInst>(VPI.getUnderlyingInstr());


  // Scalar VFs and calls forced or known to scalarize always replicate.

  if (VF.isScalar() || CostCtx.willBeScalarized(CI, VF))

    return CallWideningDecision::KindTy::Scalarize;


  auto *CalledFn = cast<Function>(

      VPI.getOperand(VPI.getNumOperandsWithoutMask() - 1)->getLiveInIRValue());

  Type *ResultTy = VPI.getScalarType();

  Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, &CostCtx.TLI);

  bool MaskRequired = CostCtx.isMaskRequired(CI);


  // Pseudo intrinsics (assume, lifetime, ...) are always scalarized.

  if (ID && VPCostContext::isFreeScalarIntrinsic(ID))

    return CallWideningDecision::KindTy::Scalarize;


  InstructionCost ScalarCost =

      VPReplicateRecipe::computeCallCost(CalledFn, ResultTy, Ops,

                                         /*IsSingleScalar=*/false, VF, CostCtx);


  Function *VecFunc =

      findVectorVariant(CI, Ops, VF, MaskRequired, CostCtx.PSE, CostCtx.L);

  InstructionCost VecCallCost = InstructionCost::getInvalid();

  if (VecFunc)

    VecCallCost = VPWidenCallRecipe::computeCallCost(VecFunc, CostCtx);


  // Prefer the intrinsic if it is at least as cheap as scalarizing and any

  // available vector variant.

  if (ID) {

    InstructionCost IntrinsicCost =

        VPWidenIntrinsicRecipe::computeCallCost(ID, Ops, VPI, VF, CostCtx);

    if (IntrinsicCost.isValid() && ScalarCost >= IntrinsicCost &&

        (!VecFunc || VecCallCost >= IntrinsicCost))

      return CallWideningDecision::KindTy::Intrinsic;

  }


  // Otherwise, use a vector library variant when it beats scalarizing.

  if (VecFunc && ScalarCost >= VecCallCost)

    return {CallWideningDecision::KindTy::VectorVariant, VecFunc};


  return CallWideningDecision::KindTy::Scalarize;

}


void VPlanTransforms::makeCallWideningDecisions(VPlan &Plan, VFRange &Range,

                                                VPRecipeBuilder &RecipeBuilder,

                                                VPCostContext &CostCtx) {

  for (VPBasicBlock *VPBB : VPBlockUtils::blocksAs<VPBasicBlock>(

           vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {

    for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {

      auto *VPI = dyn_cast<VPInstruction>(&R);

      if (!VPI || !VPI->getUnderlyingValue() ||

          VPI->getOpcode() != Instruction::Call)

        continue;


      auto *CI = cast<CallInst>(VPI->getUnderlyingInstr());

      SmallVector<VPValue *, 4> Ops(VPI->op_begin(),

                                    VPI->op_begin() + CI->arg_size());


      CallWideningDecision Decision =

          decideCallWidening(*VPI, Ops, Range.Start, CostCtx);

      LoopVectorizationPlanner::getDecisionAndClampRange(

          [&](ElementCount VF) {

            return Decision == decideCallWidening(*VPI, Ops, VF, CostCtx);

          },

          Range);


      VPSingleDefRecipe *Replacement = nullptr;

      switch (Decision.Kind) {

      case CallWideningDecision::KindTy::Intrinsic: {

        Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, &CostCtx.TLI);

        Type *ResultTy = VPI->getScalarType();

        Replacement = new VPWidenIntrinsicRecipe(*CI, ID, Ops, ResultTy, *VPI,

                                                 *VPI, VPI->getDebugLoc());

        break;

      }

      case CallWideningDecision::KindTy::VectorVariant: {

        // Masked variants take the mask as a trailing parameter, so they have

        // one more parameter than the original call's arguments.

        if (Decision.Variant->arg_size() > Ops.size()) {

          VPValue *Mask = VPI->isMasked() ? VPI->getMask() : Plan.getTrue();

          Ops.push_back(Mask);

        }

        Ops.push_back(VPI->getOperand(VPI->getNumOperandsWithoutMask() - 1));

        Replacement = new VPWidenCallRecipe(CI, Decision.Variant, Ops, *VPI,

                                            *VPI, VPI->getDebugLoc());

        break;

      }

      case CallWideningDecision::KindTy::Scalarize:

        Replacement = RecipeBuilder.handleReplication(VPI, Range);

        break;

      }


      Replacement->insertBefore(VPI);

      VPI->replaceAllUsesWith(Replacement);

      VPI->eraseFromParent();

    }

  }

}


void VPlanTransforms::convertToStridedAccesses(VPlan &Plan,

                                               PredicatedScalarEvolution &PSE,

                                               Loop &L, VPCostContext &Ctx,

                                               VFRange &Range) {

  if (Plan.hasScalarVFOnly())

    return;


  VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();

  VPValue *I32VF = nullptr;

  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(

           vp_depth_first_shallow(VectorLoop->getEntry()))) {

    for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {

      auto *LoadR = dyn_cast<VPWidenLoadRecipe>(&R);

      // TODO: Support strided store.

      // TODO: Transform reverse access into strided access with -1 stride.

      // TODO: Transform gather/scatter with uniform address into strided access

      // with 0 stride.

      // TODO: Transform interleave access into multiple strided accesses.

      if (!LoadR || LoadR->isConsecutive())

        continue;


      auto *Ptr = dyn_cast<VPWidenGEPRecipe>(LoadR->getAddr());

      if (!Ptr)

        continue;


      // Check if this is a strided access by analyzing the address SCEV for an

      // affine addRec.

      const SCEV *PtrSCEV = vputils::getSCEVExprForVPValue(Ptr, PSE, &L);

      const SCEV *Start;

      const SCEVConstant *Step;

      // TODO: Support non-constant loop invariant stride.

      if (!match(PtrSCEV,

                 m_scev_AffineAddRec(m_SCEV(Start), m_SCEVConstant(Step),

                                     m_SpecificLoop(&L))))

        continue;


      Type *LoadTy = LoadR->getScalarType();

      Align Alignment = LoadR->getAlign();

      auto IsProfitable = [&](ElementCount VF) {

        Type *DataTy = toVectorTy(LoadTy, VF);

        if (!Ctx.TTI.isLegalStridedLoadStore(DataTy, Alignment))

          return false;

        const InstructionCost CurrentCost = LoadR->computeCost(VF, Ctx);

        const InstructionCost StridedLoadStoreCost =

            VPWidenMemIntrinsicRecipe::computeMemIntrinsicCost(

                Intrinsic::experimental_vp_strided_load, DataTy,

                LoadR->isMasked(), Alignment, Ctx);

        return StridedLoadStoreCost < CurrentCost;

      };


      if (!LoopVectorizationPlanner::getDecisionAndClampRange(IsProfitable,

                                                              Range))

        continue;


      // Invalidate the legacy widening decision so the cost of replaced load is

      // not counted during precomputeCosts.

      // TODO: Remove once the legacy exit cost computation is retired.

      for (ElementCount VF : Range)

        Ctx.invalidateWideningDecision(&LoadR->getIngredient(), VF);


      // Get VF as i32 for the vector length operand.

      if (!I32VF) {

        VPBuilder Builder(Plan.getVectorPreheader());

        I32VF = Builder.createScalarZExtOrTrunc(

            &Plan.getVF(), Type::getInt32Ty(Plan.getContext()),

            Plan.getVF().getScalarType(), DebugLoc::getUnknown());

      }


      VPBuilder Builder(LoadR);

      // Create the base pointer of strided access.

      VPValue *StartVPV = vputils::getOrCreateVPValueForSCEVExpr(Plan, Start);

      VPValue *StrideInBytes = Plan.getOrAddLiveIn(Step->getValue());

      Type *IndexTy = Plan.getDataLayout().getIndexType(Ptr->getScalarType());

      assert(IndexTy == StrideInBytes->getScalarType() &&

             "Stride type from SCEV must match the index type");

      VPValue *CanIVTyStride = Builder.createScalarSExtOrTrunc(

          StrideInBytes, VectorLoop->getCanonicalIVType(), IndexTy,

          DebugLoc::getUnknown());

      auto *AddRecPtr = cast<SCEVAddRecExpr>(PtrSCEV);

      auto *Offset = Builder.createOverflowingOp(

          Instruction::Mul, {VectorLoop->getCanonicalIV(), CanIVTyStride},

          {AddRecPtr->hasNoUnsignedWrap(), AddRecPtr->hasNoSignedWrap()});

      auto *BasePtr = Builder.createNoWrapPtrAdd(

          StartVPV, Offset,

          AddRecPtr->hasNoUnsignedWrap() ? GEPNoWrapFlags::noUnsignedWrap()

                                         : GEPNoWrapFlags::none());


      // Create a new vector pointer for strided access.

      VPValue *NewPtr = Builder.createVectorPointer(

          BasePtr, Type::getInt8Ty(Plan.getContext()), StrideInBytes,

          Ptr->getGEPNoWrapFlags(), Ptr->getDebugLoc());


      VPValue *Mask = LoadR->getMask();

      if (!Mask)

        Mask = Plan.getTrue();

      auto *StridedLoad = Builder.createWidenMemIntrinsic(

          Intrinsic::experimental_vp_strided_load,

          {NewPtr, StrideInBytes, Mask, I32VF}, LoadTy, Alignment, *LoadR,

          LoadR->getDebugLoc());

      LoadR->replaceAllUsesWith(StridedLoad);

    }

  }

}


FMAInstKind::Accumulator
@ Accumulator
Definition AArch64InstrInfo.cpp:8579

assert
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

Select
AMDGPU Register Bank Select
Definition AMDGPURegBankSelect.cpp:68

APInt.h
This file implements a class to represent arbitrary precision integral constant values and operations...

DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition ARMSLSHardening.cpp:73

isEqual
static bool isEqual(const Function &Caller, const Function &Callee)
Definition Attributes.cpp:2607

getParent
static const Function * getParent(const Value *V)
Definition BasicAliasAnalysis.cpp:894

X
#define X(NUM, ENUM, NAME)
Definition ELF.h:853

A
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")

E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")

B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")

Casting.h

CostKind
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))

IntrinsicCost
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))

isSentinel
static bool isSentinel(const DWARFDebugNames::AttributeEncoding &AE)
Definition DWARFAcceleratorTable.cpp:503

Default
@ Default
Definition DwarfDebug.cpp:86

GEP
Hexagon Common GEP
Definition HexagonCommonGEP.cpp:164

_
#define _
Definition HexagonMCCodeEmitter.cpp:46

IVDescriptors.h

Users
iv Induction Variable Users
Definition IVUsers.cpp:48

users
iv users
Definition IVUsers.cpp:48

InlinePriorityMode::Size
@ Size
Definition InlineOrder.cpp:25

InstSimplifyFolder.h

getMask
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
Definition InterleavedAccessPass.cpp:586

Intrinsics.h

Ops
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Definition ItaniumDemangle.h:3391

licm
licm
Definition LICM.cpp:383

IR
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:81

Loads.h

LoopInfo.h

LoopUtils.h

I
#define I(x, y, z)
Definition MD5.cpp:57

MDBuilder.h

getDebugLoc
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
Definition MachineInstrBundle.cpp:104

MemoryLocation.h
This file provides utility analysis objects describing memory locations.

Metadata.h
This file contains the declarations for metadata subclasses.

OpIdx
MachineInstr unsigned OpIdx
Definition NVPTXPrologEpilogPass.cpp:56

Range
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))

P
#define P(N)

PostOrderIterator.h
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.

Merge
R600 Clause Merge
Definition R600ClauseMergePass.cpp:70

Cond
const SmallVectorImpl< MachineOperand > & Cond
Definition RISCVRedundantCopyElimination.cpp:73

Opc
auto Opc
Definition RISCVRedundantCopyElimination.cpp:77

dominates
static bool dominates(InstrPosIndexes &PosIndexes, const MachineInstr &A, const MachineInstr &B)
Definition RegAllocFast.cpp:504

STLExtras.h
This file contains some templates that are useful if you are working with the STL at all.

ScalarEvolutionExpander.h

ScalarEvolutionPatternMatch.h

ScopedNoAliasAA.h
This is the interface for a metadata-based scoped no-alias analysis.

SetOperations.h
This file defines generic set operations that may be used on set's of different types,...

SetVector.h
This file implements a set that has insertion order iteration characteristics.

SmallPtrSet.h
This file defines the SmallPtrSet class.

Y
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")

getType
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39

TypeSize.h

TypeSwitch.h
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...

VPRecipeBuilder.h

VPlanAnalysis.h

VPlanCFG.h

VPlanDominatorTree.h
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.

VPlanHelpers.h
This file contains the declarations of different VPlan-related auxiliary helpers.

VPlanPatternMatch.h

collectComplementaryPredicatedMemOps
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectComplementaryPredicatedMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Definition VPlanTransforms.cpp:4908

removeCommonBlendMask
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
Definition VPlanTransforms.cpp:1981

tryToCreateAbstractReductionRecipe
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
Definition VPlanTransforms.cpp:4819

findRecipeWithMinAlign
static VPReplicateRecipe * findRecipeWithMinAlign(ArrayRef< VPReplicateRecipe * > Group)
Definition VPlanTransforms.cpp:4966

decideCallWidening
static CallWideningDecision decideCallWidening(VPInstruction &VPI, ArrayRef< VPValue * > Ops, ElementCount VF, VPCostContext &CostCtx)
Pick the cheapest widening for the call VPI at VF among scalarization, vector intrinsic,...
Definition VPlanTransforms.cpp:7040

areVFParamsOk
static bool areVFParamsOk(const VFInfo &Info, ArrayRef< VPValue * > Args, PredicatedScalarEvolution &PSE, const Loop *L)
Returns true if Info's parameter kinds are compatible with Args.
Definition VPlanTransforms.cpp:6978

simplifyLogicalRecipe
static bool simplifyLogicalRecipe(VPSingleDefRecipe *Def, VPBuilder &Builder, bool CanCreateNewRecipe)
Try to simplify logical and bitwise recipes in Def.
Definition VPlanTransforms.cpp:1336

sinkScalarOperands
static bool sinkScalarOperands(VPlan &Plan)
Definition VPlanTransforms.cpp:300

simplifyBranchConditionForVFAndUF
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
Definition VPlanTransforms.cpp:2286

cloneBinOpForScalarIV
static VPValue * cloneBinOpForScalarIV(VPWidenRecipe *BinOp, VPValue *ScalarIV, VPWidenIntOrFpInductionRecipe *WidenIV)
Create a scalar version of BinOp, with its WidenIV operand replaced by ScalarIV, and place it after S...
Definition VPlanTransforms.cpp:6027

getExpressionIV
static VPWidenIntOrFpInductionRecipe * getExpressionIV(VPValue *V)
Check if V is a binary expression of a widened IV and a loop-invariant value.
Definition VPlanTransforms.cpp:6008

removeRedundantInductionCasts
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
Definition VPlanTransforms.cpp:636

isConditionTrueViaVFAndUF
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Return true if Cond is known to be true for given BestVF and BestUF.
Definition VPlanTransforms.cpp:2153

tryToFoldLiveIns
static VPIRValue * tryToFoldLiveIns(VPSingleDefRecipe &R, ArrayRef< VPValue * > Operands, const DataLayout &DL, LLVMContext &Ctx)
Try to fold R using InstSimplifyFolder.
Definition VPlanTransforms.cpp:1266

tryToReplaceALMWithWideALM
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
Definition VPlanTransforms.cpp:2192

getOpcodeOrIntrinsicID
static std::optional< std::pair< bool, unsigned > > getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R)
Get any instruction opcode or intrinsic ID data embedded in recipe R.
Definition VPlanTransforms.cpp:1243

tryToMatchAndCreateExtendedReduction
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
Definition VPlanTransforms.cpp:4600

m_RemoveMask
static RemoveMask_match< Op0_t, Op1_t > m_RemoveMask(const Op0_t &In, Op1_t &Out)
Match a specific mask In, or a combination of it (logical-and In, Out).
Definition VPlanTransforms.cpp:2943

isConsecutiveInterleaveGroup
static std::optional< ElementCount > isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, ArrayRef< ElementCount > VFs, const TargetTransformInfo &TTI)
Returns VF from VFs if IR is a full interleave group with factor and number of members both equal to ...
Definition VPlanTransforms.cpp:5577

getLoadStoreValueType
static Type * getLoadStoreValueType(VPReplicateRecipe *R, bool IsLoad)
Get the value type of the replicate load or store.
Definition VPlanTransforms.cpp:256

getCommonMetadata
static VPIRMetadata getCommonMetadata(ArrayRef< VPReplicateRecipe * > Recipes)
Definition VPlanTransforms.cpp:4899

getPredicatedMask
static VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
Definition VPlanTransforms.cpp:394

mergeReplicateRegionsIntoSuccessors
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
Definition VPlanTransforms.cpp:426

findVectorVariant
static Function * findVectorVariant(CallInst *CI, ArrayRef< VPValue * > Args, ElementCount VF, bool MaskRequired, PredicatedScalarEvolution &PSE, const Loop *L)
Find a vector variant of CI for VF, respecting MaskRequired.
Definition VPlanTransforms.cpp:7005

createScalarIVSteps
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPIRValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
Definition VPlanTransforms.cpp:672

getOptimizableIVOf
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, PredicatedScalarEvolution &PSE)
Check if VPV is an untruncated wide induction, either before or after the increment.
Definition VPlanTransforms.cpp:961

fixupVFUsersForEVL
static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL)
After replacing the canonical IV with a EVL-based IV, fixup recipes that use VF to use the EVL instea...
Definition VPlanTransforms.cpp:3151

handleUncountableExitsWithSideEffects
static bool handleUncountableExitsWithSideEffects(VPlan &Plan, SmallVectorImpl< EarlyExitInfo > &Exits, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VPBasicBlock *MiddleVPBB, Loop *TheLoop, PredicatedScalarEvolution &PSE, DominatorTree &DT, AssumptionCache *AC, VPDominatorTree &VPDT)
Update Plan to mask memory operations in the loop based on whether the early exit is taken or not.
Definition VPlanTransforms.cpp:4213

canNarrowLoad
static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx, bool IsScalable)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
Definition VPlanTransforms.cpp:5526

simplifyRecipe
static void simplifyRecipe(VPSingleDefRecipe *Def)
Try to simplify VPSingleDefRecipe Def.
Definition VPlanTransforms.cpp:1464

isDeadRecipe
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
Definition VPlanTransforms.cpp:788

legalizeAndOptimizeInductions
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
Definition VPlanTransforms.cpp:874

addReplicateRegions
static void addReplicateRegions(VPlan &Plan)
Definition VPlanTransforms.cpp:557

collectGroupedReplicateMemOps
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectGroupedReplicateMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L, function_ref< bool(VPReplicateRecipe *)> FilterFn)
Collect either replicated Loads or Stores grouped by their address SCEV and their load-store type,...
Definition VPlanTransforms.cpp:265

tryToComputeEndValueForInduction
static VPValue * tryToComputeEndValueForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPValue *VectorTC)
Compute the end value for WideIV, unless it is truncated.
Definition VPlanTransforms.cpp:1068

getVPDivRemIntrinsic
static std::optional< Intrinsic::ID > getVPDivRemIntrinsic(Intrinsic::ID IntrID)
Definition VPlanTransforms.cpp:2948

removeRedundantExpandSCEVRecipes
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant ExpandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
Definition VPlanTransforms.cpp:1199

optimizeEarlyExitInductionUser
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPValue *Op, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the early exit block.
Definition VPlanTransforms.cpp:1018

scalarizeVPWidenPointerInduction
static VPValue * scalarizeVPWidenPointerInduction(VPWidenPointerInductionRecipe *PtrIV, VPlan &Plan, VPBuilder &Builder)
Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd (IndStart,...
Definition VPlanTransforms.cpp:849

narrowInterleaveGroupOp
static VPValue * narrowInterleaveGroupOp(ArrayRef< VPValue * > Members, SmallPtrSetImpl< VPValue * > &NarrowedOps)
Definition VPlanTransforms.cpp:5632

collectUsersRecursively
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
Definition VPlanTransforms.cpp:835

optimizeLatchExitInductionUser
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
Definition VPlanTransforms.cpp:1102

recursivelyDeleteDeadRecipes
static void recursivelyDeleteDeadRecipes(VPValue *V)
Definition VPlanTransforms.cpp:1220

reassociateHeaderMask
static void reassociateHeaderMask(VPlan &Plan)
Reassociate (headermask && x) && y -> headermask && (x && y) to allow the header mask to be simplifie...
Definition VPlanTransforms.cpp:1842

addVPLaneMaskPhiAndUpdateExitBranch
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan)
Definition VPlanTransforms.cpp:2841

expandVPDerivedIV
static void expandVPDerivedIV(VPDerivedIVRecipe *R)
Expand a VPDerivedIVRecipe into executable recipes.
Definition VPlanTransforms.cpp:3905

getPredicatedThenBlock
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
Definition VPlanTransforms.cpp:404

canHoistOrSinkWithNoAliasCheck
static bool canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc, VPBasicBlock *FirstBB, VPBasicBlock *LastBB, std::optional< SinkStoreInfo > SinkInfo={})
Check if a memory operation doesn't alias with memory operations using scoped noalias metadata,...
Definition VPlanTransforms.cpp:224

createReplicateRegion
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPRegionBlock *ParentRegion, VPlan &Plan)
Definition VPlanTransforms.cpp:515

simplifyBlends
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
Definition VPlanTransforms.cpp:1998

getUnmaskedDivRemOpcode
static std::optional< Instruction::BinaryOps > getUnmaskedDivRemOpcode(Intrinsic::ID ID)
Definition VPlanTransforms.cpp:1867

isAlreadyNarrow
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
Definition VPlanTransforms.cpp:5621

cannotHoistOrSinkRecipe
static bool cannotHoistOrSinkRecipe(VPRecipeBase &R, VPBasicBlock *FirstBB, VPBasicBlock *LastBB)
Return true if we do not know how to (mechanically) hoist or sink a non-memory or memory recipe R out...
Definition VPlanTransforms.cpp:2531

canNarrowOps
static bool canNarrowOps(ArrayRef< VPValue * > Ops, bool IsScalable)
Definition VPlanTransforms.cpp:5542

optimizeVectorInductionWidthForTCAndVFUF
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
Definition VPlanTransforms.cpp:2079

tryToMatchAndCreateMulAccumulateReduction
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
Definition VPlanTransforms.cpp:4652

canSinkStoreWithNoAliasCheck
static bool canSinkStoreWithNoAliasCheck(ArrayRef< VPReplicateRecipe * > StoresToSink, PredicatedScalarEvolution &PSE, const Loop &L)
Definition VPlanTransforms.cpp:5023

expandVPWidenIntOrFpInduction
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
Definition VPlanTransforms.cpp:3755

optimizeMaskToEVL
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
Definition VPlanTransforms.cpp:2970

expandVPWidenPointerInduction
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
Definition VPlanTransforms.cpp:3864

narrowToSingleScalarRecipes
static void narrowToSingleScalarRecipes(VPlan &Plan)
Definition VPlanTransforms.cpp:1882

VPlanTransforms.h
This file provides utility VPlan to VPlan transformations.

RUN_VPLAN_PASS
#define RUN_VPLAN_PASS(PASS,...)
Definition VPlanTransforms.h:84

VPlanUtils.h

VPlanVerifier.h
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...

VPlan.h
This file contains the declarations of the Vectorization Plan base classes:

VectorUtils.h

Groups
static const X86InstrFMA3Group Groups[]
Definition X86InstrFMA3Info.cpp:81

RHS
Value * RHS
Definition X86PartialReduction.cpp:81

LHS
Value * LHS
Definition X86PartialReduction.cpp:80

Mul
BinaryOperator * Mul
Definition X86PartialReduction.cpp:75

IV
static const uint32_t IV[8]
Definition blake3_impl.h:83

Op1_t

SinkStoreInfo
Helper for extra no-alias checks via known-safe recipe and SCEV.
Definition VPlanTransforms.cpp:156

SinkStoreInfo::SinkStoreInfo
SinkStoreInfo(const SmallPtrSetImpl< VPRecipeBase * > &ExcludeRecipes, VPReplicateRecipe &GroupLeader, PredicatedScalarEvolution &PSE, const Loop &L)
Definition VPlanTransforms.cpp:201

SinkStoreInfo::shouldSkip
bool shouldSkip(VPRecipeBase &R) const
Return true if R should be skipped during alias checking, either because it's in the exclude set or b...
Definition VPlanTransforms.cpp:210

llvm::APInt
Class for arbitrary precision integers.
Definition APInt.h:78

llvm::APInt::zext
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1055

llvm::APInt::getZExtValue
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1563

llvm::APInt::getActiveBits
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1535

llvm::APInt::abs
APInt abs() const
Get the absolute value.
Definition APInt.h:1818

llvm::APInt::Rounding::UP
@ UP
Definition APInt.h:91

llvm::APInt::getBitWidth
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1511

llvm::APInt::sext
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:1028

llvm::APInt::isPowerOf2
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441

llvm::APInt::uge
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228

llvm::APSInt
An arbitrary precision integer that knows its signedness.
Definition APSInt.h:24

llvm::APSInt::getMinValue
static APSInt getMinValue(uint32_t numBits, bool Unsigned)
Return the APSInt representing the minimum integer value with the given bit width and signedness.
Definition APSInt.h:310

llvm::APSInt::getMaxValue
static APSInt getMaxValue(uint32_t numBits, bool Unsigned)
Return the APSInt representing the maximum integer value with the given bit width and signedness.
Definition APSInt.h:302

llvm::AliasResult::NoAlias
@ NoAlias
The two locations do not alias at all.
Definition AliasAnalysis.h:98

llvm::ArrayRef
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40

llvm::ArrayRef::back
const T & back() const
Get the last element.
Definition ArrayRef.h:150

llvm::ArrayRef::drop_front
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition ArrayRef.h:194

llvm::ArrayRef::front
const T & front() const
Get the first element.
Definition ArrayRef.h:144

llvm::ArrayRef::end
iterator end() const
Definition ArrayRef.h:130

llvm::ArrayRef::begin
iterator begin() const
Definition ArrayRef.h:129

llvm::AssumptionCache
A cache of @llvm.assume calls within a function.
Definition AssumptionCache.h:44

llvm::BasicBlock
LLVM Basic Block Representation.
Definition BasicBlock.h:62

llvm::BasicBlock::getParent
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213

llvm::BasicBlock::getTerminator
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237

llvm::CallBase::isNoBuiltin
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Definition InstrTypes.h:1974

llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition Instructions.h:1531

llvm::CmpInst::ICMP_ULT
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:765

llvm::CmpInst::ICMP_EQ
@ ICMP_EQ
equal
Definition InstrTypes.h:761

llvm::CmpInst::ICMP_NE
@ ICMP_NE
not equal
Definition InstrTypes.h:762

llvm::CmpInst::ICMP_ULE
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:766

llvm::CmpInst::FCMP_UNO
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:750

llvm::CmpInst::getInversePredicate
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:852

llvm::CmpPredicate
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
Definition CmpPredicate.h:23

llvm::ConstantInt::getSigned
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135

llvm::ConstantRange
This class represents a range of values.
Definition ConstantRange.h:48

llvm::ConstantRange::contains
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
Definition ConstantRange.cpp:527

llvm::Constant::getAllOnesValue
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
Definition Constants.cpp:419

llvm::Constant::getNullValue
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition Constants.cpp:363

llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64

llvm::DataLayout::getIndexType
LLVM_ABI IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
Definition DataLayout.cpp:1039

llvm::DebugLoc
A debug info location.
Definition DebugLoc.h:124

llvm::DebugLoc::getCompilerGenerated
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:152

llvm::DebugLoc::getUnknown
static DebugLoc getUnknown()
Definition DebugLoc.h:151

llvm::DenseMapBase::lookup
ValueT lookup(const_arg_type_t< KeyT > Val) const
Return the entry for the specified key, or a default constructed value if no such entry exists.
Definition DenseMap.h:252

llvm::DenseMapBase::try_emplace
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:301

llvm::DenseMapBase::lookup_or
ValueT lookup_or(const_arg_type_t< KeyT > Val, U &&Default) const
Definition DenseMap.h:262

llvm::DenseMapBase::values
auto values()
Definition DenseMap.h:161

llvm::DenseMap
Definition DenseMap.h:834

llvm::DominatorTreeBase::dominates
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
Definition GenericDomTree.h:493

llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:155

llvm::ElementCount
Definition TypeSize.h:298

llvm::ElementCount::isVector
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324

llvm::ElementCount::getScalable
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312

llvm::ElementCount::isScalar
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320

llvm::FPMathOperator
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:202

llvm::FPMathOperator::getFastMathFlags
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition Operator.h:291

llvm::FastMathFlags
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23

llvm::Function
Definition Function.h:65

llvm::Function::arg_size
size_t arg_size() const
Definition Function.h:901

llvm::GEPNoWrapFlags
Represents flags for the getelementptr instruction/expression.
Definition GEPNoWrapFlags.h:26

llvm::GEPNoWrapFlags::noUnsignedWrap
static GEPNoWrapFlags noUnsignedWrap()
Definition GEPNoWrapFlags.h:56

llvm::GEPNoWrapFlags::withoutNoUnsignedWrap
GEPNoWrapFlags withoutNoUnsignedWrap() const
Definition GEPNoWrapFlags.h:73

llvm::GEPNoWrapFlags::none
static GEPNoWrapFlags none()
Definition GEPNoWrapFlags.h:46

llvm::GetElementPtrInst
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition Instructions.h:968

llvm::InductionDescriptor
A struct for saving information about induction variables.
Definition IVDescriptors.h:375

llvm::InductionDescriptor::getCanonicalIntInduction
static LLVM_ABI InductionDescriptor getCanonicalIntInduction(Type *Ty, ScalarEvolution &SE)
Returns the canonical integer induction for type Ty with start = 0 and step = 1.
Definition IVDescriptors.cpp:1420

llvm::InductionDescriptor::InductionKind
InductionKind
This enum represents the kinds of inductions that we support.
Definition IVDescriptors.h:378

llvm::InductionDescriptor::IK_NoInduction
@ IK_NoInduction
Not an induction variable.
Definition IVDescriptors.h:379

llvm::InductionDescriptor::IK_FpInduction
@ IK_FpInduction
Floating point induction variable.
Definition IVDescriptors.h:382

llvm::InductionDescriptor::IK_PtrInduction
@ IK_PtrInduction
Pointer induction var. Step = C.
Definition IVDescriptors.h:381

llvm::InductionDescriptor::IK_IntInduction
@ IK_IntInduction
Integer induction variable. Step = C.
Definition IVDescriptors.h:380

llvm::Init
Definition Record.h:286

llvm::InstSimplifyFolder
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
Definition InstSimplifyFolder.h:36

llvm::InstructionCost
Definition InstructionCost.h:30

llvm::InstructionCost::getInvalid
static InstructionCost getInvalid(CostType Val=0)
Definition InstructionCost.h:82

llvm::InstructionCost::isValid
bool isValid() const
Definition InstructionCost.h:88

llvm::Instruction
Definition Instruction.h:70

llvm::Instruction::isCast
bool isCast() const
Definition Instruction.h:353

llvm::Instruction::getModule
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition Instruction.cpp:86

llvm::Instruction::isBinaryOp
bool isBinaryOp() const
Definition Instruction.h:349

llvm::Instruction::BinaryOps
BinaryOps
Definition Instruction.h:1056

llvm::Instruction::getDataLayout
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Definition Instruction.cpp:94

llvm::Instruction::isIntDivRem
bool isIntDivRem() const
Definition Instruction.h:350

llvm::Instruction::CastOps
CastOps
Definition Instruction.h:1070

llvm::IntegerType::get
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:350

llvm::InterleaveGroup
The group of interleaved loads/stores sharing the same stride and close to each other.
Definition VectorUtils.h:515

llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68

llvm::LoadInst
An instruction for reading from memory.
Definition Instructions.h:181

llvm::LoopVectorizationPlanner::getDecisionAndClampRange
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1666

llvm::Loop
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40

llvm::MDBuilder
Definition MDBuilder.h:37

llvm::MDBuilder::createBranchWeights
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38

llvm::MDNode
Metadata node.
Definition Metadata.h:1075

llvm::MapVector
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:38

llvm::MapVector::lookup
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:110

llvm::MapVector::try_emplace
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:118

llvm::MapVector::empty
bool empty() const
Definition MapVector.h:79

llvm::MemoryLocation
Representation for a specific memory location.
Definition MemoryLocation.h:217

llvm::MemoryLocation::AATags
AAMDNodes AATags
The metadata nodes which describes the aliasing of the location (each member is null if that kind of ...
Definition MemoryLocation.h:238

llvm::Module::getFunction
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition Module.cpp:235

llvm::Operator::getOpcode
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43

llvm::PostOrderTraversal
Post-order traversal of a graph.
Definition PostOrderIterator.h:205

llvm::PredicatedScalarEvolution
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
Definition ScalarEvolution.h:2621

llvm::PredicatedScalarEvolution::getSE
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
Definition ScalarEvolution.h:2671

llvm::PredicatedScalarEvolution::getSCEV
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
Definition ScalarEvolution.cpp:15524

llvm::RecurrenceDescriptor::getOpcode
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
Definition IVDescriptors.cpp:1228

llvm::RecurrenceDescriptor::getOpcode
unsigned getOpcode() const
Definition IVDescriptors.h:230

llvm::RecurrenceDescriptor::isFindLastRecurrenceKind
static bool isFindLastRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
Definition IVDescriptors.h:294

llvm::RegionBase::getParent
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362

llvm::Region
Definition RegionInfo.h:887

llvm::ReversePostOrderTraversal
Definition PostOrderIterator.h:290

llvm::SCEVConstant
This class represents a constant integer value.
Definition ScalarEvolutionExpressions.h:62

llvm::SCEVConstant::getValue
ConstantInt * getValue() const
Definition ScalarEvolutionExpressions.h:71

llvm::SCEVExpander
This class uses information about analyze scalars to rewrite expressions in canonical form.
Definition ScalarEvolutionExpander.h:64

llvm::SCEVExpander::expandCodeFor
LLVM_ABI Value * expandCodeFor(SCEVUse SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
Definition ScalarEvolutionExpander.cpp:1561

llvm::SCEVParameterRewriter::rewrite
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
Definition ScalarEvolutionExpressions.h:1000

llvm::SCEV
This class represents an analyzed expression in the program.
Definition ScalarEvolution.h:254

llvm::SCEV::getType
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Definition ScalarEvolution.cpp:463

llvm::ScalarEvolution
The main scalar evolution driver.
Definition ScalarEvolution.h:621

llvm::ScalarEvolution::getUDivExpr
LLVM_ABI const SCEV * getUDivExpr(SCEVUse LHS, SCEVUse RHS)
Get a canonical unsigned division expression, or something simpler if possible.
Definition ScalarEvolution.cpp:3563

llvm::ScalarEvolution::getDataLayout
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
Definition ScalarEvolution.h:1515

llvm::ScalarEvolution::getNegativeSCEV
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
Definition ScalarEvolution.cpp:4762

llvm::ScalarEvolution::isKnownNonZero
LLVM_ABI bool isKnownNonZero(const SCEV *S)
Test if the given expression is known to be non-zero.
Definition ScalarEvolution.cpp:11289

llvm::ScalarEvolution::getConstant
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
Definition ScalarEvolution.cpp:552

llvm::ScalarEvolution::getSCEV
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
Definition ScalarEvolution.cpp:4740

llvm::ScalarEvolution::getMinusSCEV
LLVM_ABI const SCEV * getMinusSCEV(SCEVUse LHS, SCEVUse RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
Definition ScalarEvolution.cpp:4842

llvm::ScalarEvolution::getSignedRange
ConstantRange getSignedRange(const SCEV *S)
Determine the signed range for a particular SCEV.
Definition ScalarEvolution.h:1214

llvm::ScalarEvolution::isKnownPositive
LLVM_ABI bool isKnownPositive(const SCEV *S)
Test if the given expression is known to be positive.
Definition ScalarEvolution.cpp:11277

llvm::ScalarEvolution::getElementCount
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Definition ScalarEvolution.cpp:590

llvm::ScalarEvolution::getUnsignedRange
ConstantRange getUnsignedRange(const SCEV *S)
Determine the unsigned range for a particular SCEV.
Definition ScalarEvolution.h:1198

llvm::ScalarEvolution::getMulExpr
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
Definition ScalarEvolution.cpp:3231

llvm::ScalarEvolution::isKnownPredicate
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, SCEVUse LHS, SCEVUse RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
Definition ScalarEvolution.cpp:11434

llvm::ScopedNoAliasAAResult::alias
static LLVM_ABI AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB)
Definition ScopedNoAliasAA.cpp:55

llvm::SetVector
A vector that has set insertion semantics.
Definition SetVector.h:57

llvm::SetVector::size
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103

llvm::SetVector::insert
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151

llvm::SmallDenseMap
Definition DenseMap.h:977

llvm::SmallPtrSetImplBase::size
size_type size() const
Definition SmallPtrSet.h:99

llvm::SmallPtrSetImplBase::empty
bool empty() const
Definition SmallPtrSet.h:98

llvm::SmallPtrSetImpl
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition SmallPtrSet.h:366

llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition SmallPtrSet.h:387

llvm::SmallPtrSetImpl::begin
iterator begin() const
Definition SmallPtrSet.h:484

llvm::SmallPtrSetImpl::contains
bool contains(ConstPtrType Ptr) const
Definition SmallPtrSet.h:467

llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition SmallPtrSet.h:533

llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition SmallVector.h:581

llvm::SmallVectorImpl::pop_back_val
T pop_back_val()
Definition SmallVector.h:681

llvm::SmallVectorTemplateBase::push_back
void push_back(const T &Elt)
Definition SmallVector.h:423

llvm::SmallVectorTemplateCommon::end
iterator end()
Definition SmallVector.h:278

llvm::SmallVectorTemplateCommon::size
size_t size() const
Definition SmallVector.h:83

llvm::SmallVectorTemplateCommon::front
reference front()
Definition SmallVector.h:308

llvm::SmallVectorTemplateCommon::begin
iterator begin()
Definition SmallVector.h:276

llvm::SmallVectorTemplateCommon::back
reference back()
Definition SmallVector.h:317

llvm::SmallVectorTemplateCommon::empty
bool empty() const
Definition SmallVector.h:86

llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition SmallVector.h:1225

llvm::StoreInst
An instruction for storing to memory.
Definition Instructions.h:297

llvm::TargetLibraryInfo
Provides information about what library functions are available for the current target.
Definition TargetLibraryInfo.h:266

llvm::TargetTransformInfo
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Definition TargetTransformInfo.h:268

llvm::TargetTransformInfo::getPartialReductionExtendKind
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
Definition TargetTransformInfo.cpp:1053

llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition TargetTransformInfo.h:331

llvm::TargetTransformInfo::TCK_RecipThroughput
@ TCK_RecipThroughput
Reciprocal throughput.
Definition TargetTransformInfo.h:332

llvm::TargetTransformInfo::RGK_FixedWidthVector
@ RGK_FixedWidthVector
Definition TargetTransformInfo.h:1348

llvm::TargetTransformInfo::RGK_ScalableVector
@ RGK_ScalableVector
Definition TargetTransformInfo.h:1348

llvm::TargetTransformInfo::PartialReductionExtendKind
PartialReductionExtendKind
Definition TargetTransformInfo.h:270

llvm::TargetTransformInfo::getPartialReductionCost
LLVM_ABI InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, PartialReductionExtendKind OpAExtend, PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const
Definition TargetTransformInfo.cpp:915

llvm::TargetTransformInfo::SK_Broadcast
@ SK_Broadcast
Broadcast element 0 to all other elements.
Definition TargetTransformInfo.h:1245

llvm::Twine
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82

llvm::TypeSize
Definition TypeSize.h:332

llvm::TypeSwitch
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:89

llvm::TypeSwitch::Case
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:98

llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46

llvm::Type::getInt32Ty
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:309

llvm::Type::isPointerTy
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:282

llvm::Type::getInt8Ty
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:307

llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:368

llvm::Type::isStructTy
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:276

llvm::Type::getPrimitiveSizeInBits
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197

llvm::Type::getScalarSizeInBits
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:232

llvm::Type::getInt1Ty
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:306

llvm::Type::isFloatingPointTy
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186

llvm::Type::isIntegerTy
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257

llvm::User
Definition User.h:44

llvm::User::operands
op_range operands()
Definition User.h:267

llvm::VFDatabase::getMappings
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition VectorUtils.h:76

llvm::VPActiveLaneMaskPHIRecipe
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:4044

llvm::VPBasicBlock
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:4399

llvm::VPBasicBlock::appendRecipe
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:4474

llvm::VPBasicBlock::iterator
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:4426

llvm::VPBasicBlock::end
iterator end()
Definition VPlan.h:4436

llvm::VPBasicBlock::begin
iterator begin()
Recipe iterator methods.
Definition VPlan.h:4434

llvm::VPBasicBlock::phis
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:4487

llvm::VPBasicBlock::getFirstNonPhi
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:266

llvm::VPBasicBlock::splitAt
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:560

llvm::VPBasicBlock::front
const VPRecipeBase & front() const
Definition VPlan.h:4446

llvm::VPBasicBlock::getTerminator
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:639

llvm::VPBasicBlock::back
const VPRecipeBase & back() const
Definition VPlan.h:4448

llvm::VPBlendRecipe
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2963

llvm::VPBlendRecipe::getMask
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:3013

llvm::VPBlendRecipe::getNumIncomingValues
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:3003

llvm::VPBlendRecipe::setMask
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:3019

llvm::VPBlendRecipe::isNormalized
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2999

llvm::VPBlockBase
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:94

llvm::VPBlockBase::setSuccessors
void setSuccessors(ArrayRef< VPBlockBase * > NewSuccs)
Set each VPBasicBlock in NewSuccss as successor of this VPBlockBase.
Definition VPlan.h:315

llvm::VPBlockBase::getParent
VPRegionBlock * getParent()
Definition VPlan.h:186

llvm::VPBlockBase::getExitingBasicBlock
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:236

llvm::VPBlockBase::getNumSuccessors
size_t getNumSuccessors() const
Definition VPlan.h:237

llvm::VPBlockBase::setPredecessors
void setPredecessors(ArrayRef< VPBlockBase * > NewPreds)
Set each VPBasicBlock in NewPreds as predecessor of this VPBlockBase.
Definition VPlan.h:306

llvm::VPBlockBase::getPredecessors
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:222

llvm::VPBlockBase::getPlan
VPlan * getPlan()
Definition VPlan.cpp:211

llvm::VPBlockBase::getName
const std::string & getName() const
Definition VPlan.h:177

llvm::VPBlockBase::clearSuccessors
void clearSuccessors()
Remove all the successors of this block.
Definition VPlan.h:325

llvm::VPBlockBase::getSinglePredecessor
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:233

llvm::VPBlockBase::clearPredecessors
void clearPredecessors()
Remove all the predecessor of this block.
Definition VPlan.h:322

llvm::VPBlockBase::getEntryBasicBlock
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:216

llvm::VPBlockBase::getSingleHierarchicalPredecessor
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:279

llvm::VPBlockBase::getSingleSuccessor
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:227

llvm::VPBlockBase::getSuccessors
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:211

llvm::VPBlockUtils::blocksAs
static auto blocksAs(T &&Range)
Return an iterator range over Range with each block cast to BlockTy.
Definition VPlanUtils.h:342

llvm::VPBlockUtils::insertOnEdge
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:361

llvm::VPBlockUtils::isLatch
static bool isLatch(const VPBlockBase *VPB, const VPDominatorTree &VPDT)
Returns true if VPB is a loop latch, using isHeader().
Definition VPlanUtils.cpp:739

llvm::VPBlockUtils::insertTwoBlocksAfter
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:251

llvm::VPBlockUtils::connectBlocks
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:269

llvm::VPBlockUtils::disconnectBlocks
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:287

llvm::VPBlockUtils::blocksOnly
static auto blocksOnly(T &&Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:323

llvm::VPBlockUtils::transferSuccessors
static void transferSuccessors(VPBlockBase *Old, VPBlockBase *New)
Transfer successors from Old to New. New must have no successors.
Definition VPlanUtils.h:307

llvm::VPBlockUtils::blocksInSingleSuccessorChainBetween
static SmallVector< VPBasicBlock * > blocksInSingleSuccessorChainBetween(VPBasicBlock *FirstBB, VPBasicBlock *LastBB)
Returns the blocks between FirstBB and LastBB, where FirstBB to LastBB forms a single-sucessor chain.
Definition VPlanUtils.cpp:695

llvm::VPBranchOnMaskRecipe
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:3495

llvm::VPBuilder::InsertPointGuard
RAII object that stores the current insertion point and restores it when the object is destroyed.
Definition LoopVectorizationPlanner.h:506

llvm::VPBuilder
VPlan-based builder utility analogous to IRBuilder.
Definition LoopVectorizationPlanner.h:99

llvm::VPBuilder::createFirstActiveLane
VPInstruction * createFirstActiveLane(ArrayRef< VPValue * > Masks, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Definition LoopVectorizationPlanner.h:238

llvm::VPBuilder::createAdd
VPInstruction * createAdd(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", VPRecipeWithIRFlags::WrapFlagsTy WrapFlags={false, false})
Definition LoopVectorizationPlanner.h:291

llvm::VPBuilder::createOr
VPInstruction * createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Definition LoopVectorizationPlanner.h:281

llvm::VPBuilder::createScalarZExtOrTrunc
VPValue * createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, DebugLoc DL)
Definition LoopVectorizationPlanner.h:440

llvm::VPBuilder::createLogicalOr
VPInstruction * createLogicalOr(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Definition LoopVectorizationPlanner.h:312

llvm::VPBuilder::createNot
VPInstruction * createNot(VPValue *Operand, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Definition LoopVectorizationPlanner.h:268

llvm::VPBuilder::createAnyOfReduction
VPInstruction * createAnyOfReduction(VPValue *ChainOp, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL=DebugLoc::getUnknown())
Create an AnyOf reduction pattern: or-reduce ChainOp, freeze the result, then select between TrueVal ...
Definition VPlan.cpp:1653

llvm::VPBuilder::createLogicalAnd
VPInstruction * createLogicalAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Definition LoopVectorizationPlanner.h:306

llvm::VPBuilder::createDerivedIV
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPIRValue *Start, VPValue *Current, VPValue *Step)
Convert the input value Current to the corresponding value of an induction with Start and Step values...
Definition LoopVectorizationPlanner.h:410

llvm::VPBuilder::createScalarCast
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL, const VPIRMetadata &Metadata={})
Definition LoopVectorizationPlanner.h:424

llvm::VPBuilder::createWidenPhi
VPWidenPHIRecipe * createWidenPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Definition LoopVectorizationPlanner.h:388

llvm::VPBuilder::getToInsertAfter
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
Definition LoopVectorizationPlanner.h:142

llvm::VPBuilder::createWidenCast
VPWidenCastRecipe * createWidenCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy)
Definition LoopVectorizationPlanner.h:467

llvm::VPBuilder::createICmp
VPInstruction * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create a new ICmp VPInstruction with predicate Pred and operands A and B.
Definition LoopVectorizationPlanner.h:329

llvm::VPBuilder::createScalarPhi
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={}, Type *ResultTy=nullptr)
Definition LoopVectorizationPlanner.h:380

llvm::VPBuilder::createSelect
VPInstruction * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={})
Definition LoopVectorizationPlanner.h:318

llvm::VPBuilder::createNaryOp
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", Type *ResultTy=nullptr)
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
Definition LoopVectorizationPlanner.h:206

llvm::VPBuilder::setInsertPoint
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
Definition LoopVectorizationPlanner.h:178

llvm::VPCurrentIterationPHIRecipe
A recipe for generating the phi node tracking the current scalar iteration index.
Definition VPlan.h:4076

llvm::VPDef::getNumDefinedValues
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:561

llvm::VPDef::getVPSingleValue
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:534

llvm::VPDef::getVPValue
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:546

llvm::VPDef::definedValues
ArrayRef< VPRecipeValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:556

llvm::VPDerivedIVRecipe
A recipe for converting the input value IV value to the corresponding value of an IV with different s...
Definition VPlan.h:4177

llvm::VPDominatorTree
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
Definition VPlanDominatorTree.h:38

llvm::VPDominatorTree::properlyDominates
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B)
Definition VPlanAnalysis.cpp:61

llvm::VPExpressionRecipe
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:3540

llvm::VPHeaderPHIRecipe
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:2437

llvm::VPHeaderPHIRecipe::getBackedgeValue
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2484

llvm::VPHeaderPHIRecipe::getStartValue
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2473

llvm::VPHistogramRecipe
A recipe representing a sequence of load -> update -> store as part of a histogram operation.
Definition VPlan.h:2156

llvm::VPIRBasicBlock
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:4552

llvm::VPIRFlags
Class to record and manage LLVM IR flags.
Definition VPlan.h:694

llvm::VPIRFlags::getDefaultFlags
static VPIRFlags getDefaultFlags(unsigned Opcode)
Returns default flags for Opcode for opcodes that support it, asserts otherwise.
Definition VPlanRecipes.cpp:2374

llvm::VPIRFlags::getFastMathFlags
LLVM_ABI_FOR_TEST FastMathFlags getFastMathFlags() const
Definition VPlanRecipes.cpp:390

llvm::VPIRFlags::dropPoisonGeneratingFlags
void dropPoisonGeneratingFlags()
Drop all poison-generating flags.
Definition VPlan.h:891

llvm::VPIRInstruction::create
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Definition VPlanRecipes.cpp:1885

llvm::VPIRMetadata
Helper to manage IR metadata for recipes.
Definition VPlan.h:1170

llvm::VPIRMetadata::intersect
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetadata object with MD, keeping only metadata nodes that are common to both.
Definition VPlanRecipes.cpp:1997

llvm::VPInstruction
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1225

llvm::VPInstruction::getNumOperandsWithoutMask
unsigned getNumOperandsWithoutMask() const
Returns the number of operands, excluding the mask if the VPInstruction is masked.
Definition VPlan.h:1472

llvm::VPInstruction::BranchOnCond
@ BranchOnCond
Definition VPlan.h:1248

llvm::VPInstruction::PtrAdd
@ PtrAdd
Definition VPlan.h:1286

llvm::VPInstruction::ExtractLane
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1318

llvm::VPInstruction::Broadcast
@ Broadcast
Definition VPlan.h:1256

llvm::VPInstruction::LastActiveLane
@ LastActiveLane
Definition VPlan.h:1308

llvm::VPInstruction::BranchOnTwoConds
@ BranchOnTwoConds
Definition VPlan.h:1255

llvm::VPInstruction::ExtractLastPart
@ ExtractLastPart
Definition VPlan.h:1273

llvm::VPInstruction::ExtractPenultimateElement
@ ExtractPenultimateElement
Definition VPlan.h:1279

llvm::VPInstruction::Unpack
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1268

llvm::VPInstruction::ActiveLaneMask
@ ActiveLaneMask
Definition VPlan.h:1237

llvm::VPInstruction::ExplicitVectorLength
@ ExplicitVectorLength
Definition VPlan.h:1238

llvm::VPInstruction::ReductionStartVector
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1314

llvm::VPInstruction::BuildVector
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1263

llvm::VPInstruction::OpsEnd
@ OpsEnd
Definition VPlan.h:1349

llvm::VPInstruction::WidePtrAdd
@ WidePtrAdd
Definition VPlan.h:1289

llvm::VPInstruction::IncomingAliasMask
@ IncomingAliasMask
Definition VPlan.h:1241

llvm::VPInstruction::LogicalAnd
@ LogicalAnd
Definition VPlan.h:1280

llvm::VPInstruction::BuildStructVector
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1260

llvm::VPInstruction::CanonicalIVIncrementForPart
@ CanonicalIVIncrementForPart
Definition VPlan.h:1244

llvm::VPInstruction::ComputeReductionResult
@ ComputeReductionResult
Reduce the operands to the final reduction result using the operation specified via the operation's V...
Definition VPlan.h:1271

llvm::VPInstruction::Not
@ Not
Definition VPlan.h:1232

llvm::VPInstruction::StepVector
@ StepVector
Definition VPlan.h:1345

llvm::VPInstruction::MaskedCond
@ MaskedCond
Definition VPlan.h:1332

llvm::VPInstruction::ExtractLastLane
@ ExtractLastLane
Definition VPlan.h:1275

llvm::VPInstruction::AnyOf
@ AnyOf
Definition VPlan.h:1295

llvm::VPInstruction::NumActiveLanes
@ NumActiveLanes
Definition VPlan.h:1282

llvm::VPInterleaveBase::getInterleaveGroup
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:3115

llvm::VPInterleaveBase::getMask
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:3107

llvm::VPInterleaveBase::getStoredValues
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:3136

llvm::VPInterleaveEVLRecipe
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:3188

llvm::VPInterleaveRecipe
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:3146

llvm::VPPhiAccessors::addIncoming
void addIncoming(VPValue *IncomingV)
Append IncomingV as an incoming value to the phi-like recipe.
Definition VPlan.h:1658

llvm::VPPredInstPHIRecipe
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3704

llvm::VPRecipeBase
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:402

llvm::VPRecipeBase::getParent
VPBasicBlock * getParent()
Definition VPlan.h:477

llvm::VPRecipeBase::getDebugLoc
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:555

llvm::VPRecipeBase::moveBefore
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
Definition VPlanRecipes.cpp:290

llvm::VPRecipeBase::insertBefore
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
Definition VPlanRecipes.cpp:253

llvm::VPRecipeBase::insertAfter
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
Definition VPlanRecipes.cpp:267

llvm::VPRecipeBase::eraseFromParent
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition VPlanRecipes.cpp:280

llvm::VPRecipeBuilder
Helper class to create VPRecipies from IR instructions.
Definition VPRecipeBuilder.h:24

llvm::VPRecipeBuilder::widenIfHistogram
VPHistogramRecipe * widenIfHistogram(VPInstruction *VPI)
If VPI represents a histogram operation (as determined by LoopVectorizationLegality) make that safe f...
Definition LoopVectorize.cpp:6315

llvm::VPRecipeBuilder::tryToWidenMemory
VPRecipeBase * tryToWidenMemory(VPInstruction *VPI, VFRange &Range)
Check if the load or store instruction VPI should widened for Range.Start and potentially masked.
Definition LoopVectorize.cpp:6135

llvm::VPRecipeBuilder::replaceWithFinalIfReductionStore
bool replaceWithFinalIfReductionStore(VPInstruction *VPI, VPBuilder &FinalRedStoresBuilder)
If VPI is a store of a reduction into an invariant address, delete it.
Definition LoopVectorize.cpp:6345

llvm::VPRecipeBuilder::handleReplication
VPReplicateRecipe * handleReplication(VPInstruction *VPI, VFRange &Range)
Build a VPReplicationRecipe for VPI.
Definition LoopVectorize.cpp:6374

llvm::VPRecipeValue::getScalarType
Type * getScalarType() const
Returns the scalar type of this VPRecipeValue.
Definition VPlanValue.h:337

llvm::VPReductionEVLRecipe
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:3359

llvm::VPReductionPHIRecipe
A recipe for handling reduction phis.
Definition VPlan.h:2865

llvm::VPReductionPHIRecipe::setVFScaleFactor
void setVFScaleFactor(unsigned ScaleFactor)
Set the VFScaleFactor for this reduction phi.
Definition VPlan.h:2916

llvm::VPReductionPHIRecipe::getVFScaleFactor
unsigned getVFScaleFactor() const
Get the factor that the VF of this recipe's output should be scaled by, or 1 if it isn't scaled.
Definition VPlan.h:2909

llvm::VPReductionPHIRecipe::getRecurrenceKind
RecurKind getRecurrenceKind() const
Returns the recurrence kind of the reduction.
Definition VPlan.h:2927

llvm::VPReductionRecipe
A recipe to represent inloop, ordered or partial reduction operations.
Definition VPlan.h:3239

llvm::VPRegionBlock
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4609

llvm::VPRegionBlock::getEntry
const VPBlockBase * getEntry() const
Definition VPlan.h:4653

llvm::VPRegionBlock::isReplicator
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4685

llvm::VPRegionBlock::getOrCreateCanonicalIVIncrement
VPInstruction * getOrCreateCanonicalIVIncrement()
Get the canonical IV increment instruction if it exists.
Definition VPlan.cpp:857

llvm::VPRegionBlock::setExiting
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4670

llvm::VPRegionBlock::getCanonicalIVType
Type * getCanonicalIVType() const
Return the type of the canonical IV for loop regions.
Definition VPlan.h:4729

llvm::VPRegionBlock::clearCanonicalIVNUW
void clearCanonicalIVNUW(VPInstruction *Increment)
Unsets NUW for the canonical IV increment Increment, for loop regions.
Definition VPlan.h:4737

llvm::VPRegionBlock::getCanonicalIV
VPRegionValue * getCanonicalIV()
Return the canonical induction variable of the region, null for replicating regions.
Definition VPlan.h:4721

llvm::VPRegionBlock::getExiting
const VPBlockBase * getExiting() const
Definition VPlan.h:4665

llvm::VPRegionBlock::getPreheaderVPBB
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4678

llvm::VPReplicateRecipe
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:3404

llvm::VPReplicateRecipe::isSingleScalar
bool isSingleScalar() const
Definition VPlan.h:3460

llvm::VPReplicateRecipe::computeCallCost
static InstructionCost computeCallCost(Function *CalledFn, Type *ResultTy, ArrayRef< const VPValue * > ArgOps, bool IsSingleScalar, ElementCount VF, VPCostContext &Ctx)
Return the cost of scalarizing a call to CalledFn with argument operands ArgOps for a given VF.
Definition VPlanRecipes.cpp:3879

llvm::VPReplicateRecipe::isPredicated
bool isPredicated() const
Definition VPlan.h:3462

llvm::VPReplicateRecipe::getMask
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:3479

llvm::VPSCEVExpander
Lightweight SCEV-to-VPlan expander.
Definition VPlanUtils.h:189

llvm::VPSCEVExpander::tryToExpand
VPValue * tryToExpand(const SCEV *S)
Try to expand S into recipes and live-ins using the builder.
Definition VPlanUtils.cpp:929

llvm::VPScalarIVStepsRecipe
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:4244

llvm::VPSingleDefRecipe
VPSingleDefRecipe is a base class for recipes that model a sequence of one or more output IR that def...
Definition VPlan.h:608

llvm::VPSingleDefRecipe::getUnderlyingInstr
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:679

llvm::VPSingleDefRecipe::clone
VPSingleDefRecipe * clone() override=0
Clone the current recipe.

llvm::VPUser
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:384

llvm::VPUser::operands
operand_range operands()
Definition VPlanValue.h:457

llvm::VPUser::setOperand
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:430

llvm::VPUser::getNumOperands
unsigned getNumOperands() const
Definition VPlanValue.h:424

llvm::VPUser::getOperand
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:425

llvm::VPValue
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:50

llvm::VPValue::getScalarType
Type * getScalarType() const
Returns the scalar type of this VPValue, dispatching based on the concrete subclass.
Definition VPlan.cpp:149

llvm::VPValue::getLiveInIRValue
Value * getLiveInIRValue() const
Return the underlying IR value for a VPIRValue.
Definition VPlan.cpp:143

llvm::VPValue::isDefinedOutsideLoopRegions
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1478

llvm::VPValue::getDefiningRecipe
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:130

llvm::VPValue::hasMoreThanOneUniqueUser
bool hasMoreThanOneUniqueUser() const
Returns true if the value has more than one unique user.
Definition VPlanValue.h:163

llvm::VPValue::getUnderlyingValue
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:75

llvm::VPValue::setUnderlyingValue
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:208

llvm::VPValue::getSingleUser
VPUser * getSingleUser()
Return the single user of this value, or nullptr if there is not exactly one user.
Definition VPlanValue.h:178

llvm::VPValue::replaceAllUsesWith
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1481

llvm::VPValue::getNumUsers
unsigned getNumUsers() const
Definition VPlanValue.h:115

llvm::VPValue::replaceUsesWithIf
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1487

llvm::VPValue::users
user_range users()
Definition VPlanValue.h:157

llvm::VPVectorEndPointerRecipe
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:2267

llvm::VPWidenCallRecipe
A recipe for widening Call instructions using library calls.
Definition VPlan.h:2090

llvm::VPWidenCallRecipe::computeCallCost
static InstructionCost computeCallCost(Function *Variant, VPCostContext &Ctx)
Return the cost of widening a call using the vector function Variant.
Definition VPlanRecipes.cpp:2069

llvm::VPWidenCastRecipe
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1871

llvm::VPWidenCastRecipe::getOpcode
Instruction::CastOps getOpcode() const
Definition VPlan.h:1907

llvm::VPWidenGEPRecipe
A recipe for handling GEP instructions.
Definition VPlan.h:2199

llvm::VPWidenInductionRecipe
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2517

llvm::VPWidenInductionRecipe::getVFValue
VPValue * getVFValue()
Definition VPlan.h:2574

llvm::VPWidenInductionRecipe::getStartValue
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2565

llvm::VPWidenInductionRecipe::getPHINode
PHINode * getPHINode() const
Returns the underlying PHINode if one exists, or null otherwise.
Definition VPlan.h:2583

llvm::VPWidenInductionRecipe::getStepValue
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2568

llvm::VPWidenInductionRecipe::getInductionDescriptor
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2588

llvm::VPWidenIntOrFpInductionRecipe
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2624

llvm::VPWidenIntOrFpInductionRecipe::getStartValue
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2671

llvm::VPWidenIntOrFpInductionRecipe::getSplatVFValue
VPValue * getSplatVFValue() const
If the recipe has been unrolled, return the VPValue for the induction increment, otherwise return nul...
Definition VPlan.h:2675

llvm::VPWidenIntOrFpInductionRecipe::getTruncInst
TruncInst * getTruncInst()
Returns the first defined value as TruncInst, if it is one or nullptr otherwise.
Definition VPlan.h:2686

llvm::VPWidenIntOrFpInductionRecipe::getLastUnrolledPartOperand
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2697

llvm::VPWidenIntrinsicRecipe
A recipe for widening vector intrinsics.
Definition VPlan.h:1918

llvm::VPWidenIntrinsicRecipe::computeCallCost
static InstructionCost computeCallCost(Intrinsic::ID ID, ArrayRef< const VPValue * > Operands, const VPRecipeWithIRFlags &R, ElementCount VF, VPCostContext &Ctx)
Compute the cost of a vector intrinsic with ID and Operands.
Definition VPlanRecipes.cpp:2173

llvm::VPWidenMemIntrinsicRecipe::computeMemIntrinsicCost
static InstructionCost computeMemIntrinsicCost(Intrinsic::ID IID, Type *Ty, bool IsMasked, Align Alignment, VPCostContext &Ctx)
Helper function for computing the cost of vector memory intrinsic.
Definition VPlanRecipes.cpp:2264

llvm::VPWidenMemoryRecipe
A common mixin class for widening memory operations.
Definition VPlan.h:3740

llvm::VPWidenMemoryRecipe::getAsRecipe
virtual VPRecipeBase * getAsRecipe()=0
Return a VPRecipeBase* to the current object.

llvm::VPWidenPHIRecipe
A recipe for widened phis.
Definition VPlan.h:2755

llvm::VPWidenPointerInductionRecipe
Definition VPlan.h:2709

llvm::VPWidenRecipe
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1810

llvm::VPWidenRecipe::computeCost
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenRecipe.
Definition VPlanRecipes.cpp:2733

llvm::VPWidenRecipe::clone
VPWidenRecipe * clone() override
Clone the current recipe.
Definition VPlan.h:1831

llvm::VPWidenRecipe::getOpcode
unsigned getOpcode() const
Definition VPlan.h:1850

llvm::VPlan
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4757

llvm::VPlan::getLiveIn
VPIRValue * getLiveIn(Value *V) const
Return the live-in VPIRValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:5082

llvm::VPlan::hasVF
bool hasVF(ElementCount VF) const
Definition VPlan.h:4980

llvm::VPlan::getDataLayout
const DataLayout & getDataLayout() const
Definition VPlan.h:4962

llvm::VPlan::getContext
LLVMContext & getContext() const
Definition VPlan.h:4958

llvm::VPlan::getEntry
VPBasicBlock * getEntry()
Definition VPlan.h:4853

llvm::VPlan::hasScalableVF
bool hasScalableVF() const
Definition VPlan.h:4981

llvm::VPlan::getTripCount
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4916

llvm::VPlan::getOrCreateBackedgeTakenCount
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4937

llvm::VPlan::vectorFactors
iterator_range< SmallSetVector< ElementCount, 2 >::iterator > vectorFactors() const
Returns an iterator range over all VFs of the plan.
Definition VPlan.h:4987

llvm::VPlan::getFalse
VPIRValue * getFalse()
Return a VPIRValue wrapping i1 false.
Definition VPlan.h:5053

llvm::VPlan::getVFxUF
VPSymbolicValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4956

llvm::VPlan::getAllOnesValue
VPIRValue * getAllOnesValue(Type *Ty)
Return a VPIRValue wrapping the AllOnes value of type Ty.
Definition VPlan.h:5059

llvm::VPlan::createReplicateRegion
VPRegionBlock * createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="")
Create a new replicate region with Entry, Exiting and Name.
Definition VPlan.h:5131

llvm::VPlan::getLiveIns
auto getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:5085

llvm::VPlan::hasUF
bool hasUF(unsigned UF) const
Definition VPlan.h:5005

llvm::VPlan::getExitBlocks
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4906

llvm::VPlan::getVectorTripCount
VPSymbolicValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4946

llvm::VPlan::getBackedgeTakenCount
VPValue * getBackedgeTakenCount() const
Definition VPlan.h:4943

llvm::VPlan::getOrAddLiveIn
VPIRValue * getOrAddLiveIn(Value *V)
Gets the live-in VPIRValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:5030

llvm::VPlan::getZero
VPIRValue * getZero(Type *Ty)
Return a VPIRValue wrapping the null value of type Ty.
Definition VPlan.h:5056

llvm::VPlan::setVF
void setVF(ElementCount VF)
Definition VPlan.h:4968

llvm::VPlan::isUnrolled
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:5021

llvm::VPlan::getVectorLoopRegion
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1068

llvm::VPlan::getConcreteUF
unsigned getConcreteUF() const
Returns the concrete UF of the plan, after unrolling.
Definition VPlan.h:5008

llvm::VPlan::resetTripCount
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4930

llvm::VPlan::getMiddleBlock
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4882

llvm::VPlan::createVPBasicBlock
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:5108

llvm::VPlan::getTrue
VPIRValue * getTrue()
Return a VPIRValue wrapping i1 true.
Definition VPlan.h:5050

llvm::VPlan::getVectorPreheader
VPBasicBlock * getVectorPreheader() const
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4858

llvm::VPlan::getUF
VPSymbolicValue & getUF()
Returns the UF of the vector loop region.
Definition VPlan.h:4953

llvm::VPlan::hasScalarVFOnly
bool hasScalarVFOnly() const
Definition VPlan.h:4998

llvm::VPlan::getScalarPreheader
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4896

llvm::VPlan::getVF
VPSymbolicValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4949

llvm::VPlan::setUF
void setUF(unsigned UF)
Definition VPlan.h:5013

llvm::VPlan::hasScalarTail
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop, i.e.
Definition VPlan.h:5163

llvm::VPlan::duplicate
LLVM_ABI_FOR_TEST VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition VPlan.cpp:1224

llvm::VPlan::getConstantInt
VPIRValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPIRValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:5064

llvm::Value
LLVM Value Representation.
Definition Value.h:75

llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255

llvm::Value::users
iterator_range< user_iterator > users()
Definition Value.h:426

llvm::Value::hasName
bool hasName() const
Definition Value.h:261

llvm::Value::getName
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318

llvm::VectorType::get
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.

llvm::details::FixedOrScalableQuantity::hasKnownScalarFactor
constexpr bool hasKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns true if there exists a value X where RHS.multiplyCoefficientBy(X) will result in a value whos...
Definition TypeSize.h:269

llvm::details::FixedOrScalableQuantity::getFixedValue
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200

llvm::details::FixedOrScalableQuantity::getKnownScalarFactor
constexpr ScalarTy getKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns a value X where RHS.multiplyCoefficientBy(X) will result in a value whose quantity matches ou...
Definition TypeSize.h:277

llvm::details::FixedOrScalableQuantity< ElementCount, unsigned >::isKnownLT
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216

llvm::details::FixedOrScalableQuantity::isScalable
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168

llvm::details::FixedOrScalableQuantity::multiplyCoefficientBy
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256

llvm::details::FixedOrScalableQuantity::isFixed
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171

llvm::details::FixedOrScalableQuantity::getKnownMinValue
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165

llvm::function_ref
An efficient, type-erasing, non-owning reference to a callable.
Definition STLFunctionalExtras.h:37

llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition ilist_node.h:123

uint64_t

unsigned

Changed
Changed
Definition ObjCARCOpts.cpp:2366

llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition ErrorHandling.h:164

llvm::APIntOps::RoundingUDiv
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2815

llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24

llvm::CallingConv::C
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34

llvm::Check
Definition FileCheck.h:50

llvm::Intrinsic::not_intrinsic
@ not_intrinsic
Definition Intrinsics.h:49

llvm::Intrinsic::ID
unsigned ID
Definition GenericSSAContext.h:28

llvm::Loc
Definition DwarfDebug.h:130

llvm::Loc::Variant
std::variant< std::monostate, Loc::Single, Loc::Multi, Loc::MMI, Loc::EntryValue > Variant
Alias for the std::variant specialization base class of DbgVariable.
Definition DwarfDebug.h:190

llvm::M68k::MemAddrModeKind::U
@ U
Definition M68kBaseInfo.h:60

llvm::M68k::MemAddrModeKind::L
@ L
Definition M68kBaseInfo.h:69

llvm::MIPatternMatch::m_ZeroInt
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
Definition MIPatternMatch.h:278

llvm::MIPatternMatch::m_Not
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
Definition MIPatternMatch.h:943

llvm::MIPatternMatch::m_OneUse
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
Definition MIPatternMatch.h:56

llvm::NVPTX::Const
@ Const
Definition NVPTX.h:206

llvm::PatternMatchHelpers::m_Isa
match_isa< To... > m_Isa()
Definition PatternMatchHelpers.h:73

llvm::PatternMatchHelpers::m_CombineOr
match_combine_or< Ty... > m_CombineOr(const Ty &...Ps)
Combine pattern matchers matching any of Ps patterns.
Definition PatternMatchHelpers.h:56

llvm::PatternMatch::m_AllOnes
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
Definition PatternMatch.h:492

llvm::PatternMatch::m_Cmp
auto m_Cmp()
Matches any compare instruction and ignore it.
Definition PatternMatch.h:144

llvm::PatternMatch::m_Add
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
Definition PatternMatch.h:1154

llvm::PatternMatch::m_MaskedStore
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedStore Intrinsic.
Definition PatternMatch.h:2871

llvm::PatternMatch::m_TruncOrSelf
match_combine_or< CastInst_match< OpTy, TruncInst >, OpTy > m_TruncOrSelf(const OpTy &Op)
Definition PatternMatch.h:2215

llvm::PatternMatch::m_APInt
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
Definition PatternMatch.h:261

llvm::PatternMatch::m_Trunc
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
Definition PatternMatch.h:2195

llvm::PatternMatch::m_LogicalAnd
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
Definition PatternMatch.h:3351

llvm::PatternMatch::m_SpecificInt
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
Definition PatternMatch.h:1032

llvm::PatternMatch::m_FMul
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
Definition PatternMatch.h:1226

llvm::PatternMatch::m_ZExtOrSelf
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
Definition PatternMatch.h:2238

llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition PatternMatch.h:53

llvm::PatternMatch::m_Deferred
match_deferred< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
Definition PatternMatch.h:951

llvm::PatternMatch::m_Specific
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition PatternMatch.h:943

llvm::PatternMatch::match_fn
auto match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
Definition PatternMatch.h:60

llvm::PatternMatch::m_MaskedLoad
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedLoad Intrinsic.
Definition PatternMatch.h:2864

llvm::PatternMatch::m_One
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
Definition PatternMatch.h:562

llvm::PatternMatch::m_Intrinsic
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
Definition PatternMatch.h:2848

llvm::PatternMatch::m_Select
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
Definition PatternMatch.h:1900

llvm::PatternMatch::m_SpecificCmp
SpecificCmpClass_match< LHS, RHS, CmpInst > m_SpecificCmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
Definition PatternMatch.h:1783

llvm::PatternMatch::m_Mul
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
Definition PatternMatch.h:1220

llvm::PatternMatch::m_FPExt
CastInst_match< OpTy, FPExtInst > m_FPExt(const OpTy &Op)
Definition PatternMatch.h:2391

llvm::PatternMatch::m_SpecificICmp
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
Definition PatternMatch.h:1789

llvm::PatternMatch::m_UDiv
BinaryOp_match< LHS, RHS, Instruction::UDiv > m_UDiv(const LHS &L, const RHS &R)
Definition PatternMatch.h:1232

llvm::PatternMatch::m_c_Add
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
Definition PatternMatch.h:3078

llvm::PatternMatch::m_ICmp
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
Definition PatternMatch.h:1726

llvm::PatternMatch::m_ZExtOrSExt
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
Definition PatternMatch.h:2258

llvm::PatternMatch::m_FNeg
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
Definition PatternMatch.h:1208

llvm::PatternMatch::m_c_FAdd
BinaryOp_match< LHS, RHS, Instruction::FAdd, true > m_c_FAdd(const LHS &L, const RHS &R)
Matches FAdd with LHS and RHS in either order.
Definition PatternMatch.h:3183

llvm::PatternMatch::m_c_LogicalAnd
LogicalOp_match< LHS, RHS, Instruction::And, true > m_c_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R with LHS and RHS in either order.
Definition PatternMatch.h:3362

llvm::PatternMatch::m_LogicalAnd
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
Definition PatternMatch.h:3357

llvm::PatternMatch::m_SExt
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
Definition PatternMatch.h:2221

llvm::PatternMatch::m_c_Mul
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
Definition PatternMatch.h:3085

llvm::PatternMatch::m_Sub
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
Definition PatternMatch.h:1166

llvm::PatternMatch::m_ConstantInt
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition PatternMatch.h:179

llvm::RISCVExceptFlags::UF
@ UF
Definition RISCVBaseInfo.h:601

llvm::RISCVFenceField::R
@ R
Definition RISCVBaseInfo.h:490

llvm::SCEVPatternMatch
Definition ScalarEvolutionPatternMatch.h:36

llvm::SCEVPatternMatch::m_scev_APInt
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
Definition ScalarEvolutionPatternMatch.h:157

llvm::SCEVPatternMatch::m_SpecificLoop
specificloop_ty m_SpecificLoop(const Loop *L)
Definition ScalarEvolutionPatternMatch.h:379

llvm::SCEVPatternMatch::match
bool match(const SCEV *S, const Pattern &P)
Definition ScalarEvolutionPatternMatch.h:38

llvm::SCEVPatternMatch::m_scev_AffineAddRec
SCEVAffineAddRec_match< Op0_t, Op1_t, match_isa< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
Definition ScalarEvolutionPatternMatch.h:385

llvm::SCEVPatternMatch::m_SCEVConstant
auto m_SCEVConstant()
Definition ScalarEvolutionPatternMatch.h:82

llvm::SCEVPatternMatch::m_SCEV
auto m_SCEV()
Definition ScalarEvolutionPatternMatch.h:81

llvm::SI
Definition SIInstrInfo.h:1926

llvm::VPlanPatternMatch
Definition VPlanPatternMatch.h:24

llvm::VPlanPatternMatch::m_ExtractLastLaneOfLastPart
VPInstruction_match< VPInstruction::ExtractLastLane, VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > > m_ExtractLastLaneOfLastPart(const Op0_t &Op0)
Definition VPlanPatternMatch.h:423

llvm::VPlanPatternMatch::m_c_BinaryAnd
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
Definition VPlanPatternMatch.h:653

llvm::VPlanPatternMatch::m_BinaryOr
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
Definition VPlanPatternMatch.h:663

llvm::VPlanPatternMatch::m_AnyOf
VPInstruction_match< VPInstruction::AnyOf > m_AnyOf()
Definition VPlanPatternMatch.h:449

llvm::VPlanPatternMatch::m_c_BinaryOr
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Definition VPlanPatternMatch.h:669

llvm::VPlanPatternMatch::m_ComputeReductionResult
VPInstruction_match< VPInstruction::ComputeReductionResult, Op0_t > m_ComputeReductionResult(const Op0_t &Op0)
Definition VPlanPatternMatch.h:480

llvm::VPlanPatternMatch::m_WidenAnyExtend
auto m_WidenAnyExtend(const Op0_t &Op0)
Definition VPlanPatternMatch.h:554

llvm::VPlanPatternMatch::m_VPIRValue
match_bind< VPIRValue > m_VPIRValue(VPIRValue *&V)
Match a VPIRValue.
Definition VPlanPatternMatch.h:207

llvm::VPlanPatternMatch::m_StepVector
VPInstruction_match< VPInstruction::StepVector > m_StepVector()
Definition VPlanPatternMatch.h:501

llvm::VPlanPatternMatch::m_VPPhi
auto m_VPPhi(const Op0_t &Op0, const Op1_t &Op1)
Definition VPlanPatternMatch.h:1107

llvm::VPlanPatternMatch::m_BranchOnTwoConds
VPInstruction_match< VPInstruction::BranchOnTwoConds > m_BranchOnTwoConds()
Definition VPlanPatternMatch.h:367

llvm::VPlanPatternMatch::m_Binary
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
Definition VPlanPatternMatch.h:573

llvm::VPlanPatternMatch::m_LastActiveLane
VPInstruction_match< VPInstruction::LastActiveLane, Op0_t > m_LastActiveLane(const Op0_t &Op0)
Definition VPlanPatternMatch.h:467

llvm::VPlanPatternMatch::m_WidenIntrinsic
auto m_WidenIntrinsic(const T &...Ops)
Definition VPlanPatternMatch.h:1031

llvm::VPlanPatternMatch::m_CanonicalWidenIV
canonical_widen_iv_match m_CanonicalWidenIV()
Definition VPlanPatternMatch.h:851

llvm::VPlanPatternMatch::m_ExitingIVValue
VPInstruction_match< VPInstruction::ExitingIVValue, Op0_t > m_ExitingIVValue(const Op0_t &Op0)
Definition VPlanPatternMatch.h:507

llvm::VPlanPatternMatch::m_ExtractElement
VPInstruction_match< Instruction::ExtractElement, Op0_t, Op1_t > m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1)
Definition VPlanPatternMatch.h:397

llvm::VPlanPatternMatch::m_False
specific_intval< 1 > m_False()
Definition VPlanPatternMatch.h:120

llvm::VPlanPatternMatch::m_ExtractLastLane
VPInstruction_match< VPInstruction::ExtractLastLane, Op0_t > m_ExtractLastLane(const Op0_t &Op0)
Definition VPlanPatternMatch.h:391

llvm::VPlanPatternMatch::m_ActiveLaneMask
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
Definition VPlanPatternMatch.h:435

llvm::VPlanPatternMatch::m_VPSingleDefRecipe
match_bind< VPSingleDefRecipe > m_VPSingleDefRecipe(VPSingleDefRecipe *&V)
Match a VPSingleDefRecipe, capturing if we match.
Definition VPlanPatternMatch.h:211

llvm::VPlanPatternMatch::m_BranchOnCount
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
Definition VPlanPatternMatch.h:439

llvm::VPlanPatternMatch::m_GetElementPtr
auto m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
Definition VPlanPatternMatch.h:772

llvm::VPlanPatternMatch::m_LiveIn
auto m_LiveIn()
Definition VPlanPatternMatch.h:1035

llvm::VPlanPatternMatch::m_True
specific_intval< 1 > m_True()
Definition VPlanPatternMatch.h:124

llvm::VPlanPatternMatch::m_VPValue
auto m_VPValue()
Match an arbitrary VPValue and ignore it.
Definition VPlanPatternMatch.h:51

llvm::VPlanPatternMatch::m_VecEndPtr
VectorEndPointerRecipe_match< Op0_t, Op1_t > m_VecEndPtr(const Op0_t &Op0, const Op1_t &Op1)
Definition VPlanPatternMatch.h:925

llvm::VPlanPatternMatch::m_ExtractLastPart
VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > m_ExtractLastPart(const Op0_t &Op0)
Definition VPlanPatternMatch.h:415

llvm::VPlanPatternMatch::m_Broadcast
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
Definition VPlanPatternMatch.h:379

llvm::VPlanPatternMatch::m_EVL
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
Definition VPlanPatternMatch.h:385

llvm::VPlanPatternMatch::m_BuildVector
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
Definition VPlanPatternMatch.h:339

llvm::VPlanPatternMatch::m_ExtractPenultimateElement
VPInstruction_match< VPInstruction::ExtractPenultimateElement, Op0_t > m_ExtractPenultimateElement(const Op0_t &Op0)
Definition VPlanPatternMatch.h:429

llvm::VPlanPatternMatch::m_VPInstruction
match_bind< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
Definition VPlanPatternMatch.h:216

llvm::VPlanPatternMatch::m_FirstActiveLane
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
Definition VPlanPatternMatch.h:461

llvm::VPlanPatternMatch::m_DerivedIV
auto m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
Definition VPlanPatternMatch.h:867

llvm::VPlanPatternMatch::m_BranchOnCond
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
Definition VPlanPatternMatch.h:356

llvm::VPlanPatternMatch::findUserOf
static VPRecipeBase * findUserOf(VPValue *V, const MatchT &P)
If V is used by a recipe matching pattern P, return it.
Definition VPlanPatternMatch.h:1115

llvm::VPlanPatternMatch::m_ExtractLane
VPInstruction_match< VPInstruction::ExtractLane, Op0_t, Op1_t > m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1)
Definition VPlanPatternMatch.h:409

llvm::VPlanPatternMatch::m_AnyNeg
auto m_AnyNeg(const Op0_t &Op0)
Definition VPlanPatternMatch.h:558

llvm::VPlanPatternMatch::m_Reverse
VPInstruction_match< VPInstruction::Reverse, Op0_t > m_Reverse(const Op0_t &Op0)
Definition VPlanPatternMatch.h:497

llvm::codeview::ClassOptions::Intrinsic
@ Intrinsic
Definition CodeView.h:198

llvm::codeview::PublicSymFlags::Function
@ Function
Definition CodeView.h:408

llvm::dwarf_linker::DebugSectionKind::DebugLoc
@ DebugLoc
Definition DWARFLinkerBase.h:34

llvm::lltok::Kind
Kind
Definition LLToken.h:18

llvm::logicalview::LVAttributeKind::Zero
@ Zero
Definition LVOptions.h:130

llvm::ms_demangle::QualifierMangleMode::Result
@ Result
Definition MicrosoftDemangle.h:132

llvm::pdb::DbgHeaderType::Max
@ Max
Definition RawConstants.h:101

llvm::rdf::Def
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384

llvm::sframe::Flags
Flags
Definition SFrame.h:39

llvm::vputils::isSingleScalar
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
Definition VPlanUtils.cpp:404

llvm::vputils::getOrCreateVPValueForSCEVExpr
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
Definition VPlanUtils.cpp:40

llvm::vputils::cannotHoistOrSinkRecipe
bool cannotHoistOrSinkRecipe(const VPRecipeBase &R, bool Sinking=false)
Return true if we do not know how to (mechanically) hoist or sink R.
Definition VPlanUtils.cpp:516

llvm::vputils::findComputeReductionResult
VPInstruction * findComputeReductionResult(VPReductionPHIRecipe *PhiR)
Find the ComputeReductionResult recipe for PhiR, looking through selects inserted for predicated redu...
Definition VPlanUtils.cpp:837

llvm::vputils::findCanonicalIVIncrement
VPInstruction * findCanonicalIVIncrement(VPlan &Plan)
Find the canonical IV increment of Plan's vector loop region.
Definition VPlanUtils.cpp:774

llvm::vputils::getMemoryLocation
std::optional< MemoryLocation > getMemoryLocation(const VPRecipeBase &R)
Return a MemoryLocation for R with noalias metadata populated from R, if the recipe is supported and ...
Definition VPlanUtils.cpp:761

llvm::vputils::onlyFirstLaneUsed
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
Definition VPlanUtils.cpp:25

llvm::vputils::findIncomingAliasMask
VPValue * findIncomingAliasMask(const VPlan &Plan)
Finds the incoming alias-mask within the vector preheader.
Definition VPlanUtils.cpp:715

llvm::vputils::findRecipe
VPRecipeBase * findRecipe(VPValue *Start, PredT Pred)
Search Start's users for a recipe satisfying Pred, looking through recipes with definitions.
Definition VPlanUtils.h:139

llvm::vputils::findHeaderMask
VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) Note: If ...
Definition VPlanUtils.cpp:640

llvm::vputils::onlyScalarValuesUsed
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
Definition VPlanUtils.cpp:35

llvm::vputils::isUniformAcrossVFsAndUFs
bool isUniformAcrossVFsAndUFs(const VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
Definition VPlanUtils.cpp:441

llvm::vputils::getRecipesForUncountableExit
LLVM_ABI_FOR_TEST std::optional< VPValue * > getRecipesForUncountableExit(SmallVectorImpl< VPInstruction * > &Recipes, SmallVectorImpl< VPInstruction * > &GEPs, VPBasicBlock *LatchVPBB)
Returns the VPValue representing the uncountable exit comparison used by AnyOf if the recipes it depe...
Definition VPlanUtils.cpp:532

llvm::vputils::getSCEVExprForVPValue
const SCEV * getSCEVExprForVPValue(const VPValue *V, PredicatedScalarEvolution &PSE, const Loop *L=nullptr)
Return the SCEV expression for V.
Definition VPlanUtils.cpp:170

llvm::vputils::isHeaderMask
bool isHeaderMask(const VPValue *V, const VPlan &Plan)
Return true if V is a header mask in Plan.
Definition VPlanUtils.cpp:59

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition FunctionInfo.h:25

llvm::drop_begin
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315

llvm::vp_rpo_plain_cfg_loop_body
SmallVector< VPBasicBlock * > vp_rpo_plain_cfg_loop_body(VPBasicBlock *Header)
Returns the VPBasicBlocks forming the loop body of a plain (pre-region) VPlan in reverse post-order s...
Definition VPlanCFG.h:265

llvm::Offset
@ Offset
Definition DWP.cpp:558

llvm::not_equal_to
constexpr auto not_equal_to(T &&Arg)
Functor variant of std::not_equal_to that can be used as a UnaryPredicate in functional algorithms li...
Definition STLExtras.h:2179

llvm::stable_sort
void stable_sort(R &&Range)
Definition STLExtras.h:2115

llvm::min_element
auto min_element(R &&Range)
Provide wrappers to std::min_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2077

llvm::all_of
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738

llvm::size
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1668

llvm::getVectorIntrinsicIDForCall
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
Definition VectorUtils.cpp:236

llvm::TailFoldingOpts::Reverse
@ Reverse
Definition AArch64BaseInfo.h:670

llvm::zip_equal
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:840

llvm::ValueToSCEVMapTy
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
Definition ScalarEvolutionExpressions.h:994

llvm::enumerate
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553

llvm::dyn_cast
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643

llvm::getLoadStorePointerOperand
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
Definition Instructions.h:5293

llvm::from_range
constexpr from_range_t from_range
Definition STLForwardCompat.h:206

llvm::make_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Definition iterator_range.h:70

llvm::append_range
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2207

llvm::make_early_inc_range
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633

llvm::cast_or_null
auto cast_or_null(const Y &Val)
Definition Casting.h:714

llvm::vp_depth_first_shallow
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:253

llvm::bind_back
constexpr auto bind_back(FnT &&Fn, BindArgsT &&...BindArgs)
C++23 bind_back.
Definition STLForwardCompat.h:317

llvm::vp_depth_first_deep
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:288

llvm::equal_to
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2172

llvm::operator==
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
Definition AddressRanges.h:151

llvm::calculateRegisterUsageForPlan
SmallVector< VPRegisterUsage, 8 > calculateRegisterUsageForPlan(VPlan &Plan, ArrayRef< ElementCount > VFs, const TargetTransformInfo &TTI, const SmallPtrSetImpl< const Value * > &ValuesToIgnore)
Estimate the register usage for Plan and vectorization factors in VFs by calculating the highest numb...
Definition VPlanAnalysis.cpp:109

llvm::map_range
auto map_range(ContainerTy &&C, FuncTy F)
Return a range that applies F to the elements of C.
Definition STLExtras.h:365

llvm::concat
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1151

llvm::PowerOf2Ceil
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385

llvm::dyn_cast_or_null
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753

llvm::any_of
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745

llvm::reverse
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407

llvm::range_size
constexpr size_t range_size(R &&Range)
Returns the size of the Range, i.e., the number of elements.
Definition STLExtras.h:1693

llvm::sort
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1635

llvm::EnableWideActiveLaneMask
LLVM_ABI_FOR_TEST cl::opt< bool > EnableWideActiveLaneMask

llvm::UncountableExitStyle
UncountableExitStyle
Different methods of handling early exits.
Definition VPlan.h:79

llvm::UncountableExitStyle::MaskedHandleExitInScalarLoop
@ MaskedHandleExitInScalarLoop
All memory operations other than the load(s) required to determine whether an uncountable exit occurr...
Definition VPlan.h:89

llvm::none_of
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1752

llvm::to_vector
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
Definition SmallVector.h:1325

llvm::make_filter_range
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition STLExtras.h:551

llvm::canConstantBeExtended
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1850

llvm::SmallVector
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
Definition SmallVector.h:1151

llvm::isa
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547

llvm::drop_end
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:322

llvm::IRMemLocation::Other
@ Other
Any other memory.
Definition ModRef.h:68

llvm::TTI
TargetTransformInfo TTI
Definition TargetTransformInfo.h:263

llvm::LEB128Sign::Signed
@ Signed
Definition LEB128.h:232

llvm::RecurKind
RecurKind
These are the kinds of recurrences that we support.
Definition IVDescriptors.h:35

llvm::RecurKind::UMin
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
Definition IVDescriptors.h:47

llvm::RecurKind::FindIV
@ FindIV
FindIV reduction with select(icmp(),x,y) where one of (x,y) is a loop induction variable (increasing ...
Definition IVDescriptors.h:64

llvm::RecurKind::Or
@ Or
Bitwise or logical OR of integers.
Definition IVDescriptors.h:42

llvm::RecurKind::Mul
@ Mul
Product of integers.
Definition IVDescriptors.h:41

llvm::RecurKind::FSub
@ FSub
Subtraction of floats.
Definition IVDescriptors.h:51

llvm::RecurKind::FMul
@ FMul
Product of floats.
Definition IVDescriptors.h:52

llvm::RecurKind::SMax
@ SMax
Signed integer max implemented in terms of select(cmp()).
Definition IVDescriptors.h:46

llvm::RecurKind::SMin
@ SMin
Signed integer min implemented in terms of select(cmp()).
Definition IVDescriptors.h:45

llvm::RecurKind::Sub
@ Sub
Subtraction of integers.
Definition IVDescriptors.h:39

llvm::RecurKind::Add
@ Add
Sum of integers.
Definition IVDescriptors.h:38

llvm::RecurKind::AddChainWithSubs
@ AddChainWithSubs
A chain of adds and subs.
Definition IVDescriptors.h:40

llvm::RecurKind::FAdd
@ FAdd
Sum of floats.
Definition IVDescriptors.h:49

llvm::RecurKind::UMax
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
Definition IVDescriptors.h:48

llvm::getRecurrenceIdentity
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
Definition LoopUtils.cpp:1535

llvm::SplitBlock
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
Definition BasicBlockUtils.cpp:1049

llvm::Next
FunctionAddr VTableAddr Next
Definition InstrProf.h:141

llvm::count
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2011

llvm::Op
DWARFExpression::Operation Op
Definition DWARFExpressionPrinter.cpp:25

llvm::max_element
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2087

llvm::ArrayRef
ArrayRef(const T &OneElt) -> ArrayRef< T >

llvm::make_second_range
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
Definition STLExtras.h:1408

llvm::cast
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559

llvm::find_if
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1771

llvm::is_contained
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946

llvm::getLoadStoreType
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
Definition Instructions.h:5348

llvm::all_equal
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2165

llvm::hash_combine
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:325

llvm::equal
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2145

llvm::toVectorTy
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
Definition VectorTypeUtils.h:20

llvm::PseudoProbeAttributes::Sentinel
@ Sentinel
Definition PseudoProbe.h:34

llvm::isDereferenceableAndAlignedInLoop
LLVM_ABI bool isDereferenceableAndAlignedInLoop(LoadInst *LI, Loop *L, ScalarEvolution &SE, DominatorTree &DT, AssumptionCache *AC=nullptr, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
Return true if we can prove that the given load (which is assumed to be within the specified loop) wo...
Definition Loads.cpp:311

llvm::ValueUniformity::Default
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20

llvm::IsaPred
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866

llvm::hash_combine_range
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:305

std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:863

N
#define N

EarlyExitInfo
Definition VPlanTransforms.cpp:4205

EarlyExitInfo::EarlyExitingVPBB
VPBasicBlock * EarlyExitingVPBB
Definition VPlanTransforms.cpp:4206

EarlyExitInfo::CondToExit
VPValue * CondToExit
Definition VPlanTransforms.cpp:4208

EarlyExitInfo::EarlyExitVPBB
VPIRBasicBlock * EarlyExitVPBB
Definition VPlanTransforms.cpp:4207

RemoveMask_match
Definition VPlanTransforms.cpp:2925

RemoveMask_match::RemoveMask_match
RemoveMask_match(const Op0_t &In, Op1_t &Out)
Definition VPlanTransforms.cpp:2929

RemoveMask_match::match
bool match(OpTy *V) const
Definition VPlanTransforms.cpp:2931

RemoveMask_match::In
Op0_t In
Definition VPlanTransforms.cpp:2926

RemoveMask_match::Out
Op1_t & Out
Definition VPlanTransforms.cpp:2927

llvm::AAMDNodes::Scope
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition Metadata.h:786

llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39

llvm::DenseMapInfo
An information struct used to provide DenseMap with the various necessary components for a given valu...
Definition DenseMapInfo.h:54

llvm::PointerDiffInfo
Definition LoopAccessAnalysis.h:493

llvm::RdxUnordered
This reduction is unordered with the partial result scaled down by some factor.
Definition VPlan.h:2847

llvm::VFInfo
Holds the VFShape for a specific scalar to vector function mapping.
Definition VFABIDemangler.h:125

llvm::VFParameter
Encapsulates information needed to describe a parameter.
Definition VFABIDemangler.h:64

llvm::VFRange
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Definition VPlanHelpers.h:57

llvm::VPCostContext
Struct to hold various analysis needed for cost computations.
Definition VPlanHelpers.h:322

llvm::VPCostContext::L
const Loop * L
Definition VPlanHelpers.h:330

llvm::VPCostContext::isFreeScalarIntrinsic
static bool isFreeScalarIntrinsic(Intrinsic::ID ID)
Returns true if ID is a pseudo intrinsic that is dropped via scalarization rather than widened.
Definition VPlan.cpp:1946

llvm::VPCostContext::isMaskRequired
bool isMaskRequired(Instruction *I) const
Forwards to LoopVectorizationCostModel::isMaskRequired.
Definition LoopVectorize.cpp:5628

llvm::VPCostContext::PSE
PredicatedScalarEvolution & PSE
Definition VPlanHelpers.h:329

llvm::VPCostContext::willBeScalarized
bool willBeScalarized(Instruction *I, ElementCount VF) const
Returns true if I is known to be scalarized at VF.
Definition LoopVectorize.cpp:5622

llvm::VPCostContext::CostKind
TargetTransformInfo::TargetCostKind CostKind
Definition VPlanHelpers.h:328

llvm::VPCostContext::TLI
const TargetLibraryInfo & TLI
Definition VPlanHelpers.h:324

llvm::VPCostContext::TTI
const TargetTransformInfo & TTI
Definition VPlanHelpers.h:323

llvm::VPIRFlags::WrapFlagsTy
Definition VPlan.h:710

llvm::VPIRValue
A VPValue representing a live-in from the input IR or a constant.
Definition VPlanValue.h:246

llvm::VPIRValue::getType
Type * getType() const
Returns the type of the underlying IR value.
Definition VPlan.cpp:147

llvm::VPPhi
Definition VPlan.h:1672

llvm::VPRegisterUsage
A struct that represents some properties of the register usage of a loop.
Definition VPlanAnalysis.h:35

llvm::VPRegisterUsage::MaxLocalUsers
SmallMapVector< unsigned, unsigned, 4 > MaxLocalUsers
Holds the maximum number of concurrent live intervals in the loop.
Definition VPlanAnalysis.h:41

llvm::VPRegisterUsage::spillCost
InstructionCost spillCost(const TargetTransformInfo &TTI, TargetTransformInfo::TargetCostKind CostKind, unsigned OverrideMaxNumRegs=0) const
Calculate the estimated cost of any spills due to using more registers than the number available for ...
Definition VPlanAnalysis.cpp:84

llvm::VPSymbolicValue
A symbolic live-in VPValue, used for values like vector trip count, VF, and VFxUF.
Definition VPlanValue.h:286

llvm::VPSymbolicValue::isMaterialized
bool isMaterialized() const
Returns true if this symbolic value has been materialized.
Definition VPlanValue.h:297

llvm::VPWidenLoadEVLRecipe
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3853

llvm::VPWidenLoadRecipe
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3804

llvm::VPWidenStoreEVLRecipe
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3955

llvm::VPWidenStoreRecipe
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3902

llvm::VPlanTransforms::materializeAliasMask
static VPValue * materializeAliasMask(VPlan &Plan, VPBasicBlock *AliasCheckVPBB, ArrayRef< PointerDiffInfo > DiffChecks)
Materializes within the AliasCheckVPBB block.
Definition VPlanTransforms.cpp:5362

llvm::VPlanTransforms::tryToConvertVPInstructionsToVPRecipes
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
Definition VPlanTransforms.cpp:51

llvm::VPlanTransforms::makeMemOpWideningDecisions
static void makeMemOpWideningDecisions(VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder)
Convert load/store VPInstructions in Plan into widened or replicate recipes.
Definition VPlanTransforms.cpp:6879

llvm::VPlanTransforms::expandSCEVsToVPInstructions
static void expandSCEVsToVPInstructions(VPlan &Plan, ScalarEvolution &SE)
Try to expand VPExpandSCEVRecipes in Plan's entry block to VPInstructions.
Definition VPlanTransforms.cpp:5455

llvm::VPlanTransforms::materializeBroadcasts
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
Definition VPlanTransforms.cpp:4853

llvm::VPlanTransforms::materializePacksAndUnpacks
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
Definition VPlanTransforms.cpp:5137

llvm::VPlanTransforms::createInterleaveGroups
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, const bool &EpilogueAllowed)
Definition VPlanTransforms.cpp:3612

llvm::VPlanTransforms::simplifyKnownEVL
static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, PredicatedScalarEvolution &PSE)
Try to simplify VPInstruction::ExplicitVectorLength recipes when the AVL is known to be <= VF,...
Definition VPlanTransforms.cpp:2342

llvm::VPlanTransforms::removeBranchOnConst
static void removeBranchOnConst(VPlan &Plan, bool OnlyLatches=false)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
Definition VPlanTransforms.cpp:2727

llvm::VPlanTransforms::materializeFactors
static void materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize UF, VF and VFxUF to be computed explicitly using VPInstructions.
Definition VPlanTransforms.cpp:5299

llvm::VPlanTransforms::materializeBackedgeTakenCount
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
Definition VPlanTransforms.cpp:5123

llvm::VPlanTransforms::addActiveLaneMask
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
Definition VPlanTransforms.cpp:2897

llvm::VPlanTransforms::replaceWideCanonicalIVWithWideIV
static void replaceWideCanonicalIVWithWideIV(VPlan &Plan, ScalarEvolution &SE, const TargetTransformInfo &TTI, TargetTransformInfo::TargetCostKind CostKind, ElementCount VF, unsigned UF, const SmallPtrSetImpl< const Value * > &ValuesToIgnore)
Replace a VPWidenCanonicalIVRecipe if it is present in Plan, with a VPWidenIntOrFpInductionRecipe,...
Definition VPlanTransforms.cpp:710

llvm::VPlanTransforms::createAndOptimizeReplicateRegions
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
Definition VPlanTransforms.cpp:618

llvm::VPlanTransforms::convertToVariableLengthStep
static void convertToVariableLengthStep(VPlan &Plan)
Transform loops with variable-length stepping after region dissolution.
Definition VPlanTransforms.cpp:3370

llvm::VPlanTransforms::addBranchWeightToMiddleTerminator
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
Definition VPlanTransforms.cpp:5863

llvm::VPlanTransforms::narrowInterleaveGroups
static std::unique_ptr< VPlan > narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI)
Try to find a single VF among Plan's VFs for which all interleave groups (with known minimum VF eleme...
Definition VPlanTransforms.cpp:5689

llvm::VPlanTransforms::materializeAliasMaskCheckBlock
static void materializeAliasMaskCheckBlock(VPlan &Plan, ArrayRef< PointerDiffInfo > DiffChecks, bool HasBranchWeights)
Materializes the alias mask within a check block before the loop.
Definition VPlanTransforms.cpp:5410

llvm::VPlanTransforms::expandSCEVs
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand remaining VPExpandSCEVRecipes in Plan's entry block using SCEVExpander.
Definition VPlanTransforms.cpp:5480

llvm::VPlanTransforms::convertToConcreteRecipes
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
Definition VPlanTransforms.cpp:4022

llvm::VPlanTransforms::expandBranchOnTwoConds
static void expandBranchOnTwoConds(VPlan &Plan)
Expand BranchOnTwoConds instructions into explicit CFG with BranchOnCond instructions.
Definition VPlanTransforms.cpp:3957

llvm::VPlanTransforms::materializeVectorTripCount
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue, VPValue *Step, std::optional< uint64_t > MaxRuntimeStep=std::nullopt)
Materialize vector trip count computations to a set of VPInstructions.
Definition VPlanTransforms.cpp:5228

llvm::VPlanTransforms::hoistPredicatedLoads
static void hoistPredicatedLoads(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Hoist predicated loads from the same address to the loop entry block, if they are guaranteed to execu...
Definition VPlanTransforms.cpp:4973

llvm::VPlanTransforms::mergeBlocksIntoPredecessors
static bool mergeBlocksIntoPredecessors(VPlan &Plan)
Remove redundant VPBasicBlocks by merging them into their single predecessor if the latter has a sing...
Definition VPlanTransforms.cpp:587

llvm::VPlanTransforms::attachAliasMaskToHeaderMask
static void attachAliasMaskToHeaderMask(VPlan &Plan)
Attaches the alias-mask to the existing header-mask.
Definition VPlanTransforms.cpp:5339

llvm::VPlanTransforms::optimizeFindIVReductions
static void optimizeFindIVReductions(VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &L)
Optimize FindLast reductions selecting IVs (or expressions of IVs) by converting them to FindIV reduc...
Definition VPlanTransforms.cpp:6042

llvm::VPlanTransforms::convertToAbstractRecipes
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
Definition VPlanTransforms.cpp:4842

llvm::VPlanTransforms::materializeConstantVectorTripCount
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Definition VPlanTransforms.cpp:5090

llvm::VPlanTransforms::makeScalarizationDecisions
static void makeScalarizationDecisions(VPlan &Plan, VFRange &Range)
Make VPlan-based scalarization decision prior to delegating to the ones made by the legacy CM.
Definition VPlanTransforms.cpp:6931

llvm::VPlanTransforms::addExplicitVectorLength
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPCurrentIterationPHIRecipe and related recipes to Plan and replaces all uses of the canonical ...
Definition VPlanTransforms.cpp:3309

llvm::VPlanTransforms::makeCallWideningDecisions
static void makeCallWideningDecisions(VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder, VPCostContext &CostCtx)
Convert call VPInstructions in Plan into widened call, vector intrinsic or replicate recipes based on...
Definition VPlanTransforms.cpp:7087

llvm::VPlanTransforms::adjustFirstOrderRecurrenceMiddleUsers
static void adjustFirstOrderRecurrenceMiddleUsers(VPlan &Plan, VFRange &Range)
Adjust first-order recurrence users in the middle block: create penultimate element extracts for LCSS...
Definition VPlanTransforms.cpp:5885

llvm::VPlanTransforms::optimizeEVLMasks
static void optimizeEVLMasks(VPlan &Plan)
Optimize recipes which use an EVL-based header mask to VP intrinsics, for example:
Definition VPlanTransforms.cpp:3099

llvm::VPlanTransforms::replaceSymbolicStrides
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
Definition VPlanTransforms.cpp:3463

llvm::VPlanTransforms::removeDeadRecipes
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
Definition VPlanTransforms.cpp:805

llvm::VPlanTransforms::simplifyRecipes
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
Definition VPlanTransforms.cpp:1829

llvm::VPlanTransforms::sinkPredicatedStores
static void sinkPredicatedStores(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Sink predicated stores to the same address with complementary predicates (P and NOT P) to an uncondit...
Definition VPlanTransforms.cpp:5040

llvm::VPlanTransforms::convertToStridedAccesses
static void convertToStridedAccesses(VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &L, VPCostContext &Ctx, VFRange &Range)
Transform widen memory recipes into strided access recipes when legal and profitable.
Definition VPlanTransforms.cpp:7143

llvm::VPlanTransforms::handleUncountableEarlyExits
static bool handleUncountableEarlyExits(VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VPBasicBlock *MiddleVPBB, Loop *TheLoop, PredicatedScalarEvolution &PSE, DominatorTree &DT, AssumptionCache *AC, UncountableExitStyle Style)
Update Plan to account for uncountable early exits by introducing appropriate branching logic in the ...
Definition VPlanTransforms.cpp:4367

llvm::VPlanTransforms::clearReductionWrapFlags
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
Definition VPlanTransforms.cpp:2392

llvm::VPlanTransforms::optimizeInductionLiveOutUsers
static void optimizeInductionLiveOutUsers(VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
Definition VPlanTransforms.cpp:1147

llvm::VPlanTransforms::createPartialReductions
static void createPartialReductions(VPlan &Plan, VPCostContext &CostCtx, VFRange &Range)
Detect and create partial reduction recipes for scaled reductions in Plan.
Definition VPlanTransforms.cpp:6761

llvm::VPlanTransforms::cse
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
Definition VPlanTransforms.cpp:2503

llvm::VPlanTransforms::attachVPCheckBlock
static void attachVPCheckBlock(VPlan &Plan, VPValue *Cond, VPBasicBlock *CheckBlock, bool AddBranchWeights)
Wrap runtime check block CheckBlock in a VPIRBB and Cond in a VPValue and connect the block to Plan,...
Definition VPlanConstruction.cpp:1486

llvm::VPlanTransforms::optimize
static LLVM_ABI_FOR_TEST void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
Definition VPlanTransforms.cpp:2797

llvm::VPlanTransforms::dissolveLoopRegions
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
Definition VPlanTransforms.cpp:3945

llvm::VPlanTransforms::truncateToMinimalBitwidths
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
Definition VPlanTransforms.cpp:2628

llvm::VPlanTransforms::dropPoisonGeneratingRecipes
static void dropPoisonGeneratingRecipes(VPlan &Plan)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
Definition VPlanTransforms.cpp:3524

llvm::VPlanTransforms::optimizeForVFAndUF
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
Definition VPlanTransforms.cpp:2376

llvm::VPlanTransforms::convertEVLExitCond
static void convertEVLExitCond(VPlan &Plan)
Replaces the exit condition from (branch-on-cond eq CanonicalIVInc, VectorTripCount) to (branch-on-co...
Definition VPlanTransforms.cpp:3409