Exemplo n.º 1
0
static bool simplifyLoopCFG(Loop &L, DominatorTree &DT, LoopInfo &LI,
                            ScalarEvolution &SE) {
  bool Changed = false;
  // Copy blocks into a temporary array to avoid iterator invalidation issues
  // as we remove them.
  SmallVector<WeakTrackingVH, 16> Blocks(L.blocks());

  for (auto &Block : Blocks) {
    // Attempt to merge blocks in the trivial case. Don't modify blocks which
    // belong to other loops.
    BasicBlock *Succ = cast_or_null<BasicBlock>(Block);
    if (!Succ)
      continue;

    BasicBlock *Pred = Succ->getSinglePredecessor();
    if (!Pred || !Pred->getSingleSuccessor() || LI.getLoopFor(Pred) != &L)
      continue;

    // Merge Succ into Pred and delete it.
    MergeBlockIntoPredecessor(Succ, &DT, &LI);

    SE.forgetLoop(&L);
    Changed = true;
  }

  return Changed;
}
Exemplo n.º 2
0
/// Insert code in the prolog code when unrolling a loop with a
/// run-time trip-count.
///
/// This method assumes that the loop unroll factor is total number
/// of loop bodes in the loop after unrolling. (Some folks refer
/// to the unroll factor as the number of *extra* copies added).
/// We assume also that the loop unroll factor is a power-of-two. So, after
/// unrolling the loop, the number of loop bodies executed is 2,
/// 4, 8, etc.  Note - LLVM converts the if-then-sequence to a switch
/// instruction in SimplifyCFG.cpp.  Then, the backend decides how code for
/// the switch instruction is generated.
///
///    extraiters = tripcount % loopfactor
///    if (extraiters == 0) jump Loop:
///    if (extraiters == loopfactor) jump L1
///    if (extraiters == loopfactor-1) jump L2
///    ...
///    L1:  LoopBody;
///    L2:  LoopBody;
///    ...
///    if tripcount < loopfactor jump End
///    Loop:
///    ...
///    End:
///
bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI,
                                   LPPassManager *LPM) {
  // for now, only unroll loops that contain a single exit
  if (!L->getExitingBlock())
    return false;

  // Make sure the loop is in canonical form, and there is a single
  // exit block only.
  if (!L->isLoopSimplifyForm() || !L->getUniqueExitBlock())
    return false;

  // Use Scalar Evolution to compute the trip count.  This allows more
  // loops to be unrolled than relying on induction var simplification
  if (!LPM)
    return false;
  ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>();
  if (!SE)
    return false;

  // Only unroll loops with a computable trip count and the trip count needs
  // to be an int value (allowing a pointer type is a TODO item)
  const SCEV *BECount = SE->getBackedgeTakenCount(L);
  if (isa<SCEVCouldNotCompute>(BECount) || !BECount->getType()->isIntegerTy())
    return false;

  // Add 1 since the backedge count doesn't include the first loop iteration
  const SCEV *TripCountSC =
    SE->getAddExpr(BECount, SE->getConstant(BECount->getType(), 1));
  if (isa<SCEVCouldNotCompute>(TripCountSC))
    return false;

  // We only handle cases when the unroll factor is a power of 2.
  // Count is the loop unroll factor, the number of extra copies added + 1.
  if ((Count & (Count-1)) != 0)
    return false;

  // If this loop is nested, then the loop unroller changes the code in
  // parent loop, so the Scalar Evolution pass needs to be run again
  if (Loop *ParentLoop = L->getParentLoop())
    SE->forgetLoop(ParentLoop);

  BasicBlock *PH = L->getLoopPreheader();
  BasicBlock *Header = L->getHeader();
  BasicBlock *Latch = L->getLoopLatch();
  // It helps to splits the original preheader twice, one for the end of the
  // prolog code and one for a new loop preheader
  BasicBlock *PEnd = SplitEdge(PH, Header, LPM->getAsPass());
  BasicBlock *NewPH = SplitBlock(PEnd, PEnd->getTerminator(), LPM->getAsPass());
  BranchInst *PreHeaderBR = cast<BranchInst>(PH->getTerminator());

  // Compute the number of extra iterations required, which is:
  //  extra iterations = run-time trip count % (loop unroll factor + 1)
  SCEVExpander Expander(*SE, "loop-unroll");
  Value *TripCount = Expander.expandCodeFor(TripCountSC, TripCountSC->getType(),
                                            PreHeaderBR);
  Type *CountTy = TripCount->getType();
  BinaryOperator *ModVal =
    BinaryOperator::CreateURem(TripCount,
                               ConstantInt::get(CountTy, Count),
                               "xtraiter");
  ModVal->insertBefore(PreHeaderBR);

  // Check if for no extra iterations, then jump to unrolled loop
  Value *BranchVal = new ICmpInst(PreHeaderBR,
                                  ICmpInst::ICMP_NE, ModVal,
                                  ConstantInt::get(CountTy, 0), "lcmp");
  // Branch to either the extra iterations or the unrolled loop
  // We will fix up the true branch label when adding loop body copies
  BranchInst::Create(PEnd, PEnd, BranchVal, PreHeaderBR);
  assert(PreHeaderBR->isUnconditional() &&
         PreHeaderBR->getSuccessor(0) == PEnd &&
         "CFG edges in Preheader are not correct");
  PreHeaderBR->eraseFromParent();

  ValueToValueMapTy LVMap;
  Function *F = Header->getParent();
  // These variables are used to update the CFG links in each iteration
  BasicBlock *CompareBB = nullptr;
  BasicBlock *LastLoopBB = PH;
  // Get an ordered list of blocks in the loop to help with the ordering of the
  // cloned blocks in the prolog code
  LoopBlocksDFS LoopBlocks(L);
  LoopBlocks.perform(LI);

  //
  // For each extra loop iteration, create a copy of the loop's basic blocks
  // and generate a condition that branches to the copy depending on the
  // number of 'left over' iterations.
  //
  for (unsigned leftOverIters = Count-1; leftOverIters > 0; --leftOverIters) {
    std::vector<BasicBlock*> NewBlocks;
    ValueToValueMapTy VMap;

    // Clone all the basic blocks in the loop, but we don't clone the loop
    // This function adds the appropriate CFG connections.
    CloneLoopBlocks(L, (leftOverIters == Count-1), LastLoopBB, PEnd, NewBlocks,
                    LoopBlocks, VMap, LVMap, LI);
    LastLoopBB = cast<BasicBlock>(VMap[Latch]);

    // Insert the cloned blocks into function just before the original loop
    F->getBasicBlockList().splice(PEnd, F->getBasicBlockList(),
                                  NewBlocks[0], F->end());

    // Generate the code for the comparison which determines if the loop
    // prolog code needs to be executed.
    if (leftOverIters == Count-1) {
      // There is no compare block for the fall-thru case when for the last
      // left over iteration
      CompareBB = NewBlocks[0];
    } else {
      // Create a new block for the comparison
      BasicBlock *NewBB = BasicBlock::Create(CompareBB->getContext(), "unr.cmp",
                                             F, CompareBB);
      if (Loop *ParentLoop = L->getParentLoop()) {
        // Add the new block to the parent loop, if needed
        ParentLoop->addBasicBlockToLoop(NewBB, LI->getBase());
      }

      // The comparison w/ the extra iteration value and branch
      Value *BranchVal = new ICmpInst(*NewBB, ICmpInst::ICMP_EQ, ModVal,
                                      ConstantInt::get(CountTy, leftOverIters),
                                      "un.tmp");
      // Branch to either the extra iterations or the unrolled loop
      BranchInst::Create(NewBlocks[0], CompareBB,
                         BranchVal, NewBB);
      CompareBB = NewBB;
      PH->getTerminator()->setSuccessor(0, NewBB);
      VMap[NewPH] = CompareBB;
    }

    // Rewrite the cloned instruction operands to use the values
    // created when the clone is created.
    for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i) {
      for (BasicBlock::iterator I = NewBlocks[i]->begin(),
             E = NewBlocks[i]->end(); I != E; ++I) {
        RemapInstruction(I, VMap,
                         RF_NoModuleLevelChanges|RF_IgnoreMissingEntries);
      }
    }
  }

  // Connect the prolog code to the original loop and update the
  // PHI functions.
  ConnectProlog(L, TripCount, Count, LastLoopBB, PEnd, PH, NewPH, LVMap,
                LPM->getAsPass());
  NumRuntimeUnrolled++;
  return true;
}
Exemplo n.º 3
0
/// Remove dead loops, by which we mean loops that do not impact the observable
/// behavior of the program other than finite running time.  Note we do ensure
/// that this never remove a loop that might be infinite, as doing so could
/// change the halting/non-halting nature of a program. NOTE: This entire
/// process relies pretty heavily on LoopSimplify and LCSSA in order to make
/// various safety checks work.
bool LoopDeletionPass::runImpl(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
                               LoopInfo &loopInfo) {
  assert(L->isLCSSAForm(DT) && "Expected LCSSA!");

  // We can only remove the loop if there is a preheader that we can
  // branch from after removing it.
  BasicBlock *preheader = L->getLoopPreheader();
  if (!preheader)
    return false;

  // If LoopSimplify form is not available, stay out of trouble.
  if (!L->hasDedicatedExits())
    return false;

  // We can't remove loops that contain subloops.  If the subloops were dead,
  // they would already have been removed in earlier executions of this pass.
  if (L->begin() != L->end())
    return false;

  SmallVector<BasicBlock *, 4> exitingBlocks;
  L->getExitingBlocks(exitingBlocks);

  SmallVector<BasicBlock *, 4> exitBlocks;
  L->getUniqueExitBlocks(exitBlocks);

  // We require that the loop only have a single exit block.  Otherwise, we'd
  // be in the situation of needing to be able to solve statically which exit
  // block will be branched to, or trying to preserve the branching logic in
  // a loop invariant manner.
  if (exitBlocks.size() != 1)
    return false;

  // Finally, we have to check that the loop really is dead.
  bool Changed = false;
  if (!isLoopDead(L, SE, exitingBlocks, exitBlocks, Changed, preheader))
    return Changed;

  // Don't remove loops for which we can't solve the trip count.
  // They could be infinite, in which case we'd be changing program behavior.
  const SCEV *S = SE.getMaxBackedgeTakenCount(L);
  if (isa<SCEVCouldNotCompute>(S))
    return Changed;

  // Now that we know the removal is safe, remove the loop by changing the
  // branch from the preheader to go to the single exit block.
  BasicBlock *exitBlock = exitBlocks[0];

  // Because we're deleting a large chunk of code at once, the sequence in which
  // we remove things is very important to avoid invalidation issues.  Don't
  // mess with this unless you have good reason and know what you're doing.

  // Tell ScalarEvolution that the loop is deleted. Do this before
  // deleting the loop so that ScalarEvolution can look at the loop
  // to determine what it needs to clean up.
  SE.forgetLoop(L);

  // Connect the preheader directly to the exit block.
  TerminatorInst *TI = preheader->getTerminator();
  TI->replaceUsesOfWith(L->getHeader(), exitBlock);

  // Rewrite phis in the exit block to get their inputs from
  // the preheader instead of the exiting block.
  BasicBlock *exitingBlock = exitingBlocks[0];
  BasicBlock::iterator BI = exitBlock->begin();
  while (PHINode *P = dyn_cast<PHINode>(BI)) {
    int j = P->getBasicBlockIndex(exitingBlock);
    assert(j >= 0 && "Can't find exiting block in exit block's phi node!");
    P->setIncomingBlock(j, preheader);
    for (unsigned i = 1; i < exitingBlocks.size(); ++i)
      P->removeIncomingValue(exitingBlocks[i]);
    ++BI;
  }

  // Update the dominator tree and remove the instructions and blocks that will
  // be deleted from the reference counting scheme.
  SmallVector<DomTreeNode*, 8> ChildNodes;
  for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
       LI != LE; ++LI) {
    // Move all of the block's children to be children of the preheader, which
    // allows us to remove the domtree entry for the block.
    ChildNodes.insert(ChildNodes.begin(), DT[*LI]->begin(), DT[*LI]->end());
    for (DomTreeNode *ChildNode : ChildNodes) {
      DT.changeImmediateDominator(ChildNode, DT[preheader]);
    }

    ChildNodes.clear();
    DT.eraseNode(*LI);

    // Remove the block from the reference counting scheme, so that we can
    // delete it freely later.
    (*LI)->dropAllReferences();
  }

  // Erase the instructions and the blocks without having to worry
  // about ordering because we already dropped the references.
  // NOTE: This iteration is safe because erasing the block does not remove its
  // entry from the loop's block list.  We do that in the next section.
  for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
       LI != LE; ++LI)
    (*LI)->eraseFromParent();

  // Finally, the blocks from loopinfo.  This has to happen late because
  // otherwise our loop iterators won't work.

  SmallPtrSet<BasicBlock *, 8> blocks;
  blocks.insert(L->block_begin(), L->block_end());
  for (BasicBlock *BB : blocks)
    loopInfo.removeBlock(BB);

  // The last step is to update LoopInfo now that we've eliminated this loop.
  loopInfo.markAsRemoved(L);
  Changed = true;

  ++NumDeleted;

  return Changed;
}
Exemplo n.º 4
0
/// Unroll the given loop by Count. The loop must be in LCSSA form. Returns true
/// if unrolling was successful, or false if the loop was unmodified. Unrolling
/// can only fail when the loop's latch block is not terminated by a conditional
/// branch instruction. However, if the trip count (and multiple) are not known,
/// loop unrolling will mostly produce more code that is no faster.
///
/// TripCount is generally defined as the number of times the loop header
/// executes. UnrollLoop relaxes the definition to permit early exits: here
/// TripCount is the iteration on which control exits LatchBlock if no early
/// exits were taken. Note that UnrollLoop assumes that the loop counter test
/// terminates LatchBlock in order to remove unnecesssary instances of the
/// test. In other words, control may exit the loop prior to TripCount
/// iterations via an early branch, but control may not exit the loop from the
/// LatchBlock's terminator prior to TripCount iterations.
///
/// Similarly, TripMultiple divides the number of times that the LatchBlock may
/// execute without exiting the loop.
///
/// The LoopInfo Analysis that is passed will be kept consistent.
///
/// If a LoopPassManager is passed in, and the loop is fully removed, it will be
/// removed from the LoopPassManager as well. LPM can also be NULL.
///
/// This utility preserves LoopInfo. If DominatorTree or ScalarEvolution are
/// available from the Pass it must also preserve those analyses.
bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
                      bool AllowRuntime, unsigned TripMultiple,
                      LoopInfo *LI, Pass *PP, LPPassManager *LPM) {
  BasicBlock *Preheader = L->getLoopPreheader();
  if (!Preheader) {
    DEBUG(dbgs() << "  Can't unroll; loop preheader-insertion failed.\n");
    return false;
  }

  BasicBlock *LatchBlock = L->getLoopLatch();
  if (!LatchBlock) {
    DEBUG(dbgs() << "  Can't unroll; loop exit-block-insertion failed.\n");
    return false;
  }

  // Loops with indirectbr cannot be cloned.
  if (!L->isSafeToClone()) {
    DEBUG(dbgs() << "  Can't unroll; Loop body cannot be cloned.\n");
    return false;
  }

  BasicBlock *Header = L->getHeader();
  BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());

  if (!BI || BI->isUnconditional()) {
    // The loop-rotate pass can be helpful to avoid this in many cases.
    DEBUG(dbgs() <<
             "  Can't unroll; loop not terminated by a conditional branch.\n");
    return false;
  }

  if (Header->hasAddressTaken()) {
    // The loop-rotate pass can be helpful to avoid this in many cases.
    DEBUG(dbgs() <<
          "  Won't unroll loop: address of header block is taken.\n");
    return false;
  }

  if (TripCount != 0)
    DEBUG(dbgs() << "  Trip Count = " << TripCount << "\n");
  if (TripMultiple != 1)
    DEBUG(dbgs() << "  Trip Multiple = " << TripMultiple << "\n");

  // Effectively "DCE" unrolled iterations that are beyond the tripcount
  // and will never be executed.
  if (TripCount != 0 && Count > TripCount)
    Count = TripCount;

  // Don't enter the unroll code if there is nothing to do. This way we don't
  // need to support "partial unrolling by 1".
  if (TripCount == 0 && Count < 2)
    return false;

  assert(Count > 0);
  assert(TripMultiple > 0);
  assert(TripCount == 0 || TripCount % TripMultiple == 0);

  // Are we eliminating the loop control altogether?
  bool CompletelyUnroll = Count == TripCount;

  // We assume a run-time trip count if the compiler cannot
  // figure out the loop trip count and the unroll-runtime
  // flag is specified.
  bool RuntimeTripCount = (TripCount == 0 && Count > 0 && AllowRuntime);

  if (RuntimeTripCount && !UnrollRuntimeLoopProlog(L, Count, LI, LPM))
    return false;

  // Notify ScalarEvolution that the loop will be substantially changed,
  // if not outright eliminated.
  if (PP) {
    ScalarEvolution *SE = PP->getAnalysisIfAvailable<ScalarEvolution>();
    if (SE)
      SE->forgetLoop(L);
  }

  // If we know the trip count, we know the multiple...
  unsigned BreakoutTrip = 0;
  if (TripCount != 0) {
    BreakoutTrip = TripCount % Count;
    TripMultiple = 0;
  } else {
    // Figure out what multiple to use.
    BreakoutTrip = TripMultiple =
      (unsigned)GreatestCommonDivisor64(Count, TripMultiple);
  }

  // Report the unrolling decision.
  DebugLoc LoopLoc = L->getStartLoc();
  Function *F = Header->getParent();
  LLVMContext &Ctx = F->getContext();

  if (CompletelyUnroll) {
    DEBUG(dbgs() << "COMPLETELY UNROLLING loop %" << Header->getName()
          << " with trip count " << TripCount << "!\n");
    emitOptimizationRemark(Ctx, DEBUG_TYPE, *F, LoopLoc,
                           Twine("completely unrolled loop with ") +
                               Twine(TripCount) + " iterations");
  } else {
    DEBUG(dbgs() << "UNROLLING loop %" << Header->getName()
          << " by " << Count);
    Twine DiagMsg("unrolled loop by a factor of " + Twine(Count));
    if (TripMultiple == 0 || BreakoutTrip != TripMultiple) {
      DEBUG(dbgs() << " with a breakout at trip " << BreakoutTrip);
      DiagMsg.concat(" with a breakout at trip " + Twine(BreakoutTrip));
    } else if (TripMultiple != 1) {
      DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
      DiagMsg.concat(" with " + Twine(TripMultiple) + " trips per branch");
    } else if (RuntimeTripCount) {
      DEBUG(dbgs() << " with run-time trip count");
      DiagMsg.concat(" with run-time trip count");
    }
    DEBUG(dbgs() << "!\n");
    emitOptimizationRemark(Ctx, DEBUG_TYPE, *F, LoopLoc, DiagMsg);
  }

  bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
  BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);

  // For the first iteration of the loop, we should use the precloned values for
  // PHI nodes.  Insert associations now.
  ValueToValueMapTy LastValueMap;
  std::vector<PHINode*> OrigPHINode;
  for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
    OrigPHINode.push_back(cast<PHINode>(I));
  }

  std::vector<BasicBlock*> Headers;
  std::vector<BasicBlock*> Latches;
  Headers.push_back(Header);
  Latches.push_back(LatchBlock);

  // The current on-the-fly SSA update requires blocks to be processed in
  // reverse postorder so that LastValueMap contains the correct value at each
  // exit.
  LoopBlocksDFS DFS(L);
  DFS.perform(LI);

  // Stash the DFS iterators before adding blocks to the loop.
  LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
  LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();

  for (unsigned It = 1; It != Count; ++It) {
    std::vector<BasicBlock*> NewBlocks;

    for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
      ValueToValueMapTy VMap;
      BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
      Header->getParent()->getBasicBlockList().push_back(New);

      // Loop over all of the PHI nodes in the block, changing them to use the
      // incoming values from the previous block.
      if (*BB == Header)
        for (unsigned i = 0, e = OrigPHINode.size(); i != e; ++i) {
          PHINode *NewPHI = cast<PHINode>(VMap[OrigPHINode[i]]);
          Value *InVal = NewPHI->getIncomingValueForBlock(LatchBlock);
          if (Instruction *InValI = dyn_cast<Instruction>(InVal))
            if (It > 1 && L->contains(InValI))
              InVal = LastValueMap[InValI];
          VMap[OrigPHINode[i]] = InVal;
          New->getInstList().erase(NewPHI);
        }

      // Update our running map of newest clones
      LastValueMap[*BB] = New;
      for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
           VI != VE; ++VI)
        LastValueMap[VI->first] = VI->second;

      L->addBasicBlockToLoop(New, LI->getBase());

      // Add phi entries for newly created values to all exit blocks.
      for (succ_iterator SI = succ_begin(*BB), SE = succ_end(*BB);
           SI != SE; ++SI) {
        if (L->contains(*SI))
          continue;
        for (BasicBlock::iterator BBI = (*SI)->begin();
             PHINode *phi = dyn_cast<PHINode>(BBI); ++BBI) {
          Value *Incoming = phi->getIncomingValueForBlock(*BB);
          ValueToValueMapTy::iterator It = LastValueMap.find(Incoming);
          if (It != LastValueMap.end())
            Incoming = It->second;
          phi->addIncoming(Incoming, New);
        }
      }
      // Keep track of new headers and latches as we create them, so that
      // we can insert the proper branches later.
      if (*BB == Header)
        Headers.push_back(New);
      if (*BB == LatchBlock)
        Latches.push_back(New);

      NewBlocks.push_back(New);
    }

    // Remap all instructions in the most recent iteration
    for (unsigned i = 0; i < NewBlocks.size(); ++i)
      for (BasicBlock::iterator I = NewBlocks[i]->begin(),
           E = NewBlocks[i]->end(); I != E; ++I)
        ::RemapInstruction(I, LastValueMap);
  }

  // Loop over the PHI nodes in the original block, setting incoming values.
  for (unsigned i = 0, e = OrigPHINode.size(); i != e; ++i) {
    PHINode *PN = OrigPHINode[i];
    if (CompletelyUnroll) {
      PN->replaceAllUsesWith(PN->getIncomingValueForBlock(Preheader));
      Header->getInstList().erase(PN);
    }
    else if (Count > 1) {
      Value *InVal = PN->removeIncomingValue(LatchBlock, false);
      // If this value was defined in the loop, take the value defined by the
      // last iteration of the loop.
      if (Instruction *InValI = dyn_cast<Instruction>(InVal)) {
        if (L->contains(InValI))
          InVal = LastValueMap[InVal];
      }
      assert(Latches.back() == LastValueMap[LatchBlock] && "bad last latch");
      PN->addIncoming(InVal, Latches.back());
    }
  }

  // Now that all the basic blocks for the unrolled iterations are in place,
  // set up the branches to connect them.
  for (unsigned i = 0, e = Latches.size(); i != e; ++i) {
    // The original branch was replicated in each unrolled iteration.
    BranchInst *Term = cast<BranchInst>(Latches[i]->getTerminator());

    // The branch destination.
    unsigned j = (i + 1) % e;
    BasicBlock *Dest = Headers[j];
    bool NeedConditional = true;

    if (RuntimeTripCount && j != 0) {
      NeedConditional = false;
    }

    // For a complete unroll, make the last iteration end with a branch
    // to the exit block.
    if (CompletelyUnroll && j == 0) {
      Dest = LoopExit;
      NeedConditional = false;
    }

    // If we know the trip count or a multiple of it, we can safely use an
    // unconditional branch for some iterations.
    if (j != BreakoutTrip && (TripMultiple == 0 || j % TripMultiple != 0)) {
      NeedConditional = false;
    }

    if (NeedConditional) {
      // Update the conditional branch's successor for the following
      // iteration.
      Term->setSuccessor(!ContinueOnTrue, Dest);
    } else {
      // Remove phi operands at this loop exit
      if (Dest != LoopExit) {
        BasicBlock *BB = Latches[i];
        for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB);
             SI != SE; ++SI) {
          if (*SI == Headers[i])
            continue;
          for (BasicBlock::iterator BBI = (*SI)->begin();
               PHINode *Phi = dyn_cast<PHINode>(BBI); ++BBI) {
            Phi->removeIncomingValue(BB, false);
          }
        }
      }
      // Replace the conditional branch with an unconditional one.
      BranchInst::Create(Dest, Term);
      Term->eraseFromParent();
    }
  }

  // Merge adjacent basic blocks, if possible.
  for (unsigned i = 0, e = Latches.size(); i != e; ++i) {
    BranchInst *Term = cast<BranchInst>(Latches[i]->getTerminator());
    if (Term->isUnconditional()) {
      BasicBlock *Dest = Term->getSuccessor(0);
      if (BasicBlock *Fold = FoldBlockIntoPredecessor(Dest, LI, LPM))
        std::replace(Latches.begin(), Latches.end(), Dest, Fold);
    }
  }

  DominatorTree *DT = nullptr;
  if (PP) {
    // FIXME: Reconstruct dom info, because it is not preserved properly.
    // Incrementally updating domtree after loop unrolling would be easy.
    if (DominatorTreeWrapperPass *DTWP =
            PP->getAnalysisIfAvailable<DominatorTreeWrapperPass>()) {
      DT = &DTWP->getDomTree();
      DT->recalculate(*L->getHeader()->getParent());
    }

    // Simplify any new induction variables in the partially unrolled loop.
    ScalarEvolution *SE = PP->getAnalysisIfAvailable<ScalarEvolution>();
    if (SE && !CompletelyUnroll) {
      SmallVector<WeakVH, 16> DeadInsts;
      simplifyLoopIVs(L, SE, LPM, DeadInsts);

      // Aggressively clean up dead instructions that simplifyLoopIVs already
      // identified. Any remaining should be cleaned up below.
      while (!DeadInsts.empty())
        if (Instruction *Inst =
            dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val()))
          RecursivelyDeleteTriviallyDeadInstructions(Inst);
    }
  }
  // At this point, the code is well formed.  We now do a quick sweep over the
  // inserted code, doing constant propagation and dead code elimination as we
  // go.
  const std::vector<BasicBlock*> &NewLoopBlocks = L->getBlocks();
  for (std::vector<BasicBlock*>::const_iterator BB = NewLoopBlocks.begin(),
       BBE = NewLoopBlocks.end(); BB != BBE; ++BB)
    for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end(); I != E; ) {
      Instruction *Inst = I++;

      if (isInstructionTriviallyDead(Inst))
        (*BB)->getInstList().erase(Inst);
      else if (Value *V = SimplifyInstruction(Inst))
        if (LI->replacementPreservesLCSSAForm(Inst, V)) {
          Inst->replaceAllUsesWith(V);
          (*BB)->getInstList().erase(Inst);
        }
    }

  NumCompletelyUnrolled += CompletelyUnroll;
  ++NumUnrolled;

  Loop *OuterL = L->getParentLoop();
  // Remove the loop from the LoopPassManager if it's completely removed.
  if (CompletelyUnroll && LPM != nullptr)
    LPM->deleteLoopFromQueue(L);

  // If we have a pass and a DominatorTree we should re-simplify impacted loops
  // to ensure subsequent analyses can rely on this form. We want to simplify
  // at least one layer outside of the loop that was unrolled so that any
  // changes to the parent loop exposed by the unrolling are considered.
  if (PP && DT) {
    if (!OuterL && !CompletelyUnroll)
      OuterL = L;
    if (OuterL) {
      ScalarEvolution *SE = PP->getAnalysisIfAvailable<ScalarEvolution>();
      simplifyLoop(OuterL, DT, LI, PP, /*AliasAnalysis*/ nullptr, SE);

      // LCSSA must be performed on the outermost affected loop. The unrolled
      // loop's last loop latch is guaranteed to be in the outermost loop after
      // deleteLoopFromQueue updates LoopInfo.
      Loop *LatchLoop = LI->getLoopFor(Latches.back());
      if (!OuterL->contains(LatchLoop))
        while (OuterL->getParentLoop() != LatchLoop)
          OuterL = OuterL->getParentLoop();

      formLCSSARecursively(*OuterL, *DT, SE);
    }
  }

  return true;
}
Exemplo n.º 5
0
/// Insert code in the prolog code when unrolling a loop with a
/// run-time trip-count.
///
/// This method assumes that the loop unroll factor is total number
/// of loop bodes in the loop after unrolling. (Some folks refer
/// to the unroll factor as the number of *extra* copies added).
/// We assume also that the loop unroll factor is a power-of-two. So, after
/// unrolling the loop, the number of loop bodies executed is 2,
/// 4, 8, etc.  Note - LLVM converts the if-then-sequence to a switch
/// instruction in SimplifyCFG.cpp.  Then, the backend decides how code for
/// the switch instruction is generated.
///
///        extraiters = tripcount % loopfactor
///        if (extraiters == 0) jump Loop:
///        else jump Prol
/// Prol:  LoopBody;
///        extraiters -= 1                 // Omitted if unroll factor is 2.
///        if (extraiters != 0) jump Prol: // Omitted if unroll factor is 2.
///        if (tripcount < loopfactor) jump End
/// Loop:
/// ...
/// End:
///
bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count,
                                   bool AllowExpensiveTripCount, LoopInfo *LI,
                                   LPPassManager *LPM) {
    // for now, only unroll loops that contain a single exit
    if (!L->getExitingBlock())
        return false;

    // Make sure the loop is in canonical form, and there is a single
    // exit block only.
    if (!L->isLoopSimplifyForm() || !L->getUniqueExitBlock())
        return false;

    // Use Scalar Evolution to compute the trip count.  This allows more
    // loops to be unrolled than relying on induction var simplification
    if (!LPM)
        return false;
    ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>();
    if (!SE)
        return false;

    // Only unroll loops with a computable trip count and the trip count needs
    // to be an int value (allowing a pointer type is a TODO item)
    const SCEV *BECountSC = SE->getBackedgeTakenCount(L);
    if (isa<SCEVCouldNotCompute>(BECountSC) ||
            !BECountSC->getType()->isIntegerTy())
        return false;

    unsigned BEWidth = cast<IntegerType>(BECountSC->getType())->getBitWidth();

    // Add 1 since the backedge count doesn't include the first loop iteration
    const SCEV *TripCountSC =
        SE->getAddExpr(BECountSC, SE->getConstant(BECountSC->getType(), 1));
    if (isa<SCEVCouldNotCompute>(TripCountSC))
        return false;

    BasicBlock *Header = L->getHeader();
    const DataLayout &DL = Header->getModule()->getDataLayout();
    SCEVExpander Expander(*SE, DL, "loop-unroll");
    if (!AllowExpensiveTripCount && Expander.isHighCostExpansion(TripCountSC, L))
        return false;

    // We only handle cases when the unroll factor is a power of 2.
    // Count is the loop unroll factor, the number of extra copies added + 1.
    if (!isPowerOf2_32(Count))
        return false;

    // This constraint lets us deal with an overflowing trip count easily; see the
    // comment on ModVal below.
    if (Log2_32(Count) > BEWidth)
        return false;

    // If this loop is nested, then the loop unroller changes the code in
    // parent loop, so the Scalar Evolution pass needs to be run again
    if (Loop *ParentLoop = L->getParentLoop())
        SE->forgetLoop(ParentLoop);

    // Grab analyses that we preserve.
    auto *DTWP = LPM->getAnalysisIfAvailable<DominatorTreeWrapperPass>();
    auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;

    BasicBlock *PH = L->getLoopPreheader();
    BasicBlock *Latch = L->getLoopLatch();
    // It helps to splits the original preheader twice, one for the end of the
    // prolog code and one for a new loop preheader
    BasicBlock *PEnd = SplitEdge(PH, Header, DT, LI);
    BasicBlock *NewPH = SplitBlock(PEnd, PEnd->getTerminator(), DT, LI);
    BranchInst *PreHeaderBR = cast<BranchInst>(PH->getTerminator());

    // Compute the number of extra iterations required, which is:
    //  extra iterations = run-time trip count % (loop unroll factor + 1)
    Value *TripCount = Expander.expandCodeFor(TripCountSC, TripCountSC->getType(),
                       PreHeaderBR);
    Value *BECount = Expander.expandCodeFor(BECountSC, BECountSC->getType(),
                                            PreHeaderBR);

    IRBuilder<> B(PreHeaderBR);
    Value *ModVal = B.CreateAnd(TripCount, Count - 1, "xtraiter");

    // If ModVal is zero, we know that either
    //  1. there are no iteration to be run in the prologue loop
    // OR
    //  2. the addition computing TripCount overflowed
    //
    // If (2) is true, we know that TripCount really is (1 << BEWidth) and so the
    // number of iterations that remain to be run in the original loop is a
    // multiple Count == (1 << Log2(Count)) because Log2(Count) <= BEWidth (we
    // explicitly check this above).

    Value *BranchVal = B.CreateIsNotNull(ModVal, "lcmp.mod");

    // Branch to either the extra iterations or the cloned/unrolled loop
    // We will fix up the true branch label when adding loop body copies
    B.CreateCondBr(BranchVal, PEnd, PEnd);
    assert(PreHeaderBR->isUnconditional() &&
           PreHeaderBR->getSuccessor(0) == PEnd &&
           "CFG edges in Preheader are not correct");
    PreHeaderBR->eraseFromParent();
    Function *F = Header->getParent();
    // Get an ordered list of blocks in the loop to help with the ordering of the
    // cloned blocks in the prolog code
    LoopBlocksDFS LoopBlocks(L);
    LoopBlocks.perform(LI);

    //
    // For each extra loop iteration, create a copy of the loop's basic blocks
    // and generate a condition that branches to the copy depending on the
    // number of 'left over' iterations.
    //
    std::vector<BasicBlock *> NewBlocks;
    ValueToValueMapTy VMap;

    bool UnrollPrologue = Count == 2;

    // Clone all the basic blocks in the loop. If Count is 2, we don't clone
    // the loop, otherwise we create a cloned loop to execute the extra
    // iterations. This function adds the appropriate CFG connections.
    CloneLoopBlocks(L, ModVal, UnrollPrologue, PH, PEnd, NewBlocks, LoopBlocks,
                    VMap, LI);

    // Insert the cloned blocks into function just before the original loop
    F->getBasicBlockList().splice(PEnd, F->getBasicBlockList(), NewBlocks[0],
                                  F->end());

    // Rewrite the cloned instruction operands to use the values
    // created when the clone is created.
    for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i) {
        for (BasicBlock::iterator I = NewBlocks[i]->begin(),
                E = NewBlocks[i]->end();
                I != E; ++I) {
            RemapInstruction(I, VMap,
                             RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);
        }
    }

    // Connect the prolog code to the original loop and update the
    // PHI functions.
    BasicBlock *LastLoopBB = cast<BasicBlock>(VMap[Latch]);
    ConnectProlog(L, BECount, Count, LastLoopBB, PEnd, PH, NewPH, VMap,
                  /*AliasAnalysis*/ nullptr, DT, LI, LPM->getAsPass());
    NumRuntimeUnrolled++;
    return true;
}