Ejemplo n.º 1
0
/// Create a clone of the blocks in a loop and connect them together.
/// This function doesn't create a clone of the loop structure.
///
/// There are two value maps that are defined and used.  VMap is
/// for the values in the current loop instance.  LVMap contains
/// the values from the last loop instance.  We need the LVMap values
/// to update the initial values for the current loop instance.
///
static void CloneLoopBlocks(Loop *L,
                            bool FirstCopy,
                            BasicBlock *InsertTop,
                            BasicBlock *InsertBot,
                            std::vector<BasicBlock *> &NewBlocks,
                            LoopBlocksDFS &LoopBlocks,
                            ValueToValueMapTy &VMap,
                            ValueToValueMapTy &LVMap,
                            LoopInfo *LI) {

  BasicBlock *Preheader = L->getLoopPreheader();
  BasicBlock *Header = L->getHeader();
  BasicBlock *Latch = L->getLoopLatch();
  Function *F = Header->getParent();
  LoopBlocksDFS::RPOIterator BlockBegin = LoopBlocks.beginRPO();
  LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO();
  // For each block in the original loop, create a new copy,
  // and update the value map with the newly created values.
  for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
    BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, ".unr", F);
    NewBlocks.push_back(NewBB);

    if (Loop *ParentLoop = L->getParentLoop())
      ParentLoop->addBasicBlockToLoop(NewBB, LI->getBase());

    VMap[*BB] = NewBB;
    if (Header == *BB) {
      // For the first block, add a CFG connection to this newly
      // created block
      InsertTop->getTerminator()->setSuccessor(0, NewBB);

      // Change the incoming values to the ones defined in the
      // previously cloned loop.
      for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
        PHINode *NewPHI = cast<PHINode>(VMap[I]);
        if (FirstCopy) {
          // We replace the first phi node with the value from the preheader
          VMap[I] = NewPHI->getIncomingValueForBlock(Preheader);
          NewBB->getInstList().erase(NewPHI);
        } else {
          // Update VMap with values from the previous block
          unsigned idx = NewPHI->getBasicBlockIndex(Latch);
          Value *InVal = NewPHI->getIncomingValue(idx);
          if (Instruction *I = dyn_cast<Instruction>(InVal))
            if (L->contains(I))
              InVal = LVMap[InVal];
          NewPHI->setIncomingValue(idx, InVal);
          NewPHI->setIncomingBlock(idx, InsertTop);
        }
      }
    }

    if (Latch == *BB) {
      VMap.erase((*BB)->getTerminator());
      NewBB->getTerminator()->eraseFromParent();
      BranchInst::Create(InsertBot, NewBB);
    }
  }
  // LastValueMap is updated with the values for the current loop
  // which are used the next time this function is called.
  for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
       VI != VE; ++VI) {
    LVMap[VI->first] = VI->second;
  }
}
Ejemplo n.º 2
0
/// Unroll the given loop by Count. The loop must be in LCSSA form. Returns true
/// if unrolling was successful, or false if the loop was unmodified. Unrolling
/// can only fail when the loop's latch block is not terminated by a conditional
/// branch instruction. However, if the trip count (and multiple) are not known,
/// loop unrolling will mostly produce more code that is no faster.
///
/// TripCount is generally defined as the number of times the loop header
/// executes. UnrollLoop relaxes the definition to permit early exits: here
/// TripCount is the iteration on which control exits LatchBlock if no early
/// exits were taken. Note that UnrollLoop assumes that the loop counter test
/// terminates LatchBlock in order to remove unnecesssary instances of the
/// test. In other words, control may exit the loop prior to TripCount
/// iterations via an early branch, but control may not exit the loop from the
/// LatchBlock's terminator prior to TripCount iterations.
///
/// Similarly, TripMultiple divides the number of times that the LatchBlock may
/// execute without exiting the loop.
///
/// The LoopInfo Analysis that is passed will be kept consistent.
///
/// If a LoopPassManager is passed in, and the loop is fully removed, it will be
/// removed from the LoopPassManager as well. LPM can also be NULL.
///
/// This utility preserves LoopInfo. If DominatorTree or ScalarEvolution are
/// available from the Pass it must also preserve those analyses.
bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
                      bool AllowRuntime, unsigned TripMultiple,
                      LoopInfo *LI, Pass *PP, LPPassManager *LPM) {
  BasicBlock *Preheader = L->getLoopPreheader();
  if (!Preheader) {
    DEBUG(dbgs() << "  Can't unroll; loop preheader-insertion failed.\n");
    return false;
  }

  BasicBlock *LatchBlock = L->getLoopLatch();
  if (!LatchBlock) {
    DEBUG(dbgs() << "  Can't unroll; loop exit-block-insertion failed.\n");
    return false;
  }

  // Loops with indirectbr cannot be cloned.
  if (!L->isSafeToClone()) {
    DEBUG(dbgs() << "  Can't unroll; Loop body cannot be cloned.\n");
    return false;
  }

  BasicBlock *Header = L->getHeader();
  BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());

  if (!BI || BI->isUnconditional()) {
    // The loop-rotate pass can be helpful to avoid this in many cases.
    DEBUG(dbgs() <<
             "  Can't unroll; loop not terminated by a conditional branch.\n");
    return false;
  }

  if (Header->hasAddressTaken()) {
    // The loop-rotate pass can be helpful to avoid this in many cases.
    DEBUG(dbgs() <<
          "  Won't unroll loop: address of header block is taken.\n");
    return false;
  }

  if (TripCount != 0)
    DEBUG(dbgs() << "  Trip Count = " << TripCount << "\n");
  if (TripMultiple != 1)
    DEBUG(dbgs() << "  Trip Multiple = " << TripMultiple << "\n");

  // Effectively "DCE" unrolled iterations that are beyond the tripcount
  // and will never be executed.
  if (TripCount != 0 && Count > TripCount)
    Count = TripCount;

  // Don't enter the unroll code if there is nothing to do. This way we don't
  // need to support "partial unrolling by 1".
  if (TripCount == 0 && Count < 2)
    return false;

  assert(Count > 0);
  assert(TripMultiple > 0);
  assert(TripCount == 0 || TripCount % TripMultiple == 0);

  // Are we eliminating the loop control altogether?
  bool CompletelyUnroll = Count == TripCount;

  // We assume a run-time trip count if the compiler cannot
  // figure out the loop trip count and the unroll-runtime
  // flag is specified.
  bool RuntimeTripCount = (TripCount == 0 && Count > 0 && AllowRuntime);

  if (RuntimeTripCount && !UnrollRuntimeLoopProlog(L, Count, LI, LPM))
    return false;

  // Notify ScalarEvolution that the loop will be substantially changed,
  // if not outright eliminated.
  if (PP) {
    ScalarEvolution *SE = PP->getAnalysisIfAvailable<ScalarEvolution>();
    if (SE)
      SE->forgetLoop(L);
  }

  // If we know the trip count, we know the multiple...
  unsigned BreakoutTrip = 0;
  if (TripCount != 0) {
    BreakoutTrip = TripCount % Count;
    TripMultiple = 0;
  } else {
    // Figure out what multiple to use.
    BreakoutTrip = TripMultiple =
      (unsigned)GreatestCommonDivisor64(Count, TripMultiple);
  }

  // Report the unrolling decision.
  DebugLoc LoopLoc = L->getStartLoc();
  Function *F = Header->getParent();
  LLVMContext &Ctx = F->getContext();

  if (CompletelyUnroll) {
    DEBUG(dbgs() << "COMPLETELY UNROLLING loop %" << Header->getName()
          << " with trip count " << TripCount << "!\n");
    emitOptimizationRemark(Ctx, DEBUG_TYPE, *F, LoopLoc,
                           Twine("completely unrolled loop with ") +
                               Twine(TripCount) + " iterations");
  } else {
    DEBUG(dbgs() << "UNROLLING loop %" << Header->getName()
          << " by " << Count);
    Twine DiagMsg("unrolled loop by a factor of " + Twine(Count));
    if (TripMultiple == 0 || BreakoutTrip != TripMultiple) {
      DEBUG(dbgs() << " with a breakout at trip " << BreakoutTrip);
      DiagMsg.concat(" with a breakout at trip " + Twine(BreakoutTrip));
    } else if (TripMultiple != 1) {
      DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
      DiagMsg.concat(" with " + Twine(TripMultiple) + " trips per branch");
    } else if (RuntimeTripCount) {
      DEBUG(dbgs() << " with run-time trip count");
      DiagMsg.concat(" with run-time trip count");
    }
    DEBUG(dbgs() << "!\n");
    emitOptimizationRemark(Ctx, DEBUG_TYPE, *F, LoopLoc, DiagMsg);
  }

  bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
  BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);

  // For the first iteration of the loop, we should use the precloned values for
  // PHI nodes.  Insert associations now.
  ValueToValueMapTy LastValueMap;
  std::vector<PHINode*> OrigPHINode;
  for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
    OrigPHINode.push_back(cast<PHINode>(I));
  }

  std::vector<BasicBlock*> Headers;
  std::vector<BasicBlock*> Latches;
  Headers.push_back(Header);
  Latches.push_back(LatchBlock);

  // The current on-the-fly SSA update requires blocks to be processed in
  // reverse postorder so that LastValueMap contains the correct value at each
  // exit.
  LoopBlocksDFS DFS(L);
  DFS.perform(LI);

  // Stash the DFS iterators before adding blocks to the loop.
  LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
  LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();

  for (unsigned It = 1; It != Count; ++It) {
    std::vector<BasicBlock*> NewBlocks;

    for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
      ValueToValueMapTy VMap;
      BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
      Header->getParent()->getBasicBlockList().push_back(New);

      // Loop over all of the PHI nodes in the block, changing them to use the
      // incoming values from the previous block.
      if (*BB == Header)
        for (unsigned i = 0, e = OrigPHINode.size(); i != e; ++i) {
          PHINode *NewPHI = cast<PHINode>(VMap[OrigPHINode[i]]);
          Value *InVal = NewPHI->getIncomingValueForBlock(LatchBlock);
          if (Instruction *InValI = dyn_cast<Instruction>(InVal))
            if (It > 1 && L->contains(InValI))
              InVal = LastValueMap[InValI];
          VMap[OrigPHINode[i]] = InVal;
          New->getInstList().erase(NewPHI);
        }

      // Update our running map of newest clones
      LastValueMap[*BB] = New;
      for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
           VI != VE; ++VI)
        LastValueMap[VI->first] = VI->second;

      L->addBasicBlockToLoop(New, LI->getBase());

      // Add phi entries for newly created values to all exit blocks.
      for (succ_iterator SI = succ_begin(*BB), SE = succ_end(*BB);
           SI != SE; ++SI) {
        if (L->contains(*SI))
          continue;
        for (BasicBlock::iterator BBI = (*SI)->begin();
             PHINode *phi = dyn_cast<PHINode>(BBI); ++BBI) {
          Value *Incoming = phi->getIncomingValueForBlock(*BB);
          ValueToValueMapTy::iterator It = LastValueMap.find(Incoming);
          if (It != LastValueMap.end())
            Incoming = It->second;
          phi->addIncoming(Incoming, New);
        }
      }
      // Keep track of new headers and latches as we create them, so that
      // we can insert the proper branches later.
      if (*BB == Header)
        Headers.push_back(New);
      if (*BB == LatchBlock)
        Latches.push_back(New);

      NewBlocks.push_back(New);
    }

    // Remap all instructions in the most recent iteration
    for (unsigned i = 0; i < NewBlocks.size(); ++i)
      for (BasicBlock::iterator I = NewBlocks[i]->begin(),
           E = NewBlocks[i]->end(); I != E; ++I)
        ::RemapInstruction(I, LastValueMap);
  }

  // Loop over the PHI nodes in the original block, setting incoming values.
  for (unsigned i = 0, e = OrigPHINode.size(); i != e; ++i) {
    PHINode *PN = OrigPHINode[i];
    if (CompletelyUnroll) {
      PN->replaceAllUsesWith(PN->getIncomingValueForBlock(Preheader));
      Header->getInstList().erase(PN);
    }
    else if (Count > 1) {
      Value *InVal = PN->removeIncomingValue(LatchBlock, false);
      // If this value was defined in the loop, take the value defined by the
      // last iteration of the loop.
      if (Instruction *InValI = dyn_cast<Instruction>(InVal)) {
        if (L->contains(InValI))
          InVal = LastValueMap[InVal];
      }
      assert(Latches.back() == LastValueMap[LatchBlock] && "bad last latch");
      PN->addIncoming(InVal, Latches.back());
    }
  }

  // Now that all the basic blocks for the unrolled iterations are in place,
  // set up the branches to connect them.
  for (unsigned i = 0, e = Latches.size(); i != e; ++i) {
    // The original branch was replicated in each unrolled iteration.
    BranchInst *Term = cast<BranchInst>(Latches[i]->getTerminator());

    // The branch destination.
    unsigned j = (i + 1) % e;
    BasicBlock *Dest = Headers[j];
    bool NeedConditional = true;

    if (RuntimeTripCount && j != 0) {
      NeedConditional = false;
    }

    // For a complete unroll, make the last iteration end with a branch
    // to the exit block.
    if (CompletelyUnroll && j == 0) {
      Dest = LoopExit;
      NeedConditional = false;
    }

    // If we know the trip count or a multiple of it, we can safely use an
    // unconditional branch for some iterations.
    if (j != BreakoutTrip && (TripMultiple == 0 || j % TripMultiple != 0)) {
      NeedConditional = false;
    }

    if (NeedConditional) {
      // Update the conditional branch's successor for the following
      // iteration.
      Term->setSuccessor(!ContinueOnTrue, Dest);
    } else {
      // Remove phi operands at this loop exit
      if (Dest != LoopExit) {
        BasicBlock *BB = Latches[i];
        for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB);
             SI != SE; ++SI) {
          if (*SI == Headers[i])
            continue;
          for (BasicBlock::iterator BBI = (*SI)->begin();
               PHINode *Phi = dyn_cast<PHINode>(BBI); ++BBI) {
            Phi->removeIncomingValue(BB, false);
          }
        }
      }
      // Replace the conditional branch with an unconditional one.
      BranchInst::Create(Dest, Term);
      Term->eraseFromParent();
    }
  }

  // Merge adjacent basic blocks, if possible.
  for (unsigned i = 0, e = Latches.size(); i != e; ++i) {
    BranchInst *Term = cast<BranchInst>(Latches[i]->getTerminator());
    if (Term->isUnconditional()) {
      BasicBlock *Dest = Term->getSuccessor(0);
      if (BasicBlock *Fold = FoldBlockIntoPredecessor(Dest, LI, LPM))
        std::replace(Latches.begin(), Latches.end(), Dest, Fold);
    }
  }

  DominatorTree *DT = nullptr;
  if (PP) {
    // FIXME: Reconstruct dom info, because it is not preserved properly.
    // Incrementally updating domtree after loop unrolling would be easy.
    if (DominatorTreeWrapperPass *DTWP =
            PP->getAnalysisIfAvailable<DominatorTreeWrapperPass>()) {
      DT = &DTWP->getDomTree();
      DT->recalculate(*L->getHeader()->getParent());
    }

    // Simplify any new induction variables in the partially unrolled loop.
    ScalarEvolution *SE = PP->getAnalysisIfAvailable<ScalarEvolution>();
    if (SE && !CompletelyUnroll) {
      SmallVector<WeakVH, 16> DeadInsts;
      simplifyLoopIVs(L, SE, LPM, DeadInsts);

      // Aggressively clean up dead instructions that simplifyLoopIVs already
      // identified. Any remaining should be cleaned up below.
      while (!DeadInsts.empty())
        if (Instruction *Inst =
            dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val()))
          RecursivelyDeleteTriviallyDeadInstructions(Inst);
    }
  }
  // At this point, the code is well formed.  We now do a quick sweep over the
  // inserted code, doing constant propagation and dead code elimination as we
  // go.
  const std::vector<BasicBlock*> &NewLoopBlocks = L->getBlocks();
  for (std::vector<BasicBlock*>::const_iterator BB = NewLoopBlocks.begin(),
       BBE = NewLoopBlocks.end(); BB != BBE; ++BB)
    for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end(); I != E; ) {
      Instruction *Inst = I++;

      if (isInstructionTriviallyDead(Inst))
        (*BB)->getInstList().erase(Inst);
      else if (Value *V = SimplifyInstruction(Inst))
        if (LI->replacementPreservesLCSSAForm(Inst, V)) {
          Inst->replaceAllUsesWith(V);
          (*BB)->getInstList().erase(Inst);
        }
    }

  NumCompletelyUnrolled += CompletelyUnroll;
  ++NumUnrolled;

  Loop *OuterL = L->getParentLoop();
  // Remove the loop from the LoopPassManager if it's completely removed.
  if (CompletelyUnroll && LPM != nullptr)
    LPM->deleteLoopFromQueue(L);

  // If we have a pass and a DominatorTree we should re-simplify impacted loops
  // to ensure subsequent analyses can rely on this form. We want to simplify
  // at least one layer outside of the loop that was unrolled so that any
  // changes to the parent loop exposed by the unrolling are considered.
  if (PP && DT) {
    if (!OuterL && !CompletelyUnroll)
      OuterL = L;
    if (OuterL) {
      ScalarEvolution *SE = PP->getAnalysisIfAvailable<ScalarEvolution>();
      simplifyLoop(OuterL, DT, LI, PP, /*AliasAnalysis*/ nullptr, SE);

      // LCSSA must be performed on the outermost affected loop. The unrolled
      // loop's last loop latch is guaranteed to be in the outermost loop after
      // deleteLoopFromQueue updates LoopInfo.
      Loop *LatchLoop = LI->getLoopFor(Latches.back());
      if (!OuterL->contains(LatchLoop))
        while (OuterL->getParentLoop() != LatchLoop)
          OuterL = OuterL->getParentLoop();

      formLCSSARecursively(*OuterL, *DT, SE);
    }
  }

  return true;
}
Ejemplo n.º 3
0
/// Unroll the given loop by Count. The loop must be in LCSSA form. Returns true
/// if unrolling was succesful, or false if the loop was unmodified. Unrolling
/// can only fail when the loop's latch block is not terminated by a conditional
/// branch instruction. However, if the trip count (and multiple) are not known,
/// loop unrolling will mostly produce more code that is no faster.
///
/// The LoopInfo Analysis that is passed will be kept consistent.
///
/// If a LoopPassManager is passed in, and the loop is fully removed, it will be
/// removed from the LoopPassManager as well. LPM can also be NULL.
bool llvm::UnrollLoop(Loop *L, unsigned Count, LoopInfo* LI, LPPassManager* LPM) {
    BasicBlock *Preheader = L->getLoopPreheader();
    if (!Preheader) {
        DEBUG(dbgs() << "  Can't unroll; loop preheader-insertion failed.\n");
        return false;
    }

    BasicBlock *LatchBlock = L->getLoopLatch();
    if (!LatchBlock) {
        DEBUG(dbgs() << "  Can't unroll; loop exit-block-insertion failed.\n");
        return false;
    }

    BasicBlock *Header = L->getHeader();
    BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());

    if (!BI || BI->isUnconditional()) {
        // The loop-rotate pass can be helpful to avoid this in many cases.
        DEBUG(dbgs() <<
              "  Can't unroll; loop not terminated by a conditional branch.\n");
        return false;
    }

    // Notify ScalarEvolution that the loop will be substantially changed,
    // if not outright eliminated.
    if (ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>())
        SE->forgetLoop(L);

    // Find trip count
    unsigned TripCount = L->getSmallConstantTripCount();
    // Find trip multiple if count is not available
    unsigned TripMultiple = 1;
    if (TripCount == 0)
        TripMultiple = L->getSmallConstantTripMultiple();

    if (TripCount != 0)
        DEBUG(dbgs() << "  Trip Count = " << TripCount << "\n");
    if (TripMultiple != 1)
        DEBUG(dbgs() << "  Trip Multiple = " << TripMultiple << "\n");

    // Effectively "DCE" unrolled iterations that are beyond the tripcount
    // and will never be executed.
    if (TripCount != 0 && Count > TripCount)
        Count = TripCount;

    assert(Count > 0);
    assert(TripMultiple > 0);
    assert(TripCount == 0 || TripCount % TripMultiple == 0);

    // Are we eliminating the loop control altogether?
    bool CompletelyUnroll = Count == TripCount;

    // If we know the trip count, we know the multiple...
    unsigned BreakoutTrip = 0;
    if (TripCount != 0) {
        BreakoutTrip = TripCount % Count;
        TripMultiple = 0;
    } else {
        // Figure out what multiple to use.
        BreakoutTrip = TripMultiple =
                           (unsigned)GreatestCommonDivisor64(Count, TripMultiple);
    }

    if (CompletelyUnroll) {
        DEBUG(dbgs() << "COMPLETELY UNROLLING loop %" << Header->getName()
              << " with trip count " << TripCount << "!\n");
    } else {
        DEBUG(dbgs() << "UNROLLING loop %" << Header->getName()
              << " by " << Count);
        if (TripMultiple == 0 || BreakoutTrip != TripMultiple) {
            DEBUG(dbgs() << " with a breakout at trip " << BreakoutTrip);
        } else if (TripMultiple != 1) {
            DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
        }
        DEBUG(dbgs() << "!\n");
    }

    std::vector<BasicBlock*> LoopBlocks = L->getBlocks();

    bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
    BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);

    // For the first iteration of the loop, we should use the precloned values for
    // PHI nodes.  Insert associations now.
    typedef ValueMap<const Value*, Value*> ValueToValueMapTy;
    ValueToValueMapTy LastValueMap;
    std::vector<PHINode*> OrigPHINode;
    for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
        PHINode *PN = cast<PHINode>(I);
        OrigPHINode.push_back(PN);
        if (Instruction *I =
                    dyn_cast<Instruction>(PN->getIncomingValueForBlock(LatchBlock)))
            if (L->contains(I))
                LastValueMap[I] = I;
    }

    std::vector<BasicBlock*> Headers;
    std::vector<BasicBlock*> Latches;
    Headers.push_back(Header);
    Latches.push_back(LatchBlock);

    for (unsigned It = 1; It != Count; ++It) {
        std::vector<BasicBlock*> NewBlocks;

        for (std::vector<BasicBlock*>::iterator BB = LoopBlocks.begin(),
                E = LoopBlocks.end(); BB != E; ++BB) {
            ValueToValueMapTy VMap;
            BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
            Header->getParent()->getBasicBlockList().push_back(New);

            // Loop over all of the PHI nodes in the block, changing them to use the
            // incoming values from the previous block.
            if (*BB == Header)
                for (unsigned i = 0, e = OrigPHINode.size(); i != e; ++i) {
                    PHINode *NewPHI = cast<PHINode>(VMap[OrigPHINode[i]]);
                    Value *InVal = NewPHI->getIncomingValueForBlock(LatchBlock);
                    if (Instruction *InValI = dyn_cast<Instruction>(InVal))
                        if (It > 1 && L->contains(InValI))
                            InVal = LastValueMap[InValI];
                    VMap[OrigPHINode[i]] = InVal;
                    New->getInstList().erase(NewPHI);
                }

            // Update our running map of newest clones
            LastValueMap[*BB] = New;
            for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
                    VI != VE; ++VI)
                LastValueMap[VI->first] = VI->second;

            L->addBasicBlockToLoop(New, LI->getBase());

            // Add phi entries for newly created values to all exit blocks except
            // the successor of the latch block.  The successor of the exit block will
            // be updated specially after unrolling all the way.
            if (*BB != LatchBlock)
                for (Value::use_iterator UI = (*BB)->use_begin(), UE = (*BB)->use_end();
                        UI != UE;) {
                    Instruction *UseInst = cast<Instruction>(*UI);
                    ++UI;
                    if (isa<PHINode>(UseInst) && !L->contains(UseInst)) {
                        PHINode *phi = cast<PHINode>(UseInst);
                        Value *Incoming = phi->getIncomingValueForBlock(*BB);
                        phi->addIncoming(Incoming, New);
                    }
                }

            // Keep track of new headers and latches as we create them, so that
            // we can insert the proper branches later.
            if (*BB == Header)
                Headers.push_back(New);
            if (*BB == LatchBlock) {
                Latches.push_back(New);

                // Also, clear out the new latch's back edge so that it doesn't look
                // like a new loop, so that it's amenable to being merged with adjacent
                // blocks later on.
                TerminatorInst *Term = New->getTerminator();
                assert(L->contains(Term->getSuccessor(!ContinueOnTrue)));
                assert(Term->getSuccessor(ContinueOnTrue) == LoopExit);
                Term->setSuccessor(!ContinueOnTrue, NULL);
            }

            NewBlocks.push_back(New);
        }

        // Remap all instructions in the most recent iteration
        for (unsigned i = 0; i < NewBlocks.size(); ++i)
            for (BasicBlock::iterator I = NewBlocks[i]->begin(),
                    E = NewBlocks[i]->end(); I != E; ++I)
                RemapInstruction(I, LastValueMap);
    }

    // The latch block exits the loop.  If there are any PHI nodes in the
    // successor blocks, update them to use the appropriate values computed as the
    // last iteration of the loop.
    if (Count != 1) {
        SmallPtrSet<PHINode*, 8> Users;
        for (Value::use_iterator UI = LatchBlock->use_begin(),
                UE = LatchBlock->use_end(); UI != UE; ++UI)
            if (PHINode *phi = dyn_cast<PHINode>(*UI))
                Users.insert(phi);

        BasicBlock *LastIterationBB = cast<BasicBlock>(LastValueMap[LatchBlock]);
        for (SmallPtrSet<PHINode*,8>::iterator SI = Users.begin(), SE = Users.end();
                SI != SE; ++SI) {
            PHINode *PN = *SI;
            Value *InVal = PN->removeIncomingValue(LatchBlock, false);
            // If this value was defined in the loop, take the value defined by the
            // last iteration of the loop.
            if (Instruction *InValI = dyn_cast<Instruction>(InVal)) {
                if (L->contains(InValI))
                    InVal = LastValueMap[InVal];
            }
            PN->addIncoming(InVal, LastIterationBB);
        }
    }

    // Now, if we're doing complete unrolling, loop over the PHI nodes in the
    // original block, setting them to their incoming values.
    if (CompletelyUnroll) {
        BasicBlock *Preheader = L->getLoopPreheader();
        for (unsigned i = 0, e = OrigPHINode.size(); i != e; ++i) {
            PHINode *PN = OrigPHINode[i];
            PN->replaceAllUsesWith(PN->getIncomingValueForBlock(Preheader));
            Header->getInstList().erase(PN);
        }
    }

    // Now that all the basic blocks for the unrolled iterations are in place,
    // set up the branches to connect them.
    for (unsigned i = 0, e = Latches.size(); i != e; ++i) {
        // The original branch was replicated in each unrolled iteration.
        BranchInst *Term = cast<BranchInst>(Latches[i]->getTerminator());

        // The branch destination.
        unsigned j = (i + 1) % e;
        BasicBlock *Dest = Headers[j];
        bool NeedConditional = true;

        // For a complete unroll, make the last iteration end with a branch
        // to the exit block.
        if (CompletelyUnroll && j == 0) {
            Dest = LoopExit;
            NeedConditional = false;
        }

        // If we know the trip count or a multiple of it, we can safely use an
        // unconditional branch for some iterations.
        if (j != BreakoutTrip && (TripMultiple == 0 || j % TripMultiple != 0)) {
            NeedConditional = false;
        }

        if (NeedConditional) {
            // Update the conditional branch's successor for the following
            // iteration.
            Term->setSuccessor(!ContinueOnTrue, Dest);
        } else {
            Term->setUnconditionalDest(Dest);
            // Merge adjacent basic blocks, if possible.
            if (BasicBlock *Fold = FoldBlockIntoPredecessor(Dest, LI)) {
                std::replace(Latches.begin(), Latches.end(), Dest, Fold);
                std::replace(Headers.begin(), Headers.end(), Dest, Fold);
            }
        }
    }

    // At this point, the code is well formed.  We now do a quick sweep over the
    // inserted code, doing constant propagation and dead code elimination as we
    // go.
    const std::vector<BasicBlock*> &NewLoopBlocks = L->getBlocks();
    for (std::vector<BasicBlock*>::const_iterator BB = NewLoopBlocks.begin(),
            BBE = NewLoopBlocks.end(); BB != BBE; ++BB)
        for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end(); I != E; ) {
            Instruction *Inst = I++;

            if (isInstructionTriviallyDead(Inst))
                (*BB)->getInstList().erase(Inst);
            else if (Constant *C = ConstantFoldInstruction(Inst)) {
                Inst->replaceAllUsesWith(C);
                (*BB)->getInstList().erase(Inst);
            }
        }

    NumCompletelyUnrolled += CompletelyUnroll;
    ++NumUnrolled;
    // Remove the loop from the LoopPassManager if it's completely removed.
    if (CompletelyUnroll && LPM != NULL)
        LPM->deleteLoopFromQueue(L);

    return true;
}
Ejemplo n.º 4
0
// CreateOneLoop should take the current Loop 
Loop * SplitPass2::CreateOneLoop(Loop *L, std::map<Instruction*, Instruction*>  &instMap) {
    DEBUG(dbgs() << "Running CreateOneLoop\n");
    // TODO check if only one exit block.
    // TODO check if save to clone.

    BasicBlock *Header = L->getHeader();
    std::vector< BasicBlock * > body = L->getBlocks();
    BasicBlock *PreHeader = L->getLoopPredecessor(); 

    BasicBlock *Exit = L->getExitBlock(); 
    ValueToValueMapTy LastValueMap;
    std::vector<BasicBlock *> new_body;

    for(std::vector<BasicBlock * >::iterator it = body.begin(); it != body.end(); ++it) {
        ValueToValueMapTy VMap;
        BasicBlock *New = CloneBasicBlock(*it, VMap, ".copied");
        Header->getParent()->getBasicBlockList().push_back(New);
        new_body.push_back(New);

        // add to instMap
        std::vector<Instruction *> new_insts;
        for (BasicBlock::iterator I = New->begin(); I != New->end(); ++I) {
            new_insts.push_back(I);
        }
        std::vector<Instruction *> old_insts;
        for (BasicBlock::iterator I = (*it)->begin(); I != (*it)->end(); ++I) {
            old_insts.push_back(I);
        }

        for (unsigned i=0; i< new_insts.size(); ++i) {
            instMap[new_insts[i]] = old_insts[i];
        }

        // Update our running map of newest clones
        LastValueMap[*it] = New;
        for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end(); VI != VE; ++VI) {
            LastValueMap[VI->first] = VI->second;
        }
    }
    // new_body[0] is Header new_body(new_body.size()-1) is Latch
    
    // change vars
    for(unsigned i=0; i<new_body.size(); ++i) {
        for (BasicBlock::iterator I = new_body[i]->begin(); I != new_body[i]->end(); ++I) {
            remapInstruction(I, LastValueMap);
        }
    }
    DEBUG(dbgs() << "Cloned loop's register name changed ok\n");

    // fix the phi node in cloned Loops Header  
    for (BasicBlock::iterator I = new_body[0]->begin(); isa<PHINode>(I); ++I) {
        PHINode *NewPHI = cast<PHINode>(I); 

        for (unsigned i=0; i<NewPHI->getNumIncomingValues(); ++i) {
            if (NewPHI->getIncomingBlock(i) == PreHeader) {
                NewPHI->setIncomingBlock(i, Header);
            }
        }
    }
    DEBUG(dbgs() << "Start working on exit block \n");
    // fix the phi node in original Loop's exit BB. Because the cloned loop points to it.
    for (BasicBlock::iterator I = Exit->begin(); isa<PHINode>(I); ++I) {
        PHINode *NewPHI = cast<PHINode>(I);

        for (unsigned i=0; i<NewPHI->getNumIncomingValues(); ++i) {
            if (NewPHI->getIncomingBlock(i) == Header) {
                NewPHI->setIncomingBlock(i, new_body[0]); // cloned loop's Header
            }
        }
        DEBUG(dbgs() << "Exit block, Done phi node: "<<  NewPHI << "\n");
    }

    DEBUG(dbgs() << "All Phi Node done \n");



    BranchInst *new_back_edge = cast<BranchInst>(new_body[new_body.size()-1]->getTerminator());   // Latch's last inst is branch
    new_back_edge->setSuccessor(0, new_body[0]); // Set to branch to new Cond (Header) 1st BB

    // link new loop body together
    for(unsigned i=0; i<new_body.size()-1; i++) {
        BranchInst *next = cast<BranchInst>(new_body[i]->getTerminator());   
        next->setSuccessor(0, new_body[i+1]);
    }

    // first cond exit points to second cond block
    BranchInst *first_loop_to_next_loop = cast<BranchInst>(Header->getTerminator());
    for(unsigned i=0; i< first_loop_to_next_loop->getNumSuccessors(); ++i) { 
        if (first_loop_to_next_loop->getSuccessor(i) == Exit) {
            first_loop_to_next_loop->setSuccessor(i, new_body[0]);
        }
    }

    Loop *new_loop = new Loop();

    for (unsigned i=0; i<new_body.size();i++) {
        new_loop->addBlockEntry(new_body[i]);
    }
    new_loop->moveToHeader(new_body[0]); // set Header for new loop 
    
    return new_loop;
}
Ejemplo n.º 5
0
// MakeFunctionClone - If the specified function needs to be modified for pool
// allocation support, make a clone of it, adding additional arguments as
// necessary, and return it.  If not, just return null.
//
Function* RTAssociate::MakeFunctionClone(Function &F, FuncInfo& FI, DSGraph* G) {
  if (G->node_begin() == G->node_end()) return 0;

  if (FI.ArgNodes.empty())
    return 0; // No need to clone if no pools need to be passed in!

  // Update statistics..
  NumArgsAdded += FI.ArgNodes.size();
  if (MaxArgsAdded < FI.ArgNodes.size()) MaxArgsAdded = FI.ArgNodes.size();
  ++NumCloned;

  // Figure out what the arguments are to be for the new version of the
  // function
  FunctionType *OldFuncTy = F.getFunctionType();
  std::vector<Type*> ArgTys(FI.ArgNodes.size(), PoolDescPtrTy);
  ArgTys.reserve(OldFuncTy->getNumParams() + FI.ArgNodes.size());

  ArgTys.insert(ArgTys.end(), OldFuncTy->param_begin(), OldFuncTy->param_end());

  // Create the new function prototype
  FunctionType *FuncTy = FunctionType::get(OldFuncTy->getReturnType(), ArgTys,
                                           OldFuncTy->isVarArg());
  // Create the new function...
  Function *New = Function::Create(FuncTy, Function::InternalLinkage, F.getName());
  New->copyAttributesFrom(&F);
  F.getParent()->getFunctionList().insert(&F, New);

  // Set the rest of the new arguments names to be PDa<n> and add entries to the
  // pool descriptors map
  Function::arg_iterator NI = New->arg_begin();
  for (unsigned i = 0, e = FI.ArgNodes.size(); i != e; ++i, ++NI) {
    FI.PoolDescriptors[FI.ArgNodes[i]] = CreateArgPool(FI.ArgNodes[i], NI);
    NI->setName("PDa");
  }

  // Map the existing arguments of the old function to the corresponding
  // arguments of the new function, and copy over the names.
  ValueToValueMapTy ValueMap;
  for (Function::arg_iterator I = F.arg_begin();
          NI != New->arg_end(); ++I, ++NI) {
    ValueMap[I] = NI;
    NI->setName(I->getName());
  }

  // Perform the cloning.
  SmallVector<ReturnInst*,100> Returns;
  // TODO: review the boolean flag here
  CloneFunctionInto(New, &F, ValueMap, true, Returns);

  //
  // The CloneFunctionInto() function will copy the parameter attributes
  // verbatim.  This is incorrect; each attribute should be shifted one so
  // that the pool descriptor has no attributes.
  //
  const AttributeSet OldAttrs = New->getAttributes();
  if (!OldAttrs.isEmpty()) {
    AttributeSet NewAttrs;
    for (unsigned index = 0; index < OldAttrs.getNumSlots(); ++index) {
      const AttributeSet & PAWI = OldAttrs.getSlotAttributes(index);
      unsigned argIndex = OldAttrs.getSlotIndex(index);

      // If it's not the return value, move the attribute to the next
      // parameter.
      if (argIndex) ++argIndex;

      // Add the parameter to the new list.
      NewAttrs = NewAttrs.addAttributes(F.getContext(), argIndex, PAWI);
    }

    // Assign the new attributes to the function clone
    New->setAttributes(NewAttrs);
  }

  for (ValueToValueMapTy::iterator I = ValueMap.begin(),
          E = ValueMap.end(); I != E; ++I)
    FI.NewToOldValueMap.insert(std::make_pair(I->second, const_cast<Value*>(I->first)));

  return FI.Clone = New;
}
Ejemplo n.º 6
0
/*
  This method performs Unroll and Jam. For a simple loop like:
  for (i = ..)
    Fore(i)
    for (j = ..)
      SubLoop(i, j)
    Aft(i)

  Instead of doing normal inner or outer unrolling, we do:
  for (i = .., i+=2)
    Fore(i)
    Fore(i+1)
    for (j = ..)
      SubLoop(i, j)
      SubLoop(i+1, j)
    Aft(i)
    Aft(i+1)

  So the outer loop is essetially unrolled and then the inner loops are fused
  ("jammed") together into a single loop. This can increase speed when there
  are loads in SubLoop that are invariant to i, as they become shared between
  the now jammed inner loops.

  We do this by spliting the blocks in the loop into Fore, Subloop and Aft.
  Fore blocks are those before the inner loop, Aft are those after. Normal
  Unroll code is used to copy each of these sets of blocks and the results are
  combined together into the final form above.

  isSafeToUnrollAndJam should be used prior to calling this to make sure the
  unrolling will be valid. Checking profitablility is also advisable.
*/
LoopUnrollResult
llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
                       unsigned TripMultiple, bool UnrollRemainder,
                       LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
                       AssumptionCache *AC, OptimizationRemarkEmitter *ORE) {

  // When we enter here we should have already checked that it is safe
  BasicBlock *Header = L->getHeader();
  assert(L->getSubLoops().size() == 1);
  Loop *SubLoop = *L->begin();

  // Don't enter the unroll code if there is nothing to do.
  if (TripCount == 0 && Count < 2) {
    LLVM_DEBUG(dbgs() << "Won't unroll; almost nothing to do\n");
    return LoopUnrollResult::Unmodified;
  }

  assert(Count > 0);
  assert(TripMultiple > 0);
  assert(TripCount == 0 || TripCount % TripMultiple == 0);

  // Are we eliminating the loop control altogether?
  bool CompletelyUnroll = (Count == TripCount);

  // We use the runtime remainder in cases where we don't know trip multiple
  if (TripMultiple == 1 || TripMultiple % Count != 0) {
    if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false,
                                    /*UseEpilogRemainder*/ true,
                                    UnrollRemainder, LI, SE, DT, AC, true)) {
      LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be "
                           "generated when assuming runtime trip count\n");
      return LoopUnrollResult::Unmodified;
    }
  }

  // Notify ScalarEvolution that the loop will be substantially changed,
  // if not outright eliminated.
  if (SE) {
    SE->forgetLoop(L);
    SE->forgetLoop(SubLoop);
  }

  using namespace ore;
  // Report the unrolling decision.
  if (CompletelyUnroll) {
    LLVM_DEBUG(dbgs() << "COMPLETELY UNROLL AND JAMMING loop %"
                      << Header->getName() << " with trip count " << TripCount
                      << "!\n");
    ORE->emit(OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
                                 L->getHeader())
              << "completely unroll and jammed loop with "
              << NV("UnrollCount", TripCount) << " iterations");
  } else {
    auto DiagBuilder = [&]() {
      OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),
                              L->getHeader());
      return Diag << "unroll and jammed loop by a factor of "
                  << NV("UnrollCount", Count);
    };

    LLVM_DEBUG(dbgs() << "UNROLL AND JAMMING loop %" << Header->getName()
                      << " by " << Count);
    if (TripMultiple != 1) {
      LLVM_DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
      ORE->emit([&]() {
        return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple)
                             << " trips per branch";
      });
    } else {
      LLVM_DEBUG(dbgs() << " with run-time trip count");
      ORE->emit([&]() { return DiagBuilder() << " with run-time trip count"; });
    }
    LLVM_DEBUG(dbgs() << "!\n");
  }

  BasicBlock *Preheader = L->getLoopPreheader();
  BasicBlock *LatchBlock = L->getLoopLatch();
  BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
  assert(Preheader && LatchBlock && Header);
  assert(BI && !BI->isUnconditional());
  bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
  BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
  bool SubLoopContinueOnTrue = SubLoop->contains(
      SubLoop->getLoopLatch()->getTerminator()->getSuccessor(0));

  // Partition blocks in an outer/inner loop pair into blocks before and after
  // the loop
  BasicBlockSet SubLoopBlocks;
  BasicBlockSet ForeBlocks;
  BasicBlockSet AftBlocks;
  partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, AftBlocks,
                           DT);

  // We keep track of the entering/first and exiting/last block of each of
  // Fore/SubLoop/Aft in each iteration. This helps make the stapling up of
  // blocks easier.
  std::vector<BasicBlock *> ForeBlocksFirst;
  std::vector<BasicBlock *> ForeBlocksLast;
  std::vector<BasicBlock *> SubLoopBlocksFirst;
  std::vector<BasicBlock *> SubLoopBlocksLast;
  std::vector<BasicBlock *> AftBlocksFirst;
  std::vector<BasicBlock *> AftBlocksLast;
  ForeBlocksFirst.push_back(Header);
  ForeBlocksLast.push_back(SubLoop->getLoopPreheader());
  SubLoopBlocksFirst.push_back(SubLoop->getHeader());
  SubLoopBlocksLast.push_back(SubLoop->getExitingBlock());
  AftBlocksFirst.push_back(SubLoop->getExitBlock());
  AftBlocksLast.push_back(L->getExitingBlock());
  // Maps Blocks[0] -> Blocks[It]
  ValueToValueMapTy LastValueMap;

  // Move any instructions from fore phi operands from AftBlocks into Fore.
  moveHeaderPhiOperandsToForeBlocks(
      Header, LatchBlock, SubLoop->getLoopPreheader()->getTerminator(),
      AftBlocks);

  // The current on-the-fly SSA update requires blocks to be processed in
  // reverse postorder so that LastValueMap contains the correct value at each
  // exit.
  LoopBlocksDFS DFS(L);
  DFS.perform(LI);
  // Stash the DFS iterators before adding blocks to the loop.
  LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
  LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();

  if (Header->getParent()->isDebugInfoForProfiling())
    for (BasicBlock *BB : L->getBlocks())
      for (Instruction &I : *BB)
        if (!isa<DbgInfoIntrinsic>(&I))
          if (const DILocation *DIL = I.getDebugLoc())
            I.setDebugLoc(DIL->cloneWithDuplicationFactor(Count));

  // Copy all blocks
  for (unsigned It = 1; It != Count; ++It) {
    std::vector<BasicBlock *> NewBlocks;
    // Maps Blocks[It] -> Blocks[It-1]
    DenseMap<Value *, Value *> PrevItValueMap;

    for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
      ValueToValueMapTy VMap;
      BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
      Header->getParent()->getBasicBlockList().push_back(New);

      if (ForeBlocks.count(*BB)) {
        L->addBasicBlockToLoop(New, *LI);

        if (*BB == ForeBlocksFirst[0])
          ForeBlocksFirst.push_back(New);
        if (*BB == ForeBlocksLast[0])
          ForeBlocksLast.push_back(New);
      } else if (SubLoopBlocks.count(*BB)) {
        SubLoop->addBasicBlockToLoop(New, *LI);

        if (*BB == SubLoopBlocksFirst[0])
          SubLoopBlocksFirst.push_back(New);
        if (*BB == SubLoopBlocksLast[0])
          SubLoopBlocksLast.push_back(New);
      } else if (AftBlocks.count(*BB)) {
        L->addBasicBlockToLoop(New, *LI);

        if (*BB == AftBlocksFirst[0])
          AftBlocksFirst.push_back(New);
        if (*BB == AftBlocksLast[0])
          AftBlocksLast.push_back(New);
      } else {
        llvm_unreachable("BB being cloned should be in Fore/Sub/Aft");
      }

      // Update our running maps of newest clones
      PrevItValueMap[New] = (It == 1 ? *BB : LastValueMap[*BB]);
      LastValueMap[*BB] = New;
      for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
           VI != VE; ++VI) {
        PrevItValueMap[VI->second] =
            const_cast<Value *>(It == 1 ? VI->first : LastValueMap[VI->first]);
        LastValueMap[VI->first] = VI->second;
      }

      NewBlocks.push_back(New);

      // Update DomTree:
      if (*BB == ForeBlocksFirst[0])
        DT->addNewBlock(New, ForeBlocksLast[It - 1]);
      else if (*BB == SubLoopBlocksFirst[0])
        DT->addNewBlock(New, SubLoopBlocksLast[It - 1]);
      else if (*BB == AftBlocksFirst[0])
        DT->addNewBlock(New, AftBlocksLast[It - 1]);
      else {
        // Each set of blocks (Fore/Sub/Aft) will have the same internal domtree
        // structure.
        auto BBDomNode = DT->getNode(*BB);
        auto BBIDom = BBDomNode->getIDom();
        BasicBlock *OriginalBBIDom = BBIDom->getBlock();
        assert(OriginalBBIDom);
        assert(LastValueMap[cast<Value>(OriginalBBIDom)]);
        DT->addNewBlock(
            New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)]));
      }
    }

    // Remap all instructions in the most recent iteration
    for (BasicBlock *NewBlock : NewBlocks) {
      for (Instruction &I : *NewBlock) {
        ::remapInstruction(&I, LastValueMap);
        if (auto *II = dyn_cast<IntrinsicInst>(&I))
          if (II->getIntrinsicID() == Intrinsic::assume)
            AC->registerAssumption(II);
      }
    }

    // Alter the ForeBlocks phi's, pointing them at the latest version of the
    // value from the previous iteration's phis
    for (PHINode &Phi : ForeBlocksFirst[It]->phis()) {
      Value *OldValue = Phi.getIncomingValueForBlock(AftBlocksLast[It]);
      assert(OldValue && "should have incoming edge from Aft[It]");
      Value *NewValue = OldValue;
      if (Value *PrevValue = PrevItValueMap[OldValue])
        NewValue = PrevValue;

      assert(Phi.getNumOperands() == 2);
      Phi.setIncomingBlock(0, ForeBlocksLast[It - 1]);
      Phi.setIncomingValue(0, NewValue);
      Phi.removeIncomingValue(1);
    }
  }

  // Now that all the basic blocks for the unrolled iterations are in place,
  // finish up connecting the blocks and phi nodes. At this point LastValueMap
  // is the last unrolled iterations values.

  // Update Phis in BB from OldBB to point to NewBB
  auto updatePHIBlocks = [](BasicBlock *BB, BasicBlock *OldBB,
                            BasicBlock *NewBB) {
    for (PHINode &Phi : BB->phis()) {
      int I = Phi.getBasicBlockIndex(OldBB);
      Phi.setIncomingBlock(I, NewBB);
    }
  };
  // Update Phis in BB from OldBB to point to NewBB and use the latest value
  // from LastValueMap
  auto updatePHIBlocksAndValues = [](BasicBlock *BB, BasicBlock *OldBB,
                                     BasicBlock *NewBB,
                                     ValueToValueMapTy &LastValueMap) {
    for (PHINode &Phi : BB->phis()) {
      for (unsigned b = 0; b < Phi.getNumIncomingValues(); ++b) {
        if (Phi.getIncomingBlock(b) == OldBB) {
          Value *OldValue = Phi.getIncomingValue(b);
          if (Value *LastValue = LastValueMap[OldValue])
            Phi.setIncomingValue(b, LastValue);
          Phi.setIncomingBlock(b, NewBB);
          break;
        }
      }
    }
  };
  // Move all the phis from Src into Dest
  auto movePHIs = [](BasicBlock *Src, BasicBlock *Dest) {
    Instruction *insertPoint = Dest->getFirstNonPHI();
    while (PHINode *Phi = dyn_cast<PHINode>(Src->begin()))
      Phi->moveBefore(insertPoint);
  };

  // Update the PHI values outside the loop to point to the last block
  updatePHIBlocksAndValues(LoopExit, AftBlocksLast[0], AftBlocksLast.back(),
                           LastValueMap);

  // Update ForeBlocks successors and phi nodes
  BranchInst *ForeTerm =
      cast<BranchInst>(ForeBlocksLast.back()->getTerminator());
  BasicBlock *Dest = SubLoopBlocksFirst[0];
  ForeTerm->setSuccessor(0, Dest);

  if (CompletelyUnroll) {
    while (PHINode *Phi = dyn_cast<PHINode>(ForeBlocksFirst[0]->begin())) {
      Phi->replaceAllUsesWith(Phi->getIncomingValueForBlock(Preheader));
      Phi->getParent()->getInstList().erase(Phi);
    }
  } else {
    // Update the PHI values to point to the last aft block
    updatePHIBlocksAndValues(ForeBlocksFirst[0], AftBlocksLast[0],
                             AftBlocksLast.back(), LastValueMap);
  }

  for (unsigned It = 1; It != Count; It++) {
    // Remap ForeBlock successors from previous iteration to this
    BranchInst *ForeTerm =
        cast<BranchInst>(ForeBlocksLast[It - 1]->getTerminator());
    BasicBlock *Dest = ForeBlocksFirst[It];
    ForeTerm->setSuccessor(0, Dest);
  }

  // Subloop successors and phis
  BranchInst *SubTerm =
      cast<BranchInst>(SubLoopBlocksLast.back()->getTerminator());
  SubTerm->setSuccessor(!SubLoopContinueOnTrue, SubLoopBlocksFirst[0]);
  SubTerm->setSuccessor(SubLoopContinueOnTrue, AftBlocksFirst[0]);
  updatePHIBlocks(SubLoopBlocksFirst[0], ForeBlocksLast[0],
                  ForeBlocksLast.back());
  updatePHIBlocks(SubLoopBlocksFirst[0], SubLoopBlocksLast[0],
                  SubLoopBlocksLast.back());

  for (unsigned It = 1; It != Count; It++) {
    // Replace the conditional branch of the previous iteration subloop with an
    // unconditional one to this one
    BranchInst *SubTerm =
        cast<BranchInst>(SubLoopBlocksLast[It - 1]->getTerminator());
    BranchInst::Create(SubLoopBlocksFirst[It], SubTerm);
    SubTerm->eraseFromParent();

    updatePHIBlocks(SubLoopBlocksFirst[It], ForeBlocksLast[It],
                    ForeBlocksLast.back());
    updatePHIBlocks(SubLoopBlocksFirst[It], SubLoopBlocksLast[It],
                    SubLoopBlocksLast.back());
    movePHIs(SubLoopBlocksFirst[It], SubLoopBlocksFirst[0]);
  }

  // Aft blocks successors and phis
  BranchInst *Term = cast<BranchInst>(AftBlocksLast.back()->getTerminator());
  if (CompletelyUnroll) {
    BranchInst::Create(LoopExit, Term);
    Term->eraseFromParent();
  } else {
    Term->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]);
  }
  updatePHIBlocks(AftBlocksFirst[0], SubLoopBlocksLast[0],
                  SubLoopBlocksLast.back());

  for (unsigned It = 1; It != Count; It++) {
    // Replace the conditional branch of the previous iteration subloop with an
    // unconditional one to this one
    BranchInst *AftTerm =
        cast<BranchInst>(AftBlocksLast[It - 1]->getTerminator());
    BranchInst::Create(AftBlocksFirst[It], AftTerm);
    AftTerm->eraseFromParent();

    updatePHIBlocks(AftBlocksFirst[It], SubLoopBlocksLast[It],
                    SubLoopBlocksLast.back());
    movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]);
  }

  // Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the
  // new ones required.
  if (Count != 1) {
    SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
    DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, ForeBlocksLast[0],
                           SubLoopBlocksFirst[0]);
    DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete,
                           SubLoopBlocksLast[0], AftBlocksFirst[0]);

    DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
                           ForeBlocksLast.back(), SubLoopBlocksFirst[0]);
    DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
                           SubLoopBlocksLast.back(), AftBlocksFirst[0]);
    DT->applyUpdates(DTUpdates);
  }

  // Merge adjacent basic blocks, if possible.
  SmallPtrSet<BasicBlock *, 16> MergeBlocks;
  MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end());
  MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end());
  MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end());
  while (!MergeBlocks.empty()) {
    BasicBlock *BB = *MergeBlocks.begin();
    BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator());
    if (Term && Term->isUnconditional() && L->contains(Term->getSuccessor(0))) {
      BasicBlock *Dest = Term->getSuccessor(0);
      if (BasicBlock *Fold = foldBlockIntoPredecessor(Dest, LI, SE, DT)) {
        // Don't remove BB and add Fold as they are the same BB
        assert(Fold == BB);
        (void)Fold;
        MergeBlocks.erase(Dest);
      } else
        MergeBlocks.erase(BB);
    } else
      MergeBlocks.erase(BB);
  }

  // At this point, the code is well formed.  We now do a quick sweep over the
  // inserted code, doing constant propagation and dead code elimination as we
  // go.
  simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC);
  simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC);

  NumCompletelyUnrolledAndJammed += CompletelyUnroll;
  ++NumUnrolledAndJammed;

#ifndef NDEBUG
  // We shouldn't have done anything to break loop simplify form or LCSSA.
  Loop *OuterL = L->getParentLoop();
  Loop *OutestLoop = OuterL ? OuterL : (!CompletelyUnroll ? L : SubLoop);
  assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI));
  if (!CompletelyUnroll)
    assert(L->isLoopSimplifyForm());
  assert(SubLoop->isLoopSimplifyForm());
  assert(DT->verify());
#endif

  // Update LoopInfo if the loop is completely removed.
  if (CompletelyUnroll)
    LI->erase(L);

  return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled
                          : LoopUnrollResult::PartiallyUnrolled;
}