static bool simplifyLoopCFG(Loop &L, DominatorTree &DT, LoopInfo &LI, ScalarEvolution &SE) { bool Changed = false; // Copy blocks into a temporary array to avoid iterator invalidation issues // as we remove them. SmallVector<WeakTrackingVH, 16> Blocks(L.blocks()); for (auto &Block : Blocks) { // Attempt to merge blocks in the trivial case. Don't modify blocks which // belong to other loops. BasicBlock *Succ = cast_or_null<BasicBlock>(Block); if (!Succ) continue; BasicBlock *Pred = Succ->getSinglePredecessor(); if (!Pred || !Pred->getSingleSuccessor() || LI.getLoopFor(Pred) != &L) continue; // Merge Succ into Pred and delete it. MergeBlockIntoPredecessor(Succ, &DT, &LI); SE.forgetLoop(&L); Changed = true; } return Changed; }
/// Insert code in the prolog code when unrolling a loop with a /// run-time trip-count. /// /// This method assumes that the loop unroll factor is total number /// of loop bodes in the loop after unrolling. (Some folks refer /// to the unroll factor as the number of *extra* copies added). /// We assume also that the loop unroll factor is a power-of-two. So, after /// unrolling the loop, the number of loop bodies executed is 2, /// 4, 8, etc. Note - LLVM converts the if-then-sequence to a switch /// instruction in SimplifyCFG.cpp. Then, the backend decides how code for /// the switch instruction is generated. /// /// extraiters = tripcount % loopfactor /// if (extraiters == 0) jump Loop: /// if (extraiters == loopfactor) jump L1 /// if (extraiters == loopfactor-1) jump L2 /// ... /// L1: LoopBody; /// L2: LoopBody; /// ... /// if tripcount < loopfactor jump End /// Loop: /// ... /// End: /// bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI, LPPassManager *LPM) { // for now, only unroll loops that contain a single exit if (!L->getExitingBlock()) return false; // Make sure the loop is in canonical form, and there is a single // exit block only. if (!L->isLoopSimplifyForm() || !L->getUniqueExitBlock()) return false; // Use Scalar Evolution to compute the trip count. This allows more // loops to be unrolled than relying on induction var simplification if (!LPM) return false; ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>(); if (!SE) return false; // Only unroll loops with a computable trip count and the trip count needs // to be an int value (allowing a pointer type is a TODO item) const SCEV *BECount = SE->getBackedgeTakenCount(L); if (isa<SCEVCouldNotCompute>(BECount) || !BECount->getType()->isIntegerTy()) return false; // Add 1 since the backedge count doesn't include the first loop iteration const SCEV *TripCountSC = SE->getAddExpr(BECount, SE->getConstant(BECount->getType(), 1)); if (isa<SCEVCouldNotCompute>(TripCountSC)) return false; // We only handle cases when the unroll factor is a power of 2. // Count is the loop unroll factor, the number of extra copies added + 1. if ((Count & (Count-1)) != 0) return false; // If this loop is nested, then the loop unroller changes the code in // parent loop, so the Scalar Evolution pass needs to be run again if (Loop *ParentLoop = L->getParentLoop()) SE->forgetLoop(ParentLoop); BasicBlock *PH = L->getLoopPreheader(); BasicBlock *Header = L->getHeader(); BasicBlock *Latch = L->getLoopLatch(); // It helps to splits the original preheader twice, one for the end of the // prolog code and one for a new loop preheader BasicBlock *PEnd = SplitEdge(PH, Header, LPM->getAsPass()); BasicBlock *NewPH = SplitBlock(PEnd, PEnd->getTerminator(), LPM->getAsPass()); BranchInst *PreHeaderBR = cast<BranchInst>(PH->getTerminator()); // Compute the number of extra iterations required, which is: // extra iterations = run-time trip count % (loop unroll factor + 1) SCEVExpander Expander(*SE, "loop-unroll"); Value *TripCount = Expander.expandCodeFor(TripCountSC, TripCountSC->getType(), PreHeaderBR); Type *CountTy = TripCount->getType(); BinaryOperator *ModVal = BinaryOperator::CreateURem(TripCount, ConstantInt::get(CountTy, Count), "xtraiter"); ModVal->insertBefore(PreHeaderBR); // Check if for no extra iterations, then jump to unrolled loop Value *BranchVal = new ICmpInst(PreHeaderBR, ICmpInst::ICMP_NE, ModVal, ConstantInt::get(CountTy, 0), "lcmp"); // Branch to either the extra iterations or the unrolled loop // We will fix up the true branch label when adding loop body copies BranchInst::Create(PEnd, PEnd, BranchVal, PreHeaderBR); assert(PreHeaderBR->isUnconditional() && PreHeaderBR->getSuccessor(0) == PEnd && "CFG edges in Preheader are not correct"); PreHeaderBR->eraseFromParent(); ValueToValueMapTy LVMap; Function *F = Header->getParent(); // These variables are used to update the CFG links in each iteration BasicBlock *CompareBB = nullptr; BasicBlock *LastLoopBB = PH; // Get an ordered list of blocks in the loop to help with the ordering of the // cloned blocks in the prolog code LoopBlocksDFS LoopBlocks(L); LoopBlocks.perform(LI); // // For each extra loop iteration, create a copy of the loop's basic blocks // and generate a condition that branches to the copy depending on the // number of 'left over' iterations. // for (unsigned leftOverIters = Count-1; leftOverIters > 0; --leftOverIters) { std::vector<BasicBlock*> NewBlocks; ValueToValueMapTy VMap; // Clone all the basic blocks in the loop, but we don't clone the loop // This function adds the appropriate CFG connections. CloneLoopBlocks(L, (leftOverIters == Count-1), LastLoopBB, PEnd, NewBlocks, LoopBlocks, VMap, LVMap, LI); LastLoopBB = cast<BasicBlock>(VMap[Latch]); // Insert the cloned blocks into function just before the original loop F->getBasicBlockList().splice(PEnd, F->getBasicBlockList(), NewBlocks[0], F->end()); // Generate the code for the comparison which determines if the loop // prolog code needs to be executed. if (leftOverIters == Count-1) { // There is no compare block for the fall-thru case when for the last // left over iteration CompareBB = NewBlocks[0]; } else { // Create a new block for the comparison BasicBlock *NewBB = BasicBlock::Create(CompareBB->getContext(), "unr.cmp", F, CompareBB); if (Loop *ParentLoop = L->getParentLoop()) { // Add the new block to the parent loop, if needed ParentLoop->addBasicBlockToLoop(NewBB, LI->getBase()); } // The comparison w/ the extra iteration value and branch Value *BranchVal = new ICmpInst(*NewBB, ICmpInst::ICMP_EQ, ModVal, ConstantInt::get(CountTy, leftOverIters), "un.tmp"); // Branch to either the extra iterations or the unrolled loop BranchInst::Create(NewBlocks[0], CompareBB, BranchVal, NewBB); CompareBB = NewBB; PH->getTerminator()->setSuccessor(0, NewBB); VMap[NewPH] = CompareBB; } // Rewrite the cloned instruction operands to use the values // created when the clone is created. for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i) { for (BasicBlock::iterator I = NewBlocks[i]->begin(), E = NewBlocks[i]->end(); I != E; ++I) { RemapInstruction(I, VMap, RF_NoModuleLevelChanges|RF_IgnoreMissingEntries); } } } // Connect the prolog code to the original loop and update the // PHI functions. ConnectProlog(L, TripCount, Count, LastLoopBB, PEnd, PH, NewPH, LVMap, LPM->getAsPass()); NumRuntimeUnrolled++; return true; }
/// Remove dead loops, by which we mean loops that do not impact the observable /// behavior of the program other than finite running time. Note we do ensure /// that this never remove a loop that might be infinite, as doing so could /// change the halting/non-halting nature of a program. NOTE: This entire /// process relies pretty heavily on LoopSimplify and LCSSA in order to make /// various safety checks work. bool LoopDeletionPass::runImpl(Loop *L, DominatorTree &DT, ScalarEvolution &SE, LoopInfo &loopInfo) { assert(L->isLCSSAForm(DT) && "Expected LCSSA!"); // We can only remove the loop if there is a preheader that we can // branch from after removing it. BasicBlock *preheader = L->getLoopPreheader(); if (!preheader) return false; // If LoopSimplify form is not available, stay out of trouble. if (!L->hasDedicatedExits()) return false; // We can't remove loops that contain subloops. If the subloops were dead, // they would already have been removed in earlier executions of this pass. if (L->begin() != L->end()) return false; SmallVector<BasicBlock *, 4> exitingBlocks; L->getExitingBlocks(exitingBlocks); SmallVector<BasicBlock *, 4> exitBlocks; L->getUniqueExitBlocks(exitBlocks); // We require that the loop only have a single exit block. Otherwise, we'd // be in the situation of needing to be able to solve statically which exit // block will be branched to, or trying to preserve the branching logic in // a loop invariant manner. if (exitBlocks.size() != 1) return false; // Finally, we have to check that the loop really is dead. bool Changed = false; if (!isLoopDead(L, SE, exitingBlocks, exitBlocks, Changed, preheader)) return Changed; // Don't remove loops for which we can't solve the trip count. // They could be infinite, in which case we'd be changing program behavior. const SCEV *S = SE.getMaxBackedgeTakenCount(L); if (isa<SCEVCouldNotCompute>(S)) return Changed; // Now that we know the removal is safe, remove the loop by changing the // branch from the preheader to go to the single exit block. BasicBlock *exitBlock = exitBlocks[0]; // Because we're deleting a large chunk of code at once, the sequence in which // we remove things is very important to avoid invalidation issues. Don't // mess with this unless you have good reason and know what you're doing. // Tell ScalarEvolution that the loop is deleted. Do this before // deleting the loop so that ScalarEvolution can look at the loop // to determine what it needs to clean up. SE.forgetLoop(L); // Connect the preheader directly to the exit block. TerminatorInst *TI = preheader->getTerminator(); TI->replaceUsesOfWith(L->getHeader(), exitBlock); // Rewrite phis in the exit block to get their inputs from // the preheader instead of the exiting block. BasicBlock *exitingBlock = exitingBlocks[0]; BasicBlock::iterator BI = exitBlock->begin(); while (PHINode *P = dyn_cast<PHINode>(BI)) { int j = P->getBasicBlockIndex(exitingBlock); assert(j >= 0 && "Can't find exiting block in exit block's phi node!"); P->setIncomingBlock(j, preheader); for (unsigned i = 1; i < exitingBlocks.size(); ++i) P->removeIncomingValue(exitingBlocks[i]); ++BI; } // Update the dominator tree and remove the instructions and blocks that will // be deleted from the reference counting scheme. SmallVector<DomTreeNode*, 8> ChildNodes; for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end(); LI != LE; ++LI) { // Move all of the block's children to be children of the preheader, which // allows us to remove the domtree entry for the block. ChildNodes.insert(ChildNodes.begin(), DT[*LI]->begin(), DT[*LI]->end()); for (DomTreeNode *ChildNode : ChildNodes) { DT.changeImmediateDominator(ChildNode, DT[preheader]); } ChildNodes.clear(); DT.eraseNode(*LI); // Remove the block from the reference counting scheme, so that we can // delete it freely later. (*LI)->dropAllReferences(); } // Erase the instructions and the blocks without having to worry // about ordering because we already dropped the references. // NOTE: This iteration is safe because erasing the block does not remove its // entry from the loop's block list. We do that in the next section. for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end(); LI != LE; ++LI) (*LI)->eraseFromParent(); // Finally, the blocks from loopinfo. This has to happen late because // otherwise our loop iterators won't work. SmallPtrSet<BasicBlock *, 8> blocks; blocks.insert(L->block_begin(), L->block_end()); for (BasicBlock *BB : blocks) loopInfo.removeBlock(BB); // The last step is to update LoopInfo now that we've eliminated this loop. loopInfo.markAsRemoved(L); Changed = true; ++NumDeleted; return Changed; }
/// Unroll the given loop by Count. The loop must be in LCSSA form. Returns true /// if unrolling was successful, or false if the loop was unmodified. Unrolling /// can only fail when the loop's latch block is not terminated by a conditional /// branch instruction. However, if the trip count (and multiple) are not known, /// loop unrolling will mostly produce more code that is no faster. /// /// TripCount is generally defined as the number of times the loop header /// executes. UnrollLoop relaxes the definition to permit early exits: here /// TripCount is the iteration on which control exits LatchBlock if no early /// exits were taken. Note that UnrollLoop assumes that the loop counter test /// terminates LatchBlock in order to remove unnecesssary instances of the /// test. In other words, control may exit the loop prior to TripCount /// iterations via an early branch, but control may not exit the loop from the /// LatchBlock's terminator prior to TripCount iterations. /// /// Similarly, TripMultiple divides the number of times that the LatchBlock may /// execute without exiting the loop. /// /// The LoopInfo Analysis that is passed will be kept consistent. /// /// If a LoopPassManager is passed in, and the loop is fully removed, it will be /// removed from the LoopPassManager as well. LPM can also be NULL. /// /// This utility preserves LoopInfo. If DominatorTree or ScalarEvolution are /// available from the Pass it must also preserve those analyses. bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool AllowRuntime, unsigned TripMultiple, LoopInfo *LI, Pass *PP, LPPassManager *LPM) { BasicBlock *Preheader = L->getLoopPreheader(); if (!Preheader) { DEBUG(dbgs() << " Can't unroll; loop preheader-insertion failed.\n"); return false; } BasicBlock *LatchBlock = L->getLoopLatch(); if (!LatchBlock) { DEBUG(dbgs() << " Can't unroll; loop exit-block-insertion failed.\n"); return false; } // Loops with indirectbr cannot be cloned. if (!L->isSafeToClone()) { DEBUG(dbgs() << " Can't unroll; Loop body cannot be cloned.\n"); return false; } BasicBlock *Header = L->getHeader(); BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator()); if (!BI || BI->isUnconditional()) { // The loop-rotate pass can be helpful to avoid this in many cases. DEBUG(dbgs() << " Can't unroll; loop not terminated by a conditional branch.\n"); return false; } if (Header->hasAddressTaken()) { // The loop-rotate pass can be helpful to avoid this in many cases. DEBUG(dbgs() << " Won't unroll loop: address of header block is taken.\n"); return false; } if (TripCount != 0) DEBUG(dbgs() << " Trip Count = " << TripCount << "\n"); if (TripMultiple != 1) DEBUG(dbgs() << " Trip Multiple = " << TripMultiple << "\n"); // Effectively "DCE" unrolled iterations that are beyond the tripcount // and will never be executed. if (TripCount != 0 && Count > TripCount) Count = TripCount; // Don't enter the unroll code if there is nothing to do. This way we don't // need to support "partial unrolling by 1". if (TripCount == 0 && Count < 2) return false; assert(Count > 0); assert(TripMultiple > 0); assert(TripCount == 0 || TripCount % TripMultiple == 0); // Are we eliminating the loop control altogether? bool CompletelyUnroll = Count == TripCount; // We assume a run-time trip count if the compiler cannot // figure out the loop trip count and the unroll-runtime // flag is specified. bool RuntimeTripCount = (TripCount == 0 && Count > 0 && AllowRuntime); if (RuntimeTripCount && !UnrollRuntimeLoopProlog(L, Count, LI, LPM)) return false; // Notify ScalarEvolution that the loop will be substantially changed, // if not outright eliminated. if (PP) { ScalarEvolution *SE = PP->getAnalysisIfAvailable<ScalarEvolution>(); if (SE) SE->forgetLoop(L); } // If we know the trip count, we know the multiple... unsigned BreakoutTrip = 0; if (TripCount != 0) { BreakoutTrip = TripCount % Count; TripMultiple = 0; } else { // Figure out what multiple to use. BreakoutTrip = TripMultiple = (unsigned)GreatestCommonDivisor64(Count, TripMultiple); } // Report the unrolling decision. DebugLoc LoopLoc = L->getStartLoc(); Function *F = Header->getParent(); LLVMContext &Ctx = F->getContext(); if (CompletelyUnroll) { DEBUG(dbgs() << "COMPLETELY UNROLLING loop %" << Header->getName() << " with trip count " << TripCount << "!\n"); emitOptimizationRemark(Ctx, DEBUG_TYPE, *F, LoopLoc, Twine("completely unrolled loop with ") + Twine(TripCount) + " iterations"); } else { DEBUG(dbgs() << "UNROLLING loop %" << Header->getName() << " by " << Count); Twine DiagMsg("unrolled loop by a factor of " + Twine(Count)); if (TripMultiple == 0 || BreakoutTrip != TripMultiple) { DEBUG(dbgs() << " with a breakout at trip " << BreakoutTrip); DiagMsg.concat(" with a breakout at trip " + Twine(BreakoutTrip)); } else if (TripMultiple != 1) { DEBUG(dbgs() << " with " << TripMultiple << " trips per branch"); DiagMsg.concat(" with " + Twine(TripMultiple) + " trips per branch"); } else if (RuntimeTripCount) { DEBUG(dbgs() << " with run-time trip count"); DiagMsg.concat(" with run-time trip count"); } DEBUG(dbgs() << "!\n"); emitOptimizationRemark(Ctx, DEBUG_TYPE, *F, LoopLoc, DiagMsg); } bool ContinueOnTrue = L->contains(BI->getSuccessor(0)); BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue); // For the first iteration of the loop, we should use the precloned values for // PHI nodes. Insert associations now. ValueToValueMapTy LastValueMap; std::vector<PHINode*> OrigPHINode; for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) { OrigPHINode.push_back(cast<PHINode>(I)); } std::vector<BasicBlock*> Headers; std::vector<BasicBlock*> Latches; Headers.push_back(Header); Latches.push_back(LatchBlock); // The current on-the-fly SSA update requires blocks to be processed in // reverse postorder so that LastValueMap contains the correct value at each // exit. LoopBlocksDFS DFS(L); DFS.perform(LI); // Stash the DFS iterators before adding blocks to the loop. LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO(); LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO(); for (unsigned It = 1; It != Count; ++It) { std::vector<BasicBlock*> NewBlocks; for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) { ValueToValueMapTy VMap; BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It)); Header->getParent()->getBasicBlockList().push_back(New); // Loop over all of the PHI nodes in the block, changing them to use the // incoming values from the previous block. if (*BB == Header) for (unsigned i = 0, e = OrigPHINode.size(); i != e; ++i) { PHINode *NewPHI = cast<PHINode>(VMap[OrigPHINode[i]]); Value *InVal = NewPHI->getIncomingValueForBlock(LatchBlock); if (Instruction *InValI = dyn_cast<Instruction>(InVal)) if (It > 1 && L->contains(InValI)) InVal = LastValueMap[InValI]; VMap[OrigPHINode[i]] = InVal; New->getInstList().erase(NewPHI); } // Update our running map of newest clones LastValueMap[*BB] = New; for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end(); VI != VE; ++VI) LastValueMap[VI->first] = VI->second; L->addBasicBlockToLoop(New, LI->getBase()); // Add phi entries for newly created values to all exit blocks. for (succ_iterator SI = succ_begin(*BB), SE = succ_end(*BB); SI != SE; ++SI) { if (L->contains(*SI)) continue; for (BasicBlock::iterator BBI = (*SI)->begin(); PHINode *phi = dyn_cast<PHINode>(BBI); ++BBI) { Value *Incoming = phi->getIncomingValueForBlock(*BB); ValueToValueMapTy::iterator It = LastValueMap.find(Incoming); if (It != LastValueMap.end()) Incoming = It->second; phi->addIncoming(Incoming, New); } } // Keep track of new headers and latches as we create them, so that // we can insert the proper branches later. if (*BB == Header) Headers.push_back(New); if (*BB == LatchBlock) Latches.push_back(New); NewBlocks.push_back(New); } // Remap all instructions in the most recent iteration for (unsigned i = 0; i < NewBlocks.size(); ++i) for (BasicBlock::iterator I = NewBlocks[i]->begin(), E = NewBlocks[i]->end(); I != E; ++I) ::RemapInstruction(I, LastValueMap); } // Loop over the PHI nodes in the original block, setting incoming values. for (unsigned i = 0, e = OrigPHINode.size(); i != e; ++i) { PHINode *PN = OrigPHINode[i]; if (CompletelyUnroll) { PN->replaceAllUsesWith(PN->getIncomingValueForBlock(Preheader)); Header->getInstList().erase(PN); } else if (Count > 1) { Value *InVal = PN->removeIncomingValue(LatchBlock, false); // If this value was defined in the loop, take the value defined by the // last iteration of the loop. if (Instruction *InValI = dyn_cast<Instruction>(InVal)) { if (L->contains(InValI)) InVal = LastValueMap[InVal]; } assert(Latches.back() == LastValueMap[LatchBlock] && "bad last latch"); PN->addIncoming(InVal, Latches.back()); } } // Now that all the basic blocks for the unrolled iterations are in place, // set up the branches to connect them. for (unsigned i = 0, e = Latches.size(); i != e; ++i) { // The original branch was replicated in each unrolled iteration. BranchInst *Term = cast<BranchInst>(Latches[i]->getTerminator()); // The branch destination. unsigned j = (i + 1) % e; BasicBlock *Dest = Headers[j]; bool NeedConditional = true; if (RuntimeTripCount && j != 0) { NeedConditional = false; } // For a complete unroll, make the last iteration end with a branch // to the exit block. if (CompletelyUnroll && j == 0) { Dest = LoopExit; NeedConditional = false; } // If we know the trip count or a multiple of it, we can safely use an // unconditional branch for some iterations. if (j != BreakoutTrip && (TripMultiple == 0 || j % TripMultiple != 0)) { NeedConditional = false; } if (NeedConditional) { // Update the conditional branch's successor for the following // iteration. Term->setSuccessor(!ContinueOnTrue, Dest); } else { // Remove phi operands at this loop exit if (Dest != LoopExit) { BasicBlock *BB = Latches[i]; for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI) { if (*SI == Headers[i]) continue; for (BasicBlock::iterator BBI = (*SI)->begin(); PHINode *Phi = dyn_cast<PHINode>(BBI); ++BBI) { Phi->removeIncomingValue(BB, false); } } } // Replace the conditional branch with an unconditional one. BranchInst::Create(Dest, Term); Term->eraseFromParent(); } } // Merge adjacent basic blocks, if possible. for (unsigned i = 0, e = Latches.size(); i != e; ++i) { BranchInst *Term = cast<BranchInst>(Latches[i]->getTerminator()); if (Term->isUnconditional()) { BasicBlock *Dest = Term->getSuccessor(0); if (BasicBlock *Fold = FoldBlockIntoPredecessor(Dest, LI, LPM)) std::replace(Latches.begin(), Latches.end(), Dest, Fold); } } DominatorTree *DT = nullptr; if (PP) { // FIXME: Reconstruct dom info, because it is not preserved properly. // Incrementally updating domtree after loop unrolling would be easy. if (DominatorTreeWrapperPass *DTWP = PP->getAnalysisIfAvailable<DominatorTreeWrapperPass>()) { DT = &DTWP->getDomTree(); DT->recalculate(*L->getHeader()->getParent()); } // Simplify any new induction variables in the partially unrolled loop. ScalarEvolution *SE = PP->getAnalysisIfAvailable<ScalarEvolution>(); if (SE && !CompletelyUnroll) { SmallVector<WeakVH, 16> DeadInsts; simplifyLoopIVs(L, SE, LPM, DeadInsts); // Aggressively clean up dead instructions that simplifyLoopIVs already // identified. Any remaining should be cleaned up below. while (!DeadInsts.empty()) if (Instruction *Inst = dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val())) RecursivelyDeleteTriviallyDeadInstructions(Inst); } } // At this point, the code is well formed. We now do a quick sweep over the // inserted code, doing constant propagation and dead code elimination as we // go. const std::vector<BasicBlock*> &NewLoopBlocks = L->getBlocks(); for (std::vector<BasicBlock*>::const_iterator BB = NewLoopBlocks.begin(), BBE = NewLoopBlocks.end(); BB != BBE; ++BB) for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end(); I != E; ) { Instruction *Inst = I++; if (isInstructionTriviallyDead(Inst)) (*BB)->getInstList().erase(Inst); else if (Value *V = SimplifyInstruction(Inst)) if (LI->replacementPreservesLCSSAForm(Inst, V)) { Inst->replaceAllUsesWith(V); (*BB)->getInstList().erase(Inst); } } NumCompletelyUnrolled += CompletelyUnroll; ++NumUnrolled; Loop *OuterL = L->getParentLoop(); // Remove the loop from the LoopPassManager if it's completely removed. if (CompletelyUnroll && LPM != nullptr) LPM->deleteLoopFromQueue(L); // If we have a pass and a DominatorTree we should re-simplify impacted loops // to ensure subsequent analyses can rely on this form. We want to simplify // at least one layer outside of the loop that was unrolled so that any // changes to the parent loop exposed by the unrolling are considered. if (PP && DT) { if (!OuterL && !CompletelyUnroll) OuterL = L; if (OuterL) { ScalarEvolution *SE = PP->getAnalysisIfAvailable<ScalarEvolution>(); simplifyLoop(OuterL, DT, LI, PP, /*AliasAnalysis*/ nullptr, SE); // LCSSA must be performed on the outermost affected loop. The unrolled // loop's last loop latch is guaranteed to be in the outermost loop after // deleteLoopFromQueue updates LoopInfo. Loop *LatchLoop = LI->getLoopFor(Latches.back()); if (!OuterL->contains(LatchLoop)) while (OuterL->getParentLoop() != LatchLoop) OuterL = OuterL->getParentLoop(); formLCSSARecursively(*OuterL, *DT, SE); } } return true; }
/// Insert code in the prolog code when unrolling a loop with a /// run-time trip-count. /// /// This method assumes that the loop unroll factor is total number /// of loop bodes in the loop after unrolling. (Some folks refer /// to the unroll factor as the number of *extra* copies added). /// We assume also that the loop unroll factor is a power-of-two. So, after /// unrolling the loop, the number of loop bodies executed is 2, /// 4, 8, etc. Note - LLVM converts the if-then-sequence to a switch /// instruction in SimplifyCFG.cpp. Then, the backend decides how code for /// the switch instruction is generated. /// /// extraiters = tripcount % loopfactor /// if (extraiters == 0) jump Loop: /// else jump Prol /// Prol: LoopBody; /// extraiters -= 1 // Omitted if unroll factor is 2. /// if (extraiters != 0) jump Prol: // Omitted if unroll factor is 2. /// if (tripcount < loopfactor) jump End /// Loop: /// ... /// End: /// bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, bool AllowExpensiveTripCount, LoopInfo *LI, LPPassManager *LPM) { // for now, only unroll loops that contain a single exit if (!L->getExitingBlock()) return false; // Make sure the loop is in canonical form, and there is a single // exit block only. if (!L->isLoopSimplifyForm() || !L->getUniqueExitBlock()) return false; // Use Scalar Evolution to compute the trip count. This allows more // loops to be unrolled than relying on induction var simplification if (!LPM) return false; ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>(); if (!SE) return false; // Only unroll loops with a computable trip count and the trip count needs // to be an int value (allowing a pointer type is a TODO item) const SCEV *BECountSC = SE->getBackedgeTakenCount(L); if (isa<SCEVCouldNotCompute>(BECountSC) || !BECountSC->getType()->isIntegerTy()) return false; unsigned BEWidth = cast<IntegerType>(BECountSC->getType())->getBitWidth(); // Add 1 since the backedge count doesn't include the first loop iteration const SCEV *TripCountSC = SE->getAddExpr(BECountSC, SE->getConstant(BECountSC->getType(), 1)); if (isa<SCEVCouldNotCompute>(TripCountSC)) return false; BasicBlock *Header = L->getHeader(); const DataLayout &DL = Header->getModule()->getDataLayout(); SCEVExpander Expander(*SE, DL, "loop-unroll"); if (!AllowExpensiveTripCount && Expander.isHighCostExpansion(TripCountSC, L)) return false; // We only handle cases when the unroll factor is a power of 2. // Count is the loop unroll factor, the number of extra copies added + 1. if (!isPowerOf2_32(Count)) return false; // This constraint lets us deal with an overflowing trip count easily; see the // comment on ModVal below. if (Log2_32(Count) > BEWidth) return false; // If this loop is nested, then the loop unroller changes the code in // parent loop, so the Scalar Evolution pass needs to be run again if (Loop *ParentLoop = L->getParentLoop()) SE->forgetLoop(ParentLoop); // Grab analyses that we preserve. auto *DTWP = LPM->getAnalysisIfAvailable<DominatorTreeWrapperPass>(); auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; BasicBlock *PH = L->getLoopPreheader(); BasicBlock *Latch = L->getLoopLatch(); // It helps to splits the original preheader twice, one for the end of the // prolog code and one for a new loop preheader BasicBlock *PEnd = SplitEdge(PH, Header, DT, LI); BasicBlock *NewPH = SplitBlock(PEnd, PEnd->getTerminator(), DT, LI); BranchInst *PreHeaderBR = cast<BranchInst>(PH->getTerminator()); // Compute the number of extra iterations required, which is: // extra iterations = run-time trip count % (loop unroll factor + 1) Value *TripCount = Expander.expandCodeFor(TripCountSC, TripCountSC->getType(), PreHeaderBR); Value *BECount = Expander.expandCodeFor(BECountSC, BECountSC->getType(), PreHeaderBR); IRBuilder<> B(PreHeaderBR); Value *ModVal = B.CreateAnd(TripCount, Count - 1, "xtraiter"); // If ModVal is zero, we know that either // 1. there are no iteration to be run in the prologue loop // OR // 2. the addition computing TripCount overflowed // // If (2) is true, we know that TripCount really is (1 << BEWidth) and so the // number of iterations that remain to be run in the original loop is a // multiple Count == (1 << Log2(Count)) because Log2(Count) <= BEWidth (we // explicitly check this above). Value *BranchVal = B.CreateIsNotNull(ModVal, "lcmp.mod"); // Branch to either the extra iterations or the cloned/unrolled loop // We will fix up the true branch label when adding loop body copies B.CreateCondBr(BranchVal, PEnd, PEnd); assert(PreHeaderBR->isUnconditional() && PreHeaderBR->getSuccessor(0) == PEnd && "CFG edges in Preheader are not correct"); PreHeaderBR->eraseFromParent(); Function *F = Header->getParent(); // Get an ordered list of blocks in the loop to help with the ordering of the // cloned blocks in the prolog code LoopBlocksDFS LoopBlocks(L); LoopBlocks.perform(LI); // // For each extra loop iteration, create a copy of the loop's basic blocks // and generate a condition that branches to the copy depending on the // number of 'left over' iterations. // std::vector<BasicBlock *> NewBlocks; ValueToValueMapTy VMap; bool UnrollPrologue = Count == 2; // Clone all the basic blocks in the loop. If Count is 2, we don't clone // the loop, otherwise we create a cloned loop to execute the extra // iterations. This function adds the appropriate CFG connections. CloneLoopBlocks(L, ModVal, UnrollPrologue, PH, PEnd, NewBlocks, LoopBlocks, VMap, LI); // Insert the cloned blocks into function just before the original loop F->getBasicBlockList().splice(PEnd, F->getBasicBlockList(), NewBlocks[0], F->end()); // Rewrite the cloned instruction operands to use the values // created when the clone is created. for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i) { for (BasicBlock::iterator I = NewBlocks[i]->begin(), E = NewBlocks[i]->end(); I != E; ++I) { RemapInstruction(I, VMap, RF_NoModuleLevelChanges | RF_IgnoreMissingEntries); } } // Connect the prolog code to the original loop and update the // PHI functions. BasicBlock *LastLoopBB = cast<BasicBlock>(VMap[Latch]); ConnectProlog(L, BECount, Count, LastLoopBB, PEnd, PH, NewPH, VMap, /*AliasAnalysis*/ nullptr, DT, LI, LPM->getAsPass()); NumRuntimeUnrolled++; return true; }