bool llvm::cannotBeMaxInLoop(const SCEV *S, const Loop *L, ScalarEvolution &SE, bool Signed) { unsigned BitWidth = cast<IntegerType>(S->getType())->getBitWidth(); APInt Max = Signed ? APInt::getSignedMaxValue(BitWidth) : APInt::getMaxValue(BitWidth); auto Predicate = Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT; return SE.isAvailableAtLoopEntry(S, L) && SE.isLoopEntryGuardedByCond(L, Predicate, S, SE.getConstant(Max)); }
/// Insert code in the prolog code when unrolling a loop with a /// run-time trip-count. /// /// This method assumes that the loop unroll factor is total number /// of loop bodes in the loop after unrolling. (Some folks refer /// to the unroll factor as the number of *extra* copies added). /// We assume also that the loop unroll factor is a power-of-two. So, after /// unrolling the loop, the number of loop bodies executed is 2, /// 4, 8, etc. Note - LLVM converts the if-then-sequence to a switch /// instruction in SimplifyCFG.cpp. Then, the backend decides how code for /// the switch instruction is generated. /// /// extraiters = tripcount % loopfactor /// if (extraiters == 0) jump Loop: /// if (extraiters == loopfactor) jump L1 /// if (extraiters == loopfactor-1) jump L2 /// ... /// L1: LoopBody; /// L2: LoopBody; /// ... /// if tripcount < loopfactor jump End /// Loop: /// ... /// End: /// bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI, LPPassManager *LPM) { // for now, only unroll loops that contain a single exit if (!L->getExitingBlock()) return false; // Make sure the loop is in canonical form, and there is a single // exit block only. if (!L->isLoopSimplifyForm() || !L->getUniqueExitBlock()) return false; // Use Scalar Evolution to compute the trip count. This allows more // loops to be unrolled than relying on induction var simplification if (!LPM) return false; ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>(); if (!SE) return false; // Only unroll loops with a computable trip count and the trip count needs // to be an int value (allowing a pointer type is a TODO item) const SCEV *BECount = SE->getBackedgeTakenCount(L); if (isa<SCEVCouldNotCompute>(BECount) || !BECount->getType()->isIntegerTy()) return false; // Add 1 since the backedge count doesn't include the first loop iteration const SCEV *TripCountSC = SE->getAddExpr(BECount, SE->getConstant(BECount->getType(), 1)); if (isa<SCEVCouldNotCompute>(TripCountSC)) return false; // We only handle cases when the unroll factor is a power of 2. // Count is the loop unroll factor, the number of extra copies added + 1. if ((Count & (Count-1)) != 0) return false; // If this loop is nested, then the loop unroller changes the code in // parent loop, so the Scalar Evolution pass needs to be run again if (Loop *ParentLoop = L->getParentLoop()) SE->forgetLoop(ParentLoop); BasicBlock *PH = L->getLoopPreheader(); BasicBlock *Header = L->getHeader(); BasicBlock *Latch = L->getLoopLatch(); // It helps to splits the original preheader twice, one for the end of the // prolog code and one for a new loop preheader BasicBlock *PEnd = SplitEdge(PH, Header, LPM->getAsPass()); BasicBlock *NewPH = SplitBlock(PEnd, PEnd->getTerminator(), LPM->getAsPass()); BranchInst *PreHeaderBR = cast<BranchInst>(PH->getTerminator()); // Compute the number of extra iterations required, which is: // extra iterations = run-time trip count % (loop unroll factor + 1) SCEVExpander Expander(*SE, "loop-unroll"); Value *TripCount = Expander.expandCodeFor(TripCountSC, TripCountSC->getType(), PreHeaderBR); Type *CountTy = TripCount->getType(); BinaryOperator *ModVal = BinaryOperator::CreateURem(TripCount, ConstantInt::get(CountTy, Count), "xtraiter"); ModVal->insertBefore(PreHeaderBR); // Check if for no extra iterations, then jump to unrolled loop Value *BranchVal = new ICmpInst(PreHeaderBR, ICmpInst::ICMP_NE, ModVal, ConstantInt::get(CountTy, 0), "lcmp"); // Branch to either the extra iterations or the unrolled loop // We will fix up the true branch label when adding loop body copies BranchInst::Create(PEnd, PEnd, BranchVal, PreHeaderBR); assert(PreHeaderBR->isUnconditional() && PreHeaderBR->getSuccessor(0) == PEnd && "CFG edges in Preheader are not correct"); PreHeaderBR->eraseFromParent(); ValueToValueMapTy LVMap; Function *F = Header->getParent(); // These variables are used to update the CFG links in each iteration BasicBlock *CompareBB = nullptr; BasicBlock *LastLoopBB = PH; // Get an ordered list of blocks in the loop to help with the ordering of the // cloned blocks in the prolog code LoopBlocksDFS LoopBlocks(L); LoopBlocks.perform(LI); // // For each extra loop iteration, create a copy of the loop's basic blocks // and generate a condition that branches to the copy depending on the // number of 'left over' iterations. // for (unsigned leftOverIters = Count-1; leftOverIters > 0; --leftOverIters) { std::vector<BasicBlock*> NewBlocks; ValueToValueMapTy VMap; // Clone all the basic blocks in the loop, but we don't clone the loop // This function adds the appropriate CFG connections. CloneLoopBlocks(L, (leftOverIters == Count-1), LastLoopBB, PEnd, NewBlocks, LoopBlocks, VMap, LVMap, LI); LastLoopBB = cast<BasicBlock>(VMap[Latch]); // Insert the cloned blocks into function just before the original loop F->getBasicBlockList().splice(PEnd, F->getBasicBlockList(), NewBlocks[0], F->end()); // Generate the code for the comparison which determines if the loop // prolog code needs to be executed. if (leftOverIters == Count-1) { // There is no compare block for the fall-thru case when for the last // left over iteration CompareBB = NewBlocks[0]; } else { // Create a new block for the comparison BasicBlock *NewBB = BasicBlock::Create(CompareBB->getContext(), "unr.cmp", F, CompareBB); if (Loop *ParentLoop = L->getParentLoop()) { // Add the new block to the parent loop, if needed ParentLoop->addBasicBlockToLoop(NewBB, LI->getBase()); } // The comparison w/ the extra iteration value and branch Value *BranchVal = new ICmpInst(*NewBB, ICmpInst::ICMP_EQ, ModVal, ConstantInt::get(CountTy, leftOverIters), "un.tmp"); // Branch to either the extra iterations or the unrolled loop BranchInst::Create(NewBlocks[0], CompareBB, BranchVal, NewBB); CompareBB = NewBB; PH->getTerminator()->setSuccessor(0, NewBB); VMap[NewPH] = CompareBB; } // Rewrite the cloned instruction operands to use the values // created when the clone is created. for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i) { for (BasicBlock::iterator I = NewBlocks[i]->begin(), E = NewBlocks[i]->end(); I != E; ++I) { RemapInstruction(I, VMap, RF_NoModuleLevelChanges|RF_IgnoreMissingEntries); } } } // Connect the prolog code to the original loop and update the // PHI functions. ConnectProlog(L, TripCount, Count, LastLoopBB, PEnd, PH, NewPH, LVMap, LPM->getAsPass()); NumRuntimeUnrolled++; return true; }
// Return the number of iterations to peel off that make conditions in the // body true/false. For example, if we peel 2 iterations off the loop below, // the condition i < 2 can be evaluated at compile time. // for (i = 0; i < n; i++) // if (i < 2) // .. // else // .. // } static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount, ScalarEvolution &SE) { assert(L.isLoopSimplifyForm() && "Loop needs to be in loop simplify form"); unsigned DesiredPeelCount = 0; for (auto *BB : L.blocks()) { auto *BI = dyn_cast<BranchInst>(BB->getTerminator()); if (!BI || BI->isUnconditional()) continue; // Ignore loop exit condition. if (L.getLoopLatch() == BB) continue; Value *Condition = BI->getCondition(); Value *LeftVal, *RightVal; CmpInst::Predicate Pred; if (!match(Condition, m_ICmp(Pred, m_Value(LeftVal), m_Value(RightVal)))) continue; const SCEV *LeftSCEV = SE.getSCEV(LeftVal); const SCEV *RightSCEV = SE.getSCEV(RightVal); // Do not consider predicates that are known to be true or false // independently of the loop iteration. if (SE.isKnownPredicate(Pred, LeftSCEV, RightSCEV) || SE.isKnownPredicate(ICmpInst::getInversePredicate(Pred), LeftSCEV, RightSCEV)) continue; // Check if we have a condition with one AddRec and one non AddRec // expression. Normalize LeftSCEV to be the AddRec. if (!isa<SCEVAddRecExpr>(LeftSCEV)) { if (isa<SCEVAddRecExpr>(RightSCEV)) { std::swap(LeftSCEV, RightSCEV); Pred = ICmpInst::getSwappedPredicate(Pred); } else continue; } const SCEVAddRecExpr *LeftAR = cast<SCEVAddRecExpr>(LeftSCEV); // Avoid huge SCEV computations in the loop below, make sure we only // consider AddRecs of the loop we are trying to peel and avoid // non-monotonic predicates, as we will not be able to simplify the loop // body. // FIXME: For the non-monotonic predicates ICMP_EQ and ICMP_NE we can // simplify the loop, if we peel 1 additional iteration, if there // is no wrapping. bool Increasing; if (!LeftAR->isAffine() || LeftAR->getLoop() != &L || !SE.isMonotonicPredicate(LeftAR, Pred, Increasing)) continue; (void)Increasing; // Check if extending the current DesiredPeelCount lets us evaluate Pred // or !Pred in the loop body statically. unsigned NewPeelCount = DesiredPeelCount; const SCEV *IterVal = LeftAR->evaluateAtIteration( SE.getConstant(LeftSCEV->getType(), NewPeelCount), SE); // If the original condition is not known, get the negated predicate // (which holds on the else branch) and check if it is known. This allows // us to peel of iterations that make the original condition false. if (!SE.isKnownPredicate(Pred, IterVal, RightSCEV)) Pred = ICmpInst::getInversePredicate(Pred); const SCEV *Step = LeftAR->getStepRecurrence(SE); while (NewPeelCount < MaxPeelCount && SE.isKnownPredicate(Pred, IterVal, RightSCEV)) { IterVal = SE.getAddExpr(IterVal, Step); NewPeelCount++; } // Only peel the loop if the monotonic predicate !Pred becomes known in the // first iteration of the loop body after peeling. if (NewPeelCount > DesiredPeelCount && SE.isKnownPredicate(ICmpInst::getInversePredicate(Pred), IterVal, RightSCEV)) DesiredPeelCount = NewPeelCount; } return DesiredPeelCount; }
/// Insert code in the prolog code when unrolling a loop with a /// run-time trip-count. /// /// This method assumes that the loop unroll factor is total number /// of loop bodes in the loop after unrolling. (Some folks refer /// to the unroll factor as the number of *extra* copies added). /// We assume also that the loop unroll factor is a power-of-two. So, after /// unrolling the loop, the number of loop bodies executed is 2, /// 4, 8, etc. Note - LLVM converts the if-then-sequence to a switch /// instruction in SimplifyCFG.cpp. Then, the backend decides how code for /// the switch instruction is generated. /// /// extraiters = tripcount % loopfactor /// if (extraiters == 0) jump Loop: /// else jump Prol /// Prol: LoopBody; /// extraiters -= 1 // Omitted if unroll factor is 2. /// if (extraiters != 0) jump Prol: // Omitted if unroll factor is 2. /// if (tripcount < loopfactor) jump End /// Loop: /// ... /// End: /// bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, bool AllowExpensiveTripCount, LoopInfo *LI, LPPassManager *LPM) { // for now, only unroll loops that contain a single exit if (!L->getExitingBlock()) return false; // Make sure the loop is in canonical form, and there is a single // exit block only. if (!L->isLoopSimplifyForm() || !L->getUniqueExitBlock()) return false; // Use Scalar Evolution to compute the trip count. This allows more // loops to be unrolled than relying on induction var simplification if (!LPM) return false; ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>(); if (!SE) return false; // Only unroll loops with a computable trip count and the trip count needs // to be an int value (allowing a pointer type is a TODO item) const SCEV *BECountSC = SE->getBackedgeTakenCount(L); if (isa<SCEVCouldNotCompute>(BECountSC) || !BECountSC->getType()->isIntegerTy()) return false; unsigned BEWidth = cast<IntegerType>(BECountSC->getType())->getBitWidth(); // Add 1 since the backedge count doesn't include the first loop iteration const SCEV *TripCountSC = SE->getAddExpr(BECountSC, SE->getConstant(BECountSC->getType(), 1)); if (isa<SCEVCouldNotCompute>(TripCountSC)) return false; BasicBlock *Header = L->getHeader(); const DataLayout &DL = Header->getModule()->getDataLayout(); SCEVExpander Expander(*SE, DL, "loop-unroll"); if (!AllowExpensiveTripCount && Expander.isHighCostExpansion(TripCountSC, L)) return false; // We only handle cases when the unroll factor is a power of 2. // Count is the loop unroll factor, the number of extra copies added + 1. if (!isPowerOf2_32(Count)) return false; // This constraint lets us deal with an overflowing trip count easily; see the // comment on ModVal below. if (Log2_32(Count) > BEWidth) return false; // If this loop is nested, then the loop unroller changes the code in // parent loop, so the Scalar Evolution pass needs to be run again if (Loop *ParentLoop = L->getParentLoop()) SE->forgetLoop(ParentLoop); // Grab analyses that we preserve. auto *DTWP = LPM->getAnalysisIfAvailable<DominatorTreeWrapperPass>(); auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; BasicBlock *PH = L->getLoopPreheader(); BasicBlock *Latch = L->getLoopLatch(); // It helps to splits the original preheader twice, one for the end of the // prolog code and one for a new loop preheader BasicBlock *PEnd = SplitEdge(PH, Header, DT, LI); BasicBlock *NewPH = SplitBlock(PEnd, PEnd->getTerminator(), DT, LI); BranchInst *PreHeaderBR = cast<BranchInst>(PH->getTerminator()); // Compute the number of extra iterations required, which is: // extra iterations = run-time trip count % (loop unroll factor + 1) Value *TripCount = Expander.expandCodeFor(TripCountSC, TripCountSC->getType(), PreHeaderBR); Value *BECount = Expander.expandCodeFor(BECountSC, BECountSC->getType(), PreHeaderBR); IRBuilder<> B(PreHeaderBR); Value *ModVal = B.CreateAnd(TripCount, Count - 1, "xtraiter"); // If ModVal is zero, we know that either // 1. there are no iteration to be run in the prologue loop // OR // 2. the addition computing TripCount overflowed // // If (2) is true, we know that TripCount really is (1 << BEWidth) and so the // number of iterations that remain to be run in the original loop is a // multiple Count == (1 << Log2(Count)) because Log2(Count) <= BEWidth (we // explicitly check this above). Value *BranchVal = B.CreateIsNotNull(ModVal, "lcmp.mod"); // Branch to either the extra iterations or the cloned/unrolled loop // We will fix up the true branch label when adding loop body copies B.CreateCondBr(BranchVal, PEnd, PEnd); assert(PreHeaderBR->isUnconditional() && PreHeaderBR->getSuccessor(0) == PEnd && "CFG edges in Preheader are not correct"); PreHeaderBR->eraseFromParent(); Function *F = Header->getParent(); // Get an ordered list of blocks in the loop to help with the ordering of the // cloned blocks in the prolog code LoopBlocksDFS LoopBlocks(L); LoopBlocks.perform(LI); // // For each extra loop iteration, create a copy of the loop's basic blocks // and generate a condition that branches to the copy depending on the // number of 'left over' iterations. // std::vector<BasicBlock *> NewBlocks; ValueToValueMapTy VMap; bool UnrollPrologue = Count == 2; // Clone all the basic blocks in the loop. If Count is 2, we don't clone // the loop, otherwise we create a cloned loop to execute the extra // iterations. This function adds the appropriate CFG connections. CloneLoopBlocks(L, ModVal, UnrollPrologue, PH, PEnd, NewBlocks, LoopBlocks, VMap, LI); // Insert the cloned blocks into function just before the original loop F->getBasicBlockList().splice(PEnd, F->getBasicBlockList(), NewBlocks[0], F->end()); // Rewrite the cloned instruction operands to use the values // created when the clone is created. for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i) { for (BasicBlock::iterator I = NewBlocks[i]->begin(), E = NewBlocks[i]->end(); I != E; ++I) { RemapInstruction(I, VMap, RF_NoModuleLevelChanges | RF_IgnoreMissingEntries); } } // Connect the prolog code to the original loop and update the // PHI functions. BasicBlock *LastLoopBB = cast<BasicBlock>(VMap[Latch]); ConnectProlog(L, BECount, Count, LastLoopBB, PEnd, PH, NewPH, VMap, /*AliasAnalysis*/ nullptr, DT, LI, LPM->getAsPass()); NumRuntimeUnrolled++; return true; }