static void shuffleValueUseLists(Value *V, std::minstd_rand0 &Gen, DenseSet<Value *> &Seen) { if (!Seen.insert(V).second) return; if (auto *C = dyn_cast<Constant>(V)) if (!isa<GlobalValue>(C)) for (Value *Op : C->operands()) shuffleValueUseLists(Op, Gen, Seen); if (V->use_empty() || std::next(V->use_begin()) == V->use_end()) // Nothing to shuffle for 0 or 1 users. return; // Generate random numbers between 10 and 99, which will line up nicely in // debug output. We're not worried about collisons here. DEBUG(dbgs() << "V = "; V->dump()); std::uniform_int_distribution<short> Dist(10, 99); SmallDenseMap<const Use *, short, 16> Order; for (const Use &U : V->uses()) { auto I = Dist(Gen); Order[&U] = I; DEBUG(dbgs() << " - order: " << I << ", U = "; U.getUser()->dump()); } DEBUG(dbgs() << " => shuffle\n"); V->sortUseList( [&Order](const Use &L, const Use &R) { return Order[&L] < Order[&R]; }); DEBUG({ for (const Use &U : V->uses()) DEBUG(dbgs() << " - order: " << Order.lookup(&U) << ", U = "; U.getUser()->dump()); });
/// Sort local variables so that variables appearing inside of helper /// expressions come first. static SmallVector<DbgVariable *, 8> sortLocalVars(SmallVectorImpl<DbgVariable *> &Input) { SmallVector<DbgVariable *, 8> Result; SmallVector<PointerIntPair<DbgVariable *, 1>, 8> WorkList; // Map back from a DIVariable to its containing DbgVariable. SmallDenseMap<const DILocalVariable *, DbgVariable *> DbgVar; // Set of DbgVariables in Result. SmallDenseSet<DbgVariable *, 8> Visited; // For cycle detection. SmallDenseSet<DbgVariable *, 8> Visiting; // Initialize the worklist and the DIVariable lookup table. for (auto Var : reverse(Input)) { DbgVar.insert({Var->getVariable(), Var}); WorkList.push_back({Var, 0}); } // Perform a stable topological sort by doing a DFS. while (!WorkList.empty()) { auto Item = WorkList.back(); DbgVariable *Var = Item.getPointer(); bool visitedAllDependencies = Item.getInt(); WorkList.pop_back(); // Dependency is in a different lexical scope or a global. if (!Var) continue; // Already handled. if (Visited.count(Var)) continue; // Add to Result if all dependencies are visited. if (visitedAllDependencies) { Visited.insert(Var); Result.push_back(Var); continue; } // Detect cycles. auto Res = Visiting.insert(Var); if (!Res.second) { assert(false && "dependency cycle in local variables"); return Result; } // Push dependencies and this node onto the worklist, so that this node is // visited again after all of its dependencies are handled. WorkList.push_back({Var, 1}); for (auto *Dependency : dependencies(Var)) { auto Dep = dyn_cast_or_null<const DILocalVariable>(Dependency); WorkList.push_back({DbgVar[Dep], 0}); } } return Result; }
// This function calculates the number of iterations after which the given Phi // becomes an invariant. The pre-calculated values are memorized in the map. The // function (shortcut is I) is calculated according to the following definition: // Given %x = phi <Inputs from above the loop>, ..., [%y, %back.edge]. // If %y is a loop invariant, then I(%x) = 1. // If %y is a Phi from the loop header, I(%x) = I(%y) + 1. // Otherwise, I(%x) is infinite. // TODO: Actually if %y is an expression that depends only on Phi %z and some // loop invariants, we can estimate I(%x) = I(%z) + 1. The example // looks like: // %x = phi(0, %a), <-- becomes invariant starting from 3rd iteration. // %y = phi(0, 5), // %a = %y + 1. static unsigned calculateIterationsToInvariance( PHINode *Phi, Loop *L, BasicBlock *BackEdge, SmallDenseMap<PHINode *, unsigned> &IterationsToInvariance) { assert(Phi->getParent() == L->getHeader() && "Non-loop Phi should not be checked for turning into invariant."); assert(BackEdge == L->getLoopLatch() && "Wrong latch?"); // If we already know the answer, take it from the map. auto I = IterationsToInvariance.find(Phi); if (I != IterationsToInvariance.end()) return I->second; // Otherwise we need to analyze the input from the back edge. Value *Input = Phi->getIncomingValueForBlock(BackEdge); // Place infinity to map to avoid infinite recursion for cycled Phis. Such // cycles can never stop on an invariant. IterationsToInvariance[Phi] = InfiniteIterationsToInvariance; unsigned ToInvariance = InfiniteIterationsToInvariance; if (L->isLoopInvariant(Input)) ToInvariance = 1u; else if (PHINode *IncPhi = dyn_cast<PHINode>(Input)) { // Only consider Phis in header block. if (IncPhi->getParent() != L->getHeader()) return InfiniteIterationsToInvariance; // If the input becomes an invariant after X iterations, then our Phi // becomes an invariant after X + 1 iterations. unsigned InputToInvariance = calculateIterationsToInvariance( IncPhi, L, BackEdge, IterationsToInvariance); if (InputToInvariance != InfiniteIterationsToInvariance) ToInvariance = InputToInvariance + 1u; } // If we found that this Phi lies in an invariant chain, update the map. if (ToInvariance != InfiniteIterationsToInvariance) IterationsToInvariance[Phi] = ToInvariance; return ToInvariance; }
static bool IsEquivalentPHI(PHINode *PHI, SmallDenseMap<BasicBlock*, Value*, 8> &ValueMapping) { unsigned PHINumValues = PHI->getNumIncomingValues(); if (PHINumValues != ValueMapping.size()) return false; // Scan the phi to see if it matches. for (unsigned i = 0, e = PHINumValues; i != e; ++i) if (ValueMapping[PHI->getIncomingBlock(i)] != PHI->getIncomingValue(i)) { return false; } return true; }
/// Remove dead functions that are not included in DNR (Do Not Remove) list. bool Inliner::removeDeadFunctions(CallGraph &CG, bool AlwaysInlineOnly) { SmallVector<CallGraphNode*, 16> FunctionsToRemove; SmallVector<CallGraphNode *, 16> DeadFunctionsInComdats; SmallDenseMap<const Comdat *, int, 16> ComdatEntriesAlive; auto RemoveCGN = [&](CallGraphNode *CGN) { // Remove any call graph edges from the function to its callees. CGN->removeAllCalledFunctions(); // Remove any edges from the external node to the function's call graph // node. These edges might have been made irrelegant due to // optimization of the program. CG.getExternalCallingNode()->removeAnyCallEdgeTo(CGN); // Removing the node for callee from the call graph and delete it. FunctionsToRemove.push_back(CGN); }; // Scan for all of the functions, looking for ones that should now be removed // from the program. Insert the dead ones in the FunctionsToRemove set. for (CallGraph::iterator I = CG.begin(), E = CG.end(); I != E; ++I) { CallGraphNode *CGN = I->second; Function *F = CGN->getFunction(); if (!F || F->isDeclaration()) continue; // Handle the case when this function is called and we only want to care // about always-inline functions. This is a bit of a hack to share code // between here and the InlineAlways pass. if (AlwaysInlineOnly && !F->hasFnAttribute(Attribute::AlwaysInline)) continue; // If the only remaining users of the function are dead constants, remove // them. F->removeDeadConstantUsers(); if (!F->isDefTriviallyDead()) continue; // It is unsafe to drop a function with discardable linkage from a COMDAT // without also dropping the other members of the COMDAT. // The inliner doesn't visit non-function entities which are in COMDAT // groups so it is unsafe to do so *unless* the linkage is local. if (!F->hasLocalLinkage()) { if (const Comdat *C = F->getComdat()) { --ComdatEntriesAlive[C]; DeadFunctionsInComdats.push_back(CGN); continue; } } RemoveCGN(CGN); } if (!DeadFunctionsInComdats.empty()) { // Count up all the entities in COMDAT groups auto ComdatGroupReferenced = [&](const Comdat *C) { auto I = ComdatEntriesAlive.find(C); if (I != ComdatEntriesAlive.end()) ++(I->getSecond()); }; for (const Function &F : CG.getModule()) if (const Comdat *C = F.getComdat()) ComdatGroupReferenced(C); for (const GlobalVariable &GV : CG.getModule().globals()) if (const Comdat *C = GV.getComdat()) ComdatGroupReferenced(C); for (const GlobalAlias &GA : CG.getModule().aliases()) if (const Comdat *C = GA.getComdat()) ComdatGroupReferenced(C); for (CallGraphNode *CGN : DeadFunctionsInComdats) { Function *F = CGN->getFunction(); const Comdat *C = F->getComdat(); int NumAlive = ComdatEntriesAlive[C]; // We can remove functions in a COMDAT group if the entire group is dead. assert(NumAlive >= 0); if (NumAlive > 0) continue; RemoveCGN(CGN); } } if (FunctionsToRemove.empty()) return false; // Now that we know which functions to delete, do so. We didn't want to do // this inline, because that would invalidate our CallGraph::iterator // objects. :( // // Note that it doesn't matter that we are iterating over a non-stable order // here to do this, it doesn't matter which order the functions are deleted // in. array_pod_sort(FunctionsToRemove.begin(), FunctionsToRemove.end()); FunctionsToRemove.erase(std::unique(FunctionsToRemove.begin(), FunctionsToRemove.end()), FunctionsToRemove.end()); for (SmallVectorImpl<CallGraphNode *>::iterator I = FunctionsToRemove.begin(), E = FunctionsToRemove.end(); I != E; ++I) { delete CG.removeFunctionFromModule(*I); ++NumDeleted; } return true; }
/// Unroll the given loop by Count. The loop must be in LCSSA form. Returns true /// if unrolling was successful, or false if the loop was unmodified. Unrolling /// can only fail when the loop's latch block is not terminated by a conditional /// branch instruction. However, if the trip count (and multiple) are not known, /// loop unrolling will mostly produce more code that is no faster. /// /// TripCount is generally defined as the number of times the loop header /// executes. UnrollLoop relaxes the definition to permit early exits: here /// TripCount is the iteration on which control exits LatchBlock if no early /// exits were taken. Note that UnrollLoop assumes that the loop counter test /// terminates LatchBlock in order to remove unnecesssary instances of the /// test. In other words, control may exit the loop prior to TripCount /// iterations via an early branch, but control may not exit the loop from the /// LatchBlock's terminator prior to TripCount iterations. /// /// Similarly, TripMultiple divides the number of times that the LatchBlock may /// execute without exiting the loop. /// /// The LoopInfo Analysis that is passed will be kept consistent. /// /// If a LoopPassManager is passed in, and the loop is fully removed, it will be /// removed from the LoopPassManager as well. LPM can also be NULL. /// /// This utility preserves LoopInfo. If DominatorTree or ScalarEvolution are /// available from the Pass it must also preserve those analyses. bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool AllowRuntime, unsigned TripMultiple, LoopInfo *LI, Pass *PP, LPPassManager *LPM, AssumptionCache *AC) { BasicBlock *Preheader = L->getLoopPreheader(); if (!Preheader) { DEBUG(dbgs() << " Can't unroll; loop preheader-insertion failed.\n"); return false; } BasicBlock *LatchBlock = L->getLoopLatch(); if (!LatchBlock) { DEBUG(dbgs() << " Can't unroll; loop exit-block-insertion failed.\n"); return false; } // Loops with indirectbr cannot be cloned. if (!L->isSafeToClone()) { DEBUG(dbgs() << " Can't unroll; Loop body cannot be cloned.\n"); return false; } BasicBlock *Header = L->getHeader(); BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator()); if (!BI || BI->isUnconditional()) { // The loop-rotate pass can be helpful to avoid this in many cases. DEBUG(dbgs() << " Can't unroll; loop not terminated by a conditional branch.\n"); return false; } if (Header->hasAddressTaken()) { // The loop-rotate pass can be helpful to avoid this in many cases. DEBUG(dbgs() << " Won't unroll loop: address of header block is taken.\n"); return false; } if (TripCount != 0) DEBUG(dbgs() << " Trip Count = " << TripCount << "\n"); if (TripMultiple != 1) DEBUG(dbgs() << " Trip Multiple = " << TripMultiple << "\n"); // Effectively "DCE" unrolled iterations that are beyond the tripcount // and will never be executed. if (TripCount != 0 && Count > TripCount) Count = TripCount; // Don't enter the unroll code if there is nothing to do. This way we don't // need to support "partial unrolling by 1". if (TripCount == 0 && Count < 2) return false; assert(Count > 0); assert(TripMultiple > 0); assert(TripCount == 0 || TripCount % TripMultiple == 0); // Are we eliminating the loop control altogether? bool CompletelyUnroll = Count == TripCount; // We assume a run-time trip count if the compiler cannot // figure out the loop trip count and the unroll-runtime // flag is specified. bool RuntimeTripCount = (TripCount == 0 && Count > 0 && AllowRuntime); if (RuntimeTripCount && !UnrollRuntimeLoopProlog(L, Count, LI, LPM)) return false; // Notify ScalarEvolution that the loop will be substantially changed, // if not outright eliminated. ScalarEvolution *SE = PP ? PP->getAnalysisIfAvailable<ScalarEvolution>() : nullptr; if (SE) SE->forgetLoop(L); // If we know the trip count, we know the multiple... unsigned BreakoutTrip = 0; if (TripCount != 0) { BreakoutTrip = TripCount % Count; TripMultiple = 0; } else { // Figure out what multiple to use. BreakoutTrip = TripMultiple = (unsigned)GreatestCommonDivisor64(Count, TripMultiple); } // Report the unrolling decision. DebugLoc LoopLoc = L->getStartLoc(); Function *F = Header->getParent(); LLVMContext &Ctx = F->getContext(); if (CompletelyUnroll) { DEBUG(dbgs() << "COMPLETELY UNROLLING loop %" << Header->getName() << " with trip count " << TripCount << "!\n"); emitOptimizationRemark(Ctx, DEBUG_TYPE, *F, LoopLoc, Twine("completely unrolled loop with ") + Twine(TripCount) + " iterations"); } else { auto EmitDiag = [&](const Twine &T) { emitOptimizationRemark(Ctx, DEBUG_TYPE, *F, LoopLoc, "unrolled loop by a factor of " + Twine(Count) + T); }; DEBUG(dbgs() << "UNROLLING loop %" << Header->getName() << " by " << Count); if (TripMultiple == 0 || BreakoutTrip != TripMultiple) { DEBUG(dbgs() << " with a breakout at trip " << BreakoutTrip); EmitDiag(" with a breakout at trip " + Twine(BreakoutTrip)); } else if (TripMultiple != 1) { DEBUG(dbgs() << " with " << TripMultiple << " trips per branch"); EmitDiag(" with " + Twine(TripMultiple) + " trips per branch"); } else if (RuntimeTripCount) { DEBUG(dbgs() << " with run-time trip count"); EmitDiag(" with run-time trip count"); } DEBUG(dbgs() << "!\n"); } bool ContinueOnTrue = L->contains(BI->getSuccessor(0)); BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue); // For the first iteration of the loop, we should use the precloned values for // PHI nodes. Insert associations now. ValueToValueMapTy LastValueMap; std::vector<PHINode*> OrigPHINode; for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) { OrigPHINode.push_back(cast<PHINode>(I)); } std::vector<BasicBlock*> Headers; std::vector<BasicBlock*> Latches; Headers.push_back(Header); Latches.push_back(LatchBlock); // The current on-the-fly SSA update requires blocks to be processed in // reverse postorder so that LastValueMap contains the correct value at each // exit. LoopBlocksDFS DFS(L); DFS.perform(LI); // Stash the DFS iterators before adding blocks to the loop. LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO(); LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO(); for (unsigned It = 1; It != Count; ++It) { std::vector<BasicBlock*> NewBlocks; SmallDenseMap<const Loop *, Loop *, 4> NewLoops; NewLoops[L] = L; for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) { ValueToValueMapTy VMap; BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It)); Header->getParent()->getBasicBlockList().push_back(New); // Tell LI about New. if (*BB == Header) { assert(LI->getLoopFor(*BB) == L && "Header should not be in a sub-loop"); L->addBasicBlockToLoop(New, *LI); } else { // Figure out which loop New is in. const Loop *OldLoop = LI->getLoopFor(*BB); assert(OldLoop && "Should (at least) be in the loop being unrolled!"); Loop *&NewLoop = NewLoops[OldLoop]; if (!NewLoop) { // Found a new sub-loop. assert(*BB == OldLoop->getHeader() && "Header should be first in RPO"); Loop *NewLoopParent = NewLoops.lookup(OldLoop->getParentLoop()); assert(NewLoopParent && "Expected parent loop before sub-loop in RPO"); NewLoop = new Loop; NewLoopParent->addChildLoop(NewLoop); // Forget the old loop, since its inputs may have changed. if (SE) SE->forgetLoop(OldLoop); } NewLoop->addBasicBlockToLoop(New, *LI); } if (*BB == Header) // Loop over all of the PHI nodes in the block, changing them to use // the incoming values from the previous block. for (unsigned i = 0, e = OrigPHINode.size(); i != e; ++i) { PHINode *NewPHI = cast<PHINode>(VMap[OrigPHINode[i]]); Value *InVal = NewPHI->getIncomingValueForBlock(LatchBlock); if (Instruction *InValI = dyn_cast<Instruction>(InVal)) if (It > 1 && L->contains(InValI)) InVal = LastValueMap[InValI]; VMap[OrigPHINode[i]] = InVal; New->getInstList().erase(NewPHI); } // Update our running map of newest clones LastValueMap[*BB] = New; for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end(); VI != VE; ++VI) LastValueMap[VI->first] = VI->second; // Add phi entries for newly created values to all exit blocks. for (succ_iterator SI = succ_begin(*BB), SE = succ_end(*BB); SI != SE; ++SI) { if (L->contains(*SI)) continue; for (BasicBlock::iterator BBI = (*SI)->begin(); PHINode *phi = dyn_cast<PHINode>(BBI); ++BBI) { Value *Incoming = phi->getIncomingValueForBlock(*BB); ValueToValueMapTy::iterator It = LastValueMap.find(Incoming); if (It != LastValueMap.end()) Incoming = It->second; phi->addIncoming(Incoming, New); } } // Keep track of new headers and latches as we create them, so that // we can insert the proper branches later. if (*BB == Header) Headers.push_back(New); if (*BB == LatchBlock) Latches.push_back(New); NewBlocks.push_back(New); } // Remap all instructions in the most recent iteration for (unsigned i = 0; i < NewBlocks.size(); ++i) for (BasicBlock::iterator I = NewBlocks[i]->begin(), E = NewBlocks[i]->end(); I != E; ++I) ::RemapInstruction(I, LastValueMap); } // Loop over the PHI nodes in the original block, setting incoming values. for (unsigned i = 0, e = OrigPHINode.size(); i != e; ++i) { PHINode *PN = OrigPHINode[i]; if (CompletelyUnroll) { PN->replaceAllUsesWith(PN->getIncomingValueForBlock(Preheader)); Header->getInstList().erase(PN); } else if (Count > 1) { Value *InVal = PN->removeIncomingValue(LatchBlock, false); // If this value was defined in the loop, take the value defined by the // last iteration of the loop. if (Instruction *InValI = dyn_cast<Instruction>(InVal)) { if (L->contains(InValI)) InVal = LastValueMap[InVal]; } assert(Latches.back() == LastValueMap[LatchBlock] && "bad last latch"); PN->addIncoming(InVal, Latches.back()); } } // Now that all the basic blocks for the unrolled iterations are in place, // set up the branches to connect them. for (unsigned i = 0, e = Latches.size(); i != e; ++i) { // The original branch was replicated in each unrolled iteration. BranchInst *Term = cast<BranchInst>(Latches[i]->getTerminator()); // The branch destination. unsigned j = (i + 1) % e; BasicBlock *Dest = Headers[j]; bool NeedConditional = true; if (RuntimeTripCount && j != 0) { NeedConditional = false; } // For a complete unroll, make the last iteration end with a branch // to the exit block. if (CompletelyUnroll && j == 0) { Dest = LoopExit; NeedConditional = false; } // If we know the trip count or a multiple of it, we can safely use an // unconditional branch for some iterations. if (j != BreakoutTrip && (TripMultiple == 0 || j % TripMultiple != 0)) { NeedConditional = false; } if (NeedConditional) { // Update the conditional branch's successor for the following // iteration. Term->setSuccessor(!ContinueOnTrue, Dest); } else { // Remove phi operands at this loop exit if (Dest != LoopExit) { BasicBlock *BB = Latches[i]; for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI) { if (*SI == Headers[i]) continue; for (BasicBlock::iterator BBI = (*SI)->begin(); PHINode *Phi = dyn_cast<PHINode>(BBI); ++BBI) { Phi->removeIncomingValue(BB, false); } } } // Replace the conditional branch with an unconditional one. BranchInst::Create(Dest, Term); Term->eraseFromParent(); } } // Merge adjacent basic blocks, if possible. SmallPtrSet<Loop *, 4> ForgottenLoops; for (unsigned i = 0, e = Latches.size(); i != e; ++i) { BranchInst *Term = cast<BranchInst>(Latches[i]->getTerminator()); if (Term->isUnconditional()) { BasicBlock *Dest = Term->getSuccessor(0); if (BasicBlock *Fold = FoldBlockIntoPredecessor(Dest, LI, LPM, ForgottenLoops)) std::replace(Latches.begin(), Latches.end(), Dest, Fold); } } // FIXME: We could register any cloned assumptions instead of clearing the // whole function's cache. AC->clear(); DominatorTree *DT = nullptr; if (PP) { // FIXME: Reconstruct dom info, because it is not preserved properly. // Incrementally updating domtree after loop unrolling would be easy. if (DominatorTreeWrapperPass *DTWP = PP->getAnalysisIfAvailable<DominatorTreeWrapperPass>()) { DT = &DTWP->getDomTree(); DT->recalculate(*L->getHeader()->getParent()); } // Simplify any new induction variables in the partially unrolled loop. if (SE && !CompletelyUnroll) { SmallVector<WeakVH, 16> DeadInsts; simplifyLoopIVs(L, SE, LPM, DeadInsts); // Aggressively clean up dead instructions that simplifyLoopIVs already // identified. Any remaining should be cleaned up below. while (!DeadInsts.empty()) if (Instruction *Inst = dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val())) RecursivelyDeleteTriviallyDeadInstructions(Inst); } } // At this point, the code is well formed. We now do a quick sweep over the // inserted code, doing constant propagation and dead code elimination as we // go. const std::vector<BasicBlock*> &NewLoopBlocks = L->getBlocks(); for (std::vector<BasicBlock*>::const_iterator BB = NewLoopBlocks.begin(), BBE = NewLoopBlocks.end(); BB != BBE; ++BB) for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end(); I != E; ) { Instruction *Inst = I++; if (isInstructionTriviallyDead(Inst)) (*BB)->getInstList().erase(Inst); else if (Value *V = SimplifyInstruction(Inst)) if (LI->replacementPreservesLCSSAForm(Inst, V)) { Inst->replaceAllUsesWith(V); (*BB)->getInstList().erase(Inst); } } NumCompletelyUnrolled += CompletelyUnroll; ++NumUnrolled; Loop *OuterL = L->getParentLoop(); // Remove the loop from the LoopPassManager if it's completely removed. if (CompletelyUnroll && LPM != nullptr) LPM->deleteLoopFromQueue(L); // If we have a pass and a DominatorTree we should re-simplify impacted loops // to ensure subsequent analyses can rely on this form. We want to simplify // at least one layer outside of the loop that was unrolled so that any // changes to the parent loop exposed by the unrolling are considered. if (PP && DT) { if (!OuterL && !CompletelyUnroll) OuterL = L; if (OuterL) { DataLayoutPass *DLP = PP->getAnalysisIfAvailable<DataLayoutPass>(); const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; simplifyLoop(OuterL, DT, LI, PP, /*AliasAnalysis*/ nullptr, SE, DL, AC); // LCSSA must be performed on the outermost affected loop. The unrolled // loop's last loop latch is guaranteed to be in the outermost loop after // deleteLoopFromQueue updates LoopInfo. Loop *LatchLoop = LI->getLoopFor(Latches.back()); if (!OuterL->contains(LatchLoop)) while (OuterL->getParentLoop() != LatchLoop) OuterL = OuterL->getParentLoop(); formLCSSARecursively(*OuterL, *DT, LI, SE); } } return true; }
// Sinks \p I from the loop \p L's preheader to its uses. Returns true if // sinking is successful. // \p LoopBlockNumber is used to sort the insertion blocks to ensure // determinism. static bool sinkInstruction(Loop &L, Instruction &I, const SmallVectorImpl<BasicBlock *> &ColdLoopBBs, const SmallDenseMap<BasicBlock *, int, 16> &LoopBlockNumber, LoopInfo &LI, DominatorTree &DT, BlockFrequencyInfo &BFI) { // Compute the set of blocks in loop L which contain a use of I. SmallPtrSet<BasicBlock *, 2> BBs; for (auto &U : I.uses()) { Instruction *UI = cast<Instruction>(U.getUser()); // We cannot sink I to PHI-uses. if (dyn_cast<PHINode>(UI)) return false; // We cannot sink I if it has uses outside of the loop. if (!L.contains(LI.getLoopFor(UI->getParent()))) return false; BBs.insert(UI->getParent()); } // findBBsToSinkInto is O(BBs.size() * ColdLoopBBs.size()). We cap the max // BBs.size() to avoid expensive computation. // FIXME: Handle code size growth for min_size and opt_size. if (BBs.size() > MaxNumberOfUseBBsForSinking) return false; // Find the set of BBs that we should insert a copy of I. SmallPtrSet<BasicBlock *, 2> BBsToSinkInto = findBBsToSinkInto(L, BBs, ColdLoopBBs, DT, BFI); if (BBsToSinkInto.empty()) return false; // Copy the final BBs into a vector and sort them using the total ordering // of the loop block numbers as iterating the set doesn't give a useful // order. No need to stable sort as the block numbers are a total ordering. SmallVector<BasicBlock *, 2> SortedBBsToSinkInto; SortedBBsToSinkInto.insert(SortedBBsToSinkInto.begin(), BBsToSinkInto.begin(), BBsToSinkInto.end()); std::sort(SortedBBsToSinkInto.begin(), SortedBBsToSinkInto.end(), [&](BasicBlock *A, BasicBlock *B) { return *LoopBlockNumber.find(A) < *LoopBlockNumber.find(B); }); BasicBlock *MoveBB = *SortedBBsToSinkInto.begin(); // FIXME: Optimize the efficiency for cloned value replacement. The current // implementation is O(SortedBBsToSinkInto.size() * I.num_uses()). for (BasicBlock *N : SortedBBsToSinkInto) { if (N == MoveBB) continue; // Clone I and replace its uses. Instruction *IC = I.clone(); IC->setName(I.getName()); IC->insertBefore(&*N->getFirstInsertionPt()); // Replaces uses of I with IC in N for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE;) { Use &U = *UI++; auto *I = cast<Instruction>(U.getUser()); if (I->getParent() == N) U.set(IC); } // Replaces uses of I with IC in blocks dominated by N replaceDominatedUsesWith(&I, IC, DT, N); DEBUG(dbgs() << "Sinking a clone of " << I << " To: " << N->getName() << '\n'); NumLoopSunkCloned++; } DEBUG(dbgs() << "Sinking " << I << " To: " << MoveBB->getName() << '\n'); NumLoopSunk++; I.moveBefore(&*MoveBB->getFirstInsertionPt()); return true; }
/// For every instruction from the worklist, check to see if it has any uses /// that are outside the current loop. If so, insert LCSSA PHI nodes and /// rewrite the uses. bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist, DominatorTree &DT, LoopInfo &LI) { SmallVector<Use *, 16> UsesToRewrite; SmallSetVector<PHINode *, 16> PHIsToRemove; PredIteratorCache PredCache; bool Changed = false; // Cache the Loop ExitBlocks across this loop. We expect to get a lot of // instructions within the same loops, computing the exit blocks is // expensive, and we're not mutating the loop structure. SmallDenseMap<Loop*, SmallVector<BasicBlock *,1>> LoopExitBlocks; while (!Worklist.empty()) { UsesToRewrite.clear(); Instruction *I = Worklist.pop_back_val(); BasicBlock *InstBB = I->getParent(); Loop *L = LI.getLoopFor(InstBB); if (!LoopExitBlocks.count(L)) L->getExitBlocks(LoopExitBlocks[L]); assert(LoopExitBlocks.count(L)); const SmallVectorImpl<BasicBlock *> &ExitBlocks = LoopExitBlocks[L]; if (ExitBlocks.empty()) continue; // Tokens cannot be used in PHI nodes, so we skip over them. // We can run into tokens which are live out of a loop with catchswitch // instructions in Windows EH if the catchswitch has one catchpad which // is inside the loop and another which is not. if (I->getType()->isTokenTy()) continue; for (Use &U : I->uses()) { Instruction *User = cast<Instruction>(U.getUser()); BasicBlock *UserBB = User->getParent(); if (PHINode *PN = dyn_cast<PHINode>(User)) UserBB = PN->getIncomingBlock(U); if (InstBB != UserBB && !L->contains(UserBB)) UsesToRewrite.push_back(&U); } // If there are no uses outside the loop, exit with no change. if (UsesToRewrite.empty()) continue; ++NumLCSSA; // We are applying the transformation // Invoke instructions are special in that their result value is not // available along their unwind edge. The code below tests to see whether // DomBB dominates the value, so adjust DomBB to the normal destination // block, which is effectively where the value is first usable. BasicBlock *DomBB = InstBB; if (InvokeInst *Inv = dyn_cast<InvokeInst>(I)) DomBB = Inv->getNormalDest(); DomTreeNode *DomNode = DT.getNode(DomBB); SmallVector<PHINode *, 16> AddedPHIs; SmallVector<PHINode *, 8> PostProcessPHIs; SmallVector<PHINode *, 4> InsertedPHIs; SSAUpdater SSAUpdate(&InsertedPHIs); SSAUpdate.Initialize(I->getType(), I->getName()); // Insert the LCSSA phi's into all of the exit blocks dominated by the // value, and add them to the Phi's map. for (BasicBlock *ExitBB : ExitBlocks) { if (!DT.dominates(DomNode, DT.getNode(ExitBB))) continue; // If we already inserted something for this BB, don't reprocess it. if (SSAUpdate.HasValueForBlock(ExitBB)) continue; PHINode *PN = PHINode::Create(I->getType(), PredCache.size(ExitBB), I->getName() + ".lcssa", &ExitBB->front()); // Add inputs from inside the loop for this PHI. for (BasicBlock *Pred : PredCache.get(ExitBB)) { PN->addIncoming(I, Pred); // If the exit block has a predecessor not within the loop, arrange for // the incoming value use corresponding to that predecessor to be // rewritten in terms of a different LCSSA PHI. if (!L->contains(Pred)) UsesToRewrite.push_back( &PN->getOperandUse(PN->getOperandNumForIncomingValue( PN->getNumIncomingValues() - 1))); } AddedPHIs.push_back(PN); // Remember that this phi makes the value alive in this block. SSAUpdate.AddAvailableValue(ExitBB, PN); // LoopSimplify might fail to simplify some loops (e.g. when indirect // branches are involved). In such situations, it might happen that an // exit for Loop L1 is the header of a disjoint Loop L2. Thus, when we // create PHIs in such an exit block, we are also inserting PHIs into L2's // header. This could break LCSSA form for L2 because these inserted PHIs // can also have uses outside of L2. Remember all PHIs in such situation // as to revisit than later on. FIXME: Remove this if indirectbr support // into LoopSimplify gets improved. if (auto *OtherLoop = LI.getLoopFor(ExitBB)) if (!L->contains(OtherLoop)) PostProcessPHIs.push_back(PN); } // Rewrite all uses outside the loop in terms of the new PHIs we just // inserted. for (Use *UseToRewrite : UsesToRewrite) { // If this use is in an exit block, rewrite to use the newly inserted PHI. // This is required for correctness because SSAUpdate doesn't handle uses // in the same block. It assumes the PHI we inserted is at the end of the // block. Instruction *User = cast<Instruction>(UseToRewrite->getUser()); BasicBlock *UserBB = User->getParent(); if (PHINode *PN = dyn_cast<PHINode>(User)) UserBB = PN->getIncomingBlock(*UseToRewrite); if (isa<PHINode>(UserBB->begin()) && isExitBlock(UserBB, ExitBlocks)) { // Tell the VHs that the uses changed. This updates SCEV's caches. if (UseToRewrite->get()->hasValueHandle()) ValueHandleBase::ValueIsRAUWd(*UseToRewrite, &UserBB->front()); UseToRewrite->set(&UserBB->front()); continue; } // Otherwise, do full PHI insertion. SSAUpdate.RewriteUse(*UseToRewrite); } // SSAUpdater might have inserted phi-nodes inside other loops. We'll need // to post-process them to keep LCSSA form. for (PHINode *InsertedPN : InsertedPHIs) { if (auto *OtherLoop = LI.getLoopFor(InsertedPN->getParent())) if (!L->contains(OtherLoop)) PostProcessPHIs.push_back(InsertedPN); } // Post process PHI instructions that were inserted into another disjoint // loop and update their exits properly. for (auto *PostProcessPN : PostProcessPHIs) { if (PostProcessPN->use_empty()) continue; // Reprocess each PHI instruction. Worklist.push_back(PostProcessPN); } // Keep track of PHI nodes that we want to remove because they did not have // any uses rewritten. for (PHINode *PN : AddedPHIs) if (PN->use_empty()) PHIsToRemove.insert(PN); Changed = true; } // Remove PHI nodes that did not have any uses rewritten. for (PHINode *PN : PHIsToRemove) { assert (PN->use_empty() && "Trying to remove a phi with uses."); PN->eraseFromParent(); } return Changed; }
// Create output section objects and add them to OutputSections. template <class ELFT> void Writer<ELFT>::createSections() { // .interp needs to be on the first page in the output file. if (needsInterpSection()) OutputSections.push_back(Out<ELFT>::Interp); SmallDenseMap<SectionKey<ELFT::Is64Bits>, OutputSectionBase<ELFT> *> Map; std::vector<OutputSectionBase<ELFT> *> RegularSections; for (const std::unique_ptr<ObjectFile<ELFT>> &F : Symtab.getObjectFiles()) { for (InputSectionBase<ELFT> *C : F->getSections()) { if (isDiscarded(C)) continue; const Elf_Shdr *H = C->getSectionHdr(); uintX_t OutFlags = H->sh_flags & ~SHF_GROUP; // For SHF_MERGE we create different output sections for each sh_entsize. // This makes each output section simple and keeps a single level // mapping from input to output. typename InputSectionBase<ELFT>::Kind K = C->SectionKind; uintX_t EntSize = K != InputSectionBase<ELFT>::Merge ? 0 : H->sh_entsize; uint32_t OutType = H->sh_type; if (OutType == SHT_PROGBITS && C->getSectionName() == ".eh_frame" && Config->EMachine == EM_X86_64) OutType = SHT_X86_64_UNWIND; SectionKey<ELFT::Is64Bits> Key{getOutputSectionName(C->getSectionName()), OutType, OutFlags, EntSize}; OutputSectionBase<ELFT> *&Sec = Map[Key]; if (!Sec) { switch (K) { case InputSectionBase<ELFT>::Regular: Sec = new (SecAlloc.Allocate()) OutputSection<ELFT>(Key.Name, Key.Type, Key.Flags); break; case InputSectionBase<ELFT>::EHFrame: Sec = new (EHSecAlloc.Allocate()) EHOutputSection<ELFT>(Key.Name, Key.Type, Key.Flags); break; case InputSectionBase<ELFT>::Merge: Sec = new (MSecAlloc.Allocate()) MergeOutputSection<ELFT>(Key.Name, Key.Type, Key.Flags); break; } OutputSections.push_back(Sec); RegularSections.push_back(Sec); } switch (K) { case InputSectionBase<ELFT>::Regular: static_cast<OutputSection<ELFT> *>(Sec) ->addSection(cast<InputSection<ELFT>>(C)); break; case InputSectionBase<ELFT>::EHFrame: static_cast<EHOutputSection<ELFT> *>(Sec) ->addSection(cast<EHInputSection<ELFT>>(C)); break; case InputSectionBase<ELFT>::Merge: static_cast<MergeOutputSection<ELFT> *>(Sec) ->addSection(cast<MergeInputSection<ELFT>>(C)); break; } } } Out<ELFT>::Bss = static_cast<OutputSection<ELFT> *>( Map[{".bss", SHT_NOBITS, SHF_ALLOC | SHF_WRITE, 0}]); Out<ELFT>::Dynamic->PreInitArraySec = Map.lookup( {".preinit_array", SHT_PREINIT_ARRAY, SHF_WRITE | SHF_ALLOC, 0}); Out<ELFT>::Dynamic->InitArraySec = Map.lookup({".init_array", SHT_INIT_ARRAY, SHF_WRITE | SHF_ALLOC, 0}); Out<ELFT>::Dynamic->FiniArraySec = Map.lookup({".fini_array", SHT_FINI_ARRAY, SHF_WRITE | SHF_ALLOC, 0}); auto AddStartEnd = [&](StringRef Start, StringRef End, OutputSectionBase<ELFT> *OS) { if (OS) { Symtab.addSyntheticSym(Start, *OS, 0); Symtab.addSyntheticSym(End, *OS, OS->getSize()); } else { Symtab.addIgnoredSym(Start); Symtab.addIgnoredSym(End); } }; AddStartEnd("__preinit_array_start", "__preinit_array_end", Out<ELFT>::Dynamic->PreInitArraySec); AddStartEnd("__init_array_start", "__init_array_end", Out<ELFT>::Dynamic->InitArraySec); AddStartEnd("__fini_array_start", "__fini_array_end", Out<ELFT>::Dynamic->FiniArraySec); for (OutputSectionBase<ELFT> *Sec : RegularSections) addStartStopSymbols(Sec); // __tls_get_addr is defined by the dynamic linker for dynamic ELFs. For // static linking the linker is required to optimize away any references to // __tls_get_addr, so it's not defined anywhere. Create a hidden definition // to avoid the undefined symbol error. if (!isOutputDynamic()) Symtab.addIgnoredSym("__tls_get_addr"); // If the "_end" symbol is referenced, it is expected to point to the address // right after the data segment. Usually, this symbol points to the end // of .bss section or to the end of .data section if .bss section is absent. // The order of the sections can be affected by linker script, // so it is hard to predict which section will be the last one. // So, if this symbol is referenced, we just add the placeholder here // and update its value later. if (Symtab.find("_end")) Symtab.addAbsoluteSym("_end", DefinedAbsolute<ELFT>::End); // If there is an undefined symbol "end", we should initialize it // with the same value as "_end". In any other case it should stay intact, // because it is an allowable name for a user symbol. if (SymbolBody *B = Symtab.find("end")) if (B->isUndefined()) Symtab.addAbsoluteSym("end", DefinedAbsolute<ELFT>::End); // Scan relocations. This must be done after every symbol is declared so that // we can correctly decide if a dynamic relocation is needed. for (const std::unique_ptr<ObjectFile<ELFT>> &F : Symtab.getObjectFiles()) { for (InputSectionBase<ELFT> *C : F->getSections()) { if (isDiscarded(C)) continue; if (auto *S = dyn_cast<InputSection<ELFT>>(C)) scanRelocs(*S); else if (auto *S = dyn_cast<EHInputSection<ELFT>>(C)) if (S->RelocSection) scanRelocs(*S, *S->RelocSection); } } std::vector<DefinedCommon<ELFT> *> CommonSymbols; std::vector<SharedSymbol<ELFT> *> SharedCopySymbols; for (auto &P : Symtab.getSymbols()) { SymbolBody *Body = P.second->Body; if (auto *U = dyn_cast<Undefined<ELFT>>(Body)) if (!U->isWeak() && !U->canKeepUndefined()) reportUndefined<ELFT>(Symtab, *Body); if (auto *C = dyn_cast<DefinedCommon<ELFT>>(Body)) CommonSymbols.push_back(C); if (auto *SC = dyn_cast<SharedSymbol<ELFT>>(Body)) if (SC->needsCopy()) SharedCopySymbols.push_back(SC); if (!includeInSymtab<ELFT>(*Body)) continue; if (Out<ELFT>::SymTab) Out<ELFT>::SymTab->addSymbol(Body); if (isOutputDynamic() && includeInDynamicSymtab(*Body)) Out<ELFT>::DynSymTab->addSymbol(Body); } addCommonSymbols(CommonSymbols); addSharedCopySymbols(SharedCopySymbols); // This order is not the same as the final output order // because we sort the sections using their attributes below. if (Out<ELFT>::SymTab) OutputSections.push_back(Out<ELFT>::SymTab); OutputSections.push_back(Out<ELFT>::ShStrTab); if (Out<ELFT>::StrTab) OutputSections.push_back(Out<ELFT>::StrTab); if (isOutputDynamic()) { OutputSections.push_back(Out<ELFT>::DynSymTab); if (Out<ELFT>::GnuHashTab) OutputSections.push_back(Out<ELFT>::GnuHashTab); if (Out<ELFT>::HashTab) OutputSections.push_back(Out<ELFT>::HashTab); OutputSections.push_back(Out<ELFT>::Dynamic); OutputSections.push_back(Out<ELFT>::DynStrTab); if (Out<ELFT>::RelaDyn->hasRelocs()) OutputSections.push_back(Out<ELFT>::RelaDyn); if (Out<ELFT>::RelaPlt && Out<ELFT>::RelaPlt->hasRelocs()) OutputSections.push_back(Out<ELFT>::RelaPlt); // This is a MIPS specific section to hold a space within the data segment // of executable file which is pointed to by the DT_MIPS_RLD_MAP entry. // See "Dynamic section" in Chapter 5 in the following document: // ftp://www.linux-mips.org/pub/linux/mips/doc/ABI/mipsabi.pdf if (Config->EMachine == EM_MIPS && !Config->Shared) { Out<ELFT>::MipsRldMap = new (SecAlloc.Allocate()) OutputSection<ELFT>(".rld_map", SHT_PROGBITS, SHF_ALLOC | SHF_WRITE); Out<ELFT>::MipsRldMap->setSize(ELFT::Is64Bits ? 8 : 4); Out<ELFT>::MipsRldMap->updateAlign(ELFT::Is64Bits ? 8 : 4); OutputSections.push_back(Out<ELFT>::MipsRldMap); } } // We add the .got section to the result for dynamic MIPS target because // its address and properties are mentioned in the .dynamic section. if (!Out<ELFT>::Got->empty() || (isOutputDynamic() && Config->EMachine == EM_MIPS)) OutputSections.push_back(Out<ELFT>::Got); if (Out<ELFT>::GotPlt && !Out<ELFT>::GotPlt->empty()) OutputSections.push_back(Out<ELFT>::GotPlt); if (!Out<ELFT>::Plt->empty()) OutputSections.push_back(Out<ELFT>::Plt); std::stable_sort(OutputSections.begin(), OutputSections.end(), compareSections<ELFT>); for (unsigned I = 0, N = OutputSections.size(); I < N; ++I) { OutputSections[I]->SectionIndex = I + 1; HasRelro |= (Config->ZRelro && isRelroSection(OutputSections[I])); } for (OutputSectionBase<ELFT> *Sec : OutputSections) Out<ELFT>::ShStrTab->add(Sec->getName()); // Finalizers fix each section's size. // .dynamic section's finalizer may add strings to .dynstr, // so finalize that early. // Likewise, .dynsym is finalized early since that may fill up .gnu.hash. Out<ELFT>::Dynamic->finalize(); if (isOutputDynamic()) Out<ELFT>::DynSymTab->finalize(); // Fill other section headers. for (OutputSectionBase<ELFT> *Sec : OutputSections) Sec->finalize(); // If we have a .opd section (used under PPC64 for function descriptors), // store a pointer to it here so that we can use it later when processing // relocations. Out<ELFT>::Opd = Map.lookup({".opd", SHT_PROGBITS, SHF_WRITE | SHF_ALLOC, 0}); }