BlockFrequencyInfo BlockFrequencyAnalysis::run(Function &F, FunctionAnalysisManager &AM) { BlockFrequencyInfo BFI; BFI.calculate(F, AM.getResult<BranchProbabilityAnalysis>(F), AM.getResult<LoopAnalysis>(F)); return BFI; }
/// Return adjusted total frequency of \p BBs. /// /// * If there is only one BB, sinking instruction will not introduce code /// size increase. Thus there is no need to adjust the frequency. /// * If there are more than one BB, sinking would lead to code size increase. /// In this case, we add some "tax" to the total frequency to make it harder /// to sink. E.g. /// Freq(Preheader) = 100 /// Freq(BBs) = sum(50, 49) = 99 /// Even if Freq(BBs) < Freq(Preheader), we will not sink from Preheade to /// BBs as the difference is too small to justify the code size increase. /// To model this, The adjusted Freq(BBs) will be: /// AdjustedFreq(BBs) = 99 / SinkFrequencyPercentThreshold% static BlockFrequency adjustedSumFreq(SmallPtrSetImpl<BasicBlock *> &BBs, BlockFrequencyInfo &BFI) { BlockFrequency T = 0; for (BasicBlock *B : BBs) T += BFI.getBlockFreq(B); if (BBs.size() > 1) T /= BranchProbability(SinkFrequencyPercentThreshold, 100); return T; }
/// Return a set of basic blocks to insert sinked instructions. /// /// The returned set of basic blocks (BBsToSinkInto) should satisfy: /// /// * Inside the loop \p L /// * For each UseBB in \p UseBBs, there is at least one BB in BBsToSinkInto /// that domintates the UseBB /// * Has minimum total frequency that is no greater than preheader frequency /// /// The purpose of the function is to find the optimal sinking points to /// minimize execution cost, which is defined as "sum of frequency of /// BBsToSinkInto". /// As a result, the returned BBsToSinkInto needs to have minimum total /// frequency. /// Additionally, if the total frequency of BBsToSinkInto exceeds preheader /// frequency, the optimal solution is not sinking (return empty set). /// /// \p ColdLoopBBs is used to help find the optimal sinking locations. /// It stores a list of BBs that is: /// /// * Inside the loop \p L /// * Has a frequency no larger than the loop's preheader /// * Sorted by BB frequency /// /// The complexity of the function is O(UseBBs.size() * ColdLoopBBs.size()). /// To avoid expensive computation, we cap the maximum UseBBs.size() in its /// caller. static SmallPtrSet<BasicBlock *, 2> findBBsToSinkInto(const Loop &L, const SmallPtrSetImpl<BasicBlock *> &UseBBs, const SmallVectorImpl<BasicBlock *> &ColdLoopBBs, DominatorTree &DT, BlockFrequencyInfo &BFI) { SmallPtrSet<BasicBlock *, 2> BBsToSinkInto; if (UseBBs.size() == 0) return BBsToSinkInto; BBsToSinkInto.insert(UseBBs.begin(), UseBBs.end()); SmallPtrSet<BasicBlock *, 2> BBsDominatedByColdestBB; // For every iteration: // * Pick the ColdestBB from ColdLoopBBs // * Find the set BBsDominatedByColdestBB that satisfy: // - BBsDominatedByColdestBB is a subset of BBsToSinkInto // - Every BB in BBsDominatedByColdestBB is dominated by ColdestBB // * If Freq(ColdestBB) < Freq(BBsDominatedByColdestBB), remove // BBsDominatedByColdestBB from BBsToSinkInto, add ColdestBB to // BBsToSinkInto for (BasicBlock *ColdestBB : ColdLoopBBs) { BBsDominatedByColdestBB.clear(); for (BasicBlock *SinkedBB : BBsToSinkInto) if (DT.dominates(ColdestBB, SinkedBB)) BBsDominatedByColdestBB.insert(SinkedBB); if (BBsDominatedByColdestBB.size() == 0) continue; if (adjustedSumFreq(BBsDominatedByColdestBB, BFI) > BFI.getBlockFreq(ColdestBB)) { for (BasicBlock *DominatedBB : BBsDominatedByColdestBB) { BBsToSinkInto.erase(DominatedBB); } BBsToSinkInto.insert(ColdestBB); } } // If the total frequency of BBsToSinkInto is larger than preheader frequency, // do not sink. if (adjustedSumFreq(BBsToSinkInto, BFI) > BFI.getBlockFreq(L.getLoopPreheader())) BBsToSinkInto.clear(); return BBsToSinkInto; }
// Create the callsite to profile count map which is // used to update the original function's entry count, // after the function is partially inlined into the callsite. void PartialInlinerImpl::computeCallsiteToProfCountMap( Function *DuplicateFunction, DenseMap<User *, uint64_t> &CallSiteToProfCountMap) { std::vector<User *> Users(DuplicateFunction->user_begin(), DuplicateFunction->user_end()); Function *CurrentCaller = nullptr; std::unique_ptr<BlockFrequencyInfo> TempBFI; BlockFrequencyInfo *CurrentCallerBFI = nullptr; auto ComputeCurrBFI = [&,this](Function *Caller) { // For the old pass manager: if (!GetBFI) { DominatorTree DT(*Caller); LoopInfo LI(DT); BranchProbabilityInfo BPI(*Caller, LI); TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI)); CurrentCallerBFI = TempBFI.get(); } else { // New pass manager: CurrentCallerBFI = &(*GetBFI)(*Caller); } }; for (User *User : Users) { CallSite CS = getCallSite(User); Function *Caller = CS.getCaller(); if (CurrentCaller != Caller) { CurrentCaller = Caller; ComputeCurrBFI(Caller); } else { assert(CurrentCallerBFI && "CallerBFI is not set"); } BasicBlock *CallBB = CS.getInstruction()->getParent(); auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB); if (Count) CallSiteToProfCountMap[User] = *Count; else CallSiteToProfCountMap[User] = 0; } }
/// Sinks instructions from loop's preheader to the loop body if the /// sum frequency of inserted copy is smaller than preheader's frequency. static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI, DominatorTree &DT, BlockFrequencyInfo &BFI, ScalarEvolution *SE) { BasicBlock *Preheader = L.getLoopPreheader(); if (!Preheader) return false; // Enable LoopSink only when runtime profile is available. // With static profile, the sinking decision may be sub-optimal. if (!Preheader->getParent()->getEntryCount()) return false; const BlockFrequency PreheaderFreq = BFI.getBlockFreq(Preheader); // If there are no basic blocks with lower frequency than the preheader then // we can avoid the detailed analysis as we will never find profitable sinking // opportunities. if (all_of(L.blocks(), [&](const BasicBlock *BB) { return BFI.getBlockFreq(BB) > PreheaderFreq; })) return false; bool Changed = false; AliasSetTracker CurAST(AA); // Compute alias set. for (BasicBlock *BB : L.blocks()) CurAST.add(*BB); // Sort loop's basic blocks by frequency SmallVector<BasicBlock *, 10> ColdLoopBBs; SmallDenseMap<BasicBlock *, int, 16> LoopBlockNumber; int i = 0; for (BasicBlock *B : L.blocks()) if (BFI.getBlockFreq(B) < BFI.getBlockFreq(L.getLoopPreheader())) { ColdLoopBBs.push_back(B); LoopBlockNumber[B] = ++i; } std::stable_sort(ColdLoopBBs.begin(), ColdLoopBBs.end(), [&](BasicBlock *A, BasicBlock *B) { return BFI.getBlockFreq(A) < BFI.getBlockFreq(B); }); // Traverse preheader's instructions in reverse order becaue if A depends // on B (A appears after B), A needs to be sinked first before B can be // sinked. for (auto II = Preheader->rbegin(), E = Preheader->rend(); II != E;) { Instruction *I = &*II++; if (!canSinkOrHoistInst(*I, &AA, &DT, &L, &CurAST, nullptr)) continue; if (sinkInstruction(L, *I, ColdLoopBBs, LoopBlockNumber, LI, DT, BFI)) Changed = true; } if (Changed && SE) SE->forgetLoopDispositions(&L); return Changed; }
/// Given \p BBs as input, find another set of BBs which collectively /// dominates \p BBs and have the minimal sum of frequencies. Return the BB /// set found in \p BBs. static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI, BasicBlock *Entry, SmallPtrSet<BasicBlock *, 8> &BBs) { assert(!BBs.count(Entry) && "Assume Entry is not in BBs"); // Nodes on the current path to the root. SmallPtrSet<BasicBlock *, 8> Path; // Candidates includes any block 'BB' in set 'BBs' that is not strictly // dominated by any other blocks in set 'BBs', and all nodes in the path // in the dominator tree from Entry to 'BB'. SmallPtrSet<BasicBlock *, 16> Candidates; for (auto BB : BBs) { // Ignore unreachable basic blocks. if (!DT.isReachableFromEntry(BB)) continue; Path.clear(); // Walk up the dominator tree until Entry or another BB in BBs // is reached. Insert the nodes on the way to the Path. BasicBlock *Node = BB; // The "Path" is a candidate path to be added into Candidates set. bool isCandidate = false; do { Path.insert(Node); if (Node == Entry || Candidates.count(Node)) { isCandidate = true; break; } assert(DT.getNode(Node)->getIDom() && "Entry doens't dominate current Node"); Node = DT.getNode(Node)->getIDom()->getBlock(); } while (!BBs.count(Node)); // If isCandidate is false, Node is another Block in BBs dominating // current 'BB'. Drop the nodes on the Path. if (!isCandidate) continue; // Add nodes on the Path into Candidates. Candidates.insert(Path.begin(), Path.end()); } // Sort the nodes in Candidates in top-down order and save the nodes // in Orders. unsigned Idx = 0; SmallVector<BasicBlock *, 16> Orders; Orders.push_back(Entry); while (Idx != Orders.size()) { BasicBlock *Node = Orders[Idx++]; for (auto ChildDomNode : DT.getNode(Node)->getChildren()) { if (Candidates.count(ChildDomNode->getBlock())) Orders.push_back(ChildDomNode->getBlock()); } } // Visit Orders in bottom-up order. using InsertPtsCostPair = std::pair<SmallPtrSet<BasicBlock *, 16>, BlockFrequency>; // InsertPtsMap is a map from a BB to the best insertion points for the // subtree of BB (subtree not including the BB itself). DenseMap<BasicBlock *, InsertPtsCostPair> InsertPtsMap; InsertPtsMap.reserve(Orders.size() + 1); for (auto RIt = Orders.rbegin(); RIt != Orders.rend(); RIt++) { BasicBlock *Node = *RIt; bool NodeInBBs = BBs.count(Node); SmallPtrSet<BasicBlock *, 16> &InsertPts = InsertPtsMap[Node].first; BlockFrequency &InsertPtsFreq = InsertPtsMap[Node].second; // Return the optimal insert points in BBs. if (Node == Entry) { BBs.clear(); if (InsertPtsFreq > BFI.getBlockFreq(Node) || (InsertPtsFreq == BFI.getBlockFreq(Node) && InsertPts.size() > 1)) BBs.insert(Entry); else BBs.insert(InsertPts.begin(), InsertPts.end()); break; } BasicBlock *Parent = DT.getNode(Node)->getIDom()->getBlock(); // Initially, ParentInsertPts is empty and ParentPtsFreq is 0. Every child // will update its parent's ParentInsertPts and ParentPtsFreq. SmallPtrSet<BasicBlock *, 16> &ParentInsertPts = InsertPtsMap[Parent].first; BlockFrequency &ParentPtsFreq = InsertPtsMap[Parent].second; // Choose to insert in Node or in subtree of Node. // Don't hoist to EHPad because we may not find a proper place to insert // in EHPad. // If the total frequency of InsertPts is the same as the frequency of the // target Node, and InsertPts contains more than one nodes, choose hoisting // to reduce code size. if (NodeInBBs || (!Node->isEHPad() && (InsertPtsFreq > BFI.getBlockFreq(Node) || (InsertPtsFreq == BFI.getBlockFreq(Node) && InsertPts.size() > 1)))) { ParentInsertPts.insert(Node); ParentPtsFreq += BFI.getBlockFreq(Node); } else { ParentInsertPts.insert(InsertPts.begin(), InsertPts.end()); ParentPtsFreq += InsertPtsFreq; } } }