bool CostModelAnalysis::runOnFunction(Function &F) { this->F = &F; // Target information. TargetTransformInfo *TTI; TTI = getAnalysisIfAvailable<TargetTransformInfo>(); if (TTI) VTTI = TTI->getVectorTargetTransformInfo(); return false; }
static bool canSplitCallSite(CallSite CS, TargetTransformInfo &TTI) { // FIXME: As of now we handle only CallInst. InvokeInst could be handled // without too much effort. Instruction *Instr = CS.getInstruction(); if (!isa<CallInst>(Instr)) return false; BasicBlock *CallSiteBB = Instr->getParent(); // Allow splitting a call-site only when the CodeSize cost of the // instructions before the call is less then DuplicationThreshold. The // instructions before the call will be duplicated in the split blocks and // corresponding uses will be updated. unsigned Cost = 0; for (auto &InstBeforeCall : llvm::make_range(CallSiteBB->begin(), Instr->getIterator())) { Cost += TTI.getInstructionCost(&InstBeforeCall, TargetTransformInfo::TCK_CodeSize); if (Cost >= DuplicationThreshold) return false; } // Need 2 predecessors and cannot split an edge from an IndirectBrInst. SmallVector<BasicBlock *, 2> Preds(predecessors(CallSiteBB)); if (Preds.size() != 2 || isa<IndirectBrInst>(Preds[0]->getTerminator()) || isa<IndirectBrInst>(Preds[1]->getTerminator())) return false; return CallSiteBB->canSplitPredecessors(); }
static unsigned ComputeSpeculationCost(const Instruction *I, const TargetTransformInfo &TTI) { switch (Operator::getOpcode(I)) { case Instruction::GetElementPtr: case Instruction::Add: case Instruction::Mul: case Instruction::And: case Instruction::Or: case Instruction::Select: case Instruction::Shl: case Instruction::Sub: case Instruction::LShr: case Instruction::AShr: case Instruction::Xor: case Instruction::ZExt: case Instruction::SExt: case Instruction::Call: case Instruction::BitCast: case Instruction::PtrToInt: case Instruction::IntToPtr: case Instruction::AddrSpaceCast: case Instruction::FPToUI: case Instruction::FPToSI: case Instruction::UIToFP: case Instruction::SIToFP: case Instruction::FPExt: case Instruction::FPTrunc: case Instruction::FAdd: case Instruction::FSub: case Instruction::FMul: case Instruction::FDiv: case Instruction::FRem: case Instruction::ICmp: case Instruction::FCmp: return TTI.getUserCost(I); default: return UINT_MAX; // Disallow anything not whitelisted. } }
static unsigned ComputeSpeculationCost(const Instruction *I, const TargetTransformInfo &TTI) { switch (Operator::getOpcode(I)) { case Instruction::GetElementPtr: case Instruction::Add: case Instruction::Mul: case Instruction::And: case Instruction::Or: case Instruction::Select: case Instruction::Shl: case Instruction::Sub: case Instruction::LShr: case Instruction::AShr: case Instruction::Xor: case Instruction::ZExt: case Instruction::SExt: return TTI.getUserCost(I); default: return UINT_MAX; // Disallow anything not whitelisted. } }
/// \brief Figure out if the loop is worth full unrolling. /// /// Complete loop unrolling can make some loads constant, and we need to know /// if that would expose any further optimization opportunities. This routine /// estimates this optimization. It computes cost of unrolled loop /// (UnrolledCost) and dynamic cost of the original loop (RolledDynamicCost). By /// dynamic cost we mean that we won't count costs of blocks that are known not /// to be executed (i.e. if we have a branch in the loop and we know that at the /// given iteration its condition would be resolved to true, we won't add up the /// cost of the 'false'-block). /// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If /// the analysis failed (no benefits expected from the unrolling, or the loop is /// too big to analyze), the returned value is None. Optional<EstimatedUnrollCost> analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, ScalarEvolution &SE, const TargetTransformInfo &TTI, unsigned MaxUnrolledLoopSize) { // We want to be able to scale offsets by the trip count and add more offsets // to them without checking for overflows, and we already don't want to // analyze *massive* trip counts, so we force the max to be reasonably small. assert(UnrollMaxIterationsCountToAnalyze < (INT_MAX / 2) && "The unroll iterations max is too large!"); // Don't simulate loops with a big or unknown tripcount if (!UnrollMaxIterationsCountToAnalyze || !TripCount || TripCount > UnrollMaxIterationsCountToAnalyze) return None; SmallSetVector<BasicBlock *, 16> BBWorklist; DenseMap<Value *, Constant *> SimplifiedValues; // The estimated cost of the unrolled form of the loop. We try to estimate // this by simplifying as much as we can while computing the estimate. unsigned UnrolledCost = 0; // We also track the estimated dynamic (that is, actually executed) cost in // the rolled form. This helps identify cases when the savings from unrolling // aren't just exposing dead control flows, but actual reduced dynamic // instructions due to the simplifications which we expect to occur after // unrolling. unsigned RolledDynamicCost = 0; // Simulate execution of each iteration of the loop counting instructions, // which would be simplified. // Since the same load will take different values on different iterations, // we literally have to go through all loop's iterations. for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) { SimplifiedValues.clear(); UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, L, SE); BBWorklist.clear(); BBWorklist.insert(L->getHeader()); // Note that we *must not* cache the size, this loop grows the worklist. for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) { BasicBlock *BB = BBWorklist[Idx]; // Visit all instructions in the given basic block and try to simplify // it. We don't change the actual IR, just count optimization // opportunities. for (Instruction &I : *BB) { unsigned InstCost = TTI.getUserCost(&I); // Visit the instruction to analyze its loop cost after unrolling, // and if the visitor returns false, include this instruction in the // unrolled cost. if (!Analyzer.visit(I)) UnrolledCost += InstCost; // Also track this instructions expected cost when executing the rolled // loop form. RolledDynamicCost += InstCost; // If unrolled body turns out to be too big, bail out. if (UnrolledCost > MaxUnrolledLoopSize) return None; } // Add BB's successors to the worklist. for (BasicBlock *Succ : successors(BB)) if (L->contains(Succ)) BBWorklist.insert(Succ); } // If we found no optimization opportunities on the first iteration, we // won't find them on later ones too. if (UnrolledCost == RolledDynamicCost) return None; } return {{UnrolledCost, RolledDynamicCost}}; }
/// \brief Figure out if the loop is worth full unrolling. /// /// Complete loop unrolling can make some loads constant, and we need to know /// if that would expose any further optimization opportunities. This routine /// estimates this optimization. It computes cost of unrolled loop /// (UnrolledCost) and dynamic cost of the original loop (RolledDynamicCost). By /// dynamic cost we mean that we won't count costs of blocks that are known not /// to be executed (i.e. if we have a branch in the loop and we know that at the /// given iteration its condition would be resolved to true, we won't add up the /// cost of the 'false'-block). /// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If /// the analysis failed (no benefits expected from the unrolling, or the loop is /// too big to analyze), the returned value is None. static Optional<EstimatedUnrollCost> analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE, const TargetTransformInfo &TTI, int MaxUnrolledLoopSize) { // We want to be able to scale offsets by the trip count and add more offsets // to them without checking for overflows, and we already don't want to // analyze *massive* trip counts, so we force the max to be reasonably small. assert(UnrollMaxIterationsCountToAnalyze < (INT_MAX / 2) && "The unroll iterations max is too large!"); // Only analyze inner loops. We can't properly estimate cost of nested loops // and we won't visit inner loops again anyway. if (!L->empty()) return None; // Don't simulate loops with a big or unknown tripcount if (!UnrollMaxIterationsCountToAnalyze || !TripCount || TripCount > UnrollMaxIterationsCountToAnalyze) return None; SmallSetVector<BasicBlock *, 16> BBWorklist; SmallSetVector<std::pair<BasicBlock *, BasicBlock *>, 4> ExitWorklist; DenseMap<Value *, Constant *> SimplifiedValues; SmallVector<std::pair<Value *, Constant *>, 4> SimplifiedInputValues; // The estimated cost of the unrolled form of the loop. We try to estimate // this by simplifying as much as we can while computing the estimate. int UnrolledCost = 0; // We also track the estimated dynamic (that is, actually executed) cost in // the rolled form. This helps identify cases when the savings from unrolling // aren't just exposing dead control flows, but actual reduced dynamic // instructions due to the simplifications which we expect to occur after // unrolling. int RolledDynamicCost = 0; // We track the simplification of each instruction in each iteration. We use // this to recursively merge costs into the unrolled cost on-demand so that // we don't count the cost of any dead code. This is essentially a map from // <instruction, int> to <bool, bool>, but stored as a densely packed struct. DenseSet<UnrolledInstState, UnrolledInstStateKeyInfo> InstCostMap; // A small worklist used to accumulate cost of instructions from each // observable and reached root in the loop. SmallVector<Instruction *, 16> CostWorklist; // PHI-used worklist used between iterations while accumulating cost. SmallVector<Instruction *, 4> PHIUsedList; // Helper function to accumulate cost for instructions in the loop. auto AddCostRecursively = [&](Instruction &RootI, int Iteration) { assert(Iteration >= 0 && "Cannot have a negative iteration!"); assert(CostWorklist.empty() && "Must start with an empty cost list"); assert(PHIUsedList.empty() && "Must start with an empty phi used list"); CostWorklist.push_back(&RootI); for (;; --Iteration) { do { Instruction *I = CostWorklist.pop_back_val(); // InstCostMap only uses I and Iteration as a key, the other two values // don't matter here. auto CostIter = InstCostMap.find({I, Iteration, 0, 0}); if (CostIter == InstCostMap.end()) // If an input to a PHI node comes from a dead path through the loop // we may have no cost data for it here. What that actually means is // that it is free. continue; auto &Cost = *CostIter; if (Cost.IsCounted) // Already counted this instruction. continue; // Mark that we are counting the cost of this instruction now. Cost.IsCounted = true; // If this is a PHI node in the loop header, just add it to the PHI set. if (auto *PhiI = dyn_cast<PHINode>(I)) if (PhiI->getParent() == L->getHeader()) { assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they " "inherently simplify during unrolling."); if (Iteration == 0) continue; // Push the incoming value from the backedge into the PHI used list // if it is an in-loop instruction. We'll use this to populate the // cost worklist for the next iteration (as we count backwards). if (auto *OpI = dyn_cast<Instruction>( PhiI->getIncomingValueForBlock(L->getLoopLatch()))) if (L->contains(OpI)) PHIUsedList.push_back(OpI); continue; } // First accumulate the cost of this instruction. if (!Cost.IsFree) { UnrolledCost += TTI.getUserCost(I); DEBUG(dbgs() << "Adding cost of instruction (iteration " << Iteration << "): "); DEBUG(I->dump()); } // We must count the cost of every operand which is not free, // recursively. If we reach a loop PHI node, simply add it to the set // to be considered on the next iteration (backwards!). for (Value *Op : I->operands()) { // Check whether this operand is free due to being a constant or // outside the loop. auto *OpI = dyn_cast<Instruction>(Op); if (!OpI || !L->contains(OpI)) continue; // Otherwise accumulate its cost. CostWorklist.push_back(OpI); } } while (!CostWorklist.empty()); if (PHIUsedList.empty()) // We've exhausted the search. break; assert(Iteration > 0 && "Cannot track PHI-used values past the first iteration!"); CostWorklist.append(PHIUsedList.begin(), PHIUsedList.end()); PHIUsedList.clear(); } }; // Ensure that we don't violate the loop structure invariants relied on by // this analysis. assert(L->isLoopSimplifyForm() && "Must put loop into normal form first."); assert(L->isLCSSAForm(DT) && "Must have loops in LCSSA form to track live-out values."); DEBUG(dbgs() << "Starting LoopUnroll profitability analysis...\n"); // Simulate execution of each iteration of the loop counting instructions, // which would be simplified. // Since the same load will take different values on different iterations, // we literally have to go through all loop's iterations. for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) { DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n"); // Prepare for the iteration by collecting any simplified entry or backedge // inputs. for (Instruction &I : *L->getHeader()) { auto *PHI = dyn_cast<PHINode>(&I); if (!PHI) break; // The loop header PHI nodes must have exactly two input: one from the // loop preheader and one from the loop latch. assert( PHI->getNumIncomingValues() == 2 && "Must have an incoming value only for the preheader and the latch."); Value *V = PHI->getIncomingValueForBlock( Iteration == 0 ? L->getLoopPreheader() : L->getLoopLatch()); Constant *C = dyn_cast<Constant>(V); if (Iteration != 0 && !C) C = SimplifiedValues.lookup(V); if (C) SimplifiedInputValues.push_back({PHI, C}); } // Now clear and re-populate the map for the next iteration. SimplifiedValues.clear(); while (!SimplifiedInputValues.empty()) SimplifiedValues.insert(SimplifiedInputValues.pop_back_val()); UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE, L); BBWorklist.clear(); BBWorklist.insert(L->getHeader()); // Note that we *must not* cache the size, this loop grows the worklist. for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) { BasicBlock *BB = BBWorklist[Idx]; // Visit all instructions in the given basic block and try to simplify // it. We don't change the actual IR, just count optimization // opportunities. for (Instruction &I : *BB) { // Track this instruction's expected baseline cost when executing the // rolled loop form. RolledDynamicCost += TTI.getUserCost(&I); // Visit the instruction to analyze its loop cost after unrolling, // and if the visitor returns true, mark the instruction as free after // unrolling and continue. bool IsFree = Analyzer.visit(I); bool Inserted = InstCostMap.insert({&I, (int)Iteration, (unsigned)IsFree, /*IsCounted*/ false}).second; (void)Inserted; assert(Inserted && "Cannot have a state for an unvisited instruction!"); if (IsFree) continue; // If the instruction might have a side-effect recursively account for // the cost of it and all the instructions leading up to it. if (I.mayHaveSideEffects()) AddCostRecursively(I, Iteration); // Can't properly model a cost of a call. // FIXME: With a proper cost model we should be able to do it. if(isa<CallInst>(&I)) return None; // If unrolled body turns out to be too big, bail out. if (UnrolledCost > MaxUnrolledLoopSize) { DEBUG(dbgs() << " Exceeded threshold.. exiting.\n" << " UnrolledCost: " << UnrolledCost << ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize << "\n"); return None; } } TerminatorInst *TI = BB->getTerminator(); // Add in the live successors by first checking whether we have terminator // that may be simplified based on the values simplified by this call. BasicBlock *KnownSucc = nullptr; if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { if (BI->isConditional()) { if (Constant *SimpleCond = SimplifiedValues.lookup(BI->getCondition())) { // Just take the first successor if condition is undef if (isa<UndefValue>(SimpleCond)) KnownSucc = BI->getSuccessor(0); else if (ConstantInt *SimpleCondVal = dyn_cast<ConstantInt>(SimpleCond)) KnownSucc = BI->getSuccessor(SimpleCondVal->isZero() ? 1 : 0); } } } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { if (Constant *SimpleCond = SimplifiedValues.lookup(SI->getCondition())) { // Just take the first successor if condition is undef if (isa<UndefValue>(SimpleCond)) KnownSucc = SI->getSuccessor(0); else if (ConstantInt *SimpleCondVal = dyn_cast<ConstantInt>(SimpleCond)) KnownSucc = SI->findCaseValue(SimpleCondVal).getCaseSuccessor(); } } if (KnownSucc) { if (L->contains(KnownSucc)) BBWorklist.insert(KnownSucc); else ExitWorklist.insert({BB, KnownSucc}); continue; } // Add BB's successors to the worklist. for (BasicBlock *Succ : successors(BB)) if (L->contains(Succ)) BBWorklist.insert(Succ); else ExitWorklist.insert({BB, Succ}); AddCostRecursively(*TI, Iteration); } // If we found no optimization opportunities on the first iteration, we // won't find them on later ones too. if (UnrolledCost == RolledDynamicCost) { DEBUG(dbgs() << " No opportunities found.. exiting.\n" << " UnrolledCost: " << UnrolledCost << "\n"); return None; } } while (!ExitWorklist.empty()) { BasicBlock *ExitingBB, *ExitBB; std::tie(ExitingBB, ExitBB) = ExitWorklist.pop_back_val(); for (Instruction &I : *ExitBB) { auto *PN = dyn_cast<PHINode>(&I); if (!PN) break; Value *Op = PN->getIncomingValueForBlock(ExitingBB); if (auto *OpI = dyn_cast<Instruction>(Op)) if (L->contains(OpI)) AddCostRecursively(*OpI, TripCount - 1); } } DEBUG(dbgs() << "Analysis finished:\n" << "UnrolledCost: " << UnrolledCost << ", " << "RolledDynamicCost: " << RolledDynamicCost << "\n"); return {{UnrolledCost, RolledDynamicCost}}; }
/// Gather the various unrolling parameters based on the defaults, compiler /// flags, TTI overrides and user specified parameters. static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences( Loop *L, const TargetTransformInfo &TTI, Optional<unsigned> UserThreshold, Optional<unsigned> UserCount, Optional<bool> UserAllowPartial, Optional<bool> UserRuntime) { TargetTransformInfo::UnrollingPreferences UP; // Set up the defaults UP.Threshold = 150; UP.PercentDynamicCostSavedThreshold = 50; UP.DynamicCostSavingsDiscount = 100; UP.OptSizeThreshold = 0; UP.PartialThreshold = UP.Threshold; UP.PartialOptSizeThreshold = 0; UP.Count = 0; UP.MaxCount = UINT_MAX; UP.FullUnrollMaxCount = UINT_MAX; UP.Partial = false; UP.Runtime = false; UP.AllowRemainder = true; UP.AllowExpensiveTripCount = false; UP.Force = false; // Override with any target specific settings TTI.getUnrollingPreferences(L, UP); // Apply size attributes if (L->getHeader()->getParent()->optForSize()) { UP.Threshold = UP.OptSizeThreshold; UP.PartialThreshold = UP.PartialOptSizeThreshold; } // Apply any user values specified by cl::opt if (UnrollThreshold.getNumOccurrences() > 0) { UP.Threshold = UnrollThreshold; UP.PartialThreshold = UnrollThreshold; } if (UnrollPercentDynamicCostSavedThreshold.getNumOccurrences() > 0) UP.PercentDynamicCostSavedThreshold = UnrollPercentDynamicCostSavedThreshold; if (UnrollDynamicCostSavingsDiscount.getNumOccurrences() > 0) UP.DynamicCostSavingsDiscount = UnrollDynamicCostSavingsDiscount; if (UnrollMaxCount.getNumOccurrences() > 0) UP.MaxCount = UnrollMaxCount; if (UnrollFullMaxCount.getNumOccurrences() > 0) UP.FullUnrollMaxCount = UnrollFullMaxCount; if (UnrollAllowPartial.getNumOccurrences() > 0) UP.Partial = UnrollAllowPartial; if (UnrollAllowRemainder.getNumOccurrences() > 0) UP.AllowRemainder = UnrollAllowRemainder; if (UnrollRuntime.getNumOccurrences() > 0) UP.Runtime = UnrollRuntime; // Apply user values provided by argument if (UserThreshold.hasValue()) { UP.Threshold = *UserThreshold; UP.PartialThreshold = *UserThreshold; } if (UserCount.hasValue()) UP.Count = *UserCount; if (UserAllowPartial.hasValue()) UP.Partial = *UserAllowPartial; if (UserRuntime.hasValue()) UP.Runtime = *UserRuntime; return UP; }
/// analyzeBasicBlock - Fill in the current structure with information gleaned /// from the specified block. void CodeMetrics::analyzeBasicBlock(const BasicBlock *BB, const TargetTransformInfo &TTI) { ++NumBlocks; unsigned NumInstsBeforeThisBB = NumInsts; for (BasicBlock::const_iterator II = BB->begin(), E = BB->end(); II != E; ++II) { // Special handling for calls. if (isa<CallInst>(II) || isa<InvokeInst>(II)) { ImmutableCallSite CS(cast<Instruction>(II)); if (const Function *F = CS.getCalledFunction()) { // If a function is both internal and has a single use, then it is // extremely likely to get inlined in the future (it was probably // exposed by an interleaved devirtualization pass). if (!CS.isNoInline() && F->hasInternalLinkage() && F->hasOneUse()) ++NumInlineCandidates; // If this call is to function itself, then the function is recursive. // Inlining it into other functions is a bad idea, because this is // basically just a form of loop peeling, and our metrics aren't useful // for that case. if (F == BB->getParent()) isRecursive = true; if (TTI.isLoweredToCall(F)) ++NumCalls; } else { // We don't want inline asm to count as a call - that would prevent loop // unrolling. The argument setup cost is still real, though. if (!isa<InlineAsm>(CS.getCalledValue())) ++NumCalls; } } if (const AllocaInst *AI = dyn_cast<AllocaInst>(II)) { if (!AI->isStaticAlloca()) this->usesDynamicAlloca = true; } if (isa<ExtractElementInst>(II) || II->getType()->isVectorTy()) ++NumVectorInsts; if (const CallInst *CI = dyn_cast<CallInst>(II)) if (CI->hasFnAttr(Attribute::NoDuplicate)) notDuplicatable = true; if (const InvokeInst *InvI = dyn_cast<InvokeInst>(II)) if (InvI->hasFnAttr(Attribute::NoDuplicate)) notDuplicatable = true; if (const IntrinsicInst *CI = dyn_cast<const IntrinsicInst>(II)) if (CI->getIntrinsicID() == Intrinsic::loopbound) notDuplicatable = true; NumInsts += TTI.getUserCost(&*II); } if (isa<ReturnInst>(BB->getTerminator())) ++NumRets; // We never want to inline functions that contain an indirectbr. This is // incorrect because all the blockaddress's (in static global initializers // for example) would be referring to the original function, and this indirect // jump would jump from the inlined copy of the function into the original // function which is extremely undefined behavior. // FIXME: This logic isn't really right; we can safely inline functions // with indirectbr's as long as no other function or global references the // blockaddress of a block within the current function. And as a QOI issue, // if someone is using a blockaddress without an indirectbr, and that // reference somehow ends up in another function or global, we probably // don't want to inline this function. notDuplicatable |= isa<IndirectBrInst>(BB->getTerminator()); // Remember NumInsts for this BB. NumBBInsts[BB] = NumInsts - NumInstsBeforeThisBB; }
/// Gather the various unrolling parameters based on the defaults, compiler /// flags, TTI overrides, pragmas, and user specified parameters. static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences( Loop *L, const TargetTransformInfo &TTI, Optional<unsigned> UserThreshold, Optional<unsigned> UserCount, Optional<bool> UserAllowPartial, Optional<bool> UserRuntime, unsigned PragmaCount, bool PragmaFullUnroll, bool PragmaEnableUnroll, unsigned TripCount) { TargetTransformInfo::UnrollingPreferences UP; // Set up the defaults UP.Threshold = 150; UP.PercentDynamicCostSavedThreshold = 20; UP.DynamicCostSavingsDiscount = 2000; UP.OptSizeThreshold = 0; UP.PartialThreshold = UP.Threshold; UP.PartialOptSizeThreshold = 0; UP.Count = 0; UP.MaxCount = UINT_MAX; UP.FullUnrollMaxCount = UINT_MAX; UP.Partial = false; UP.Runtime = false; UP.AllowExpensiveTripCount = false; // Override with any target specific settings TTI.getUnrollingPreferences(L, UP); // Apply size attributes if (L->getHeader()->getParent()->optForSize()) { UP.Threshold = UP.OptSizeThreshold; UP.PartialThreshold = UP.PartialOptSizeThreshold; } // Apply unroll count pragmas if (PragmaCount) UP.Count = PragmaCount; else if (PragmaFullUnroll) UP.Count = TripCount; // Apply any user values specified by cl::opt if (UnrollThreshold.getNumOccurrences() > 0) { UP.Threshold = UnrollThreshold; UP.PartialThreshold = UnrollThreshold; } if (UnrollPercentDynamicCostSavedThreshold.getNumOccurrences() > 0) UP.PercentDynamicCostSavedThreshold = UnrollPercentDynamicCostSavedThreshold; if (UnrollDynamicCostSavingsDiscount.getNumOccurrences() > 0) UP.DynamicCostSavingsDiscount = UnrollDynamicCostSavingsDiscount; if (UnrollCount.getNumOccurrences() > 0) UP.Count = UnrollCount; if (UnrollMaxCount.getNumOccurrences() > 0) UP.MaxCount = UnrollMaxCount; if (UnrollFullMaxCount.getNumOccurrences() > 0) UP.FullUnrollMaxCount = UnrollFullMaxCount; if (UnrollAllowPartial.getNumOccurrences() > 0) UP.Partial = UnrollAllowPartial; if (UnrollRuntime.getNumOccurrences() > 0) UP.Runtime = UnrollRuntime; // Apply user values provided by argument if (UserThreshold.hasValue()) { UP.Threshold = *UserThreshold; UP.PartialThreshold = *UserThreshold; } if (UserCount.hasValue()) UP.Count = *UserCount; if (UserAllowPartial.hasValue()) UP.Partial = *UserAllowPartial; if (UserRuntime.hasValue()) UP.Runtime = *UserRuntime; if (PragmaCount > 0 || ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount != 0)) { // If the loop has an unrolling pragma, we want to be more aggressive with // unrolling limits. Set thresholds to at least the PragmaTheshold value // which is larger than the default limits. if (UP.Threshold != NoThreshold) UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold); if (UP.PartialThreshold != NoThreshold) UP.PartialThreshold = std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold); } return UP; }
/// \brief Figure out if the loop is worth full unrolling. /// /// Complete loop unrolling can make some loads constant, and we need to know /// if that would expose any further optimization opportunities. This routine /// estimates this optimization. It computes cost of unrolled loop /// (UnrolledCost) and dynamic cost of the original loop (RolledDynamicCost). By /// dynamic cost we mean that we won't count costs of blocks that are known not /// to be executed (i.e. if we have a branch in the loop and we know that at the /// given iteration its condition would be resolved to true, we won't add up the /// cost of the 'false'-block). /// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If /// the analysis failed (no benefits expected from the unrolling, or the loop is /// too big to analyze), the returned value is None. static Optional<EstimatedUnrollCost> analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE, const TargetTransformInfo &TTI, int MaxUnrolledLoopSize) { // We want to be able to scale offsets by the trip count and add more offsets // to them without checking for overflows, and we already don't want to // analyze *massive* trip counts, so we force the max to be reasonably small. assert(UnrollMaxIterationsCountToAnalyze < (INT_MAX / 2) && "The unroll iterations max is too large!"); // Don't simulate loops with a big or unknown tripcount if (!UnrollMaxIterationsCountToAnalyze || !TripCount || TripCount > UnrollMaxIterationsCountToAnalyze) return None; SmallSetVector<BasicBlock *, 16> BBWorklist; DenseMap<Value *, Constant *> SimplifiedValues; SmallVector<std::pair<Value *, Constant *>, 4> SimplifiedInputValues; // The estimated cost of the unrolled form of the loop. We try to estimate // this by simplifying as much as we can while computing the estimate. int UnrolledCost = 0; // We also track the estimated dynamic (that is, actually executed) cost in // the rolled form. This helps identify cases when the savings from unrolling // aren't just exposing dead control flows, but actual reduced dynamic // instructions due to the simplifications which we expect to occur after // unrolling. int RolledDynamicCost = 0; // Ensure that we don't violate the loop structure invariants relied on by // this analysis. assert(L->isLoopSimplifyForm() && "Must put loop into normal form first."); assert(L->isLCSSAForm(DT) && "Must have loops in LCSSA form to track live-out values."); DEBUG(dbgs() << "Starting LoopUnroll profitability analysis...\n"); // Simulate execution of each iteration of the loop counting instructions, // which would be simplified. // Since the same load will take different values on different iterations, // we literally have to go through all loop's iterations. for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) { DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n"); // Prepare for the iteration by collecting any simplified entry or backedge // inputs. for (Instruction &I : *L->getHeader()) { auto *PHI = dyn_cast<PHINode>(&I); if (!PHI) break; // The loop header PHI nodes must have exactly two input: one from the // loop preheader and one from the loop latch. assert( PHI->getNumIncomingValues() == 2 && "Must have an incoming value only for the preheader and the latch."); Value *V = PHI->getIncomingValueForBlock( Iteration == 0 ? L->getLoopPreheader() : L->getLoopLatch()); Constant *C = dyn_cast<Constant>(V); if (Iteration != 0 && !C) C = SimplifiedValues.lookup(V); if (C) SimplifiedInputValues.push_back({PHI, C}); } // Now clear and re-populate the map for the next iteration. SimplifiedValues.clear(); while (!SimplifiedInputValues.empty()) SimplifiedValues.insert(SimplifiedInputValues.pop_back_val()); UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE); BBWorklist.clear(); BBWorklist.insert(L->getHeader()); // Note that we *must not* cache the size, this loop grows the worklist. for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) { BasicBlock *BB = BBWorklist[Idx]; // Visit all instructions in the given basic block and try to simplify // it. We don't change the actual IR, just count optimization // opportunities. for (Instruction &I : *BB) { int InstCost = TTI.getUserCost(&I); // Visit the instruction to analyze its loop cost after unrolling, // and if the visitor returns false, include this instruction in the // unrolled cost. if (!Analyzer.visit(I)) UnrolledCost += InstCost; else { DEBUG(dbgs() << " " << I << " would be simplified if loop is unrolled.\n"); (void)0; } // Also track this instructions expected cost when executing the rolled // loop form. RolledDynamicCost += InstCost; // If unrolled body turns out to be too big, bail out. if (UnrolledCost > MaxUnrolledLoopSize) { DEBUG(dbgs() << " Exceeded threshold.. exiting.\n" << " UnrolledCost: " << UnrolledCost << ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize << "\n"); return None; } } TerminatorInst *TI = BB->getTerminator(); // Add in the live successors by first checking whether we have terminator // that may be simplified based on the values simplified by this call. if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { if (BI->isConditional()) { if (Constant *SimpleCond = SimplifiedValues.lookup(BI->getCondition())) { BasicBlock *Succ = nullptr; // Just take the first successor if condition is undef if (isa<UndefValue>(SimpleCond)) Succ = BI->getSuccessor(0); else Succ = BI->getSuccessor( cast<ConstantInt>(SimpleCond)->isZero() ? 1 : 0); if (L->contains(Succ)) BBWorklist.insert(Succ); continue; } } } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { if (Constant *SimpleCond = SimplifiedValues.lookup(SI->getCondition())) { BasicBlock *Succ = nullptr; // Just take the first successor if condition is undef if (isa<UndefValue>(SimpleCond)) Succ = SI->getSuccessor(0); else Succ = SI->findCaseValue(cast<ConstantInt>(SimpleCond)) .getCaseSuccessor(); if (L->contains(Succ)) BBWorklist.insert(Succ); continue; } } // Add BB's successors to the worklist. for (BasicBlock *Succ : successors(BB)) if (L->contains(Succ)) BBWorklist.insert(Succ); } // If we found no optimization opportunities on the first iteration, we // won't find them on later ones too. if (UnrolledCost == RolledDynamicCost) { DEBUG(dbgs() << " No opportunities found.. exiting.\n" << " UnrolledCost: " << UnrolledCost << "\n"); return None; } } DEBUG(dbgs() << "Analysis finished:\n" << "UnrolledCost: " << UnrolledCost << ", " << "RolledDynamicCost: " << RolledDynamicCost << "\n"); return {{UnrolledCost, RolledDynamicCost}}; }