MDNode *MDNode::getMostGenericTBAA(MDNode *A, MDNode *B) { if (!A || !B) return nullptr; if (A == B) return A; // For struct-path aware TBAA, we use the access type of the tag. bool StructPath = isStructPathTBAA(A) && isStructPathTBAA(B); if (StructPath) { A = cast_or_null<MDNode>(A->getOperand(1)); if (!A) return nullptr; B = cast_or_null<MDNode>(B->getOperand(1)); if (!B) return nullptr; } SmallSetVector<MDNode *, 4> PathA; MDNode *T = A; while (T) { if (PathA.count(T)) report_fatal_error("Cycle found in TBAA metadata."); PathA.insert(T); T = T->getNumOperands() >= 2 ? cast_or_null<MDNode>(T->getOperand(1)) : nullptr; } SmallSetVector<MDNode *, 4> PathB; T = B; while (T) { if (PathB.count(T)) report_fatal_error("Cycle found in TBAA metadata."); PathB.insert(T); T = T->getNumOperands() >= 2 ? cast_or_null<MDNode>(T->getOperand(1)) : nullptr; } int IA = PathA.size() - 1; int IB = PathB.size() - 1; MDNode *Ret = nullptr; while (IA >= 0 && IB >=0) { if (PathA[IA] == PathB[IB]) Ret = PathA[IA]; else break; --IA; --IB; } if (!StructPath) return Ret; if (!Ret) return nullptr; // We need to convert from a type node to a tag node. Type *Int64 = IntegerType::get(A->getContext(), 64); Metadata *Ops[3] = {Ret, Ret, ConstantAsMetadata::get(ConstantInt::get(Int64, 0))}; return MDNode::get(A->getContext(), Ops); }
MDNode *MDNode::getMostGenericTBAA(MDNode *A, MDNode *B) { if (!A || !B) return nullptr; if (A == B) return A; // For struct-path aware TBAA, we use the access type of the tag. assert(isStructPathTBAA(A) && isStructPathTBAA(B) && "Auto upgrade should have taken care of this!"); A = cast_or_null<MDNode>(MutableTBAAStructTagNode(A).getAccessType()); if (!A) return nullptr; B = cast_or_null<MDNode>(MutableTBAAStructTagNode(B).getAccessType()); if (!B) return nullptr; SmallSetVector<MDNode *, 4> PathA; MutableTBAANode TA(A); while (TA.getNode()) { if (PathA.count(TA.getNode())) report_fatal_error("Cycle found in TBAA metadata."); PathA.insert(TA.getNode()); TA = TA.getParent(); } SmallSetVector<MDNode *, 4> PathB; MutableTBAANode TB(B); while (TB.getNode()) { if (PathB.count(TB.getNode())) report_fatal_error("Cycle found in TBAA metadata."); PathB.insert(TB.getNode()); TB = TB.getParent(); } int IA = PathA.size() - 1; int IB = PathB.size() - 1; MDNode *Ret = nullptr; while (IA >= 0 && IB >= 0) { if (PathA[IA] == PathB[IB]) Ret = PathA[IA]; else break; --IA; --IB; } if (!Ret) return nullptr; // We need to convert from a type node to a tag node. Type *Int64 = IntegerType::get(A->getContext(), 64); Metadata *Ops[3] = {Ret, Ret, ConstantAsMetadata::get(ConstantInt::get(Int64, 0))}; return MDNode::get(A->getContext(), Ops); }
/// Tests whether this function is known to not return null. /// /// Requires that the function returns a pointer. /// /// Returns true if it believes the function will not return a null, and sets /// \p Speculative based on whether the returned conclusion is a speculative /// conclusion due to SCC calls. static bool isReturnNonNull(Function *F, const SCCNodeSet &SCCNodes, bool &Speculative) { assert(F->getReturnType()->isPointerTy() && "nonnull only meaningful on pointer types"); Speculative = false; SmallSetVector<Value *, 8> FlowsToReturn; for (BasicBlock &BB : *F) if (auto *Ret = dyn_cast<ReturnInst>(BB.getTerminator())) FlowsToReturn.insert(Ret->getReturnValue()); for (unsigned i = 0; i != FlowsToReturn.size(); ++i) { Value *RetVal = FlowsToReturn[i]; // If this value is locally known to be non-null, we're good if (isKnownNonNull(RetVal)) continue; // Otherwise, we need to look upwards since we can't make any local // conclusions. Instruction *RVI = dyn_cast<Instruction>(RetVal); if (!RVI) return false; switch (RVI->getOpcode()) { // Extend the analysis by looking upwards. case Instruction::BitCast: case Instruction::GetElementPtr: case Instruction::AddrSpaceCast: FlowsToReturn.insert(RVI->getOperand(0)); continue; case Instruction::Select: { SelectInst *SI = cast<SelectInst>(RVI); FlowsToReturn.insert(SI->getTrueValue()); FlowsToReturn.insert(SI->getFalseValue()); continue; } case Instruction::PHI: { PHINode *PN = cast<PHINode>(RVI); for (int i = 0, e = PN->getNumIncomingValues(); i != e; ++i) FlowsToReturn.insert(PN->getIncomingValue(i)); continue; } case Instruction::Call: case Instruction::Invoke: { CallSite CS(RVI); Function *Callee = CS.getCalledFunction(); // A call to a node within the SCC is assumed to return null until // proven otherwise if (Callee && SCCNodes.count(Callee)) { Speculative = true; continue; } return false; } default: return false; // Unknown source, may be null }; llvm_unreachable("should have either continued or returned"); } return true; }
/// Tests whether a function is "malloc-like". /// /// A function is "malloc-like" if it returns either null or a pointer that /// doesn't alias any other pointer visible to the caller. static bool isFunctionMallocLike(Function *F, const SCCNodeSet &SCCNodes) { SmallSetVector<Value *, 8> FlowsToReturn; for (BasicBlock &BB : *F) if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB.getTerminator())) FlowsToReturn.insert(Ret->getReturnValue()); for (unsigned i = 0; i != FlowsToReturn.size(); ++i) { Value *RetVal = FlowsToReturn[i]; if (Constant *C = dyn_cast<Constant>(RetVal)) { if (!C->isNullValue() && !isa<UndefValue>(C)) return false; continue; } if (isa<Argument>(RetVal)) return false; if (Instruction *RVI = dyn_cast<Instruction>(RetVal)) switch (RVI->getOpcode()) { // Extend the analysis by looking upwards. case Instruction::BitCast: case Instruction::GetElementPtr: case Instruction::AddrSpaceCast: FlowsToReturn.insert(RVI->getOperand(0)); continue; case Instruction::Select: { SelectInst *SI = cast<SelectInst>(RVI); FlowsToReturn.insert(SI->getTrueValue()); FlowsToReturn.insert(SI->getFalseValue()); continue; } case Instruction::PHI: { PHINode *PN = cast<PHINode>(RVI); for (Value *IncValue : PN->incoming_values()) FlowsToReturn.insert(IncValue); continue; } // Check whether the pointer came from an allocation. case Instruction::Alloca: break; case Instruction::Call: case Instruction::Invoke: { CallSite CS(RVI); if (CS.hasRetAttr(Attribute::NoAlias)) break; if (CS.getCalledFunction() && SCCNodes.count(CS.getCalledFunction())) break; LLVM_FALLTHROUGH; } default: return false; // Did not come from an allocation. } if (PointerMayBeCaptured(RetVal, false, /*StoreCaptures=*/false)) return false; } return true; }
/// Sort the blocks, taking special care to make sure that loops are not /// interrupted by blocks not dominated by their header. /// TODO: There are many opportunities for improving the heuristics here. /// Explore them. static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI, const MachineDominatorTree &MDT) { // Prepare for a topological sort: Record the number of predecessors each // block has, ignoring loop backedges. MF.RenumberBlocks(); SmallVector<unsigned, 16> NumPredsLeft(MF.getNumBlockIDs(), 0); for (MachineBasicBlock &MBB : MF) { unsigned N = MBB.pred_size(); if (MachineLoop *L = MLI.getLoopFor(&MBB)) if (L->getHeader() == &MBB) for (const MachineBasicBlock *Pred : MBB.predecessors()) if (L->contains(Pred)) --N; NumPredsLeft[MBB.getNumber()] = N; } // Topological sort the CFG, with additional constraints: // - Between a loop header and the last block in the loop, there can be // no blocks not dominated by the loop header. // - It's desirable to preserve the original block order when possible. // We use two ready lists; Preferred and Ready. Preferred has recently // processed sucessors, to help preserve block sequences from the original // order. Ready has the remaining ready blocks. PriorityQueue<MachineBasicBlock *, std::vector<MachineBasicBlock *>, CompareBlockNumbers> Preferred; PriorityQueue<MachineBasicBlock *, std::vector<MachineBasicBlock *>, CompareBlockNumbersBackwards> Ready; SmallVector<Entry, 4> Loops; for (MachineBasicBlock *MBB = &MF.front();;) { const MachineLoop *L = MLI.getLoopFor(MBB); if (L) { // If MBB is a loop header, add it to the active loop list. We can't put // any blocks that it doesn't dominate until we see the end of the loop. if (L->getHeader() == MBB) Loops.push_back(Entry(L)); // For each active loop the block is in, decrement the count. If MBB is // the last block in an active loop, take it off the list and pick up any // blocks deferred because the header didn't dominate them. for (Entry &E : Loops) if (E.Loop->contains(MBB) && --E.NumBlocksLeft == 0) for (auto DeferredBlock : E.Deferred) Ready.push(DeferredBlock); while (!Loops.empty() && Loops.back().NumBlocksLeft == 0) Loops.pop_back(); } // The main topological sort logic. for (MachineBasicBlock *Succ : MBB->successors()) { // Ignore backedges. if (MachineLoop *SuccL = MLI.getLoopFor(Succ)) if (SuccL->getHeader() == Succ && SuccL->contains(MBB)) continue; // Decrement the predecessor count. If it's now zero, it's ready. if (--NumPredsLeft[Succ->getNumber()] == 0) Preferred.push(Succ); } // Determine the block to follow MBB. First try to find a preferred block, // to preserve the original block order when possible. MachineBasicBlock *Next = nullptr; while (!Preferred.empty()) { Next = Preferred.top(); Preferred.pop(); // If X isn't dominated by the top active loop header, defer it until that // loop is done. if (!Loops.empty() && !MDT.dominates(Loops.back().Loop->getHeader(), Next)) { Loops.back().Deferred.push_back(Next); Next = nullptr; continue; } // If Next was originally ordered before MBB, and it isn't because it was // loop-rotated above the header, it's not preferred. if (Next->getNumber() < MBB->getNumber() && (!L || !L->contains(Next) || L->getHeader()->getNumber() < Next->getNumber())) { Ready.push(Next); Next = nullptr; continue; } break; } // If we didn't find a suitable block in the Preferred list, check the // general Ready list. if (!Next) { // If there are no more blocks to process, we're done. if (Ready.empty()) { MaybeUpdateTerminator(MBB); break; } for (;;) { Next = Ready.top(); Ready.pop(); // If Next isn't dominated by the top active loop header, defer it until // that loop is done. if (!Loops.empty() && !MDT.dominates(Loops.back().Loop->getHeader(), Next)) { Loops.back().Deferred.push_back(Next); continue; } break; } } // Move the next block into place and iterate. Next->moveAfter(MBB); MaybeUpdateTerminator(MBB); MBB = Next; } assert(Loops.empty() && "Active loop list not finished"); MF.RenumberBlocks(); #ifndef NDEBUG SmallSetVector<MachineLoop *, 8> OnStack; // Insert a sentinel representing the degenerate loop that starts at the // function entry block and includes the entire function as a "loop" that // executes once. OnStack.insert(nullptr); for (auto &MBB : MF) { assert(MBB.getNumber() >= 0 && "Renumbered blocks should be non-negative."); MachineLoop *Loop = MLI.getLoopFor(&MBB); if (Loop && &MBB == Loop->getHeader()) { // Loop header. The loop predecessor should be sorted above, and the other // predecessors should be backedges below. for (auto Pred : MBB.predecessors()) assert( (Pred->getNumber() < MBB.getNumber() || Loop->contains(Pred)) && "Loop header predecessors must be loop predecessors or backedges"); assert(OnStack.insert(Loop) && "Loops should be declared at most once."); } else { // Not a loop header. All predecessors should be sorted above. for (auto Pred : MBB.predecessors()) assert(Pred->getNumber() < MBB.getNumber() && "Non-loop-header predecessors should be topologically sorted"); assert(OnStack.count(MLI.getLoopFor(&MBB)) && "Blocks must be nested in their loops"); } while (OnStack.size() > 1 && &MBB == LoopBottom(OnStack.back())) OnStack.pop_back(); } assert(OnStack.pop_back_val() == nullptr && "The function entry block shouldn't actually be a loop header"); assert(OnStack.empty() && "Control flow stack pushes and pops should be balanced."); #endif }
/// \brief Figure out if the loop is worth full unrolling. /// /// Complete loop unrolling can make some loads constant, and we need to know /// if that would expose any further optimization opportunities. This routine /// estimates this optimization. It computes cost of unrolled loop /// (UnrolledCost) and dynamic cost of the original loop (RolledDynamicCost). By /// dynamic cost we mean that we won't count costs of blocks that are known not /// to be executed (i.e. if we have a branch in the loop and we know that at the /// given iteration its condition would be resolved to true, we won't add up the /// cost of the 'false'-block). /// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If /// the analysis failed (no benefits expected from the unrolling, or the loop is /// too big to analyze), the returned value is None. Optional<EstimatedUnrollCost> analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, ScalarEvolution &SE, const TargetTransformInfo &TTI, unsigned MaxUnrolledLoopSize) { // We want to be able to scale offsets by the trip count and add more offsets // to them without checking for overflows, and we already don't want to // analyze *massive* trip counts, so we force the max to be reasonably small. assert(UnrollMaxIterationsCountToAnalyze < (INT_MAX / 2) && "The unroll iterations max is too large!"); // Don't simulate loops with a big or unknown tripcount if (!UnrollMaxIterationsCountToAnalyze || !TripCount || TripCount > UnrollMaxIterationsCountToAnalyze) return None; SmallSetVector<BasicBlock *, 16> BBWorklist; DenseMap<Value *, Constant *> SimplifiedValues; // The estimated cost of the unrolled form of the loop. We try to estimate // this by simplifying as much as we can while computing the estimate. unsigned UnrolledCost = 0; // We also track the estimated dynamic (that is, actually executed) cost in // the rolled form. This helps identify cases when the savings from unrolling // aren't just exposing dead control flows, but actual reduced dynamic // instructions due to the simplifications which we expect to occur after // unrolling. unsigned RolledDynamicCost = 0; // Simulate execution of each iteration of the loop counting instructions, // which would be simplified. // Since the same load will take different values on different iterations, // we literally have to go through all loop's iterations. for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) { SimplifiedValues.clear(); UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, L, SE); BBWorklist.clear(); BBWorklist.insert(L->getHeader()); // Note that we *must not* cache the size, this loop grows the worklist. for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) { BasicBlock *BB = BBWorklist[Idx]; // Visit all instructions in the given basic block and try to simplify // it. We don't change the actual IR, just count optimization // opportunities. for (Instruction &I : *BB) { unsigned InstCost = TTI.getUserCost(&I); // Visit the instruction to analyze its loop cost after unrolling, // and if the visitor returns false, include this instruction in the // unrolled cost. if (!Analyzer.visit(I)) UnrolledCost += InstCost; // Also track this instructions expected cost when executing the rolled // loop form. RolledDynamicCost += InstCost; // If unrolled body turns out to be too big, bail out. if (UnrolledCost > MaxUnrolledLoopSize) return None; } // Add BB's successors to the worklist. for (BasicBlock *Succ : successors(BB)) if (L->contains(Succ)) BBWorklist.insert(Succ); } // If we found no optimization opportunities on the first iteration, we // won't find them on later ones too. if (UnrolledCost == RolledDynamicCost) return None; } return {{UnrolledCost, RolledDynamicCost}}; }
/// \brief Figure out if the loop is worth full unrolling. /// /// Complete loop unrolling can make some loads constant, and we need to know /// if that would expose any further optimization opportunities. This routine /// estimates this optimization. It computes cost of unrolled loop /// (UnrolledCost) and dynamic cost of the original loop (RolledDynamicCost). By /// dynamic cost we mean that we won't count costs of blocks that are known not /// to be executed (i.e. if we have a branch in the loop and we know that at the /// given iteration its condition would be resolved to true, we won't add up the /// cost of the 'false'-block). /// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If /// the analysis failed (no benefits expected from the unrolling, or the loop is /// too big to analyze), the returned value is None. static Optional<EstimatedUnrollCost> analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE, const TargetTransformInfo &TTI, int MaxUnrolledLoopSize) { // We want to be able to scale offsets by the trip count and add more offsets // to them without checking for overflows, and we already don't want to // analyze *massive* trip counts, so we force the max to be reasonably small. assert(UnrollMaxIterationsCountToAnalyze < (INT_MAX / 2) && "The unroll iterations max is too large!"); // Only analyze inner loops. We can't properly estimate cost of nested loops // and we won't visit inner loops again anyway. if (!L->empty()) return None; // Don't simulate loops with a big or unknown tripcount if (!UnrollMaxIterationsCountToAnalyze || !TripCount || TripCount > UnrollMaxIterationsCountToAnalyze) return None; SmallSetVector<BasicBlock *, 16> BBWorklist; SmallSetVector<std::pair<BasicBlock *, BasicBlock *>, 4> ExitWorklist; DenseMap<Value *, Constant *> SimplifiedValues; SmallVector<std::pair<Value *, Constant *>, 4> SimplifiedInputValues; // The estimated cost of the unrolled form of the loop. We try to estimate // this by simplifying as much as we can while computing the estimate. int UnrolledCost = 0; // We also track the estimated dynamic (that is, actually executed) cost in // the rolled form. This helps identify cases when the savings from unrolling // aren't just exposing dead control flows, but actual reduced dynamic // instructions due to the simplifications which we expect to occur after // unrolling. int RolledDynamicCost = 0; // We track the simplification of each instruction in each iteration. We use // this to recursively merge costs into the unrolled cost on-demand so that // we don't count the cost of any dead code. This is essentially a map from // <instruction, int> to <bool, bool>, but stored as a densely packed struct. DenseSet<UnrolledInstState, UnrolledInstStateKeyInfo> InstCostMap; // A small worklist used to accumulate cost of instructions from each // observable and reached root in the loop. SmallVector<Instruction *, 16> CostWorklist; // PHI-used worklist used between iterations while accumulating cost. SmallVector<Instruction *, 4> PHIUsedList; // Helper function to accumulate cost for instructions in the loop. auto AddCostRecursively = [&](Instruction &RootI, int Iteration) { assert(Iteration >= 0 && "Cannot have a negative iteration!"); assert(CostWorklist.empty() && "Must start with an empty cost list"); assert(PHIUsedList.empty() && "Must start with an empty phi used list"); CostWorklist.push_back(&RootI); for (;; --Iteration) { do { Instruction *I = CostWorklist.pop_back_val(); // InstCostMap only uses I and Iteration as a key, the other two values // don't matter here. auto CostIter = InstCostMap.find({I, Iteration, 0, 0}); if (CostIter == InstCostMap.end()) // If an input to a PHI node comes from a dead path through the loop // we may have no cost data for it here. What that actually means is // that it is free. continue; auto &Cost = *CostIter; if (Cost.IsCounted) // Already counted this instruction. continue; // Mark that we are counting the cost of this instruction now. Cost.IsCounted = true; // If this is a PHI node in the loop header, just add it to the PHI set. if (auto *PhiI = dyn_cast<PHINode>(I)) if (PhiI->getParent() == L->getHeader()) { assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they " "inherently simplify during unrolling."); if (Iteration == 0) continue; // Push the incoming value from the backedge into the PHI used list // if it is an in-loop instruction. We'll use this to populate the // cost worklist for the next iteration (as we count backwards). if (auto *OpI = dyn_cast<Instruction>( PhiI->getIncomingValueForBlock(L->getLoopLatch()))) if (L->contains(OpI)) PHIUsedList.push_back(OpI); continue; } // First accumulate the cost of this instruction. if (!Cost.IsFree) { UnrolledCost += TTI.getUserCost(I); DEBUG(dbgs() << "Adding cost of instruction (iteration " << Iteration << "): "); DEBUG(I->dump()); } // We must count the cost of every operand which is not free, // recursively. If we reach a loop PHI node, simply add it to the set // to be considered on the next iteration (backwards!). for (Value *Op : I->operands()) { // Check whether this operand is free due to being a constant or // outside the loop. auto *OpI = dyn_cast<Instruction>(Op); if (!OpI || !L->contains(OpI)) continue; // Otherwise accumulate its cost. CostWorklist.push_back(OpI); } } while (!CostWorklist.empty()); if (PHIUsedList.empty()) // We've exhausted the search. break; assert(Iteration > 0 && "Cannot track PHI-used values past the first iteration!"); CostWorklist.append(PHIUsedList.begin(), PHIUsedList.end()); PHIUsedList.clear(); } }; // Ensure that we don't violate the loop structure invariants relied on by // this analysis. assert(L->isLoopSimplifyForm() && "Must put loop into normal form first."); assert(L->isLCSSAForm(DT) && "Must have loops in LCSSA form to track live-out values."); DEBUG(dbgs() << "Starting LoopUnroll profitability analysis...\n"); // Simulate execution of each iteration of the loop counting instructions, // which would be simplified. // Since the same load will take different values on different iterations, // we literally have to go through all loop's iterations. for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) { DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n"); // Prepare for the iteration by collecting any simplified entry or backedge // inputs. for (Instruction &I : *L->getHeader()) { auto *PHI = dyn_cast<PHINode>(&I); if (!PHI) break; // The loop header PHI nodes must have exactly two input: one from the // loop preheader and one from the loop latch. assert( PHI->getNumIncomingValues() == 2 && "Must have an incoming value only for the preheader and the latch."); Value *V = PHI->getIncomingValueForBlock( Iteration == 0 ? L->getLoopPreheader() : L->getLoopLatch()); Constant *C = dyn_cast<Constant>(V); if (Iteration != 0 && !C) C = SimplifiedValues.lookup(V); if (C) SimplifiedInputValues.push_back({PHI, C}); } // Now clear and re-populate the map for the next iteration. SimplifiedValues.clear(); while (!SimplifiedInputValues.empty()) SimplifiedValues.insert(SimplifiedInputValues.pop_back_val()); UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE, L); BBWorklist.clear(); BBWorklist.insert(L->getHeader()); // Note that we *must not* cache the size, this loop grows the worklist. for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) { BasicBlock *BB = BBWorklist[Idx]; // Visit all instructions in the given basic block and try to simplify // it. We don't change the actual IR, just count optimization // opportunities. for (Instruction &I : *BB) { // Track this instruction's expected baseline cost when executing the // rolled loop form. RolledDynamicCost += TTI.getUserCost(&I); // Visit the instruction to analyze its loop cost after unrolling, // and if the visitor returns true, mark the instruction as free after // unrolling and continue. bool IsFree = Analyzer.visit(I); bool Inserted = InstCostMap.insert({&I, (int)Iteration, (unsigned)IsFree, /*IsCounted*/ false}).second; (void)Inserted; assert(Inserted && "Cannot have a state for an unvisited instruction!"); if (IsFree) continue; // If the instruction might have a side-effect recursively account for // the cost of it and all the instructions leading up to it. if (I.mayHaveSideEffects()) AddCostRecursively(I, Iteration); // Can't properly model a cost of a call. // FIXME: With a proper cost model we should be able to do it. if(isa<CallInst>(&I)) return None; // If unrolled body turns out to be too big, bail out. if (UnrolledCost > MaxUnrolledLoopSize) { DEBUG(dbgs() << " Exceeded threshold.. exiting.\n" << " UnrolledCost: " << UnrolledCost << ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize << "\n"); return None; } } TerminatorInst *TI = BB->getTerminator(); // Add in the live successors by first checking whether we have terminator // that may be simplified based on the values simplified by this call. BasicBlock *KnownSucc = nullptr; if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { if (BI->isConditional()) { if (Constant *SimpleCond = SimplifiedValues.lookup(BI->getCondition())) { // Just take the first successor if condition is undef if (isa<UndefValue>(SimpleCond)) KnownSucc = BI->getSuccessor(0); else if (ConstantInt *SimpleCondVal = dyn_cast<ConstantInt>(SimpleCond)) KnownSucc = BI->getSuccessor(SimpleCondVal->isZero() ? 1 : 0); } } } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { if (Constant *SimpleCond = SimplifiedValues.lookup(SI->getCondition())) { // Just take the first successor if condition is undef if (isa<UndefValue>(SimpleCond)) KnownSucc = SI->getSuccessor(0); else if (ConstantInt *SimpleCondVal = dyn_cast<ConstantInt>(SimpleCond)) KnownSucc = SI->findCaseValue(SimpleCondVal).getCaseSuccessor(); } } if (KnownSucc) { if (L->contains(KnownSucc)) BBWorklist.insert(KnownSucc); else ExitWorklist.insert({BB, KnownSucc}); continue; } // Add BB's successors to the worklist. for (BasicBlock *Succ : successors(BB)) if (L->contains(Succ)) BBWorklist.insert(Succ); else ExitWorklist.insert({BB, Succ}); AddCostRecursively(*TI, Iteration); } // If we found no optimization opportunities on the first iteration, we // won't find them on later ones too. if (UnrolledCost == RolledDynamicCost) { DEBUG(dbgs() << " No opportunities found.. exiting.\n" << " UnrolledCost: " << UnrolledCost << "\n"); return None; } } while (!ExitWorklist.empty()) { BasicBlock *ExitingBB, *ExitBB; std::tie(ExitingBB, ExitBB) = ExitWorklist.pop_back_val(); for (Instruction &I : *ExitBB) { auto *PN = dyn_cast<PHINode>(&I); if (!PN) break; Value *Op = PN->getIncomingValueForBlock(ExitingBB); if (auto *OpI = dyn_cast<Instruction>(Op)) if (L->contains(OpI)) AddCostRecursively(*OpI, TripCount - 1); } } DEBUG(dbgs() << "Analysis finished:\n" << "UnrolledCost: " << UnrolledCost << ", " << "RolledDynamicCost: " << RolledDynamicCost << "\n"); return {{UnrolledCost, RolledDynamicCost}}; }
/// This works like CloneAndPruneFunctionInto, except that it does not clone the /// entire function. Instead it starts at an instruction provided by the caller /// and copies (and prunes) only the code reachable from that instruction. void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc, const Instruction *StartingInst, ValueToValueMapTy &VMap, bool ModuleLevelChanges, SmallVectorImpl<ReturnInst *> &Returns, const char *NameSuffix, ClonedCodeInfo *CodeInfo) { assert(NameSuffix && "NameSuffix cannot be null!"); ValueMapTypeRemapper *TypeMapper = nullptr; ValueMaterializer *Materializer = nullptr; #ifndef NDEBUG // If the cloning starts at the beginning of the function, verify that // the function arguments are mapped. if (!StartingInst) for (const Argument &II : OldFunc->args()) assert(VMap.count(&II) && "No mapping from source argument specified!"); #endif PruningFunctionCloner PFC(NewFunc, OldFunc, VMap, ModuleLevelChanges, NameSuffix, CodeInfo); const BasicBlock *StartingBB; if (StartingInst) StartingBB = StartingInst->getParent(); else { StartingBB = &OldFunc->getEntryBlock(); StartingInst = &StartingBB->front(); } // Clone the entry block, and anything recursively reachable from it. std::vector<const BasicBlock*> CloneWorklist; PFC.CloneBlock(StartingBB, StartingInst->getIterator(), CloneWorklist); while (!CloneWorklist.empty()) { const BasicBlock *BB = CloneWorklist.back(); CloneWorklist.pop_back(); PFC.CloneBlock(BB, BB->begin(), CloneWorklist); } // Loop over all of the basic blocks in the old function. If the block was // reachable, we have cloned it and the old block is now in the value map: // insert it into the new function in the right order. If not, ignore it. // // Defer PHI resolution until rest of function is resolved. SmallVector<const PHINode*, 16> PHIToResolve; for (const BasicBlock &BI : *OldFunc) { Value *V = VMap.lookup(&BI); BasicBlock *NewBB = cast_or_null<BasicBlock>(V); if (!NewBB) continue; // Dead block. // Add the new block to the new function. NewFunc->getBasicBlockList().push_back(NewBB); // Handle PHI nodes specially, as we have to remove references to dead // blocks. for (BasicBlock::const_iterator I = BI.begin(), E = BI.end(); I != E; ++I) { // PHI nodes may have been remapped to non-PHI nodes by the caller or // during the cloning process. if (const PHINode *PN = dyn_cast<PHINode>(I)) { if (isa<PHINode>(VMap[PN])) PHIToResolve.push_back(PN); else break; } else { break; } } // Finally, remap the terminator instructions, as those can't be remapped // until all BBs are mapped. RemapInstruction(NewBB->getTerminator(), VMap, ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges, TypeMapper, Materializer); } // Defer PHI resolution until rest of function is resolved, PHI resolution // requires the CFG to be up-to-date. for (unsigned phino = 0, e = PHIToResolve.size(); phino != e; ) { const PHINode *OPN = PHIToResolve[phino]; unsigned NumPreds = OPN->getNumIncomingValues(); const BasicBlock *OldBB = OPN->getParent(); BasicBlock *NewBB = cast<BasicBlock>(VMap[OldBB]); // Map operands for blocks that are live and remove operands for blocks // that are dead. for (; phino != PHIToResolve.size() && PHIToResolve[phino]->getParent() == OldBB; ++phino) { OPN = PHIToResolve[phino]; PHINode *PN = cast<PHINode>(VMap[OPN]); for (unsigned pred = 0, e = NumPreds; pred != e; ++pred) { Value *V = VMap.lookup(PN->getIncomingBlock(pred)); if (BasicBlock *MappedBlock = cast_or_null<BasicBlock>(V)) { Value *InVal = MapValue(PN->getIncomingValue(pred), VMap, ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges); assert(InVal && "Unknown input value?"); PN->setIncomingValue(pred, InVal); PN->setIncomingBlock(pred, MappedBlock); } else { PN->removeIncomingValue(pred, false); --pred; // Revisit the next entry. --e; } } } // The loop above has removed PHI entries for those blocks that are dead // and has updated others. However, if a block is live (i.e. copied over) // but its terminator has been changed to not go to this block, then our // phi nodes will have invalid entries. Update the PHI nodes in this // case. PHINode *PN = cast<PHINode>(NewBB->begin()); NumPreds = std::distance(pred_begin(NewBB), pred_end(NewBB)); if (NumPreds != PN->getNumIncomingValues()) { assert(NumPreds < PN->getNumIncomingValues()); // Count how many times each predecessor comes to this block. std::map<BasicBlock*, unsigned> PredCount; for (pred_iterator PI = pred_begin(NewBB), E = pred_end(NewBB); PI != E; ++PI) --PredCount[*PI]; // Figure out how many entries to remove from each PHI. for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) ++PredCount[PN->getIncomingBlock(i)]; // At this point, the excess predecessor entries are positive in the // map. Loop over all of the PHIs and remove excess predecessor // entries. BasicBlock::iterator I = NewBB->begin(); for (; (PN = dyn_cast<PHINode>(I)); ++I) { for (const auto &PCI : PredCount) { BasicBlock *Pred = PCI.first; for (unsigned NumToRemove = PCI.second; NumToRemove; --NumToRemove) PN->removeIncomingValue(Pred, false); } } } // If the loops above have made these phi nodes have 0 or 1 operand, // replace them with undef or the input value. We must do this for // correctness, because 0-operand phis are not valid. PN = cast<PHINode>(NewBB->begin()); if (PN->getNumIncomingValues() == 0) { BasicBlock::iterator I = NewBB->begin(); BasicBlock::const_iterator OldI = OldBB->begin(); while ((PN = dyn_cast<PHINode>(I++))) { Value *NV = UndefValue::get(PN->getType()); PN->replaceAllUsesWith(NV); assert(VMap[&*OldI] == PN && "VMap mismatch"); VMap[&*OldI] = NV; PN->eraseFromParent(); ++OldI; } } } // Make a second pass over the PHINodes now that all of them have been // remapped into the new function, simplifying the PHINode and performing any // recursive simplifications exposed. This will transparently update the // WeakVH in the VMap. Notably, we rely on that so that if we coalesce // two PHINodes, the iteration over the old PHIs remains valid, and the // mapping will just map us to the new node (which may not even be a PHI // node). const DataLayout &DL = NewFunc->getParent()->getDataLayout(); SmallSetVector<const Value *, 8> Worklist; for (unsigned Idx = 0, Size = PHIToResolve.size(); Idx != Size; ++Idx) if (isa<PHINode>(VMap[PHIToResolve[Idx]])) Worklist.insert(PHIToResolve[Idx]); // Note that we must test the size on each iteration, the worklist can grow. for (unsigned Idx = 0; Idx != Worklist.size(); ++Idx) { const Value *OrigV = Worklist[Idx]; auto *I = dyn_cast_or_null<Instruction>(VMap.lookup(OrigV)); if (!I) continue; // See if this instruction simplifies. Value *SimpleV = SimplifyInstruction(I, DL); if (!SimpleV) continue; // Stash away all the uses of the old instruction so we can check them for // recursive simplifications after a RAUW. This is cheaper than checking all // uses of To on the recursive step in most cases. for (const User *U : OrigV->users()) Worklist.insert(cast<Instruction>(U)); // Replace the instruction with its simplified value. I->replaceAllUsesWith(SimpleV); // If the original instruction had no side effects, remove it. if (isInstructionTriviallyDead(I)) I->eraseFromParent(); else VMap[OrigV] = I; } // Now that the inlined function body has been fully constructed, go through // and zap unconditional fall-through branches. This happens all the time when // specializing code: code specialization turns conditional branches into // uncond branches, and this code folds them. Function::iterator Begin = cast<BasicBlock>(VMap[StartingBB])->getIterator(); Function::iterator I = Begin; while (I != NewFunc->end()) { // Check if this block has become dead during inlining or other // simplifications. Note that the first block will appear dead, as it has // not yet been wired up properly. if (I != Begin && (pred_begin(&*I) == pred_end(&*I) || I->getSinglePredecessor() == &*I)) { BasicBlock *DeadBB = &*I++; DeleteDeadBlock(DeadBB); continue; } // We need to simplify conditional branches and switches with a constant // operand. We try to prune these out when cloning, but if the // simplification required looking through PHI nodes, those are only // available after forming the full basic block. That may leave some here, // and we still want to prune the dead code as early as possible. ConstantFoldTerminator(&*I); BranchInst *BI = dyn_cast<BranchInst>(I->getTerminator()); if (!BI || BI->isConditional()) { ++I; continue; } BasicBlock *Dest = BI->getSuccessor(0); if (!Dest->getSinglePredecessor()) { ++I; continue; } // We shouldn't be able to get single-entry PHI nodes here, as instsimplify // above should have zapped all of them.. assert(!isa<PHINode>(Dest->begin())); // We know all single-entry PHI nodes in the inlined function have been // removed, so we just need to splice the blocks. BI->eraseFromParent(); // Make all PHI nodes that referred to Dest now refer to I as their source. Dest->replaceAllUsesWith(&*I); // Move all the instructions in the succ to the pred. I->getInstList().splice(I->end(), Dest->getInstList()); // Remove the dest block. Dest->eraseFromParent(); // Do not increment I, iteratively merge all things this block branches to. } // Make a final pass over the basic blocks from the old function to gather // any return instructions which survived folding. We have to do this here // because we can iteratively remove and merge returns above. for (Function::iterator I = cast<BasicBlock>(VMap[StartingBB])->getIterator(), E = NewFunc->end(); I != E; ++I) if (ReturnInst *RI = dyn_cast<ReturnInst>(I->getTerminator())) Returns.push_back(RI); }
void AAEvaluator::runInternal(Function &F, AAResults &AA) { const DataLayout &DL = F.getParent()->getDataLayout(); ++FunctionCount; SetVector<Value *> Pointers; SmallSetVector<CallBase *, 16> Calls; SetVector<Value *> Loads; SetVector<Value *> Stores; for (auto &I : F.args()) if (I.getType()->isPointerTy()) // Add all pointer arguments. Pointers.insert(&I); for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) { if (I->getType()->isPointerTy()) // Add all pointer instructions. Pointers.insert(&*I); if (EvalAAMD && isa<LoadInst>(&*I)) Loads.insert(&*I); if (EvalAAMD && isa<StoreInst>(&*I)) Stores.insert(&*I); Instruction &Inst = *I; if (auto *Call = dyn_cast<CallBase>(&Inst)) { Value *Callee = Call->getCalledValue(); // Skip actual functions for direct function calls. if (!isa<Function>(Callee) && isInterestingPointer(Callee)) Pointers.insert(Callee); // Consider formals. for (Use &DataOp : Call->data_ops()) if (isInterestingPointer(DataOp)) Pointers.insert(DataOp); Calls.insert(Call); } else { // Consider all operands. for (Instruction::op_iterator OI = Inst.op_begin(), OE = Inst.op_end(); OI != OE; ++OI) if (isInterestingPointer(*OI)) Pointers.insert(*OI); } } if (PrintAll || PrintNoAlias || PrintMayAlias || PrintPartialAlias || PrintMustAlias || PrintNoModRef || PrintMod || PrintRef || PrintModRef) errs() << "Function: " << F.getName() << ": " << Pointers.size() << " pointers, " << Calls.size() << " call sites\n"; // iterate over the worklist, and run the full (n^2)/2 disambiguations for (SetVector<Value *>::iterator I1 = Pointers.begin(), E = Pointers.end(); I1 != E; ++I1) { auto I1Size = LocationSize::unknown(); Type *I1ElTy = cast<PointerType>((*I1)->getType())->getElementType(); if (I1ElTy->isSized()) I1Size = LocationSize::precise(DL.getTypeStoreSize(I1ElTy)); for (SetVector<Value *>::iterator I2 = Pointers.begin(); I2 != I1; ++I2) { auto I2Size = LocationSize::unknown(); Type *I2ElTy = cast<PointerType>((*I2)->getType())->getElementType(); if (I2ElTy->isSized()) I2Size = LocationSize::precise(DL.getTypeStoreSize(I2ElTy)); AliasResult AR = AA.alias(*I1, I1Size, *I2, I2Size); switch (AR) { case NoAlias: PrintResults(AR, PrintNoAlias, *I1, *I2, F.getParent()); ++NoAliasCount; break; case MayAlias: PrintResults(AR, PrintMayAlias, *I1, *I2, F.getParent()); ++MayAliasCount; break; case PartialAlias: PrintResults(AR, PrintPartialAlias, *I1, *I2, F.getParent()); ++PartialAliasCount; break; case MustAlias: PrintResults(AR, PrintMustAlias, *I1, *I2, F.getParent()); ++MustAliasCount; break; } } } if (EvalAAMD) { // iterate over all pairs of load, store for (Value *Load : Loads) { for (Value *Store : Stores) { AliasResult AR = AA.alias(MemoryLocation::get(cast<LoadInst>(Load)), MemoryLocation::get(cast<StoreInst>(Store))); switch (AR) { case NoAlias: PrintLoadStoreResults(AR, PrintNoAlias, Load, Store, F.getParent()); ++NoAliasCount; break; case MayAlias: PrintLoadStoreResults(AR, PrintMayAlias, Load, Store, F.getParent()); ++MayAliasCount; break; case PartialAlias: PrintLoadStoreResults(AR, PrintPartialAlias, Load, Store, F.getParent()); ++PartialAliasCount; break; case MustAlias: PrintLoadStoreResults(AR, PrintMustAlias, Load, Store, F.getParent()); ++MustAliasCount; break; } } } // iterate over all pairs of store, store for (SetVector<Value *>::iterator I1 = Stores.begin(), E = Stores.end(); I1 != E; ++I1) { for (SetVector<Value *>::iterator I2 = Stores.begin(); I2 != I1; ++I2) { AliasResult AR = AA.alias(MemoryLocation::get(cast<StoreInst>(*I1)), MemoryLocation::get(cast<StoreInst>(*I2))); switch (AR) { case NoAlias: PrintLoadStoreResults(AR, PrintNoAlias, *I1, *I2, F.getParent()); ++NoAliasCount; break; case MayAlias: PrintLoadStoreResults(AR, PrintMayAlias, *I1, *I2, F.getParent()); ++MayAliasCount; break; case PartialAlias: PrintLoadStoreResults(AR, PrintPartialAlias, *I1, *I2, F.getParent()); ++PartialAliasCount; break; case MustAlias: PrintLoadStoreResults(AR, PrintMustAlias, *I1, *I2, F.getParent()); ++MustAliasCount; break; } } } } // Mod/ref alias analysis: compare all pairs of calls and values for (CallBase *Call : Calls) { for (auto Pointer : Pointers) { auto Size = LocationSize::unknown(); Type *ElTy = cast<PointerType>(Pointer->getType())->getElementType(); if (ElTy->isSized()) Size = LocationSize::precise(DL.getTypeStoreSize(ElTy)); switch (AA.getModRefInfo(Call, Pointer, Size)) { case ModRefInfo::NoModRef: PrintModRefResults("NoModRef", PrintNoModRef, Call, Pointer, F.getParent()); ++NoModRefCount; break; case ModRefInfo::Mod: PrintModRefResults("Just Mod", PrintMod, Call, Pointer, F.getParent()); ++ModCount; break; case ModRefInfo::Ref: PrintModRefResults("Just Ref", PrintRef, Call, Pointer, F.getParent()); ++RefCount; break; case ModRefInfo::ModRef: PrintModRefResults("Both ModRef", PrintModRef, Call, Pointer, F.getParent()); ++ModRefCount; break; case ModRefInfo::Must: PrintModRefResults("Must", PrintMust, Call, Pointer, F.getParent()); ++MustCount; break; case ModRefInfo::MustMod: PrintModRefResults("Just Mod (MustAlias)", PrintMustMod, Call, Pointer, F.getParent()); ++MustModCount; break; case ModRefInfo::MustRef: PrintModRefResults("Just Ref (MustAlias)", PrintMustRef, Call, Pointer, F.getParent()); ++MustRefCount; break; case ModRefInfo::MustModRef: PrintModRefResults("Both ModRef (MustAlias)", PrintMustModRef, Call, Pointer, F.getParent()); ++MustModRefCount; break; } } } // Mod/ref alias analysis: compare all pairs of calls for (CallBase *CallA : Calls) { for (CallBase *CallB : Calls) { if (CallA == CallB) continue; switch (AA.getModRefInfo(CallA, CallB)) { case ModRefInfo::NoModRef: PrintModRefResults("NoModRef", PrintNoModRef, CallA, CallB, F.getParent()); ++NoModRefCount; break; case ModRefInfo::Mod: PrintModRefResults("Just Mod", PrintMod, CallA, CallB, F.getParent()); ++ModCount; break; case ModRefInfo::Ref: PrintModRefResults("Just Ref", PrintRef, CallA, CallB, F.getParent()); ++RefCount; break; case ModRefInfo::ModRef: PrintModRefResults("Both ModRef", PrintModRef, CallA, CallB, F.getParent()); ++ModRefCount; break; case ModRefInfo::Must: PrintModRefResults("Must", PrintMust, CallA, CallB, F.getParent()); ++MustCount; break; case ModRefInfo::MustMod: PrintModRefResults("Just Mod (MustAlias)", PrintMustMod, CallA, CallB, F.getParent()); ++MustModCount; break; case ModRefInfo::MustRef: PrintModRefResults("Just Ref (MustAlias)", PrintMustRef, CallA, CallB, F.getParent()); ++MustRefCount; break; case ModRefInfo::MustModRef: PrintModRefResults("Both ModRef (MustAlias)", PrintMustModRef, CallA, CallB, F.getParent()); ++MustModRefCount; break; } } } }
void MCObjectDisassembler::buildCFG(MCModule *Module) { typedef std::map<uint64_t, BBInfo> BBInfoByAddrTy; BBInfoByAddrTy BBInfos; AddressSetTy Splits; AddressSetTy Calls; error_code ec; for (symbol_iterator SI = Obj.begin_symbols(), SE = Obj.end_symbols(); SI != SE; SI.increment(ec)) { if (ec) break; SymbolRef::Type SymType; SI->getType(SymType); if (SymType == SymbolRef::ST_Function) { uint64_t SymAddr; SI->getAddress(SymAddr); SymAddr = getEffectiveLoadAddr(SymAddr); Calls.push_back(SymAddr); Splits.push_back(SymAddr); } } assert(Module->func_begin() == Module->func_end() && "Module already has a CFG!"); // First, determine the basic block boundaries and call targets. for (MCModule::atom_iterator AI = Module->atom_begin(), AE = Module->atom_end(); AI != AE; ++AI) { MCTextAtom *TA = dyn_cast<MCTextAtom>(*AI); if (!TA) continue; Calls.push_back(TA->getBeginAddr()); BBInfos[TA->getBeginAddr()].Atom = TA; for (MCTextAtom::const_iterator II = TA->begin(), IE = TA->end(); II != IE; ++II) { if (MIA.isTerminator(II->Inst)) Splits.push_back(II->Address + II->Size); uint64_t Target; if (MIA.evaluateBranch(II->Inst, II->Address, II->Size, Target)) { if (MIA.isCall(II->Inst)) Calls.push_back(Target); Splits.push_back(Target); } } } RemoveDupsFromAddressVector(Splits); RemoveDupsFromAddressVector(Calls); // Split text atoms into basic block atoms. for (AddressSetTy::const_iterator SI = Splits.begin(), SE = Splits.end(); SI != SE; ++SI) { MCAtom *A = Module->findAtomContaining(*SI); if (!A) continue; MCTextAtom *TA = cast<MCTextAtom>(A); if (TA->getBeginAddr() == *SI) continue; MCTextAtom *NewAtom = TA->split(*SI); BBInfos[NewAtom->getBeginAddr()].Atom = NewAtom; StringRef BBName = TA->getName(); BBName = BBName.substr(0, BBName.find_last_of(':')); NewAtom->setName((BBName + ":" + utohexstr(*SI)).str()); } // Compute succs/preds. for (MCModule::atom_iterator AI = Module->atom_begin(), AE = Module->atom_end(); AI != AE; ++AI) { MCTextAtom *TA = dyn_cast<MCTextAtom>(*AI); if (!TA) continue; BBInfo &CurBB = BBInfos[TA->getBeginAddr()]; const MCDecodedInst &LI = TA->back(); if (MIA.isBranch(LI.Inst)) { uint64_t Target; if (MIA.evaluateBranch(LI.Inst, LI.Address, LI.Size, Target)) CurBB.addSucc(BBInfos[Target]); if (MIA.isConditionalBranch(LI.Inst)) CurBB.addSucc(BBInfos[LI.Address + LI.Size]); } else if (!MIA.isTerminator(LI.Inst)) CurBB.addSucc(BBInfos[LI.Address + LI.Size]); } // Create functions and basic blocks. for (AddressSetTy::const_iterator CI = Calls.begin(), CE = Calls.end(); CI != CE; ++CI) { BBInfo &BBI = BBInfos[*CI]; if (!BBI.Atom) continue; MCFunction &MCFN = *Module->createFunction(BBI.Atom->getName()); // Create MCBBs. SmallSetVector<BBInfo*, 16> Worklist; Worklist.insert(&BBI); for (size_t wi = 0; wi < Worklist.size(); ++wi) { BBInfo *BBI = Worklist[wi]; if (!BBI->Atom) continue; BBI->BB = &MCFN.createBlock(*BBI->Atom); // Add all predecessors and successors to the worklist. for (BBInfoSetTy::iterator SI = BBI->Succs.begin(), SE = BBI->Succs.end(); SI != SE; ++SI) Worklist.insert(*SI); for (BBInfoSetTy::iterator PI = BBI->Preds.begin(), PE = BBI->Preds.end(); PI != PE; ++PI) Worklist.insert(*PI); } // Set preds/succs. for (size_t wi = 0; wi < Worklist.size(); ++wi) { BBInfo *BBI = Worklist[wi]; MCBasicBlock *MCBB = BBI->BB; if (!MCBB) continue; for (BBInfoSetTy::iterator SI = BBI->Succs.begin(), SE = BBI->Succs.end(); SI != SE; ++SI) if ((*SI)->BB) MCBB->addSuccessor((*SI)->BB); for (BBInfoSetTy::iterator PI = BBI->Preds.begin(), PE = BBI->Preds.end(); PI != PE; ++PI) if ((*PI)->BB) MCBB->addPredecessor((*PI)->BB); } } }
/// Merge the linker flags in Src into the Dest module. Error IRLinker::linkModuleFlagsMetadata() { // If the source module has no module flags, we are done. const NamedMDNode *SrcModFlags = SrcM->getModuleFlagsMetadata(); if (!SrcModFlags) return Error::success(); // If the destination module doesn't have module flags yet, then just copy // over the source module's flags. NamedMDNode *DstModFlags = DstM.getOrInsertModuleFlagsMetadata(); if (DstModFlags->getNumOperands() == 0) { for (unsigned I = 0, E = SrcModFlags->getNumOperands(); I != E; ++I) DstModFlags->addOperand(SrcModFlags->getOperand(I)); return Error::success(); } // First build a map of the existing module flags and requirements. DenseMap<MDString *, std::pair<MDNode *, unsigned>> Flags; SmallSetVector<MDNode *, 16> Requirements; for (unsigned I = 0, E = DstModFlags->getNumOperands(); I != E; ++I) { MDNode *Op = DstModFlags->getOperand(I); ConstantInt *Behavior = mdconst::extract<ConstantInt>(Op->getOperand(0)); MDString *ID = cast<MDString>(Op->getOperand(1)); if (Behavior->getZExtValue() == Module::Require) { Requirements.insert(cast<MDNode>(Op->getOperand(2))); } else { Flags[ID] = std::make_pair(Op, I); } } // Merge in the flags from the source module, and also collect its set of // requirements. for (unsigned I = 0, E = SrcModFlags->getNumOperands(); I != E; ++I) { MDNode *SrcOp = SrcModFlags->getOperand(I); ConstantInt *SrcBehavior = mdconst::extract<ConstantInt>(SrcOp->getOperand(0)); MDString *ID = cast<MDString>(SrcOp->getOperand(1)); MDNode *DstOp; unsigned DstIndex; std::tie(DstOp, DstIndex) = Flags.lookup(ID); unsigned SrcBehaviorValue = SrcBehavior->getZExtValue(); // If this is a requirement, add it and continue. if (SrcBehaviorValue == Module::Require) { // If the destination module does not already have this requirement, add // it. if (Requirements.insert(cast<MDNode>(SrcOp->getOperand(2)))) { DstModFlags->addOperand(SrcOp); } continue; } // If there is no existing flag with this ID, just add it. if (!DstOp) { Flags[ID] = std::make_pair(SrcOp, DstModFlags->getNumOperands()); DstModFlags->addOperand(SrcOp); continue; } // Otherwise, perform a merge. ConstantInt *DstBehavior = mdconst::extract<ConstantInt>(DstOp->getOperand(0)); unsigned DstBehaviorValue = DstBehavior->getZExtValue(); // If either flag has override behavior, handle it first. if (DstBehaviorValue == Module::Override) { // Diagnose inconsistent flags which both have override behavior. if (SrcBehaviorValue == Module::Override && SrcOp->getOperand(2) != DstOp->getOperand(2)) return stringErr("linking module flags '" + ID->getString() + "': IDs have conflicting override values"); continue; } else if (SrcBehaviorValue == Module::Override) { // Update the destination flag to that of the source. DstModFlags->setOperand(DstIndex, SrcOp); Flags[ID].first = SrcOp; continue; } // Diagnose inconsistent merge behavior types. if (SrcBehaviorValue != DstBehaviorValue) return stringErr("linking module flags '" + ID->getString() + "': IDs have conflicting behaviors"); auto replaceDstValue = [&](MDNode *New) { Metadata *FlagOps[] = {DstOp->getOperand(0), ID, New}; MDNode *Flag = MDNode::get(DstM.getContext(), FlagOps); DstModFlags->setOperand(DstIndex, Flag); Flags[ID].first = Flag; }; // Perform the merge for standard behavior types. switch (SrcBehaviorValue) { case Module::Require: case Module::Override: llvm_unreachable("not possible"); case Module::Error: { // Emit an error if the values differ. if (SrcOp->getOperand(2) != DstOp->getOperand(2)) return stringErr("linking module flags '" + ID->getString() + "': IDs have conflicting values"); continue; } case Module::Warning: { // Emit a warning if the values differ. if (SrcOp->getOperand(2) != DstOp->getOperand(2)) { emitWarning("linking module flags '" + ID->getString() + "': IDs have conflicting values"); } continue; } case Module::Append: { MDNode *DstValue = cast<MDNode>(DstOp->getOperand(2)); MDNode *SrcValue = cast<MDNode>(SrcOp->getOperand(2)); SmallVector<Metadata *, 8> MDs; MDs.reserve(DstValue->getNumOperands() + SrcValue->getNumOperands()); MDs.append(DstValue->op_begin(), DstValue->op_end()); MDs.append(SrcValue->op_begin(), SrcValue->op_end()); replaceDstValue(MDNode::get(DstM.getContext(), MDs)); break; } case Module::AppendUnique: { SmallSetVector<Metadata *, 16> Elts; MDNode *DstValue = cast<MDNode>(DstOp->getOperand(2)); MDNode *SrcValue = cast<MDNode>(SrcOp->getOperand(2)); Elts.insert(DstValue->op_begin(), DstValue->op_end()); Elts.insert(SrcValue->op_begin(), SrcValue->op_end()); replaceDstValue(MDNode::get(DstM.getContext(), makeArrayRef(Elts.begin(), Elts.end()))); break; } } } // Check all of the requirements. for (unsigned I = 0, E = Requirements.size(); I != E; ++I) { MDNode *Requirement = Requirements[I]; MDString *Flag = cast<MDString>(Requirement->getOperand(0)); Metadata *ReqValue = Requirement->getOperand(1); MDNode *Op = Flags[Flag].first; if (!Op || Op->getOperand(2) != ReqValue) return stringErr("linking module flags '" + Flag->getString() + "': does not have the required value"); } return Error::success(); }
/// IsFunctionMallocLike - A function is malloc-like if it returns either null /// or a pointer that doesn't alias any other pointer visible to the caller. bool FunctionAttrs::IsFunctionMallocLike(Function *F, SmallPtrSet<Function*, 8> &SCCNodes) const { SmallSetVector<Value *, 8> FlowsToReturn; for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) if (ReturnInst *Ret = dyn_cast<ReturnInst>(I->getTerminator())) FlowsToReturn.insert(Ret->getReturnValue()); for (unsigned i = 0; i != FlowsToReturn.size(); ++i) { Value *RetVal = FlowsToReturn[i]; if (Constant *C = dyn_cast<Constant>(RetVal)) { if (!C->isNullValue() && !isa<UndefValue>(C)) return false; continue; } if (isa<Argument>(RetVal)) return false; if (Instruction *RVI = dyn_cast<Instruction>(RetVal)) switch (RVI->getOpcode()) { // Extend the analysis by looking upwards. case Instruction::BitCast: case Instruction::GetElementPtr: FlowsToReturn.insert(RVI->getOperand(0)); continue; case Instruction::Select: { SelectInst *SI = cast<SelectInst>(RVI); FlowsToReturn.insert(SI->getTrueValue()); FlowsToReturn.insert(SI->getFalseValue()); continue; } case Instruction::PHI: { PHINode *PN = cast<PHINode>(RVI); for (int i = 0, e = PN->getNumIncomingValues(); i != e; ++i) FlowsToReturn.insert(PN->getIncomingValue(i)); continue; } // Check whether the pointer came from an allocation. case Instruction::Alloca: break; case Instruction::Call: case Instruction::Invoke: { CallSite CS(RVI); if (CS.paramHasAttr(0, Attribute::NoAlias)) break; if (CS.getCalledFunction() && SCCNodes.count(CS.getCalledFunction())) break; } // fall-through default: return false; // Did not come from an allocation. } if (PointerMayBeCaptured(RetVal, false, /*StoreCaptures=*/false)) return false; } return true; }
/// Sort the blocks in RPO, taking special care to make sure that loops are /// contiguous even in the case of split backedges. /// /// TODO: Determine whether RPO is actually worthwhile, or whether we should /// move to just a stable-topological-sort-based approach that would preserve /// more of the original order. static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI) { // Note that we do our own RPO rather than using // "llvm/ADT/PostOrderIterator.h" because we want control over the order that // successors are visited in (see above). Also, we can sort the blocks in the // MachineFunction as we go. SmallPtrSet<MachineBasicBlock *, 16> Visited; SmallVector<POStackEntry, 16> Stack; MachineBasicBlock *EntryBlock = &*MF.begin(); Visited.insert(EntryBlock); Stack.push_back(POStackEntry(EntryBlock, MF, MLI)); for (;;) { POStackEntry &Entry = Stack.back(); SmallVectorImpl<MachineBasicBlock *> &Succs = Entry.Succs; if (!Succs.empty()) { MachineBasicBlock *Succ = Succs.pop_back_val(); if (Visited.insert(Succ).second) Stack.push_back(POStackEntry(Succ, MF, MLI)); continue; } // Put the block in its position in the MachineFunction. MachineBasicBlock &MBB = *Entry.MBB; MBB.moveBefore(&*MF.begin()); // Branch instructions may utilize a fallthrough, so update them if a // fallthrough has been added or removed. if (!MBB.empty() && MBB.back().isTerminator() && !MBB.back().isBranch() && !MBB.back().isBarrier()) report_fatal_error( "Non-branch terminator with fallthrough cannot yet be rewritten"); if (MBB.empty() || !MBB.back().isTerminator() || MBB.back().isBranch()) MBB.updateTerminator(); Stack.pop_back(); if (Stack.empty()) break; } // Now that we've sorted the blocks in RPO, renumber them. MF.RenumberBlocks(); #ifndef NDEBUG SmallSetVector<MachineLoop *, 8> OnStack; // Insert a sentinel representing the degenerate loop that starts at the // function entry block and includes the entire function as a "loop" that // executes once. OnStack.insert(nullptr); for (auto &MBB : MF) { assert(MBB.getNumber() >= 0 && "Renumbered blocks should be non-negative."); MachineLoop *Loop = MLI.getLoopFor(&MBB); if (Loop && &MBB == Loop->getHeader()) { // Loop header. The loop predecessor should be sorted above, and the other // predecessors should be backedges below. for (auto Pred : MBB.predecessors()) assert( (Pred->getNumber() < MBB.getNumber() || Loop->contains(Pred)) && "Loop header predecessors must be loop predecessors or backedges"); assert(OnStack.insert(Loop) && "Loops should be declared at most once."); } else { // Not a loop header. All predecessors should be sorted above. for (auto Pred : MBB.predecessors()) assert(Pred->getNumber() < MBB.getNumber() && "Non-loop-header predecessors should be topologically sorted"); assert(OnStack.count(MLI.getLoopFor(&MBB)) && "Blocks must be nested in their loops"); } while (OnStack.size() > 1 && &MBB == LoopBottom(OnStack.back())) OnStack.pop_back(); } assert(OnStack.pop_back_val() == nullptr && "The function entry block shouldn't actually be a loop header"); assert(OnStack.empty() && "Control flow stack pushes and pops should be balanced."); #endif }
/// \brief Figure out if the loop is worth full unrolling. /// /// Complete loop unrolling can make some loads constant, and we need to know /// if that would expose any further optimization opportunities. This routine /// estimates this optimization. It computes cost of unrolled loop /// (UnrolledCost) and dynamic cost of the original loop (RolledDynamicCost). By /// dynamic cost we mean that we won't count costs of blocks that are known not /// to be executed (i.e. if we have a branch in the loop and we know that at the /// given iteration its condition would be resolved to true, we won't add up the /// cost of the 'false'-block). /// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If /// the analysis failed (no benefits expected from the unrolling, or the loop is /// too big to analyze), the returned value is None. static Optional<EstimatedUnrollCost> analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE, const TargetTransformInfo &TTI, int MaxUnrolledLoopSize) { // We want to be able to scale offsets by the trip count and add more offsets // to them without checking for overflows, and we already don't want to // analyze *massive* trip counts, so we force the max to be reasonably small. assert(UnrollMaxIterationsCountToAnalyze < (INT_MAX / 2) && "The unroll iterations max is too large!"); // Don't simulate loops with a big or unknown tripcount if (!UnrollMaxIterationsCountToAnalyze || !TripCount || TripCount > UnrollMaxIterationsCountToAnalyze) return None; SmallSetVector<BasicBlock *, 16> BBWorklist; DenseMap<Value *, Constant *> SimplifiedValues; SmallVector<std::pair<Value *, Constant *>, 4> SimplifiedInputValues; // The estimated cost of the unrolled form of the loop. We try to estimate // this by simplifying as much as we can while computing the estimate. int UnrolledCost = 0; // We also track the estimated dynamic (that is, actually executed) cost in // the rolled form. This helps identify cases when the savings from unrolling // aren't just exposing dead control flows, but actual reduced dynamic // instructions due to the simplifications which we expect to occur after // unrolling. int RolledDynamicCost = 0; // Ensure that we don't violate the loop structure invariants relied on by // this analysis. assert(L->isLoopSimplifyForm() && "Must put loop into normal form first."); assert(L->isLCSSAForm(DT) && "Must have loops in LCSSA form to track live-out values."); DEBUG(dbgs() << "Starting LoopUnroll profitability analysis...\n"); // Simulate execution of each iteration of the loop counting instructions, // which would be simplified. // Since the same load will take different values on different iterations, // we literally have to go through all loop's iterations. for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) { DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n"); // Prepare for the iteration by collecting any simplified entry or backedge // inputs. for (Instruction &I : *L->getHeader()) { auto *PHI = dyn_cast<PHINode>(&I); if (!PHI) break; // The loop header PHI nodes must have exactly two input: one from the // loop preheader and one from the loop latch. assert( PHI->getNumIncomingValues() == 2 && "Must have an incoming value only for the preheader and the latch."); Value *V = PHI->getIncomingValueForBlock( Iteration == 0 ? L->getLoopPreheader() : L->getLoopLatch()); Constant *C = dyn_cast<Constant>(V); if (Iteration != 0 && !C) C = SimplifiedValues.lookup(V); if (C) SimplifiedInputValues.push_back({PHI, C}); } // Now clear and re-populate the map for the next iteration. SimplifiedValues.clear(); while (!SimplifiedInputValues.empty()) SimplifiedValues.insert(SimplifiedInputValues.pop_back_val()); UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE); BBWorklist.clear(); BBWorklist.insert(L->getHeader()); // Note that we *must not* cache the size, this loop grows the worklist. for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) { BasicBlock *BB = BBWorklist[Idx]; // Visit all instructions in the given basic block and try to simplify // it. We don't change the actual IR, just count optimization // opportunities. for (Instruction &I : *BB) { int InstCost = TTI.getUserCost(&I); // Visit the instruction to analyze its loop cost after unrolling, // and if the visitor returns false, include this instruction in the // unrolled cost. if (!Analyzer.visit(I)) UnrolledCost += InstCost; else { DEBUG(dbgs() << " " << I << " would be simplified if loop is unrolled.\n"); (void)0; } // Also track this instructions expected cost when executing the rolled // loop form. RolledDynamicCost += InstCost; // If unrolled body turns out to be too big, bail out. if (UnrolledCost > MaxUnrolledLoopSize) { DEBUG(dbgs() << " Exceeded threshold.. exiting.\n" << " UnrolledCost: " << UnrolledCost << ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize << "\n"); return None; } } TerminatorInst *TI = BB->getTerminator(); // Add in the live successors by first checking whether we have terminator // that may be simplified based on the values simplified by this call. if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { if (BI->isConditional()) { if (Constant *SimpleCond = SimplifiedValues.lookup(BI->getCondition())) { BasicBlock *Succ = nullptr; // Just take the first successor if condition is undef if (isa<UndefValue>(SimpleCond)) Succ = BI->getSuccessor(0); else Succ = BI->getSuccessor( cast<ConstantInt>(SimpleCond)->isZero() ? 1 : 0); if (L->contains(Succ)) BBWorklist.insert(Succ); continue; } } } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { if (Constant *SimpleCond = SimplifiedValues.lookup(SI->getCondition())) { BasicBlock *Succ = nullptr; // Just take the first successor if condition is undef if (isa<UndefValue>(SimpleCond)) Succ = SI->getSuccessor(0); else Succ = SI->findCaseValue(cast<ConstantInt>(SimpleCond)) .getCaseSuccessor(); if (L->contains(Succ)) BBWorklist.insert(Succ); continue; } } // Add BB's successors to the worklist. for (BasicBlock *Succ : successors(BB)) if (L->contains(Succ)) BBWorklist.insert(Succ); } // If we found no optimization opportunities on the first iteration, we // won't find them on later ones too. if (UnrolledCost == RolledDynamicCost) { DEBUG(dbgs() << " No opportunities found.. exiting.\n" << " UnrolledCost: " << UnrolledCost << "\n"); return None; } } DEBUG(dbgs() << "Analysis finished:\n" << "UnrolledCost: " << UnrolledCost << ", " << "RolledDynamicCost: " << RolledDynamicCost << "\n"); return {{UnrolledCost, RolledDynamicCost}}; }