bool SIDebuggerInsertNops::runOnMachineFunction(MachineFunction &MF) { // Skip this pass if "amdgpu-debugger-insert-nops" attribute was not // specified. const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); if (!ST.debuggerInsertNops()) return false; // Skip machine functions without debug info. if (!MF.getMMI().hasDebugInfo()) return false; // Target instruction info. const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(MF.getSubtarget().getInstrInfo()); // Set containing line numbers that have nop inserted. DenseSet<unsigned> NopInserted; for (auto &MBB : MF) { for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { // Skip DBG_VALUE instructions and instructions without location. if (MI->isDebugValue() || !MI->getDebugLoc()) continue; // Insert nop instruction if line number does not have nop inserted. auto DL = MI->getDebugLoc(); if (NopInserted.find(DL.getLine()) == NopInserted.end()) { BuildMI(MBB, *MI, DL, TII->get(AMDGPU::S_NOP)) .addImm(0); NopInserted.insert(DL.getLine()); } } } return true; }
/// \brief Figure out if the loop is worth full unrolling. /// /// Complete loop unrolling can make some loads constant, and we need to know /// if that would expose any further optimization opportunities. This routine /// estimates this optimization. It computes cost of unrolled loop /// (UnrolledCost) and dynamic cost of the original loop (RolledDynamicCost). By /// dynamic cost we mean that we won't count costs of blocks that are known not /// to be executed (i.e. if we have a branch in the loop and we know that at the /// given iteration its condition would be resolved to true, we won't add up the /// cost of the 'false'-block). /// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If /// the analysis failed (no benefits expected from the unrolling, or the loop is /// too big to analyze), the returned value is None. static Optional<EstimatedUnrollCost> analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE, const TargetTransformInfo &TTI, int MaxUnrolledLoopSize) { // We want to be able to scale offsets by the trip count and add more offsets // to them without checking for overflows, and we already don't want to // analyze *massive* trip counts, so we force the max to be reasonably small. assert(UnrollMaxIterationsCountToAnalyze < (INT_MAX / 2) && "The unroll iterations max is too large!"); // Only analyze inner loops. We can't properly estimate cost of nested loops // and we won't visit inner loops again anyway. if (!L->empty()) return None; // Don't simulate loops with a big or unknown tripcount if (!UnrollMaxIterationsCountToAnalyze || !TripCount || TripCount > UnrollMaxIterationsCountToAnalyze) return None; SmallSetVector<BasicBlock *, 16> BBWorklist; SmallSetVector<std::pair<BasicBlock *, BasicBlock *>, 4> ExitWorklist; DenseMap<Value *, Constant *> SimplifiedValues; SmallVector<std::pair<Value *, Constant *>, 4> SimplifiedInputValues; // The estimated cost of the unrolled form of the loop. We try to estimate // this by simplifying as much as we can while computing the estimate. int UnrolledCost = 0; // We also track the estimated dynamic (that is, actually executed) cost in // the rolled form. This helps identify cases when the savings from unrolling // aren't just exposing dead control flows, but actual reduced dynamic // instructions due to the simplifications which we expect to occur after // unrolling. int RolledDynamicCost = 0; // We track the simplification of each instruction in each iteration. We use // this to recursively merge costs into the unrolled cost on-demand so that // we don't count the cost of any dead code. This is essentially a map from // <instruction, int> to <bool, bool>, but stored as a densely packed struct. DenseSet<UnrolledInstState, UnrolledInstStateKeyInfo> InstCostMap; // A small worklist used to accumulate cost of instructions from each // observable and reached root in the loop. SmallVector<Instruction *, 16> CostWorklist; // PHI-used worklist used between iterations while accumulating cost. SmallVector<Instruction *, 4> PHIUsedList; // Helper function to accumulate cost for instructions in the loop. auto AddCostRecursively = [&](Instruction &RootI, int Iteration) { assert(Iteration >= 0 && "Cannot have a negative iteration!"); assert(CostWorklist.empty() && "Must start with an empty cost list"); assert(PHIUsedList.empty() && "Must start with an empty phi used list"); CostWorklist.push_back(&RootI); for (;; --Iteration) { do { Instruction *I = CostWorklist.pop_back_val(); // InstCostMap only uses I and Iteration as a key, the other two values // don't matter here. auto CostIter = InstCostMap.find({I, Iteration, 0, 0}); if (CostIter == InstCostMap.end()) // If an input to a PHI node comes from a dead path through the loop // we may have no cost data for it here. What that actually means is // that it is free. continue; auto &Cost = *CostIter; if (Cost.IsCounted) // Already counted this instruction. continue; // Mark that we are counting the cost of this instruction now. Cost.IsCounted = true; // If this is a PHI node in the loop header, just add it to the PHI set. if (auto *PhiI = dyn_cast<PHINode>(I)) if (PhiI->getParent() == L->getHeader()) { assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they " "inherently simplify during unrolling."); if (Iteration == 0) continue; // Push the incoming value from the backedge into the PHI used list // if it is an in-loop instruction. We'll use this to populate the // cost worklist for the next iteration (as we count backwards). if (auto *OpI = dyn_cast<Instruction>( PhiI->getIncomingValueForBlock(L->getLoopLatch()))) if (L->contains(OpI)) PHIUsedList.push_back(OpI); continue; } // First accumulate the cost of this instruction. if (!Cost.IsFree) { UnrolledCost += TTI.getUserCost(I); DEBUG(dbgs() << "Adding cost of instruction (iteration " << Iteration << "): "); DEBUG(I->dump()); } // We must count the cost of every operand which is not free, // recursively. If we reach a loop PHI node, simply add it to the set // to be considered on the next iteration (backwards!). for (Value *Op : I->operands()) { // Check whether this operand is free due to being a constant or // outside the loop. auto *OpI = dyn_cast<Instruction>(Op); if (!OpI || !L->contains(OpI)) continue; // Otherwise accumulate its cost. CostWorklist.push_back(OpI); } } while (!CostWorklist.empty()); if (PHIUsedList.empty()) // We've exhausted the search. break; assert(Iteration > 0 && "Cannot track PHI-used values past the first iteration!"); CostWorklist.append(PHIUsedList.begin(), PHIUsedList.end()); PHIUsedList.clear(); } }; // Ensure that we don't violate the loop structure invariants relied on by // this analysis. assert(L->isLoopSimplifyForm() && "Must put loop into normal form first."); assert(L->isLCSSAForm(DT) && "Must have loops in LCSSA form to track live-out values."); DEBUG(dbgs() << "Starting LoopUnroll profitability analysis...\n"); // Simulate execution of each iteration of the loop counting instructions, // which would be simplified. // Since the same load will take different values on different iterations, // we literally have to go through all loop's iterations. for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) { DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n"); // Prepare for the iteration by collecting any simplified entry or backedge // inputs. for (Instruction &I : *L->getHeader()) { auto *PHI = dyn_cast<PHINode>(&I); if (!PHI) break; // The loop header PHI nodes must have exactly two input: one from the // loop preheader and one from the loop latch. assert( PHI->getNumIncomingValues() == 2 && "Must have an incoming value only for the preheader and the latch."); Value *V = PHI->getIncomingValueForBlock( Iteration == 0 ? L->getLoopPreheader() : L->getLoopLatch()); Constant *C = dyn_cast<Constant>(V); if (Iteration != 0 && !C) C = SimplifiedValues.lookup(V); if (C) SimplifiedInputValues.push_back({PHI, C}); } // Now clear and re-populate the map for the next iteration. SimplifiedValues.clear(); while (!SimplifiedInputValues.empty()) SimplifiedValues.insert(SimplifiedInputValues.pop_back_val()); UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE, L); BBWorklist.clear(); BBWorklist.insert(L->getHeader()); // Note that we *must not* cache the size, this loop grows the worklist. for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) { BasicBlock *BB = BBWorklist[Idx]; // Visit all instructions in the given basic block and try to simplify // it. We don't change the actual IR, just count optimization // opportunities. for (Instruction &I : *BB) { // Track this instruction's expected baseline cost when executing the // rolled loop form. RolledDynamicCost += TTI.getUserCost(&I); // Visit the instruction to analyze its loop cost after unrolling, // and if the visitor returns true, mark the instruction as free after // unrolling and continue. bool IsFree = Analyzer.visit(I); bool Inserted = InstCostMap.insert({&I, (int)Iteration, (unsigned)IsFree, /*IsCounted*/ false}).second; (void)Inserted; assert(Inserted && "Cannot have a state for an unvisited instruction!"); if (IsFree) continue; // If the instruction might have a side-effect recursively account for // the cost of it and all the instructions leading up to it. if (I.mayHaveSideEffects()) AddCostRecursively(I, Iteration); // Can't properly model a cost of a call. // FIXME: With a proper cost model we should be able to do it. if(isa<CallInst>(&I)) return None; // If unrolled body turns out to be too big, bail out. if (UnrolledCost > MaxUnrolledLoopSize) { DEBUG(dbgs() << " Exceeded threshold.. exiting.\n" << " UnrolledCost: " << UnrolledCost << ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize << "\n"); return None; } } TerminatorInst *TI = BB->getTerminator(); // Add in the live successors by first checking whether we have terminator // that may be simplified based on the values simplified by this call. BasicBlock *KnownSucc = nullptr; if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { if (BI->isConditional()) { if (Constant *SimpleCond = SimplifiedValues.lookup(BI->getCondition())) { // Just take the first successor if condition is undef if (isa<UndefValue>(SimpleCond)) KnownSucc = BI->getSuccessor(0); else if (ConstantInt *SimpleCondVal = dyn_cast<ConstantInt>(SimpleCond)) KnownSucc = BI->getSuccessor(SimpleCondVal->isZero() ? 1 : 0); } } } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { if (Constant *SimpleCond = SimplifiedValues.lookup(SI->getCondition())) { // Just take the first successor if condition is undef if (isa<UndefValue>(SimpleCond)) KnownSucc = SI->getSuccessor(0); else if (ConstantInt *SimpleCondVal = dyn_cast<ConstantInt>(SimpleCond)) KnownSucc = SI->findCaseValue(SimpleCondVal).getCaseSuccessor(); } } if (KnownSucc) { if (L->contains(KnownSucc)) BBWorklist.insert(KnownSucc); else ExitWorklist.insert({BB, KnownSucc}); continue; } // Add BB's successors to the worklist. for (BasicBlock *Succ : successors(BB)) if (L->contains(Succ)) BBWorklist.insert(Succ); else ExitWorklist.insert({BB, Succ}); AddCostRecursively(*TI, Iteration); } // If we found no optimization opportunities on the first iteration, we // won't find them on later ones too. if (UnrolledCost == RolledDynamicCost) { DEBUG(dbgs() << " No opportunities found.. exiting.\n" << " UnrolledCost: " << UnrolledCost << "\n"); return None; } } while (!ExitWorklist.empty()) { BasicBlock *ExitingBB, *ExitBB; std::tie(ExitingBB, ExitBB) = ExitWorklist.pop_back_val(); for (Instruction &I : *ExitBB) { auto *PN = dyn_cast<PHINode>(&I); if (!PN) break; Value *Op = PN->getIncomingValueForBlock(ExitingBB); if (auto *OpI = dyn_cast<Instruction>(Op)) if (L->contains(OpI)) AddCostRecursively(*OpI, TripCount - 1); } } DEBUG(dbgs() << "Analysis finished:\n" << "UnrolledCost: " << UnrolledCost << ", " << "RolledDynamicCost: " << RolledDynamicCost << "\n"); return {{UnrolledCost, RolledDynamicCost}}; }
MCFunction MCFunction::createFunctionFromMC(StringRef Name, const MCDisassembler *DisAsm, const MemoryObject &Region, uint64_t Start, uint64_t End, const MCInstrAnalysis *Ana, raw_ostream &DebugOut, SmallVectorImpl<uint64_t> &Calls) { std::vector<MCDecodedInst> Instructions; std::set<uint64_t> Splits; Splits.insert(Start); uint64_t Size; MCFunction f(Name); { DenseSet<uint64_t> VisitedInsts; SmallVector<uint64_t, 16> WorkList; WorkList.push_back(Start); // Disassemble code and gather basic block split points. while (!WorkList.empty()) { uint64_t Index = WorkList.pop_back_val(); if (VisitedInsts.find(Index) != VisitedInsts.end()) continue; // Already visited this location. for (;Index < End; Index += Size) { VisitedInsts.insert(Index); MCInst Inst; if (DisAsm->getInstruction(Inst, Size, Region, Index, DebugOut, nulls())){ Instructions.push_back(MCDecodedInst(Index, Size, Inst)); if (Ana->isBranch(Inst)) { uint64_t targ = Ana->evaluateBranch(Inst, Index, Size); if (targ != -1ULL && targ == Index+Size) continue; // Skip nop jumps. // If we could determine the branch target, make a note to start a // new basic block there and add the target to the worklist. if (targ != -1ULL) { Splits.insert(targ); WorkList.push_back(targ); WorkList.push_back(Index+Size); } Splits.insert(Index+Size); break; } else if (Ana->isReturn(Inst)) { // Return instruction. This basic block ends here. Splits.insert(Index+Size); break; } else if (Ana->isCall(Inst)) { uint64_t targ = Ana->evaluateBranch(Inst, Index, Size); // Add the call to the call list if the destination is known. if (targ != -1ULL && targ != Index+Size) Calls.push_back(targ); } } else { errs().write_hex(Index) << ": warning: invalid instruction encoding\n"; if (Size == 0) Size = 1; // skip illegible bytes } } } } // Make sure the instruction list is sorted. std::sort(Instructions.begin(), Instructions.end()); // Create basic blocks. unsigned ii = 0, ie = Instructions.size(); for (std::set<uint64_t>::iterator spi = Splits.begin(), spe = llvm::prior(Splits.end()); spi != spe; ++spi) { MCBasicBlock BB; uint64_t BlockEnd = *llvm::next(spi); // Add instructions to the BB. for (; ii != ie; ++ii) { if (Instructions[ii].Address < *spi || Instructions[ii].Address >= BlockEnd) break; BB.addInst(Instructions[ii]); } f.addBlock(*spi, BB); } std::sort(f.Blocks.begin(), f.Blocks.end()); // Calculate successors of each block. for (MCFunction::iterator i = f.begin(), e = f.end(); i != e; ++i) { MCBasicBlock &BB = const_cast<MCBasicBlock&>(i->second); if (BB.getInsts().empty()) continue; const MCDecodedInst &Inst = BB.getInsts().back(); if (Ana->isBranch(Inst.Inst)) { uint64_t targ = Ana->evaluateBranch(Inst.Inst, Inst.Address, Inst.Size); if (targ == -1ULL) { // Indirect branch. Bail and add all blocks of the function as a // successor. for (MCFunction::iterator i = f.begin(), e = f.end(); i != e; ++i) BB.addSucc(i->first); } else if (targ != Inst.Address+Inst.Size) BB.addSucc(targ); // Conditional branches can also fall through to the next block. if (Ana->isConditionalBranch(Inst.Inst) && llvm::next(i) != e) BB.addSucc(llvm::next(i)->first); } else { // No branch. Fall through to the next block. if (!Ana->isReturn(Inst.Inst) && llvm::next(i) != e) BB.addSucc(llvm::next(i)->first); } } return f; }
/// canonicalizeInputFunction - Functions like swift_retain return an /// argument as a low-level performance optimization. This makes it difficult /// to reason about pointer equality though, so undo it as an initial /// canonicalization step. After this step, all swift_retain's have been /// replaced with swift_retain. /// /// This also does some trivial peep-hole optimizations as we go. static bool canonicalizeInputFunction(Function &F, ARCEntryPointBuilder &B, SwiftRCIdentity *RC) { bool Changed = false; DenseSet<Value *> NativeRefs; DenseMap<Value *, TinyPtrVector<Instruction *>> UnknownRetains; DenseMap<Value *, TinyPtrVector<Instruction *>> UnknownReleases; for (auto &BB : F) { UnknownRetains.clear(); UnknownReleases.clear(); NativeRefs.clear(); for (auto I = BB.begin(); I != BB.end(); ) { Instruction &Inst = *I++; switch (classifyInstruction(Inst)) { // These instructions should not reach here based on the pass ordering. // i.e. LLVMARCOpt -> LLVMContractOpt. case RT_RetainN: case RT_UnknownRetainN: case RT_BridgeRetainN: case RT_ReleaseN: case RT_UnknownReleaseN: case RT_BridgeReleaseN: llvm_unreachable("These are only created by LLVMARCContract !"); case RT_Unknown: case RT_BridgeRelease: case RT_AllocObject: case RT_FixLifetime: case RT_NoMemoryAccessed: case RT_RetainUnowned: case RT_CheckUnowned: break; case RT_Retain: { CallInst &CI = cast<CallInst>(Inst); Value *ArgVal = RC->getSwiftRCIdentityRoot(CI.getArgOperand(0)); // retain(null) is a no-op. if (isa<ConstantPointerNull>(ArgVal)) { CI.eraseFromParent(); Changed = true; ++NumNoopDeleted; continue; } // Rewrite unknown retains into swift_retains. NativeRefs.insert(ArgVal); for (auto &X : UnknownRetains[ArgVal]) { B.setInsertPoint(X); B.createRetain(ArgVal, cast<CallInst>(X)); X->eraseFromParent(); ++NumUnknownRetainReleaseSRed; Changed = true; } UnknownRetains[ArgVal].clear(); break; } case RT_UnknownRetain: { CallInst &CI = cast<CallInst>(Inst); Value *ArgVal = RC->getSwiftRCIdentityRoot(CI.getArgOperand(0)); // unknownRetain(null) is a no-op. if (isa<ConstantPointerNull>(ArgVal)) { CI.eraseFromParent(); Changed = true; ++NumNoopDeleted; continue; } // Have not encountered a strong retain/release. keep it in the // unknown retain/release list for now. It might get replaced // later. if (NativeRefs.find(ArgVal) == NativeRefs.end()) { UnknownRetains[ArgVal].push_back(&CI); } else { B.setInsertPoint(&CI); B.createRetain(ArgVal, &CI); CI.eraseFromParent(); ++NumUnknownRetainReleaseSRed; Changed = true; } break; } case RT_Release: { CallInst &CI = cast<CallInst>(Inst); Value *ArgVal = RC->getSwiftRCIdentityRoot(CI.getArgOperand(0)); // release(null) is a no-op. if (isa<ConstantPointerNull>(ArgVal)) { CI.eraseFromParent(); Changed = true; ++NumNoopDeleted; continue; } // Rewrite unknown releases into swift_releases. NativeRefs.insert(ArgVal); for (auto &X : UnknownReleases[ArgVal]) { B.setInsertPoint(X); B.createRelease(ArgVal, cast<CallInst>(X)); X->eraseFromParent(); ++NumUnknownRetainReleaseSRed; Changed = true; } UnknownReleases[ArgVal].clear(); break; } case RT_UnknownRelease: { CallInst &CI = cast<CallInst>(Inst); Value *ArgVal = RC->getSwiftRCIdentityRoot(CI.getArgOperand(0)); // unknownRelease(null) is a no-op. if (isa<ConstantPointerNull>(ArgVal)) { CI.eraseFromParent(); Changed = true; ++NumNoopDeleted; continue; } // Have not encountered a strong retain/release. keep it in the // unknown retain/release list for now. It might get replaced // later. if (NativeRefs.find(ArgVal) == NativeRefs.end()) { UnknownReleases[ArgVal].push_back(&CI); } else { B.setInsertPoint(&CI); B.createRelease(ArgVal, &CI); CI.eraseFromParent(); ++NumUnknownRetainReleaseSRed; Changed = true; } break; } case RT_ObjCRelease: { CallInst &CI = cast<CallInst>(Inst); Value *ArgVal = RC->getSwiftRCIdentityRoot(CI.getArgOperand(0)); // objc_release(null) is a noop, zap it. if (isa<ConstantPointerNull>(ArgVal)) { CI.eraseFromParent(); Changed = true; ++NumNoopDeleted; continue; } break; } // These retain instructions return their argument so must be processed // specially. case RT_BridgeRetain: case RT_ObjCRetain: { // Canonicalize the retain so that nothing uses its result. CallInst &CI = cast<CallInst>(Inst); // Do not get RC identical value here, could end up with a // crash in replaceAllUsesWith as the type maybe different. Value *ArgVal = CI.getArgOperand(0); if (!CI.use_empty()) { CI.replaceAllUsesWith(ArgVal); Changed = true; } // {objc_retain,swift_unknownRetain}(null) is a noop, delete it. if (isa<ConstantPointerNull>(ArgVal)) { CI.eraseFromParent(); Changed = true; ++NumNoopDeleted; continue; } break; } } } } return Changed; }
Error AnalysisStyle::dump() { auto Tpi = File.getPDBTpiStream(); if (!Tpi) return Tpi.takeError(); TypeDatabase TypeDB(Tpi->getNumTypeRecords()); TypeDatabaseVisitor DBV(TypeDB); TypeVisitorCallbackPipeline Pipeline; HashLookupVisitor Hasher(*Tpi); // Add them to the database Pipeline.addCallbackToPipeline(DBV); // Store their hash values Pipeline.addCallbackToPipeline(Hasher); if (auto EC = codeview::visitTypeStream(Tpi->typeArray(), Pipeline)) return EC; auto &Adjusters = Tpi->getHashAdjusters(); DenseSet<uint32_t> AdjusterSet; for (const auto &Adj : Adjusters) { assert(AdjusterSet.find(Adj.second) == AdjusterSet.end()); AdjusterSet.insert(Adj.second); } uint32_t Count = 0; outs() << "Searching for hash collisions\n"; for (const auto &H : Hasher.Lookup) { if (H.second.size() <= 1) continue; ++Count; outs() << formatv("Hash: {0}, Count: {1} records\n", H.first, H.second.size()); for (const auto &R : H.second) { auto Iter = AdjusterSet.find(R.TI.getIndex()); StringRef Prefix; if (Iter != AdjusterSet.end()) { Prefix = "[HEAD]"; AdjusterSet.erase(Iter); } StringRef LeafName = getLeafTypeName(R.Record.Type); uint32_t TI = R.TI.getIndex(); StringRef TypeName = TypeDB.getTypeName(R.TI); outs() << formatv("{0,-6} {1} ({2:x}) {3}\n", Prefix, LeafName, TI, TypeName); } } outs() << "\n"; outs() << "Dumping hash adjustment chains\n"; for (const auto &A : Tpi->getHashAdjusters()) { TypeIndex TI(A.second); StringRef TypeName = TypeDB.getTypeName(TI); const CVType &HeadRecord = TypeDB.getTypeRecord(TI); assert(HeadRecord.Hash.hasValue()); auto CollisionsIter = Hasher.Lookup.find(*HeadRecord.Hash); if (CollisionsIter == Hasher.Lookup.end()) continue; const auto &Collisions = CollisionsIter->second; outs() << TypeName << "\n"; outs() << formatv(" [HEAD] {0:x} {1} {2}\n", A.second, getLeafTypeName(HeadRecord.Type), TypeName); for (const auto &Chain : Collisions) { if (Chain.TI == TI) continue; const CVType &TailRecord = TypeDB.getTypeRecord(Chain.TI); outs() << formatv(" {0:x} {1} {2}\n", Chain.TI.getIndex(), getLeafTypeName(TailRecord.Type), TypeDB.getTypeName(Chain.TI)); } } outs() << formatv("There are {0} orphaned hash adjusters\n", AdjusterSet.size()); for (const auto &Adj : AdjusterSet) { outs() << formatv(" {0}\n", Adj); } uint32_t DistinctHashValues = Hasher.Lookup.size(); outs() << formatv("{0}/{1} hash collisions", Count, DistinctHashValues); return Error::success(); }