/// processMemCpy - perform simplification of memcpy's. If we have memcpy A /// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite /// B to be a memcpy from X to Z (or potentially a memmove, depending on /// circumstances). This allows later passes to remove the first memcpy /// altogether. bool MemCpyOpt::processMemCpy(MemCpyInst *M) { // We can only optimize statically-sized memcpy's that are non-volatile. ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength()); if (CopySize == 0 || M->isVolatile()) return false; // If the source and destination of the memcpy are the same, then zap it. if (M->getSource() == M->getDest()) { MD->removeInstruction(M); M->eraseFromParent(); return false; } // The are two possible optimizations we can do for memcpy: // a) memcpy-memcpy xform which exposes redundance for DSE. // b) call-memcpy xform for return slot optimization. MemDepResult DepInfo = MD->getDependency(M); if (!DepInfo.isClobber()) return false; if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(DepInfo.getInst())) return processMemCpyMemCpyDependence(M, MDep, CopySize->getZExtValue()); if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) { if (performCallSlotOptzn(M, M->getDest(), M->getSource(), CopySize->getZExtValue(), C)) { M->eraseFromParent(); return true; } } return false; }
/// HandleFree - Handle frees of entire structures whose dependency is a store /// to a field of that structure. bool DSE::HandleFree(CallInst *F) { MemDepResult Dep = MD->getDependency(F); do { if (Dep.isNonLocal()) return false; Instruction *Dependency = Dep.getInst(); if (!hasMemoryWrite(Dependency) || !isRemovable(Dependency)) return false; Value *DepPointer = getStoredPointerOperand(Dependency)->getUnderlyingObject(); // Check for aliasing. if (!AA->isMustAlias(F->getArgOperand(0), DepPointer)) return false; // DCE instructions only used to calculate that store DeleteDeadInstruction(Dependency, *MD); ++NumFastStores; // Inst's old Dependency is now deleted. Compute the next dependency, // which may also be dead, as in // s[0] = 0; // s[1] = 0; // This has just been deleted. // free(s); Dep = MD->getDependency(F); } while (!Dep.isNonLocal()); return true; }
/// Handle frees of entire structures whose dependency is a store /// to a field of that structure. static bool handleFree(CallInst *F, AliasAnalysis *AA, MemoryDependenceResults *MD, DominatorTree *DT, const TargetLibraryInfo *TLI, InstOverlapIntervalsTy &IOL, DenseMap<Instruction*, size_t> *InstrOrdering) { bool MadeChange = false; MemoryLocation Loc = MemoryLocation(F->getOperand(0)); SmallVector<BasicBlock *, 16> Blocks; Blocks.push_back(F->getParent()); const DataLayout &DL = F->getModule()->getDataLayout(); while (!Blocks.empty()) { BasicBlock *BB = Blocks.pop_back_val(); Instruction *InstPt = BB->getTerminator(); if (BB == F->getParent()) InstPt = F; MemDepResult Dep = MD->getPointerDependencyFrom(Loc, false, InstPt->getIterator(), BB); while (Dep.isDef() || Dep.isClobber()) { Instruction *Dependency = Dep.getInst(); if (!hasMemoryWrite(Dependency, *TLI) || !isRemovable(Dependency)) break; Value *DepPointer = GetUnderlyingObject(getStoredPointerOperand(Dependency), DL); // Check for aliasing. if (!AA->isMustAlias(F->getArgOperand(0), DepPointer)) break; DEBUG(dbgs() << "DSE: Dead Store to soon to be freed memory:\n DEAD: " << *Dependency << '\n'); // DCE instructions only used to calculate that store. BasicBlock::iterator BBI(Dependency); deleteDeadInstruction(Dependency, &BBI, *MD, *TLI, IOL, InstrOrdering); ++NumFastStores; MadeChange = true; // Inst's old Dependency is now deleted. Compute the next dependency, // which may also be dead, as in // s[0] = 0; // s[1] = 0; // This has just been deleted. // free(s); Dep = MD->getPointerDependencyFrom(Loc, false, BBI, BB); } if (Dep.isNonLocal()) findUnconditionalPreds(Blocks, BB, DT); } return MadeChange; }
DataDependence::DepInfo DataDependence::getDepInfo(MemDepResult dep) { if (dep.isClobber()) return DepInfo(dep.getInst(), Clobber); if (dep.isDef()) return DepInfo(dep.getInst(), Def); if (dep.isNonFuncLocal()) return DepInfo(dep.getInst(), NonFuncLocal); if (dep.isUnknown()) return DepInfo(dep.getInst(), Unknown); if (dep.isNonLocal()) return DepInfo(dep.getInst(), NonLocal); llvm_unreachable("unknown dependence type"); }
/// We've found that the (upward scanning) memory dependence of \p MemCpy is /// \p MemSet. Try to simplify \p MemSet to only set the trailing bytes that /// weren't copied over by \p MemCpy. /// /// In other words, transform: /// \code /// memset(dst, c, dst_size); /// memcpy(dst, src, src_size); /// \endcode /// into: /// \code /// memcpy(dst, src, src_size); /// memset(dst + src_size, c, dst_size <= src_size ? 0 : dst_size - src_size); /// \endcode bool MemCpyOpt::processMemSetMemCpyDependence(MemCpyInst *MemCpy, MemSetInst *MemSet) { // We can only transform memset/memcpy with the same destination. if (MemSet->getDest() != MemCpy->getDest()) return false; // Check that there are no other dependencies on the memset destination. MemDepResult DstDepInfo = MD->getPointerDependencyFrom( MemoryLocation::getForDest(MemSet), false, MemCpy, MemCpy->getParent()); if (DstDepInfo.getInst() != MemSet) return false; // Use the same i8* dest as the memcpy, killing the memset dest if different. Value *Dest = MemCpy->getRawDest(); Value *DestSize = MemSet->getLength(); Value *SrcSize = MemCpy->getLength(); // By default, create an unaligned memset. unsigned Align = 1; // If Dest is aligned, and SrcSize is constant, use the minimum alignment // of the sum. const unsigned DestAlign = std::max(MemSet->getAlignment(), MemCpy->getAlignment()); if (DestAlign > 1) if (ConstantInt *SrcSizeC = dyn_cast<ConstantInt>(SrcSize)) Align = MinAlign(SrcSizeC->getZExtValue(), DestAlign); IRBuilder<> Builder(MemCpy); // If the sizes have different types, zext the smaller one. if (DestSize->getType() != SrcSize->getType()) { if (DestSize->getType()->getIntegerBitWidth() > SrcSize->getType()->getIntegerBitWidth()) SrcSize = Builder.CreateZExt(SrcSize, DestSize->getType()); else DestSize = Builder.CreateZExt(DestSize, SrcSize->getType()); } Value *MemsetLen = Builder.CreateSelect(Builder.CreateICmpULE(DestSize, SrcSize), ConstantInt::getNullValue(DestSize->getType()), Builder.CreateSub(DestSize, SrcSize)); const DataLayout &DL = MemCpy->getModule()->getDataLayout(); Builder.CreateMemSet(Builder.CreateGEP(Dest, SrcSize), MemSet->getOperand(1), MemsetLen, Align, false, NULL, NULL, NULL, DL.isByteAddressable()); MD->removeInstruction(MemSet); MemSet->eraseFromParent(); return true; }
void DeadStoreEliminationPass::runOverwrittenDeadStoreAnalysisOnFn(Function &F) { MDA = &getAnalysis<MemoryDependenceAnalysis>(F); for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { for (BasicBlock::iterator I = BB->begin(), IE = BB->end(); I != IE; ++I) { Instruction *inst = I; if (StoreInst* SI = dyn_cast<StoreInst>(inst)) { Value *ptr = SI->getPointerOperand(); MemDepResult mdr = MDA->getDependency(inst); Instruction *depInst = mdr.getInst(); if (depInst && (isa<CallInst>(depInst) || isa<InvokeInst>(depInst))) { Function *calledFn; if (CallInst* CI = dyn_cast<CallInst>(depInst)) { calledFn = CI->getCalledFunction(); } else { InvokeInst *II = dyn_cast<InvokeInst>(depInst); calledFn = II->getCalledFunction(); } if (!fnThatStoreOnArgs.count(calledFn)) continue; CallSite CS(depInst); CallSite::arg_iterator actualArgIter = CS.arg_begin(); Function::arg_iterator formalArgIter = calledFn->arg_begin(); int size = calledFn->arg_size(); std::set<Value*> storedArgs = fnThatStoreOnArgs[calledFn]; for (int i = 0; i < size; ++i, ++actualArgIter, ++formalArgIter) { Value *formalArg = formalArgIter; Value *actualArg = *actualArgIter; if (ptr == actualArg && storedArgs.count(formalArg)) { int64_t InstWriteOffset, DepWriteOffset; DEBUG(errs() << " Verifying if store is completely overwritten.\n"); AliasAnalysis::Location Loc(ptr, getPointerSize(ptr, *AA), NULL); AliasAnalysis::Location DepLoc(actualArg, getPointerSize(actualArg, *AA), NULL); OverwriteResult OR = isOverwrite(Loc, DepLoc, *AA, DepWriteOffset, InstWriteOffset); if (OR == OverwriteComplete) { DEBUG(errs() << " Store on " << formalArg->getName() << " will be removed with cloning\n"); deadArguments[depInst].insert(formalArg); } } } if (deadArguments.count(depInst)) { fn2Clone[calledFn].push_back(depInst); } } } } } }
/// HandleFree - Handle frees of entire structures whose dependency is a store /// to a field of that structure. bool DSE::HandleFree(CallInst *F) { bool MadeChange = false; MemoryLocation Loc = MemoryLocation(F->getOperand(0)); SmallVector<BasicBlock *, 16> Blocks; Blocks.push_back(F->getParent()); const DataLayout &DL = F->getModule()->getDataLayout(); while (!Blocks.empty()) { BasicBlock *BB = Blocks.pop_back_val(); Instruction *InstPt = BB->getTerminator(); if (BB == F->getParent()) InstPt = F; MemDepResult Dep = MD->getPointerDependencyFrom(Loc, false, InstPt, BB); while (Dep.isDef() || Dep.isClobber()) { Instruction *Dependency = Dep.getInst(); if (!hasMemoryWrite(Dependency, *TLI) || !isRemovable(Dependency)) break; Value *DepPointer = GetUnderlyingObject(getStoredPointerOperand(Dependency), DL); // Check for aliasing. if (!AA->isMustAlias(F->getArgOperand(0), DepPointer)) break; Instruction *Next = std::next(BasicBlock::iterator(Dependency)); // DCE instructions only used to calculate that store DeleteDeadInstruction(Dependency, *MD, *TLI); ++NumFastStores; MadeChange = true; // Inst's old Dependency is now deleted. Compute the next dependency, // which may also be dead, as in // s[0] = 0; // s[1] = 0; // This has just been deleted. // free(s); Dep = MD->getPointerDependencyFrom(Loc, false, Next, BB); } if (Dep.isNonLocal()) FindUnconditionalPreds(Blocks, BB, DT); } return MadeChange; }
/// processMemCpy - perform simplification of memcpy's. If we have memcpy A /// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite /// B to be a memcpy from X to Z (or potentially a memmove, depending on /// circumstances). This allows later passes to remove the first memcpy /// altogether. bool MemCpyOpt::processMemCpy(MemCpyInst *M) { // We can only optimize statically-sized memcpy's that are non-volatile. ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength()); if (CopySize == 0 || M->isVolatile()) return false; // If the source and destination of the memcpy are the same, then zap it. if (M->getSource() == M->getDest()) { MD->removeInstruction(M); M->eraseFromParent(); return false; } // If copying from a constant, try to turn the memcpy into a memset. if (GlobalVariable *GV = dyn_cast<GlobalVariable>(M->getSource())) if (GV->isConstant() && GV->hasDefinitiveInitializer()) if (Value *ByteVal = isBytewiseValue(GV->getInitializer())) { IRBuilder<> Builder(M); Builder.CreateMemSet(M->getRawDest(), ByteVal, CopySize, M->getAlignment(), false); MD->removeInstruction(M); M->eraseFromParent(); ++NumCpyToSet; return true; } // The are two possible optimizations we can do for memcpy: // a) memcpy-memcpy xform which exposes redundance for DSE. // b) call-memcpy xform for return slot optimization. MemDepResult DepInfo = MD->getDependency(M); if (!DepInfo.isClobber()) return false; if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(DepInfo.getInst())) return processMemCpyMemCpyDependence(M, MDep, CopySize->getZExtValue()); if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) { if (performCallSlotOptzn(M, M->getDest(), M->getSource(), CopySize->getZExtValue(), C)) { MD->removeInstruction(M); M->eraseFromParent(); return true; } } return false; }
bool MemDepPrinter::runOnFunction(Function &F) { this->F = &F; MemoryDependenceAnalysis &MDA = getAnalysis<MemoryDependenceAnalysis>(); // All this code uses non-const interfaces because MemDep is not // const-friendly, though nothing is actually modified. for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) { Instruction *Inst = &*I; if (!Inst->mayReadFromMemory() && !Inst->mayWriteToMemory()) continue; MemDepResult Res = MDA.getDependency(Inst); if (!Res.isNonLocal()) { Deps[Inst].insert(std::make_pair(getInstTypePair(Res), static_cast<BasicBlock *>(nullptr))); } else if (CallSite CS = cast<Value>(Inst)) { const MemoryDependenceAnalysis::NonLocalDepInfo &NLDI = MDA.getNonLocalCallDependency(CS); DepSet &InstDeps = Deps[Inst]; for (MemoryDependenceAnalysis::NonLocalDepInfo::const_iterator I = NLDI.begin(), E = NLDI.end(); I != E; ++I) { const MemDepResult &Res = I->getResult(); InstDeps.insert(std::make_pair(getInstTypePair(Res), I->getBB())); } } else { SmallVector<NonLocalDepResult, 4> NLDI; assert( (isa<LoadInst>(Inst) || isa<StoreInst>(Inst) || isa<VAArgInst>(Inst)) && "Unknown memory instruction!"); MDA.getNonLocalPointerDependency(Inst, NLDI); DepSet &InstDeps = Deps[Inst]; for (SmallVectorImpl<NonLocalDepResult>::const_iterator I = NLDI.begin(), E = NLDI.end(); I != E; ++I) { const MemDepResult &Res = I->getResult(); InstDeps.insert(std::make_pair(getInstTypePair(Res), I->getBB())); } } } return false; }
bool MemDepPrinter::runOnFunction(Function &F) { this->F = &F; MemoryDependenceResults &MDA = getAnalysis<MemoryDependenceWrapperPass>().getMemDep(); // All this code uses non-const interfaces because MemDep is not // const-friendly, though nothing is actually modified. for (auto &I : instructions(F)) { Instruction *Inst = &I; if (!Inst->mayReadFromMemory() && !Inst->mayWriteToMemory()) continue; MemDepResult Res = MDA.getDependency(Inst); if (!Res.isNonLocal()) { Deps[Inst].insert(std::make_pair(getInstTypePair(Res), static_cast<BasicBlock *>(nullptr))); } else if (auto CS = CallSite(Inst)) { const MemoryDependenceResults::NonLocalDepInfo &NLDI = MDA.getNonLocalCallDependency(CS); DepSet &InstDeps = Deps[Inst]; for (const NonLocalDepEntry &I : NLDI) { const MemDepResult &Res = I.getResult(); InstDeps.insert(std::make_pair(getInstTypePair(Res), I.getBB())); } } else { SmallVector<NonLocalDepResult, 4> NLDI; assert( (isa<LoadInst>(Inst) || isa<StoreInst>(Inst) || isa<VAArgInst>(Inst)) && "Unknown memory instruction!"); MDA.getNonLocalPointerDependency(Inst, NLDI); DepSet &InstDeps = Deps[Inst]; for (const NonLocalDepResult &I : NLDI) { const MemDepResult &Res = I.getResult(); InstDeps.insert(std::make_pair(getInstTypePair(Res), I.getBB())); } } } return false; }
/// GetNonLocalInfoForBlock - Compute the memdep value for BB with /// Pointer/PointeeSize using either cached information in Cache or by doing a /// lookup (which may use dirty cache info if available). If we do a lookup, /// add the result to the cache. MemDepResult MemoryDependenceAnalysis:: GetNonLocalInfoForBlock(Value *Pointer, uint64_t PointeeSize, bool isLoad, BasicBlock *BB, NonLocalDepInfo *Cache, unsigned NumSortedEntries) { // Do a binary search to see if we already have an entry for this block in // the cache set. If so, find it. NonLocalDepInfo::iterator Entry = std::upper_bound(Cache->begin(), Cache->begin()+NumSortedEntries, std::make_pair(BB, MemDepResult())); if (Entry != Cache->begin() && prior(Entry)->first == BB) --Entry; MemDepResult *ExistingResult = 0; if (Entry != Cache->begin()+NumSortedEntries && Entry->first == BB) ExistingResult = &Entry->second; // If we have a cached entry, and it is non-dirty, use it as the value for // this dependency. if (ExistingResult && !ExistingResult->isDirty()) { ++NumCacheNonLocalPtr; return *ExistingResult; } // Otherwise, we have to scan for the value. If we have a dirty cache // entry, start scanning from its position, otherwise we scan from the end // of the block. BasicBlock::iterator ScanPos = BB->end(); if (ExistingResult && ExistingResult->getInst()) { assert(ExistingResult->getInst()->getParent() == BB && "Instruction invalidated?"); ++NumCacheDirtyNonLocalPtr; ScanPos = ExistingResult->getInst(); // Eliminating the dirty entry from 'Cache', so update the reverse info. ValueIsLoadPair CacheKey(Pointer, isLoad); RemoveFromReverseMap(ReverseNonLocalPtrDeps, ScanPos, CacheKey.getOpaqueValue()); } else { ++NumUncacheNonLocalPtr; } // Scan the block for the dependency. MemDepResult Dep = getPointerDependencyFrom(Pointer, PointeeSize, isLoad, ScanPos, BB); // If we had a dirty entry for the block, update it. Otherwise, just add // a new entry. if (ExistingResult) *ExistingResult = Dep; else Cache->push_back(std::make_pair(BB, Dep)); // If the block has a dependency (i.e. it isn't completely transparent to // the value), remember the reverse association because we just added it // to Cache! if (Dep.isNonLocal()) return Dep; // Keep the ReverseNonLocalPtrDeps map up to date so we can efficiently // update MemDep when we remove instructions. Instruction *Inst = Dep.getInst(); assert(Inst && "Didn't depend on anything?"); ValueIsLoadPair CacheKey(Pointer, isLoad); ReverseNonLocalPtrDeps[Inst].insert(CacheKey.getOpaqueValue()); return Dep; }
/// getNonLocalCallDependency - Perform a full dependency query for the /// specified call, returning the set of blocks that the value is /// potentially live across. The returned set of results will include a /// "NonLocal" result for all blocks where the value is live across. /// /// This method assumes the instruction returns a "NonLocal" dependency /// within its own block. /// /// This returns a reference to an internal data structure that may be /// invalidated on the next non-local query or when an instruction is /// removed. Clients must copy this data if they want it around longer than /// that. const MemoryDependenceAnalysis::NonLocalDepInfo & MemoryDependenceAnalysis::getNonLocalCallDependency(CallSite QueryCS) { assert(getDependency(QueryCS.getInstruction()).isNonLocal() && "getNonLocalCallDependency should only be used on calls with non-local deps!"); PerInstNLInfo &CacheP = NonLocalDeps[QueryCS.getInstruction()]; NonLocalDepInfo &Cache = CacheP.first; /// DirtyBlocks - This is the set of blocks that need to be recomputed. In /// the cached case, this can happen due to instructions being deleted etc. In /// the uncached case, this starts out as the set of predecessors we care /// about. SmallVector<BasicBlock*, 32> DirtyBlocks; if (!Cache.empty()) { // Okay, we have a cache entry. If we know it is not dirty, just return it // with no computation. if (!CacheP.second) { NumCacheNonLocal++; return Cache; } // If we already have a partially computed set of results, scan them to // determine what is dirty, seeding our initial DirtyBlocks worklist. for (NonLocalDepInfo::iterator I = Cache.begin(), E = Cache.end(); I != E; ++I) if (I->second.isDirty()) DirtyBlocks.push_back(I->first); // Sort the cache so that we can do fast binary search lookups below. std::sort(Cache.begin(), Cache.end()); ++NumCacheDirtyNonLocal; //cerr << "CACHED CASE: " << DirtyBlocks.size() << " dirty: " // << Cache.size() << " cached: " << *QueryInst; } else { // Seed DirtyBlocks with each of the preds of QueryInst's block. BasicBlock *QueryBB = QueryCS.getInstruction()->getParent(); for (BasicBlock **PI = PredCache->GetPreds(QueryBB); *PI; ++PI) DirtyBlocks.push_back(*PI); NumUncacheNonLocal++; } // isReadonlyCall - If this is a read-only call, we can be more aggressive. bool isReadonlyCall = AA->onlyReadsMemory(QueryCS); SmallPtrSet<BasicBlock*, 64> Visited; unsigned NumSortedEntries = Cache.size(); DEBUG(AssertSorted(Cache)); // Iterate while we still have blocks to update. while (!DirtyBlocks.empty()) { BasicBlock *DirtyBB = DirtyBlocks.back(); DirtyBlocks.pop_back(); // Already processed this block? if (!Visited.insert(DirtyBB)) continue; // Do a binary search to see if we already have an entry for this block in // the cache set. If so, find it. DEBUG(AssertSorted(Cache, NumSortedEntries)); NonLocalDepInfo::iterator Entry = std::upper_bound(Cache.begin(), Cache.begin()+NumSortedEntries, std::make_pair(DirtyBB, MemDepResult())); if (Entry != Cache.begin() && prior(Entry)->first == DirtyBB) --Entry; MemDepResult *ExistingResult = 0; if (Entry != Cache.begin()+NumSortedEntries && Entry->first == DirtyBB) { // If we already have an entry, and if it isn't already dirty, the block // is done. if (!Entry->second.isDirty()) continue; // Otherwise, remember this slot so we can update the value. ExistingResult = &Entry->second; } // If the dirty entry has a pointer, start scanning from it so we don't have // to rescan the entire block. BasicBlock::iterator ScanPos = DirtyBB->end(); if (ExistingResult) { if (Instruction *Inst = ExistingResult->getInst()) { ScanPos = Inst; // We're removing QueryInst's use of Inst. RemoveFromReverseMap(ReverseNonLocalDeps, Inst, QueryCS.getInstruction()); } } // Find out if this block has a local dependency for QueryInst. MemDepResult Dep; if (ScanPos != DirtyBB->begin()) { Dep = getCallSiteDependencyFrom(QueryCS, isReadonlyCall,ScanPos, DirtyBB); } else if (DirtyBB != &DirtyBB->getParent()->getEntryBlock()) { // No dependence found. If this is the entry block of the function, it is // a clobber, otherwise it is non-local. Dep = MemDepResult::getNonLocal(); } else { Dep = MemDepResult::getClobber(ScanPos); } // If we had a dirty entry for the block, update it. Otherwise, just add // a new entry. if (ExistingResult) *ExistingResult = Dep; else Cache.push_back(std::make_pair(DirtyBB, Dep)); // If the block has a dependency (i.e. it isn't completely transparent to // the value), remember the association! if (!Dep.isNonLocal()) { // Keep the ReverseNonLocalDeps map up to date so we can efficiently // update this when we remove instructions. if (Instruction *Inst = Dep.getInst()) ReverseNonLocalDeps[Inst].insert(QueryCS.getInstruction()); } else { // If the block *is* completely transparent to the load, we need to check // the predecessors of this block. Add them to our worklist. for (BasicBlock **PI = PredCache->GetPreds(DirtyBB); *PI; ++PI) DirtyBlocks.push_back(*PI); } } return Cache; }
/// processMemCpy - perform simplification of memcpy's. If we have memcpy A /// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite /// B to be a memcpy from X to Z (or potentially a memmove, depending on /// circumstances). This allows later passes to remove the first memcpy /// altogether. bool MemCpyOpt::processMemCpy(MemCpyInst *M) { // We can only optimize non-volatile memcpy's. if (M->isVolatile()) return false; // If the source and destination of the memcpy are the same, then zap it. if (M->getSource() == M->getDest()) { MD->removeInstruction(M); M->eraseFromParent(); return false; } // If copying from a constant, try to turn the memcpy into a memset. if (GlobalVariable *GV = dyn_cast<GlobalVariable>(M->getSource())) if (GV->isConstant() && GV->hasDefinitiveInitializer()) if (Value *ByteVal = isBytewiseValue(GV->getInitializer())) { IRBuilder<> Builder(M); Builder.CreateMemSet(M->getRawDest(), ByteVal, M->getLength(), M->getAlignment(), false); MD->removeInstruction(M); M->eraseFromParent(); ++NumCpyToSet; return true; } MemDepResult DepInfo = MD->getDependency(M); // Try to turn a partially redundant memset + memcpy into // memcpy + smaller memset. We don't need the memcpy size for this. if (DepInfo.isClobber()) if (MemSetInst *MDep = dyn_cast<MemSetInst>(DepInfo.getInst())) if (processMemSetMemCpyDependence(M, MDep)) return true; // The optimizations after this point require the memcpy size. ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength()); if (!CopySize) return false; // There are four possible optimizations we can do for memcpy: // a) memcpy-memcpy xform which exposes redundance for DSE. // b) call-memcpy xform for return slot optimization. // c) memcpy from freshly alloca'd space or space that has just started its // lifetime copies undefined data, and we can therefore eliminate the // memcpy in favor of the data that was already at the destination. // d) memcpy from a just-memset'd source can be turned into memset. if (DepInfo.isClobber()) { if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) { if (performCallSlotOptzn(M, M->getDest(), M->getSource(), CopySize->getZExtValue(), M->getAlignment(), C)) { MD->removeInstruction(M); M->eraseFromParent(); return true; } } } AliasAnalysis::Location SrcLoc = AliasAnalysis::getLocationForSource(M); MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(SrcLoc, true, M, M->getParent()); if (SrcDepInfo.isClobber()) { if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst())) return processMemCpyMemCpyDependence(M, MDep); } else if (SrcDepInfo.isDef()) { Instruction *I = SrcDepInfo.getInst(); bool hasUndefContents = false; if (isa<AllocaInst>(I)) { hasUndefContents = true; } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { if (II->getIntrinsicID() == Intrinsic::lifetime_start) if (ConstantInt *LTSize = dyn_cast<ConstantInt>(II->getArgOperand(0))) if (LTSize->getZExtValue() >= CopySize->getZExtValue()) hasUndefContents = true; } if (hasUndefContents) { MD->removeInstruction(M); M->eraseFromParent(); ++NumMemCpyInstr; return true; } } if (SrcDepInfo.isClobber()) if (MemSetInst *MDep = dyn_cast<MemSetInst>(SrcDepInfo.getInst())) if (performMemCpyToMemSetOptzn(M, MDep)) { MD->removeInstruction(M); M->eraseFromParent(); ++NumCpyToSet; return true; } return false; }
bool DSE::runOnBasicBlock(BasicBlock &BB) { bool MadeChange = false; // Do a top-down walk on the BB. for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) { Instruction *Inst = BBI++; // Handle 'free' calls specially. if (CallInst *F = isFreeCall(Inst, TLI)) { MadeChange |= HandleFree(F); continue; } // If we find something that writes memory, get its memory dependence. if (!hasMemoryWrite(Inst, TLI)) continue; MemDepResult InstDep = MD->getDependency(Inst); // Ignore any store where we can't find a local dependence. // FIXME: cross-block DSE would be fun. :) if (!InstDep.isDef() && !InstDep.isClobber()) continue; // If we're storing the same value back to a pointer that we just // loaded from, then the store can be removed. if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { if (LoadInst *DepLoad = dyn_cast<LoadInst>(InstDep.getInst())) { if (SI->getPointerOperand() == DepLoad->getPointerOperand() && SI->getOperand(0) == DepLoad && isRemovable(SI)) { DEBUG(dbgs() << "DSE: Remove Store Of Load from same pointer:\n " << "LOAD: " << *DepLoad << "\n STORE: " << *SI << '\n'); // DeleteDeadInstruction can delete the current instruction. Save BBI // in case we need it. WeakVH NextInst(BBI); DeleteDeadInstruction(SI, *MD, TLI); if (!NextInst) // Next instruction deleted. BBI = BB.begin(); else if (BBI != BB.begin()) // Revisit this instruction if possible. --BBI; ++NumFastStores; MadeChange = true; continue; } } } // Figure out what location is being stored to. AliasAnalysis::Location Loc = getLocForWrite(Inst, *AA); // If we didn't get a useful location, fail. if (!Loc.Ptr) continue; while (InstDep.isDef() || InstDep.isClobber()) { // Get the memory clobbered by the instruction we depend on. MemDep will // skip any instructions that 'Loc' clearly doesn't interact with. If we // end up depending on a may- or must-aliased load, then we can't optimize // away the store and we bail out. However, if we depend on on something // that overwrites the memory location we *can* potentially optimize it. // // Find out what memory location the dependent instruction stores. Instruction *DepWrite = InstDep.getInst(); AliasAnalysis::Location DepLoc = getLocForWrite(DepWrite, *AA); // If we didn't get a useful location, or if it isn't a size, bail out. if (!DepLoc.Ptr) break; // If we find a write that is a) removable (i.e., non-volatile), b) is // completely obliterated by the store to 'Loc', and c) which we know that // 'Inst' doesn't load from, then we can remove it. if (isRemovable(DepWrite) && !isPossibleSelfRead(Inst, Loc, DepWrite, *AA)) { int64_t InstWriteOffset, DepWriteOffset; OverwriteResult OR = isOverwrite(Loc, DepLoc, *AA, DepWriteOffset, InstWriteOffset); if (OR == OverwriteComplete) { DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *DepWrite << "\n KILLER: " << *Inst << '\n'); // Delete the store and now-dead instructions that feed it. DeleteDeadInstruction(DepWrite, *MD, TLI); ++NumFastStores; MadeChange = true; // DeleteDeadInstruction can delete the current instruction in loop // cases, reset BBI. BBI = Inst; if (BBI != BB.begin()) --BBI; break; } else if (OR == OverwriteEnd && isShortenable(DepWrite)) { // TODO: base this on the target vector size so that if the earlier // store was too small to get vector writes anyway then its likely // a good idea to shorten it // Power of 2 vector writes are probably always a bad idea to optimize // as any store/memset/memcpy is likely using vector instructions so // shortening it to not vector size is likely to be slower MemIntrinsic* DepIntrinsic = cast<MemIntrinsic>(DepWrite); unsigned DepWriteAlign = DepIntrinsic->getAlignment(); if (llvm::isPowerOf2_64(InstWriteOffset) || ((DepWriteAlign != 0) && InstWriteOffset % DepWriteAlign == 0)) { DEBUG(dbgs() << "DSE: Remove Dead Store:\n OW END: " << *DepWrite << "\n KILLER (offset " << InstWriteOffset << ", " << DepLoc.Size << ")" << *Inst << '\n'); Value* DepWriteLength = DepIntrinsic->getLength(); Value* TrimmedLength = ConstantInt::get(DepWriteLength->getType(), InstWriteOffset - DepWriteOffset); DepIntrinsic->setLength(TrimmedLength); MadeChange = true; } } } // If this is a may-aliased store that is clobbering the store value, we // can keep searching past it for another must-aliased pointer that stores // to the same location. For example, in: // store -> P // store -> Q // store -> P // we can remove the first store to P even though we don't know if P and Q // alias. if (DepWrite == &BB.front()) break; // Can't look past this instruction if it might read 'Loc'. if (AA->getModRefInfo(DepWrite, Loc) & AliasAnalysis::Ref) break; InstDep = MD->getPointerDependencyFrom(Loc, false, DepWrite, &BB); } } // If this block ends in a return, unwind, or unreachable, all allocas are // dead at its end, which means stores to them are also dead. if (BB.getTerminator()->getNumSuccessors() == 0) MadeChange |= handleEndBlock(BB); return MadeChange; }
void DSWP::buildPDG(Loop *L) { //Initialize PDG for (Loop::block_iterator bi = L->getBlocks().begin(); bi != L->getBlocks().end(); bi++) { BasicBlock *BB = *bi; for (BasicBlock::iterator ui = BB->begin(); ui != BB->end(); ui++) { Instruction *inst = &(*ui); //standardlize the name for all expr if (util.hasNewDef(inst)) { inst->setName(util.genId()); dname[inst] = inst->getNameStr(); } else { dname[inst] = util.genId(); } pdg[inst] = new vector<Edge>(); rev[inst] = new vector<Edge>(); } } //LoopInfo &li = getAnalysis<LoopInfo>(); /* * Memory dependency analysis */ MemoryDependenceAnalysis &mda = getAnalysis<MemoryDependenceAnalysis>(); for (Loop::block_iterator bi = L->getBlocks().begin(); bi != L->getBlocks().end(); bi++) { BasicBlock *BB = *bi; for (BasicBlock::iterator ii = BB->begin(); ii != BB->end(); ii++) { Instruction *inst = &(*ii); //data dependence = register dependence + memory dependence //begin register dependence for (Value::use_iterator ui = ii->use_begin(); ui != ii->use_end(); ui++) { if (Instruction *user = dyn_cast<Instruction>(*ui)) { addEdge(inst, user, REG); } } //finish register dependence //begin memory dependence MemDepResult mdr = mda.getDependency(inst); //TODO not sure clobbers mean!! if (mdr.isDef()) { Instruction *dep = mdr.getInst(); if (isa<LoadInst>(inst)) { if (isa<StoreInst>(dep)) { addEdge(dep, inst, DTRUE); //READ AFTER WRITE } } if (isa<StoreInst>(inst)) { if (isa<LoadInst>(dep)) { addEdge(dep, inst, DANTI); //WRITE AFTER READ } if (isa<StoreInst>(dep)) { addEdge(dep, inst, DOUT); //WRITE AFTER WRITE } } //READ AFTER READ IS INSERT AFTER PDG BUILD } //end memory dependence }//for ii }//for bi /* * begin control dependence */ PostDominatorTree &pdt = getAnalysis<PostDominatorTree>(); //cout << pdt.getRootNode()->getBlock()->getNameStr() << endl; /* * alien code part 1 */ LoopInfo *LI = &getAnalysis<LoopInfo>(); std::set<BranchInst*> backedgeParents; for (Loop::block_iterator bi = L->getBlocks().begin(); bi != L->getBlocks().end(); bi++) { BasicBlock *BB = *bi; for (BasicBlock::iterator ii = BB->begin(); ii != BB->end(); ii++) { Instruction *inst = ii; if (BranchInst *brInst = dyn_cast<BranchInst>(inst)) { // get the loop this instruction (and therefore basic block) belongs to Loop *instLoop = LI->getLoopFor(BB); bool branchesToHeader = false; for (int i = brInst->getNumSuccessors() - 1; i >= 0 && !branchesToHeader; i--) { // if the branch could exit, store it if (LI->getLoopFor(brInst->getSuccessor(i)) != instLoop) { branchesToHeader = true; } } if (branchesToHeader) { backedgeParents.insert(brInst); } } } } //build information for predecessor of blocks in post dominator tree for (Function::iterator bi = func->begin(); bi != func->end(); bi++) { BasicBlock *BB = bi; DomTreeNode *dn = pdt.getNode(BB); for (DomTreeNode::iterator di = dn->begin(); di != dn->end(); di++) { BasicBlock *CB = (*di)->getBlock(); pre[CB] = BB; } } // // //add dependency within a basicblock // for (Loop::block_iterator bi = L->getBlocks().begin(); bi != L->getBlocks().end(); bi++) { // BasicBlock *BB = *bi; // Instruction *pre = NULL; // for (BasicBlock::iterator ui = BB->begin(); ui != BB->end(); ui++) { // Instruction *inst = &(*ui); // if (pre != NULL) { // addEdge(pre, inst, CONTROL); // } // pre = inst; // } // } // //the special kind of dependence need loop peeling ? I don't know whether this is needed // for (Loop::block_iterator bi = L->getBlocks().begin(); bi != L->getBlocks().end(); bi++) { // BasicBlock *BB = *bi; // for (succ_iterator PI = succ_begin(BB); PI != succ_end(BB); ++PI) { // BasicBlock *succ = *PI; // // checkControlDependence(BB, succ, pdt); // } // } /* * alien code part 2 */ // add normal control dependencies // loop through each instruction for (Loop::block_iterator bbIter = L->block_begin(); bbIter != L->block_end(); ++bbIter) { BasicBlock *bb = *bbIter; // check the successors of this basic block if (BranchInst *branchInst = dyn_cast<BranchInst>(bb->getTerminator())) { if (branchInst->getNumSuccessors() > 1) { BasicBlock * succ = branchInst->getSuccessor(0); // if the successor is nested shallower than the current basic block, continue if (LI->getLoopDepth(bb) < LI->getLoopDepth(succ)) { continue; } // otherwise, add all instructions to graph as control dependence while (succ != NULL && succ != bb && LI->getLoopDepth(succ) >= LI->getLoopDepth(bb)) { Instruction *terminator = bb->getTerminator(); for (BasicBlock::iterator succInstIter = succ->begin(); &(*succInstIter) != succ->getTerminator(); ++succInstIter) { addEdge(terminator, &(*succInstIter), CONTROL); } if (BranchInst *succBrInst = dyn_cast<BranchInst>(succ->getTerminator())) { if (succBrInst->getNumSuccessors() > 1) { addEdge(terminator, succ->getTerminator(), CONTROL); } } if (BranchInst *br = dyn_cast<BranchInst>(succ->getTerminator())) { if (br->getNumSuccessors() == 1) { succ = br->getSuccessor(0); } else { succ = NULL; } } else { succ = NULL; } } } } } /* * alien code part 3 */ for (std::set<BranchInst*>::iterator exitIter = backedgeParents.begin(); exitIter != backedgeParents.end(); ++exitIter) { BranchInst *exitBranch = *exitIter; if (exitBranch->isConditional()) { BasicBlock *header = LI->getLoopFor(exitBranch->getParent())->getHeader(); for (BasicBlock::iterator ctrlIter = header->begin(); ctrlIter != header->end(); ++ctrlIter) { addEdge(exitBranch, &(*ctrlIter), CONTROL); } } } //end control dependence }
bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { if (!SI->isSimple()) return false; // Avoid merging nontemporal stores since the resulting // memcpy/memset would not be able to preserve the nontemporal hint. // In theory we could teach how to propagate the !nontemporal metadata to // memset calls. However, that change would force the backend to // conservatively expand !nontemporal memset calls back to sequences of // store instructions (effectively undoing the merging). if (SI->getMetadata(LLVMContext::MD_nontemporal)) return false; const DataLayout &DL = SI->getModule()->getDataLayout(); // Load to store forwarding can be interpreted as memcpy. if (LoadInst *LI = dyn_cast<LoadInst>(SI->getOperand(0))) { if (LI->isSimple() && LI->hasOneUse() && LI->getParent() == SI->getParent()) { auto *T = LI->getType(); if (T->isAggregateType()) { AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); MemoryLocation LoadLoc = MemoryLocation::get(LI); // We use alias analysis to check if an instruction may store to // the memory we load from in between the load and the store. If // such an instruction is found, we store it in AI. Instruction *AI = nullptr; for (BasicBlock::iterator I = ++LI->getIterator(), E = SI->getIterator(); I != E; ++I) { if (AA.getModRefInfo(&*I, LoadLoc) & MRI_Mod) { AI = &*I; break; } } // If no aliasing instruction is found, then we can promote the // load/store pair to a memcpy at the store loaction. if (!AI) { // If we load from memory that may alias the memory we store to, // memmove must be used to preserve semantic. If not, memcpy can // be used. bool UseMemMove = false; if (!AA.isNoAlias(MemoryLocation::get(SI), LoadLoc)) UseMemMove = true; unsigned Align = findCommonAlignment(DL, SI, LI); uint64_t Size = DL.getTypeStoreSize(T); IRBuilder<> Builder(SI); Instruction *M; if (UseMemMove) M = Builder.CreateMemMove(SI->getPointerOperand(), LI->getPointerOperand(), Size, Align, SI->isVolatile()); else M = Builder.CreateMemCpy(SI->getPointerOperand(), LI->getPointerOperand(), Size, Align, SI->isVolatile()); DEBUG(dbgs() << "Promoting " << *LI << " to " << *SI << " => " << *M << "\n"); MD->removeInstruction(SI); SI->eraseFromParent(); MD->removeInstruction(LI); LI->eraseFromParent(); ++NumMemCpyInstr; // Make sure we do not invalidate the iterator. BBI = M->getIterator(); return true; } } // Detect cases where we're performing call slot forwarding, but // happen to be using a load-store pair to implement it, rather than // a memcpy. MemDepResult ldep = MD->getDependency(LI); CallInst *C = nullptr; if (ldep.isClobber() && !isa<MemCpyInst>(ldep.getInst())) C = dyn_cast<CallInst>(ldep.getInst()); if (C) { // Check that nothing touches the dest of the "copy" between // the call and the store. AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); MemoryLocation StoreLoc = MemoryLocation::get(SI); for (BasicBlock::iterator I = --SI->getIterator(), E = C->getIterator(); I != E; --I) { if (AA.getModRefInfo(&*I, StoreLoc) != MRI_NoModRef) { C = nullptr; break; } } } if (C) { bool changed = performCallSlotOptzn( LI, SI->getPointerOperand()->stripPointerCasts(), LI->getPointerOperand()->stripPointerCasts(), DL.getTypeStoreSize(SI->getOperand(0)->getType()), findCommonAlignment(DL, SI, LI), C); if (changed) { MD->removeInstruction(SI); SI->eraseFromParent(); MD->removeInstruction(LI); LI->eraseFromParent(); ++NumMemCpyInstr; return true; } } } } // There are two cases that are interesting for this code to handle: memcpy // and memset. Right now we only handle memset. // Ensure that the value being stored is something that can be memset'able a // byte at a time like "0" or "-1" or any width, as well as things like // 0xA0A0A0A0 and 0.0. if (Value *ByteVal = isBytewiseValue(SI->getOperand(0))) if (Instruction *I = tryMergingIntoMemset(SI, SI->getPointerOperand(), ByteVal)) { BBI = I->getIterator(); // Don't invalidate iterator. return true; } return false; }
/// processMemCpy - perform simplication of memcpy's. If we have memcpy A which /// copies X to Y, and memcpy B which copies Y to Z, then we can rewrite B to be /// a memcpy from X to Z (or potentially a memmove, depending on circumstances). /// This allows later passes to remove the first memcpy altogether. bool MemCpyOpt::processMemCpy(MemCpyInst *M) { MemoryDependenceAnalysis &MD = getAnalysis<MemoryDependenceAnalysis>(); // The are two possible optimizations we can do for memcpy: // a) memcpy-memcpy xform which exposes redundance for DSE. // b) call-memcpy xform for return slot optimization. MemDepResult dep = MD.getDependency(M); if (!dep.isClobber()) return false; if (!isa<MemCpyInst>(dep.getInst())) { if (CallInst *C = dyn_cast<CallInst>(dep.getInst())) return performCallSlotOptzn(M, C); return false; } MemCpyInst *MDep = cast<MemCpyInst>(dep.getInst()); // We can only transforms memcpy's where the dest of one is the source of the // other if (M->getSource() != MDep->getDest()) return false; // Second, the length of the memcpy's must be the same, or the preceeding one // must be larger than the following one. ConstantInt *C1 = dyn_cast<ConstantInt>(MDep->getLength()); ConstantInt *C2 = dyn_cast<ConstantInt>(M->getLength()); if (!C1 || !C2) return false; uint64_t DepSize = C1->getValue().getZExtValue(); uint64_t CpySize = C2->getValue().getZExtValue(); if (DepSize < CpySize) return false; // Finally, we have to make sure that the dest of the second does not // alias the source of the first AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); if (AA.alias(M->getRawDest(), CpySize, MDep->getRawSource(), DepSize) != AliasAnalysis::NoAlias) return false; else if (AA.alias(M->getRawDest(), CpySize, M->getRawSource(), CpySize) != AliasAnalysis::NoAlias) return false; else if (AA.alias(MDep->getRawDest(), DepSize, MDep->getRawSource(), DepSize) != AliasAnalysis::NoAlias) return false; // If all checks passed, then we can transform these memcpy's const Type *ArgTys[3] = { M->getRawDest()->getType(), MDep->getRawSource()->getType(), M->getLength()->getType() }; Function *MemCpyFun = Intrinsic::getDeclaration( M->getParent()->getParent()->getParent(), M->getIntrinsicID(), ArgTys, 3); Value *Args[5] = { M->getRawDest(), MDep->getRawSource(), M->getLength(), M->getAlignmentCst(), M->getVolatileCst() }; CallInst *C = CallInst::Create(MemCpyFun, Args, Args+5, "", M); // If C and M don't interfere, then this is a valid transformation. If they // did, this would mean that the two sources overlap, which would be bad. if (MD.getDependency(C) == dep) { MD.removeInstruction(M); M->eraseFromParent(); ++NumMemCpyInstr; return true; } // Otherwise, there was no point in doing this, so we remove the call we // inserted and act like nothing happened. MD.removeInstruction(C); C->eraseFromParent(); return false; }
bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { if (!SI->isSimple()) return false; // Avoid merging nontemporal stores since the resulting // memcpy/memset would not be able to preserve the nontemporal hint. // In theory we could teach how to propagate the !nontemporal metadata to // memset calls. However, that change would force the backend to // conservatively expand !nontemporal memset calls back to sequences of // store instructions (effectively undoing the merging). if (SI->getMetadata(LLVMContext::MD_nontemporal)) return false; const DataLayout &DL = SI->getModule()->getDataLayout(); // Detect cases where we're performing call slot forwarding, but // happen to be using a load-store pair to implement it, rather than // a memcpy. if (LoadInst *LI = dyn_cast<LoadInst>(SI->getOperand(0))) { if (LI->isSimple() && LI->hasOneUse() && LI->getParent() == SI->getParent()) { MemDepResult ldep = MD->getDependency(LI); CallInst *C = nullptr; if (ldep.isClobber() && !isa<MemCpyInst>(ldep.getInst())) C = dyn_cast<CallInst>(ldep.getInst()); if (C) { // Check that nothing touches the dest of the "copy" between // the call and the store. AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); MemoryLocation StoreLoc = MemoryLocation::get(SI); for (BasicBlock::iterator I = --SI->getIterator(), E = C->getIterator(); I != E; --I) { if (AA.getModRefInfo(&*I, StoreLoc) != MRI_NoModRef) { C = nullptr; break; } } } if (C) { unsigned storeAlign = SI->getAlignment(); if (!storeAlign) storeAlign = DL.getABITypeAlignment(SI->getOperand(0)->getType()); unsigned loadAlign = LI->getAlignment(); if (!loadAlign) loadAlign = DL.getABITypeAlignment(LI->getType()); bool changed = performCallSlotOptzn( LI, SI->getPointerOperand()->stripPointerCasts(), LI->getPointerOperand()->stripPointerCasts(), DL.getTypeStoreSize(SI->getOperand(0)->getType()), std::min(storeAlign, loadAlign), C); if (changed) { MD->removeInstruction(SI); SI->eraseFromParent(); MD->removeInstruction(LI); LI->eraseFromParent(); ++NumMemCpyInstr; return true; } } } } // There are two cases that are interesting for this code to handle: memcpy // and memset. Right now we only handle memset. // Ensure that the value being stored is something that can be memset'able a // byte at a time like "0" or "-1" or any width, as well as things like // 0xA0A0A0A0 and 0.0. if (Value *ByteVal = isBytewiseValue(SI->getOperand(0))) if (Instruction *I = tryMergingIntoMemset(SI, SI->getPointerOperand(), ByteVal)) { BBI = I->getIterator(); // Don't invalidate iterator. return true; } return false; }
/// processStore - When GVN is scanning forward over instructions, we look for /// some other patterns to fold away. In particular, this looks for stores to /// neighboring locations of memory. If it sees enough consequtive ones /// (currently 4) it attempts to merge them together into a memcpy/memset. bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { if (SI->isVolatile()) return false; TargetData *TD = getAnalysisIfAvailable<TargetData>(); if (!TD) return false; // Detect cases where we're performing call slot forwarding, but // happen to be using a load-store pair to implement it, rather than // a memcpy. if (LoadInst *LI = dyn_cast<LoadInst>(SI->getOperand(0))) { if (!LI->isVolatile() && LI->hasOneUse()) { MemDepResult dep = MD->getDependency(LI); CallInst *C = 0; if (dep.isClobber() && !isa<MemCpyInst>(dep.getInst())) C = dyn_cast<CallInst>(dep.getInst()); if (C) { bool changed = performCallSlotOptzn(LI, SI->getPointerOperand()->stripPointerCasts(), LI->getPointerOperand()->stripPointerCasts(), TD->getTypeStoreSize(SI->getOperand(0)->getType()), C); if (changed) { MD->removeInstruction(SI); SI->eraseFromParent(); LI->eraseFromParent(); ++NumMemCpyInstr; return true; } } } } LLVMContext &Context = SI->getContext(); // There are two cases that are interesting for this code to handle: memcpy // and memset. Right now we only handle memset. // Ensure that the value being stored is something that can be memset'able a // byte at a time like "0" or "-1" or any width, as well as things like // 0xA0A0A0A0 and 0.0. Value *ByteVal = isBytewiseValue(SI->getOperand(0)); if (!ByteVal) return false; AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); Module *M = SI->getParent()->getParent()->getParent(); // Okay, so we now have a single store that can be splatable. Scan to find // all subsequent stores of the same value to offset from the same pointer. // Join these together into ranges, so we can decide whether contiguous blocks // are stored. MemsetRanges Ranges(*TD); Value *StartPtr = SI->getPointerOperand(); BasicBlock::iterator BI = SI; for (++BI; !isa<TerminatorInst>(BI); ++BI) { if (isa<CallInst>(BI) || isa<InvokeInst>(BI)) { // If the call is readnone, ignore it, otherwise bail out. We don't even // allow readonly here because we don't want something like: // A[1] = 2; strlen(A); A[2] = 2; -> memcpy(A, ...); strlen(A). if (AA.getModRefBehavior(CallSite(BI)) == AliasAnalysis::DoesNotAccessMemory) continue; // TODO: If this is a memset, try to join it in. break; } else if (isa<VAArgInst>(BI) || isa<LoadInst>(BI)) break; // If this is a non-store instruction it is fine, ignore it. StoreInst *NextStore = dyn_cast<StoreInst>(BI); if (NextStore == 0) continue; // If this is a store, see if we can merge it in. if (NextStore->isVolatile()) break; // Check to see if this stored value is of the same byte-splattable value. if (ByteVal != isBytewiseValue(NextStore->getOperand(0))) break; // Check to see if this store is to a constant offset from the start ptr. int64_t Offset; if (!IsPointerOffset(StartPtr, NextStore->getPointerOperand(), Offset, *TD)) break; Ranges.addStore(Offset, NextStore); } // If we have no ranges, then we just had a single store with nothing that // could be merged in. This is a very common case of course. if (Ranges.empty()) return false; // If we had at least one store that could be merged in, add the starting // store as well. We try to avoid this unless there is at least something // interesting as a small compile-time optimization. Ranges.addStore(0, SI); // Now that we have full information about ranges, loop over the ranges and // emit memset's for anything big enough to be worthwhile. bool MadeChange = false; for (MemsetRanges::const_iterator I = Ranges.begin(), E = Ranges.end(); I != E; ++I) { const MemsetRange &Range = *I; if (Range.TheStores.size() == 1) continue; // If it is profitable to lower this range to memset, do so now. if (!Range.isProfitableToUseMemset(*TD)) continue; // Otherwise, we do want to transform this! Create a new memset. We put // the memset right before the first instruction that isn't part of this // memset block. This ensure that the memset is dominated by any addressing // instruction needed by the start of the block. BasicBlock::iterator InsertPt = BI; // Get the starting pointer of the block. StartPtr = Range.StartPtr; // Determine alignment unsigned Alignment = Range.Alignment; if (Alignment == 0) { const Type *EltType = cast<PointerType>(StartPtr->getType())->getElementType(); Alignment = TD->getABITypeAlignment(EltType); } // Cast the start ptr to be i8* as memset requires. const PointerType* StartPTy = cast<PointerType>(StartPtr->getType()); const PointerType *i8Ptr = Type::getInt8PtrTy(Context, StartPTy->getAddressSpace()); if (StartPTy!= i8Ptr) StartPtr = new BitCastInst(StartPtr, i8Ptr, StartPtr->getName(), InsertPt); Value *Ops[] = { StartPtr, ByteVal, // Start, value // size ConstantInt::get(Type::getInt64Ty(Context), Range.End-Range.Start), // align ConstantInt::get(Type::getInt32Ty(Context), Alignment), // volatile ConstantInt::getFalse(Context), }; const Type *Tys[] = { Ops[0]->getType(), Ops[2]->getType() }; Function *MemSetF = Intrinsic::getDeclaration(M, Intrinsic::memset, Tys, 2); Value *C = CallInst::Create(MemSetF, Ops, Ops+5, "", InsertPt); DEBUG(dbgs() << "Replace stores:\n"; for (unsigned i = 0, e = Range.TheStores.size(); i != e; ++i) dbgs() << *Range.TheStores[i] << '\n'; dbgs() << "With: " << *C << '\n'); C=C; // Don't invalidate the iterator BBI = BI; // Zap all the stores. for (SmallVector<StoreInst*, 16>::const_iterator SI = Range.TheStores.begin(), SE = Range.TheStores.end(); SI != SE; ++SI) (*SI)->eraseFromParent(); ++NumMemSetInfer; MadeChange = true; } return MadeChange; }
/// processMemCpyMemCpyDependence - We've found that the (upward scanning) /// memory dependence of memcpy 'M' is the memcpy 'MDep'. Try to simplify M to /// copy from MDep's input if we can. MSize is the size of M's copy. /// bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep, uint64_t MSize) { // We can only transforms memcpy's where the dest of one is the source of the // other. if (M->getSource() != MDep->getDest() || MDep->isVolatile()) return false; // If dep instruction is reading from our current input, then it is a noop // transfer and substituting the input won't change this instruction. Just // ignore the input and let someone else zap MDep. This handles cases like: // memcpy(a <- a) // memcpy(b <- a) if (M->getSource() == MDep->getSource()) return false; // Second, the length of the memcpy's must be the same, or the preceding one // must be larger than the following one. ConstantInt *MDepLen = dyn_cast<ConstantInt>(MDep->getLength()); ConstantInt *MLen = dyn_cast<ConstantInt>(M->getLength()); if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue()) return false; AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); // Verify that the copied-from memory doesn't change in between the two // transfers. For example, in: // memcpy(a <- b) // *b = 42; // memcpy(c <- a) // It would be invalid to transform the second memcpy into memcpy(c <- b). // // TODO: If the code between M and MDep is transparent to the destination "c", // then we could still perform the xform by moving M up to the first memcpy. // // NOTE: This is conservative, it will stop on any read from the source loc, // not just the defining memcpy. MemDepResult SourceDep = MD->getPointerDependencyFrom(AA.getLocationForSource(MDep), false, M, M->getParent()); if (!SourceDep.isClobber() || SourceDep.getInst() != MDep) return false; // If the dest of the second might alias the source of the first, then the // source and dest might overlap. We still want to eliminate the intermediate // value, but we have to generate a memmove instead of memcpy. bool UseMemMove = false; if (!AA.isNoAlias(AA.getLocationForDest(M), AA.getLocationForSource(MDep))) UseMemMove = true; // If all checks passed, then we can transform M. // Make sure to use the lesser of the alignment of the source and the dest // since we're changing where we're reading from, but don't want to increase // the alignment past what can be read from or written to. // TODO: Is this worth it if we're creating a less aligned memcpy? For // example we could be moving from movaps -> movq on x86. unsigned Align = std::min(MDep->getAlignment(), M->getAlignment()); IRBuilder<> Builder(M); if (UseMemMove) Builder.CreateMemMove(M->getRawDest(), MDep->getRawSource(), M->getLength(), Align, M->isVolatile()); else Builder.CreateMemCpy(M->getRawDest(), MDep->getRawSource(), M->getLength(), Align, M->isVolatile()); // Remove the instruction we're replacing. MD->removeInstruction(M); M->eraseFromParent(); ++NumMemCpyInstr; return true; }
bool DSE::runOnBasicBlock(BasicBlock &BB) { bool MadeChange = false; // Do a top-down walk on the BB. for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) { Instruction *Inst = BBI++; // Handle 'free' calls specially. if (CallInst *F = isFreeCall(Inst)) { MadeChange |= HandleFree(F); continue; } // If we find something that writes memory, get its memory dependence. if (!hasMemoryWrite(Inst)) continue; MemDepResult InstDep = MD->getDependency(Inst); // Ignore non-local store liveness. // FIXME: cross-block DSE would be fun. :) if (InstDep.isNonLocal() || // Ignore self dependence, which happens in the entry block of the // function. InstDep.getInst() == Inst) continue; // If we're storing the same value back to a pointer that we just // loaded from, then the store can be removed. if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { if (LoadInst *DepLoad = dyn_cast<LoadInst>(InstDep.getInst())) { if (SI->getPointerOperand() == DepLoad->getPointerOperand() && SI->getOperand(0) == DepLoad && !SI->isVolatile()) { DEBUG(dbgs() << "DSE: Remove Store Of Load from same pointer:\n " << "LOAD: " << *DepLoad << "\n STORE: " << *SI << '\n'); // DeleteDeadInstruction can delete the current instruction. Save BBI // in case we need it. WeakVH NextInst(BBI); DeleteDeadInstruction(SI, *MD); if (NextInst == 0) // Next instruction deleted. BBI = BB.begin(); else if (BBI != BB.begin()) // Revisit this instruction if possible. --BBI; ++NumFastStores; MadeChange = true; continue; } } } // Figure out what location is being stored to. AliasAnalysis::Location Loc = getLocForWrite(Inst, *AA); // If we didn't get a useful location, fail. if (Loc.Ptr == 0) continue; while (!InstDep.isNonLocal()) { // Get the memory clobbered by the instruction we depend on. MemDep will // skip any instructions that 'Loc' clearly doesn't interact with. If we // end up depending on a may- or must-aliased load, then we can't optimize // away the store and we bail out. However, if we depend on on something // that overwrites the memory location we *can* potentially optimize it. // // Find out what memory location the dependant instruction stores. Instruction *DepWrite = InstDep.getInst(); AliasAnalysis::Location DepLoc = getLocForWrite(DepWrite, *AA); // If we didn't get a useful location, or if it isn't a size, bail out. if (DepLoc.Ptr == 0) break; // If we find a write that is a) removable (i.e., non-volatile), b) is // completely obliterated by the store to 'Loc', and c) which we know that // 'Inst' doesn't load from, then we can remove it. if (isRemovable(DepWrite) && isCompleteOverwrite(Loc, DepLoc, *AA) && !isPossibleSelfRead(Inst, Loc, DepWrite, *AA)) { DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *DepWrite << "\n KILLER: " << *Inst << '\n'); // Delete the store and now-dead instructions that feed it. DeleteDeadInstruction(DepWrite, *MD); ++NumFastStores; MadeChange = true; // DeleteDeadInstruction can delete the current instruction in loop // cases, reset BBI. BBI = Inst; if (BBI != BB.begin()) --BBI; break; } // If this is a may-aliased store that is clobbering the store value, we // can keep searching past it for another must-aliased pointer that stores // to the same location. For example, in: // store -> P // store -> Q // store -> P // we can remove the first store to P even though we don't know if P and Q // alias. if (DepWrite == &BB.front()) break; // Can't look past this instruction if it might read 'Loc'. if (AA->getModRefInfo(DepWrite, Loc) & AliasAnalysis::Ref) break; InstDep = MD->getPointerDependencyFrom(Loc, false, DepWrite, &BB); } } // If this block ends in a return, unwind, or unreachable, all allocas are // dead at its end, which means stores to them are also dead. if (BB.getTerminator()->getNumSuccessors() == 0) MadeChange |= handleEndBlock(BB); return MadeChange; }
bool MemDepPrinter::runOnFunction(Function &F) { this->F = &F; AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); MemoryDependenceAnalysis &MDA = getAnalysis<MemoryDependenceAnalysis>(); // All this code uses non-const interfaces because MemDep is not // const-friendly, though nothing is actually modified. for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) { Instruction *Inst = &*I; if (!Inst->mayReadFromMemory() && !Inst->mayWriteToMemory()) continue; MemDepResult Res = MDA.getDependency(Inst); if (!Res.isNonLocal()) { Deps[Inst].insert(std::make_pair(getInstTypePair(Res), static_cast<BasicBlock *>(nullptr))); } else if (CallSite CS = cast<Value>(Inst)) { const MemoryDependenceAnalysis::NonLocalDepInfo &NLDI = MDA.getNonLocalCallDependency(CS); DepSet &InstDeps = Deps[Inst]; for (MemoryDependenceAnalysis::NonLocalDepInfo::const_iterator I = NLDI.begin(), E = NLDI.end(); I != E; ++I) { const MemDepResult &Res = I->getResult(); InstDeps.insert(std::make_pair(getInstTypePair(Res), I->getBB())); } } else { SmallVector<NonLocalDepResult, 4> NLDI; if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { if (!LI->isUnordered()) { // FIXME: Handle atomic/volatile loads. Deps[Inst].insert(std::make_pair(getInstTypePair(nullptr, Unknown), static_cast<BasicBlock *>(nullptr))); continue; } AliasAnalysis::Location Loc = AA.getLocation(LI); MDA.getNonLocalPointerDependency(Loc, true, LI->getParent(), NLDI); } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { if (!SI->isUnordered()) { // FIXME: Handle atomic/volatile stores. Deps[Inst].insert(std::make_pair(getInstTypePair(nullptr, Unknown), static_cast<BasicBlock *>(nullptr))); continue; } AliasAnalysis::Location Loc = AA.getLocation(SI); MDA.getNonLocalPointerDependency(Loc, false, SI->getParent(), NLDI); } else if (VAArgInst *VI = dyn_cast<VAArgInst>(Inst)) { AliasAnalysis::Location Loc = AA.getLocation(VI); MDA.getNonLocalPointerDependency(Loc, false, VI->getParent(), NLDI); } else { llvm_unreachable("Unknown memory instruction!"); } DepSet &InstDeps = Deps[Inst]; for (SmallVectorImpl<NonLocalDepResult>::const_iterator I = NLDI.begin(), E = NLDI.end(); I != E; ++I) { const MemDepResult &Res = I->getResult(); InstDeps.insert(std::make_pair(getInstTypePair(Res), I->getBB())); } } } return false; }
/// getNonLocalPointerDepFromBB - Perform a dependency query based on /// pointer/pointeesize starting at the end of StartBB. Add any clobber/def /// results to the results vector and keep track of which blocks are visited in /// 'Visited'. /// /// This has special behavior for the first block queries (when SkipFirstBlock /// is true). In this special case, it ignores the contents of the specified /// block and starts returning dependence info for its predecessors. /// /// This function returns false on success, or true to indicate that it could /// not compute dependence information for some reason. This should be treated /// as a clobber dependence on the first instruction in the predecessor block. bool MemoryDependenceAnalysis:: getNonLocalPointerDepFromBB(Value *Pointer, uint64_t PointeeSize, bool isLoad, BasicBlock *StartBB, SmallVectorImpl<NonLocalDepEntry> &Result, DenseMap<BasicBlock*, Value*> &Visited, bool SkipFirstBlock) { // Look up the cached info for Pointer. ValueIsLoadPair CacheKey(Pointer, isLoad); std::pair<BBSkipFirstBlockPair, NonLocalDepInfo> *CacheInfo = &NonLocalPointerDeps[CacheKey]; NonLocalDepInfo *Cache = &CacheInfo->second; // If we have valid cached information for exactly the block we are // investigating, just return it with no recomputation. if (CacheInfo->first == BBSkipFirstBlockPair(StartBB, SkipFirstBlock)) { // We have a fully cached result for this query then we can just return the // cached results and populate the visited set. However, we have to verify // that we don't already have conflicting results for these blocks. Check // to ensure that if a block in the results set is in the visited set that // it was for the same pointer query. if (!Visited.empty()) { for (NonLocalDepInfo::iterator I = Cache->begin(), E = Cache->end(); I != E; ++I) { DenseMap<BasicBlock*, Value*>::iterator VI = Visited.find(I->first); if (VI == Visited.end() || VI->second == Pointer) continue; // We have a pointer mismatch in a block. Just return clobber, saying // that something was clobbered in this result. We could also do a // non-fully cached query, but there is little point in doing this. return true; } } for (NonLocalDepInfo::iterator I = Cache->begin(), E = Cache->end(); I != E; ++I) { Visited.insert(std::make_pair(I->first, Pointer)); if (!I->second.isNonLocal()) Result.push_back(*I); } ++NumCacheCompleteNonLocalPtr; return false; } // Otherwise, either this is a new block, a block with an invalid cache // pointer or one that we're about to invalidate by putting more info into it // than its valid cache info. If empty, the result will be valid cache info, // otherwise it isn't. if (Cache->empty()) CacheInfo->first = BBSkipFirstBlockPair(StartBB, SkipFirstBlock); else CacheInfo->first = BBSkipFirstBlockPair(); SmallVector<BasicBlock*, 32> Worklist; Worklist.push_back(StartBB); // Keep track of the entries that we know are sorted. Previously cached // entries will all be sorted. The entries we add we only sort on demand (we // don't insert every element into its sorted position). We know that we // won't get any reuse from currently inserted values, because we don't // revisit blocks after we insert info for them. unsigned NumSortedEntries = Cache->size(); DEBUG(AssertSorted(*Cache)); while (!Worklist.empty()) { BasicBlock *BB = Worklist.pop_back_val(); // Skip the first block if we have it. if (!SkipFirstBlock) { // Analyze the dependency of *Pointer in FromBB. See if we already have // been here. assert(Visited.count(BB) && "Should check 'visited' before adding to WL"); // Get the dependency info for Pointer in BB. If we have cached // information, we will use it, otherwise we compute it. DEBUG(AssertSorted(*Cache, NumSortedEntries)); MemDepResult Dep = GetNonLocalInfoForBlock(Pointer, PointeeSize, isLoad, BB, Cache, NumSortedEntries); // If we got a Def or Clobber, add this to the list of results. if (!Dep.isNonLocal()) { Result.push_back(NonLocalDepEntry(BB, Dep)); continue; } } // If 'Pointer' is an instruction defined in this block, then we need to do // phi translation to change it into a value live in the predecessor block. // If phi translation fails, then we can't continue dependence analysis. Instruction *PtrInst = dyn_cast<Instruction>(Pointer); bool NeedsPHITranslation = PtrInst && PtrInst->getParent() == BB; // If no PHI translation is needed, just add all the predecessors of this // block to scan them as well. if (!NeedsPHITranslation) { SkipFirstBlock = false; for (BasicBlock **PI = PredCache->GetPreds(BB); *PI; ++PI) { // Verify that we haven't looked at this block yet. std::pair<DenseMap<BasicBlock*,Value*>::iterator, bool> InsertRes = Visited.insert(std::make_pair(*PI, Pointer)); if (InsertRes.second) { // First time we've looked at *PI. Worklist.push_back(*PI); continue; } // If we have seen this block before, but it was with a different // pointer then we have a phi translation failure and we have to treat // this as a clobber. if (InsertRes.first->second != Pointer) goto PredTranslationFailure; } continue; } // If we do need to do phi translation, then there are a bunch of different // cases, because we have to find a Value* live in the predecessor block. We // know that PtrInst is defined in this block at least. // If this is directly a PHI node, just use the incoming values for each // pred as the phi translated version. if (PHINode *PtrPHI = dyn_cast<PHINode>(PtrInst)) { for (BasicBlock **PI = PredCache->GetPreds(BB); *PI; ++PI) { BasicBlock *Pred = *PI; Value *PredPtr = PtrPHI->getIncomingValueForBlock(Pred); // Check to see if we have already visited this pred block with another // pointer. If so, we can't do this lookup. This failure can occur // with PHI translation when a critical edge exists and the PHI node in // the successor translates to a pointer value different than the // pointer the block was first analyzed with. std::pair<DenseMap<BasicBlock*,Value*>::iterator, bool> InsertRes = Visited.insert(std::make_pair(Pred, PredPtr)); if (!InsertRes.second) { // If the predecessor was visited with PredPtr, then we already did // the analysis and can ignore it. if (InsertRes.first->second == PredPtr) continue; // Otherwise, the block was previously analyzed with a different // pointer. We can't represent the result of this case, so we just // treat this as a phi translation failure. goto PredTranslationFailure; } // We may have added values to the cache list before this PHI // translation. If so, we haven't done anything to ensure that the // cache remains sorted. Sort it now (if needed) so that recursive // invocations of getNonLocalPointerDepFromBB that could reuse the cache // value will only see properly sorted cache arrays. if (Cache && NumSortedEntries != Cache->size()) std::sort(Cache->begin(), Cache->end()); Cache = 0; // FIXME: it is entirely possible that PHI translating will end up with // the same value. Consider PHI translating something like: // X = phi [x, bb1], [y, bb2]. PHI translating for bb1 doesn't *need* // to recurse here, pedantically speaking. // If we have a problem phi translating, fall through to the code below // to handle the failure condition. if (getNonLocalPointerDepFromBB(PredPtr, PointeeSize, isLoad, Pred, Result, Visited)) goto PredTranslationFailure; } // Refresh the CacheInfo/Cache pointer so that it isn't invalidated. CacheInfo = &NonLocalPointerDeps[CacheKey]; Cache = &CacheInfo->second; NumSortedEntries = Cache->size(); // Since we did phi translation, the "Cache" set won't contain all of the // results for the query. This is ok (we can still use it to accelerate // specific block queries) but we can't do the fastpath "return all // results from the set" Clear out the indicator for this. CacheInfo->first = BBSkipFirstBlockPair(); SkipFirstBlock = false; continue; } // TODO: BITCAST, GEP. // cerr << "MEMDEP: Could not PHI translate: " << *Pointer; // if (isa<BitCastInst>(PtrInst) || isa<GetElementPtrInst>(PtrInst)) // cerr << "OP:\t\t\t\t" << *PtrInst->getOperand(0); PredTranslationFailure: if (Cache == 0) { // Refresh the CacheInfo/Cache pointer if it got invalidated. CacheInfo = &NonLocalPointerDeps[CacheKey]; Cache = &CacheInfo->second; NumSortedEntries = Cache->size(); } else if (NumSortedEntries != Cache->size()) { std::sort(Cache->begin(), Cache->end()); NumSortedEntries = Cache->size(); } // Since we did phi translation, the "Cache" set won't contain all of the // results for the query. This is ok (we can still use it to accelerate // specific block queries) but we can't do the fastpath "return all // results from the set" Clear out the indicator for this. CacheInfo->first = BBSkipFirstBlockPair(); // If *nothing* works, mark the pointer as being clobbered by the first // instruction in this block. // // If this is the magic first block, return this as a clobber of the whole // incoming value. Since we can't phi translate to one of the predecessors, // we have to bail out. if (SkipFirstBlock) return true; for (NonLocalDepInfo::reverse_iterator I = Cache->rbegin(); ; ++I) { assert(I != Cache->rend() && "Didn't find current block??"); if (I->first != BB) continue; assert(I->second.isNonLocal() && "Should only be here with transparent block"); I->second = MemDepResult::getClobber(BB->begin()); ReverseNonLocalPtrDeps[BB->begin()].insert(CacheKey.getOpaqueValue()); Result.push_back(*I); break; } } // Okay, we're done now. If we added new values to the cache, re-sort it. switch (Cache->size()-NumSortedEntries) { case 0: // done, no new entries. break; case 2: { // Two new entries, insert the last one into place. NonLocalDepEntry Val = Cache->back(); Cache->pop_back(); NonLocalDepInfo::iterator Entry = std::upper_bound(Cache->begin(), Cache->end()-1, Val); Cache->insert(Entry, Val); // FALL THROUGH. } case 1: // One new entry, Just insert the new value at the appropriate position. if (Cache->size() != 1) { NonLocalDepEntry Val = Cache->back(); Cache->pop_back(); NonLocalDepInfo::iterator Entry = std::upper_bound(Cache->begin(), Cache->end(), Val); Cache->insert(Entry, Val); } break; default: // Added many values, do a full scale sort. std::sort(Cache->begin(), Cache->end()); } DEBUG(AssertSorted(*Cache)); return false; }
bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { if (SI->isVolatile()) return false; if (TD == 0) return false; // Detect cases where we're performing call slot forwarding, but // happen to be using a load-store pair to implement it, rather than // a memcpy. if (LoadInst *LI = dyn_cast<LoadInst>(SI->getOperand(0))) { if (!LI->isVolatile() && LI->hasOneUse()) { MemDepResult ldep = MD->getDependency(LI); CallInst *C = 0; if (ldep.isClobber() && !isa<MemCpyInst>(ldep.getInst())) C = dyn_cast<CallInst>(ldep.getInst()); if (C) { // Check that nothing touches the dest of the "copy" between // the call and the store. MemDepResult sdep = MD->getDependency(SI); if (!sdep.isNonLocal()) { bool FoundCall = false; for (BasicBlock::iterator I = SI, E = sdep.getInst(); I != E; --I) { if (&*I == C) { FoundCall = true; break; } } if (!FoundCall) C = 0; } } if (C) { bool changed = performCallSlotOptzn(LI, SI->getPointerOperand()->stripPointerCasts(), LI->getPointerOperand()->stripPointerCasts(), TD->getTypeStoreSize(SI->getOperand(0)->getType()), C); if (changed) { MD->removeInstruction(SI); SI->eraseFromParent(); MD->removeInstruction(LI); LI->eraseFromParent(); ++NumMemCpyInstr; return true; } } } } // There are two cases that are interesting for this code to handle: memcpy // and memset. Right now we only handle memset. // Ensure that the value being stored is something that can be memset'able a // byte at a time like "0" or "-1" or any width, as well as things like // 0xA0A0A0A0 and 0.0. if (Value *ByteVal = isBytewiseValue(SI->getOperand(0))) if (Instruction *I = tryMergingIntoMemset(SI, SI->getPointerOperand(), ByteVal)) { BBI = I; // Don't invalidate iterator. return true; } return false; }
/// removeInstruction - Remove an instruction from the dependence analysis, /// updating the dependence of instructions that previously depended on it. /// This method attempts to keep the cache coherent using the reverse map. void MemoryDependenceAnalysis::removeInstruction(Instruction *RemInst) { // Walk through the Non-local dependencies, removing this one as the value // for any cached queries. NonLocalDepMapType::iterator NLDI = NonLocalDeps.find(RemInst); if (NLDI != NonLocalDeps.end()) { NonLocalDepInfo &BlockMap = NLDI->second.first; for (NonLocalDepInfo::iterator DI = BlockMap.begin(), DE = BlockMap.end(); DI != DE; ++DI) if (Instruction *Inst = DI->second.getInst()) RemoveFromReverseMap(ReverseNonLocalDeps, Inst, RemInst); NonLocalDeps.erase(NLDI); } // If we have a cached local dependence query for this instruction, remove it. // LocalDepMapType::iterator LocalDepEntry = LocalDeps.find(RemInst); if (LocalDepEntry != LocalDeps.end()) { // Remove us from DepInst's reverse set now that the local dep info is gone. if (Instruction *Inst = LocalDepEntry->second.getInst()) RemoveFromReverseMap(ReverseLocalDeps, Inst, RemInst); // Remove this local dependency info. LocalDeps.erase(LocalDepEntry); } // If we have any cached pointer dependencies on this instruction, remove // them. If the instruction has non-pointer type, then it can't be a pointer // base. // Remove it from both the load info and the store info. The instruction // can't be in either of these maps if it is non-pointer. if (isa<PointerType>(RemInst->getType())) { RemoveCachedNonLocalPointerDependencies(ValueIsLoadPair(RemInst, false)); RemoveCachedNonLocalPointerDependencies(ValueIsLoadPair(RemInst, true)); } // Loop over all of the things that depend on the instruction we're removing. // SmallVector<std::pair<Instruction*, Instruction*>, 8> ReverseDepsToAdd; // If we find RemInst as a clobber or Def in any of the maps for other values, // we need to replace its entry with a dirty version of the instruction after // it. If RemInst is a terminator, we use a null dirty value. // // Using a dirty version of the instruction after RemInst saves having to scan // the entire block to get to this point. MemDepResult NewDirtyVal; if (!RemInst->isTerminator()) NewDirtyVal = MemDepResult::getDirty(++BasicBlock::iterator(RemInst)); ReverseDepMapType::iterator ReverseDepIt = ReverseLocalDeps.find(RemInst); if (ReverseDepIt != ReverseLocalDeps.end()) { SmallPtrSet<Instruction*, 4> &ReverseDeps = ReverseDepIt->second; // RemInst can't be the terminator if it has local stuff depending on it. assert(!ReverseDeps.empty() && !isa<TerminatorInst>(RemInst) && "Nothing can locally depend on a terminator"); for (SmallPtrSet<Instruction*, 4>::iterator I = ReverseDeps.begin(), E = ReverseDeps.end(); I != E; ++I) { Instruction *InstDependingOnRemInst = *I; assert(InstDependingOnRemInst != RemInst && "Already removed our local dep info"); LocalDeps[InstDependingOnRemInst] = NewDirtyVal; // Make sure to remember that new things depend on NewDepInst. assert(NewDirtyVal.getInst() && "There is no way something else can have " "a local dep on this if it is a terminator!"); ReverseDepsToAdd.push_back(std::make_pair(NewDirtyVal.getInst(), InstDependingOnRemInst)); } ReverseLocalDeps.erase(ReverseDepIt); // Add new reverse deps after scanning the set, to avoid invalidating the // 'ReverseDeps' reference. while (!ReverseDepsToAdd.empty()) { ReverseLocalDeps[ReverseDepsToAdd.back().first] .insert(ReverseDepsToAdd.back().second); ReverseDepsToAdd.pop_back(); } } ReverseDepIt = ReverseNonLocalDeps.find(RemInst); if (ReverseDepIt != ReverseNonLocalDeps.end()) { SmallPtrSet<Instruction*, 4> &Set = ReverseDepIt->second; for (SmallPtrSet<Instruction*, 4>::iterator I = Set.begin(), E = Set.end(); I != E; ++I) { assert(*I != RemInst && "Already removed NonLocalDep info for RemInst"); PerInstNLInfo &INLD = NonLocalDeps[*I]; // The information is now dirty! INLD.second = true; for (NonLocalDepInfo::iterator DI = INLD.first.begin(), DE = INLD.first.end(); DI != DE; ++DI) { if (DI->second.getInst() != RemInst) continue; // Convert to a dirty entry for the subsequent instruction. DI->second = NewDirtyVal; if (Instruction *NextI = NewDirtyVal.getInst()) ReverseDepsToAdd.push_back(std::make_pair(NextI, *I)); } } ReverseNonLocalDeps.erase(ReverseDepIt); // Add new reverse deps after scanning the set, to avoid invalidating 'Set' while (!ReverseDepsToAdd.empty()) { ReverseNonLocalDeps[ReverseDepsToAdd.back().first] .insert(ReverseDepsToAdd.back().second); ReverseDepsToAdd.pop_back(); } } // If the instruction is in ReverseNonLocalPtrDeps then it appears as a // value in the NonLocalPointerDeps info. ReverseNonLocalPtrDepTy::iterator ReversePtrDepIt = ReverseNonLocalPtrDeps.find(RemInst); if (ReversePtrDepIt != ReverseNonLocalPtrDeps.end()) { SmallPtrSet<void*, 4> &Set = ReversePtrDepIt->second; SmallVector<std::pair<Instruction*, ValueIsLoadPair>,8> ReversePtrDepsToAdd; for (SmallPtrSet<void*, 4>::iterator I = Set.begin(), E = Set.end(); I != E; ++I) { ValueIsLoadPair P; P.setFromOpaqueValue(*I); assert(P.getPointer() != RemInst && "Already removed NonLocalPointerDeps info for RemInst"); NonLocalDepInfo &NLPDI = NonLocalPointerDeps[P].second; // The cache is not valid for any specific block anymore. NonLocalPointerDeps[P].first = BBSkipFirstBlockPair(); // Update any entries for RemInst to use the instruction after it. for (NonLocalDepInfo::iterator DI = NLPDI.begin(), DE = NLPDI.end(); DI != DE; ++DI) { if (DI->second.getInst() != RemInst) continue; // Convert to a dirty entry for the subsequent instruction. DI->second = NewDirtyVal; if (Instruction *NewDirtyInst = NewDirtyVal.getInst()) ReversePtrDepsToAdd.push_back(std::make_pair(NewDirtyInst, P)); } // Re-sort the NonLocalDepInfo. Changing the dirty entry to its // subsequent value may invalidate the sortedness. std::sort(NLPDI.begin(), NLPDI.end()); } ReverseNonLocalPtrDeps.erase(ReversePtrDepIt); while (!ReversePtrDepsToAdd.empty()) { ReverseNonLocalPtrDeps[ReversePtrDepsToAdd.back().first] .insert(ReversePtrDepsToAdd.back().second.getOpaqueValue()); ReversePtrDepsToAdd.pop_back(); } } assert(!NonLocalDeps.count(RemInst) && "RemInst got reinserted?"); AA->deleteValue(RemInst); DEBUG(verifyRemoved(RemInst)); }
void DataDependence::processDepResult(Instruction *inst, MemoryDependenceAnalysis &MDA, AliasAnalysis &AA) { // TODO: This is probably a good place to check of the dependency // information is calculated on-demand MemDepResult Res = MDA.getDependency(inst); if (!Res.isNonLocal()) { // local results (not-non-local) can be simply handled. They are just // a pair of insturctions and a dependency type // Get dependency information DepInfo newInfo; newInfo = getDepInfo(Res); #ifdef MK_DEBUG //errs() << "[DEBUG] newInfo depInst == " << Res.getInst() << '\n'; if (Res.getInst() == NULL) { errs() << "[DEBUG] NULL dependency found, dep type: " << depTypeToString(newInfo.Type_) << '\n'; } #endif // Save into map assert(newInfo.valid()); LocalDeps_[inst] = newInfo; } else { // Handle NonLocal dependencies. The function call // getNonLocalPointerDependency() assumes that a result of NonLocal // has already been encountered // Get dependency information DepInfo newInfo; newInfo = getDepInfo(Res); assert(newInfo.Type_ == NonLocal); assert(Res.isNonLocal()); SmallVector<NonLocalDepResult, 4> NLDep; if (LoadInst *LI = dyn_cast<LoadInst>(inst)) { if (!LI->isUnordered()) { // FIXME: Handle atomic/volatile loads. errs() << "[WARNING] atomic/volatile loads are not handled\n"; assert(false && "atomic/volatile loads not handled"); //Deps[Inst].insert(std::make_pair(getInstTypePair(0, Unknown), //static_cast<BasicBlock *>(0))); return; } AliasAnalysis::Location Loc = AA.getLocation(LI); MDA.getNonLocalPointerDependency(Loc, true, LI->getParent(), NLDep); } else if (StoreInst *SI = dyn_cast<StoreInst>(inst)) { if (!SI->isUnordered()) { // FIXME: Handle atomic/volatile stores. errs() << "[WARNING] atomic/volatile stores are not handled\n"; assert(false && "atomic/volatile stores not handled"); //Deps[Inst].insert(std::make_pair(getInstTypePair(0, Unknown), //static_cast<BasicBlock *>(0))); return; } AliasAnalysis::Location Loc = AA.getLocation(SI); MDA.getNonLocalPointerDependency(Loc, false, SI->getParent(), NLDep); } else if (VAArgInst *VI = dyn_cast<VAArgInst>(inst)) { AliasAnalysis::Location Loc = AA.getLocation(VI); MDA.getNonLocalPointerDependency(Loc, false, VI->getParent(), NLDep); } else { llvm_unreachable("Unknown memory instruction!"); } #ifdef MK_DEBUG errs() << "[DEBUG] NLDep.size() == " << NLDep.size() << '\n'; #endif for (auto I = NLDep.begin(), E = NLDep.end(); I != E; ++I) { NonLocalDeps_[inst].push_back(*I); } } // end else }
bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { if (!SI->isSimple()) return false; if (TD == 0) return false; // Detect cases where we're performing call slot forwarding, but // happen to be using a load-store pair to implement it, rather than // a memcpy. if (LoadInst *LI = dyn_cast<LoadInst>(SI->getOperand(0))) { if (LI->isSimple() && LI->hasOneUse() && LI->getParent() == SI->getParent()) { MemDepResult ldep = MD->getDependency(LI); CallInst *C = 0; if (ldep.isClobber() && !isa<MemCpyInst>(ldep.getInst())) C = dyn_cast<CallInst>(ldep.getInst()); if (C) { // Check that nothing touches the dest of the "copy" between // the call and the store. AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); AliasAnalysis::Location StoreLoc = AA.getLocation(SI); for (BasicBlock::iterator I = --BasicBlock::iterator(SI), E = C; I != E; --I) { if (AA.getModRefInfo(&*I, StoreLoc) != AliasAnalysis::NoModRef) { C = 0; break; } } } if (C) { unsigned storeAlign = SI->getAlignment(); if (!storeAlign) storeAlign = TD->getABITypeAlignment(SI->getOperand(0)->getType()); unsigned loadAlign = LI->getAlignment(); if (!loadAlign) loadAlign = TD->getABITypeAlignment(LI->getType()); bool changed = performCallSlotOptzn(LI, SI->getPointerOperand()->stripPointerCasts(), LI->getPointerOperand()->stripPointerCasts(), TD->getTypeStoreSize(SI->getOperand(0)->getType()), std::min(storeAlign, loadAlign), C); if (changed) { MD->removeInstruction(SI); SI->eraseFromParent(); MD->removeInstruction(LI); LI->eraseFromParent(); ++NumMemCpyInstr; return true; } } } } // There are two cases that are interesting for this code to handle: memcpy // and memset. Right now we only handle memset. // Ensure that the value being stored is something that can be memset'able a // byte at a time like "0" or "-1" or any width, as well as things like // 0xA0A0A0A0 and 0.0. if (Value *ByteVal = isBytewiseValue(SI->getOperand(0))) if (Instruction *I = tryMergingIntoMemset(SI, SI->getPointerOperand(), ByteVal)) { BBI = I; // Don't invalidate iterator. return true; } return false; }
/// processMemCpy - perform simplification of memcpy's. If we have memcpy A /// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite /// B to be a memcpy from X to Z (or potentially a memmove, depending on /// circumstances). This allows later passes to remove the first memcpy /// altogether. bool MemCpyOpt::processMemCpy(MemCpyInst *M) { // We can only optimize non-volatile memcpy's. if (M->isVolatile()) return false; // If the source and destination of the memcpy are the same, then zap it. if (M->getSource() == M->getDest()) { MD->removeInstruction(M); M->eraseFromParent(); return false; } // If copying from a constant, try to turn the memcpy into a memset. if (GlobalVariable *GV = dyn_cast<GlobalVariable>(M->getSource())) if (GV->isConstant() && GV->hasDefinitiveInitializer()) if (Value *ByteVal = isBytewiseValue(GV->getInitializer())) { IRBuilder<> Builder(M); Builder.CreateMemSet(M->getRawDest(), ByteVal, M->getLength(), M->getAlignment(), false); MD->removeInstruction(M); M->eraseFromParent(); ++NumCpyToSet; return true; } // The optimizations after this point require the memcpy size. ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength()); if (CopySize == 0) return false; // The are three possible optimizations we can do for memcpy: // a) memcpy-memcpy xform which exposes redundance for DSE. // b) call-memcpy xform for return slot optimization. // c) memcpy from freshly alloca'd space copies undefined data, and we can // therefore eliminate the memcpy in favor of the data that was already // at the destination. MemDepResult DepInfo = MD->getDependency(M); if (DepInfo.isClobber()) { if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) { if (performCallSlotOptzn(M, M->getDest(), M->getSource(), CopySize->getZExtValue(), M->getAlignment(), C)) { MD->removeInstruction(M); M->eraseFromParent(); return true; } } } AliasAnalysis::Location SrcLoc = AliasAnalysis::getLocationForSource(M); MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(SrcLoc, true, M, M->getParent()); if (SrcDepInfo.isClobber()) { if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst())) return processMemCpyMemCpyDependence(M, MDep, CopySize->getZExtValue()); } else if (SrcDepInfo.isDef()) { if (isa<AllocaInst>(SrcDepInfo.getInst())) { MD->removeInstruction(M); M->eraseFromParent(); ++NumMemCpyInstr; return true; } } return false; }
/// processByValArgument - This is called on every byval argument in call sites. bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { if (TD == 0) return false; // Find out what feeds this byval argument. Value *ByValArg = CS.getArgument(ArgNo); Type *ByValTy = cast<PointerType>(ByValArg->getType())->getElementType(); uint64_t ByValSize = TD->getTypeAllocSize(ByValTy); MemDepResult DepInfo = MD->getPointerDependencyFrom(AliasAnalysis::Location(ByValArg, ByValSize), true, CS.getInstruction(), CS.getInstruction()->getParent()); if (!DepInfo.isClobber()) return false; // If the byval argument isn't fed by a memcpy, ignore it. If it is fed by // a memcpy, see if we can byval from the source of the memcpy instead of the // result. MemCpyInst *MDep = dyn_cast<MemCpyInst>(DepInfo.getInst()); if (MDep == 0 || MDep->isVolatile() || ByValArg->stripPointerCasts() != MDep->getDest()) return false; // The length of the memcpy must be larger or equal to the size of the byval. ConstantInt *C1 = dyn_cast<ConstantInt>(MDep->getLength()); if (C1 == 0 || C1->getValue().getZExtValue() < ByValSize) return false; // Get the alignment of the byval. If the call doesn't specify the alignment, // then it is some target specific value that we can't know. unsigned ByValAlign = CS.getParamAlignment(ArgNo+1); if (ByValAlign == 0) return false; // If it is greater than the memcpy, then we check to see if we can force the // source of the memcpy to the alignment we need. If we fail, we bail out. if (MDep->getAlignment() < ByValAlign && getOrEnforceKnownAlignment(MDep->getSource(),ByValAlign, TD) < ByValAlign) return false; // Verify that the copied-from memory doesn't change in between the memcpy and // the byval call. // memcpy(a <- b) // *b = 42; // foo(*a) // It would be invalid to transform the second memcpy into foo(*b). // // NOTE: This is conservative, it will stop on any read from the source loc, // not just the defining memcpy. MemDepResult SourceDep = MD->getPointerDependencyFrom(AliasAnalysis::getLocationForSource(MDep), false, CS.getInstruction(), MDep->getParent()); if (!SourceDep.isClobber() || SourceDep.getInst() != MDep) return false; Value *TmpCast = MDep->getSource(); if (MDep->getSource()->getType() != ByValArg->getType()) TmpCast = new BitCastInst(MDep->getSource(), ByValArg->getType(), "tmpcast", CS.getInstruction()); DEBUG(dbgs() << "MemCpyOpt: Forwarding memcpy to byval:\n" << " " << *MDep << "\n" << " " << *CS.getInstruction() << "\n"); // Otherwise we're good! Update the byval argument. CS.setArgument(ArgNo, TmpCast); ++NumMemCpyInstr; return true; }
bool AMDGPURewriteOutArguments::runOnFunction(Function &F) { if (skipFunction(F)) return false; // TODO: Could probably handle variadic functions. if (F.isVarArg() || F.hasStructRetAttr() || AMDGPU::isEntryFunctionCC(F.getCallingConv())) return false; MDA = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep(); unsigned ReturnNumRegs = 0; SmallSet<int, 4> OutArgIndexes; SmallVector<Type *, 4> ReturnTypes; Type *RetTy = F.getReturnType(); if (!RetTy->isVoidTy()) { ReturnNumRegs = DL->getTypeStoreSize(RetTy) / 4; if (ReturnNumRegs >= MaxNumRetRegs) return false; ReturnTypes.push_back(RetTy); } SmallVector<Argument *, 4> OutArgs; for (Argument &Arg : F.args()) { if (isOutArgumentCandidate(Arg)) { LLVM_DEBUG(dbgs() << "Found possible out argument " << Arg << " in function " << F.getName() << '\n'); OutArgs.push_back(&Arg); } } if (OutArgs.empty()) return false; using ReplacementVec = SmallVector<std::pair<Argument *, Value *>, 4>; DenseMap<ReturnInst *, ReplacementVec> Replacements; SmallVector<ReturnInst *, 4> Returns; for (BasicBlock &BB : F) { if (ReturnInst *RI = dyn_cast<ReturnInst>(&BB.back())) Returns.push_back(RI); } if (Returns.empty()) return false; bool Changing; do { Changing = false; // Keep retrying if we are able to successfully eliminate an argument. This // helps with cases with multiple arguments which may alias, such as in a // sincos implemntation. If we have 2 stores to arguments, on the first // attempt the MDA query will succeed for the second store but not the // first. On the second iteration we've removed that out clobbering argument // (by effectively moving it into another function) and will find the second // argument is OK to move. for (Argument *OutArg : OutArgs) { bool ThisReplaceable = true; SmallVector<std::pair<ReturnInst *, StoreInst *>, 4> ReplaceableStores; Type *ArgTy = OutArg->getType()->getPointerElementType(); // Skip this argument if converting it will push us over the register // count to return limit. // TODO: This is an approximation. When legalized this could be more. We // can ask TLI for exactly how many. unsigned ArgNumRegs = DL->getTypeStoreSize(ArgTy) / 4; if (ArgNumRegs + ReturnNumRegs > MaxNumRetRegs) continue; // An argument is convertible only if all exit blocks are able to replace // it. for (ReturnInst *RI : Returns) { BasicBlock *BB = RI->getParent(); MemDepResult Q = MDA->getPointerDependencyFrom(MemoryLocation(OutArg), true, BB->end(), BB, RI); StoreInst *SI = nullptr; if (Q.isDef()) SI = dyn_cast<StoreInst>(Q.getInst()); if (SI) { LLVM_DEBUG(dbgs() << "Found out argument store: " << *SI << '\n'); ReplaceableStores.emplace_back(RI, SI); } else { ThisReplaceable = false; break; } } if (!ThisReplaceable) continue; // Try the next argument candidate. for (std::pair<ReturnInst *, StoreInst *> Store : ReplaceableStores) { Value *ReplVal = Store.second->getValueOperand(); auto &ValVec = Replacements[Store.first]; if (llvm::find_if(ValVec, [OutArg](const std::pair<Argument *, Value *> &Entry) { return Entry.first == OutArg;}) != ValVec.end()) { LLVM_DEBUG(dbgs() << "Saw multiple out arg stores" << *OutArg << '\n'); // It is possible to see stores to the same argument multiple times, // but we expect these would have been optimized out already. ThisReplaceable = false; break; } ValVec.emplace_back(OutArg, ReplVal); Store.second->eraseFromParent(); } if (ThisReplaceable) { ReturnTypes.push_back(ArgTy); OutArgIndexes.insert(OutArg->getArgNo()); ++NumOutArgumentsReplaced; Changing = true; } } } while (Changing); if (Replacements.empty()) return false; LLVMContext &Ctx = F.getParent()->getContext(); StructType *NewRetTy = StructType::create(Ctx, ReturnTypes, F.getName()); FunctionType *NewFuncTy = FunctionType::get(NewRetTy, F.getFunctionType()->params(), F.isVarArg()); LLVM_DEBUG(dbgs() << "Computed new return type: " << *NewRetTy << '\n'); Function *NewFunc = Function::Create(NewFuncTy, Function::PrivateLinkage, F.getName() + ".body"); F.getParent()->getFunctionList().insert(F.getIterator(), NewFunc); NewFunc->copyAttributesFrom(&F); NewFunc->setComdat(F.getComdat()); // We want to preserve the function and param attributes, but need to strip // off any return attributes, e.g. zeroext doesn't make sense with a struct. NewFunc->stealArgumentListFrom(F); AttrBuilder RetAttrs; RetAttrs.addAttribute(Attribute::SExt); RetAttrs.addAttribute(Attribute::ZExt); RetAttrs.addAttribute(Attribute::NoAlias); NewFunc->removeAttributes(AttributeList::ReturnIndex, RetAttrs); // TODO: How to preserve metadata? // Move the body of the function into the new rewritten function, and replace // this function with a stub. NewFunc->getBasicBlockList().splice(NewFunc->begin(), F.getBasicBlockList()); for (std::pair<ReturnInst *, ReplacementVec> &Replacement : Replacements) { ReturnInst *RI = Replacement.first; IRBuilder<> B(RI); B.SetCurrentDebugLocation(RI->getDebugLoc()); int RetIdx = 0; Value *NewRetVal = UndefValue::get(NewRetTy); Value *RetVal = RI->getReturnValue(); if (RetVal) NewRetVal = B.CreateInsertValue(NewRetVal, RetVal, RetIdx++); for (std::pair<Argument *, Value *> ReturnPoint : Replacement.second) { Argument *Arg = ReturnPoint.first; Value *Val = ReturnPoint.second; Type *EltTy = Arg->getType()->getPointerElementType(); if (Val->getType() != EltTy) { Type *EffectiveEltTy = EltTy; if (StructType *CT = dyn_cast<StructType>(EltTy)) { assert(CT->getNumElements() == 1); EffectiveEltTy = CT->getElementType(0); } if (DL->getTypeSizeInBits(EffectiveEltTy) != DL->getTypeSizeInBits(Val->getType())) { assert(isVec3ToVec4Shuffle(EffectiveEltTy, Val->getType())); Val = B.CreateShuffleVector(Val, UndefValue::get(Val->getType()), { 0, 1, 2 }); } Val = B.CreateBitCast(Val, EffectiveEltTy); // Re-create single element composite. if (EltTy != EffectiveEltTy) Val = B.CreateInsertValue(UndefValue::get(EltTy), Val, 0); } NewRetVal = B.CreateInsertValue(NewRetVal, Val, RetIdx++); } if (RetVal) RI->setOperand(0, NewRetVal); else { B.CreateRet(NewRetVal); RI->eraseFromParent(); } } SmallVector<Value *, 16> StubCallArgs; for (Argument &Arg : F.args()) { if (OutArgIndexes.count(Arg.getArgNo())) { // It's easier to preserve the type of the argument list. We rely on // DeadArgumentElimination to take care of these. StubCallArgs.push_back(UndefValue::get(Arg.getType())); } else { StubCallArgs.push_back(&Arg); } } BasicBlock *StubBB = BasicBlock::Create(Ctx, "", &F); IRBuilder<> B(StubBB); CallInst *StubCall = B.CreateCall(NewFunc, StubCallArgs); int RetIdx = RetTy->isVoidTy() ? 0 : 1; for (Argument &Arg : F.args()) { if (!OutArgIndexes.count(Arg.getArgNo())) continue; PointerType *ArgType = cast<PointerType>(Arg.getType()); auto *EltTy = ArgType->getElementType(); unsigned Align = Arg.getParamAlignment(); if (Align == 0) Align = DL->getABITypeAlignment(EltTy); Value *Val = B.CreateExtractValue(StubCall, RetIdx++); Type *PtrTy = Val->getType()->getPointerTo(ArgType->getAddressSpace()); // We can peek through bitcasts, so the type may not match. Value *PtrVal = B.CreateBitCast(&Arg, PtrTy); B.CreateAlignedStore(Val, PtrVal, Align); } if (!RetTy->isVoidTy()) { B.CreateRet(B.CreateExtractValue(StubCall, 0)); } else { B.CreateRetVoid(); } // The function is now a stub we want to inline. F.addFnAttr(Attribute::AlwaysInline); ++NumOutArgumentFunctionsReplaced; return true; }