/// processMemCpy - perform simplification of memcpy's. If we have memcpy A /// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite /// B to be a memcpy from X to Z (or potentially a memmove, depending on /// circumstances). This allows later passes to remove the first memcpy /// altogether. bool MemCpyOpt::processMemCpy(MemCpyInst *M) { // We can only optimize statically-sized memcpy's that are non-volatile. ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength()); if (CopySize == 0 || M->isVolatile()) return false; // If the source and destination of the memcpy are the same, then zap it. if (M->getSource() == M->getDest()) { MD->removeInstruction(M); M->eraseFromParent(); return false; } // If copying from a constant, try to turn the memcpy into a memset. if (GlobalVariable *GV = dyn_cast<GlobalVariable>(M->getSource())) if (GV->isConstant() && GV->hasDefinitiveInitializer()) if (Value *ByteVal = isBytewiseValue(GV->getInitializer())) { IRBuilder<> Builder(M); Builder.CreateMemSet(M->getRawDest(), ByteVal, CopySize, M->getAlignment(), false); MD->removeInstruction(M); M->eraseFromParent(); ++NumCpyToSet; return true; } // The are two possible optimizations we can do for memcpy: // a) memcpy-memcpy xform which exposes redundance for DSE. // b) call-memcpy xform for return slot optimization. MemDepResult DepInfo = MD->getDependency(M); if (DepInfo.isClobber()) { if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) { if (performCallSlotOptzn(M, M->getDest(), M->getSource(), CopySize->getZExtValue(), M->getAlignment(), C)) { MD->removeInstruction(M); M->eraseFromParent(); return true; } } } AliasAnalysis::Location SrcLoc = AliasAnalysis::getLocationForSource(M); MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(SrcLoc, true, M, M->getParent()); if (SrcDepInfo.isClobber()) { if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst())) return processMemCpyMemCpyDependence(M, MDep, CopySize->getZExtValue()); } return false; }
/// processMemCpy - perform simplification of memcpy's. If we have memcpy A /// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite /// B to be a memcpy from X to Z (or potentially a memmove, depending on /// circumstances). This allows later passes to remove the first memcpy /// altogether. bool MemCpyOpt::processMemCpy(MemCpyInst *M) { // We can only optimize statically-sized memcpy's that are non-volatile. ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength()); if (CopySize == 0 || M->isVolatile()) return false; // If the source and destination of the memcpy are the same, then zap it. if (M->getSource() == M->getDest()) { MD->removeInstruction(M); M->eraseFromParent(); return false; } // The are two possible optimizations we can do for memcpy: // a) memcpy-memcpy xform which exposes redundance for DSE. // b) call-memcpy xform for return slot optimization. MemDepResult DepInfo = MD->getDependency(M); if (!DepInfo.isClobber()) return false; if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(DepInfo.getInst())) return processMemCpyMemCpyDependence(M, MDep, CopySize->getZExtValue()); if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) { if (performCallSlotOptzn(M, M->getDest(), M->getSource(), CopySize->getZExtValue(), C)) { M->eraseFromParent(); return true; } } return false; }
/// Handle frees of entire structures whose dependency is a store /// to a field of that structure. static bool handleFree(CallInst *F, AliasAnalysis *AA, MemoryDependenceResults *MD, DominatorTree *DT, const TargetLibraryInfo *TLI, InstOverlapIntervalsTy &IOL, DenseMap<Instruction*, size_t> *InstrOrdering) { bool MadeChange = false; MemoryLocation Loc = MemoryLocation(F->getOperand(0)); SmallVector<BasicBlock *, 16> Blocks; Blocks.push_back(F->getParent()); const DataLayout &DL = F->getModule()->getDataLayout(); while (!Blocks.empty()) { BasicBlock *BB = Blocks.pop_back_val(); Instruction *InstPt = BB->getTerminator(); if (BB == F->getParent()) InstPt = F; MemDepResult Dep = MD->getPointerDependencyFrom(Loc, false, InstPt->getIterator(), BB); while (Dep.isDef() || Dep.isClobber()) { Instruction *Dependency = Dep.getInst(); if (!hasMemoryWrite(Dependency, *TLI) || !isRemovable(Dependency)) break; Value *DepPointer = GetUnderlyingObject(getStoredPointerOperand(Dependency), DL); // Check for aliasing. if (!AA->isMustAlias(F->getArgOperand(0), DepPointer)) break; DEBUG(dbgs() << "DSE: Dead Store to soon to be freed memory:\n DEAD: " << *Dependency << '\n'); // DCE instructions only used to calculate that store. BasicBlock::iterator BBI(Dependency); deleteDeadInstruction(Dependency, &BBI, *MD, *TLI, IOL, InstrOrdering); ++NumFastStores; MadeChange = true; // Inst's old Dependency is now deleted. Compute the next dependency, // which may also be dead, as in // s[0] = 0; // s[1] = 0; // This has just been deleted. // free(s); Dep = MD->getPointerDependencyFrom(Loc, false, BBI, BB); } if (Dep.isNonLocal()) findUnconditionalPreds(Blocks, BB, DT); } return MadeChange; }
DataDependence::DepInfo DataDependence::getDepInfo(MemDepResult dep) { if (dep.isClobber()) return DepInfo(dep.getInst(), Clobber); if (dep.isDef()) return DepInfo(dep.getInst(), Def); if (dep.isNonFuncLocal()) return DepInfo(dep.getInst(), NonFuncLocal); if (dep.isUnknown()) return DepInfo(dep.getInst(), Unknown); if (dep.isNonLocal()) return DepInfo(dep.getInst(), NonLocal); llvm_unreachable("unknown dependence type"); }
/// HandleFree - Handle frees of entire structures whose dependency is a store /// to a field of that structure. bool DSE::HandleFree(CallInst *F) { bool MadeChange = false; MemoryLocation Loc = MemoryLocation(F->getOperand(0)); SmallVector<BasicBlock *, 16> Blocks; Blocks.push_back(F->getParent()); const DataLayout &DL = F->getModule()->getDataLayout(); while (!Blocks.empty()) { BasicBlock *BB = Blocks.pop_back_val(); Instruction *InstPt = BB->getTerminator(); if (BB == F->getParent()) InstPt = F; MemDepResult Dep = MD->getPointerDependencyFrom(Loc, false, InstPt, BB); while (Dep.isDef() || Dep.isClobber()) { Instruction *Dependency = Dep.getInst(); if (!hasMemoryWrite(Dependency, *TLI) || !isRemovable(Dependency)) break; Value *DepPointer = GetUnderlyingObject(getStoredPointerOperand(Dependency), DL); // Check for aliasing. if (!AA->isMustAlias(F->getArgOperand(0), DepPointer)) break; Instruction *Next = std::next(BasicBlock::iterator(Dependency)); // DCE instructions only used to calculate that store DeleteDeadInstruction(Dependency, *MD, *TLI); ++NumFastStores; MadeChange = true; // Inst's old Dependency is now deleted. Compute the next dependency, // which may also be dead, as in // s[0] = 0; // s[1] = 0; // This has just been deleted. // free(s); Dep = MD->getPointerDependencyFrom(Loc, false, Next, BB); } if (Dep.isNonLocal()) FindUnconditionalPreds(Blocks, BB, DT); } return MadeChange; }
/// processByValArgument - This is called on every byval argument in call sites. bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { if (TD == 0) return false; // Find out what feeds this byval argument. Value *ByValArg = CS.getArgument(ArgNo); Type *ByValTy = cast<PointerType>(ByValArg->getType())->getElementType(); uint64_t ByValSize = TD->getTypeAllocSize(ByValTy); MemDepResult DepInfo = MD->getPointerDependencyFrom(AliasAnalysis::Location(ByValArg, ByValSize), true, CS.getInstruction(), CS.getInstruction()->getParent()); if (!DepInfo.isClobber()) return false; // If the byval argument isn't fed by a memcpy, ignore it. If it is fed by // a memcpy, see if we can byval from the source of the memcpy instead of the // result. MemCpyInst *MDep = dyn_cast<MemCpyInst>(DepInfo.getInst()); if (MDep == 0 || MDep->isVolatile() || ByValArg->stripPointerCasts() != MDep->getDest()) return false; // The length of the memcpy must be larger or equal to the size of the byval. ConstantInt *C1 = dyn_cast<ConstantInt>(MDep->getLength()); if (C1 == 0 || C1->getValue().getZExtValue() < ByValSize) return false; // Get the alignment of the byval. If the call doesn't specify the alignment, // then it is some target specific value that we can't know. unsigned ByValAlign = CS.getParamAlignment(ArgNo+1); if (ByValAlign == 0) return false; // If it is greater than the memcpy, then we check to see if we can force the // source of the memcpy to the alignment we need. If we fail, we bail out. if (MDep->getAlignment() < ByValAlign && getOrEnforceKnownAlignment(MDep->getSource(),ByValAlign, TD) < ByValAlign) return false; // Verify that the copied-from memory doesn't change in between the memcpy and // the byval call. // memcpy(a <- b) // *b = 42; // foo(*a) // It would be invalid to transform the second memcpy into foo(*b). // // NOTE: This is conservative, it will stop on any read from the source loc, // not just the defining memcpy. MemDepResult SourceDep = MD->getPointerDependencyFrom(AliasAnalysis::getLocationForSource(MDep), false, CS.getInstruction(), MDep->getParent()); if (!SourceDep.isClobber() || SourceDep.getInst() != MDep) return false; Value *TmpCast = MDep->getSource(); if (MDep->getSource()->getType() != ByValArg->getType()) TmpCast = new BitCastInst(MDep->getSource(), ByValArg->getType(), "tmpcast", CS.getInstruction()); DEBUG(dbgs() << "MemCpyOpt: Forwarding memcpy to byval:\n" << " " << *MDep << "\n" << " " << *CS.getInstruction() << "\n"); // Otherwise we're good! Update the byval argument. CS.setArgument(ArgNo, TmpCast); ++NumMemCpyInstr; return true; }
/// processMemCpyMemCpyDependence - We've found that the (upward scanning) /// memory dependence of memcpy 'M' is the memcpy 'MDep'. Try to simplify M to /// copy from MDep's input if we can. MSize is the size of M's copy. /// bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep, uint64_t MSize) { // We can only transforms memcpy's where the dest of one is the source of the // other. if (M->getSource() != MDep->getDest() || MDep->isVolatile()) return false; // If dep instruction is reading from our current input, then it is a noop // transfer and substituting the input won't change this instruction. Just // ignore the input and let someone else zap MDep. This handles cases like: // memcpy(a <- a) // memcpy(b <- a) if (M->getSource() == MDep->getSource()) return false; // Second, the length of the memcpy's must be the same, or the preceding one // must be larger than the following one. ConstantInt *MDepLen = dyn_cast<ConstantInt>(MDep->getLength()); ConstantInt *MLen = dyn_cast<ConstantInt>(M->getLength()); if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue()) return false; AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); // Verify that the copied-from memory doesn't change in between the two // transfers. For example, in: // memcpy(a <- b) // *b = 42; // memcpy(c <- a) // It would be invalid to transform the second memcpy into memcpy(c <- b). // // TODO: If the code between M and MDep is transparent to the destination "c", // then we could still perform the xform by moving M up to the first memcpy. // // NOTE: This is conservative, it will stop on any read from the source loc, // not just the defining memcpy. MemDepResult SourceDep = MD->getPointerDependencyFrom(AA.getLocationForSource(MDep), false, M, M->getParent()); if (!SourceDep.isClobber() || SourceDep.getInst() != MDep) return false; // If the dest of the second might alias the source of the first, then the // source and dest might overlap. We still want to eliminate the intermediate // value, but we have to generate a memmove instead of memcpy. bool UseMemMove = false; if (!AA.isNoAlias(AA.getLocationForDest(M), AA.getLocationForSource(MDep))) UseMemMove = true; // If all checks passed, then we can transform M. // Make sure to use the lesser of the alignment of the source and the dest // since we're changing where we're reading from, but don't want to increase // the alignment past what can be read from or written to. // TODO: Is this worth it if we're creating a less aligned memcpy? For // example we could be moving from movaps -> movq on x86. unsigned Align = std::min(MDep->getAlignment(), M->getAlignment()); IRBuilder<> Builder(M); if (UseMemMove) Builder.CreateMemMove(M->getRawDest(), MDep->getRawSource(), M->getLength(), Align, M->isVolatile()); else Builder.CreateMemCpy(M->getRawDest(), MDep->getRawSource(), M->getLength(), Align, M->isVolatile()); // Remove the instruction we're replacing. MD->removeInstruction(M); M->eraseFromParent(); ++NumMemCpyInstr; return true; }
bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { if (!SI->isSimple()) return false; if (TD == 0) return false; // Detect cases where we're performing call slot forwarding, but // happen to be using a load-store pair to implement it, rather than // a memcpy. if (LoadInst *LI = dyn_cast<LoadInst>(SI->getOperand(0))) { if (LI->isSimple() && LI->hasOneUse() && LI->getParent() == SI->getParent()) { MemDepResult ldep = MD->getDependency(LI); CallInst *C = 0; if (ldep.isClobber() && !isa<MemCpyInst>(ldep.getInst())) C = dyn_cast<CallInst>(ldep.getInst()); if (C) { // Check that nothing touches the dest of the "copy" between // the call and the store. AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); AliasAnalysis::Location StoreLoc = AA.getLocation(SI); for (BasicBlock::iterator I = --BasicBlock::iterator(SI), E = C; I != E; --I) { if (AA.getModRefInfo(&*I, StoreLoc) != AliasAnalysis::NoModRef) { C = 0; break; } } } if (C) { unsigned storeAlign = SI->getAlignment(); if (!storeAlign) storeAlign = TD->getABITypeAlignment(SI->getOperand(0)->getType()); unsigned loadAlign = LI->getAlignment(); if (!loadAlign) loadAlign = TD->getABITypeAlignment(LI->getType()); bool changed = performCallSlotOptzn(LI, SI->getPointerOperand()->stripPointerCasts(), LI->getPointerOperand()->stripPointerCasts(), TD->getTypeStoreSize(SI->getOperand(0)->getType()), std::min(storeAlign, loadAlign), C); if (changed) { MD->removeInstruction(SI); SI->eraseFromParent(); MD->removeInstruction(LI); LI->eraseFromParent(); ++NumMemCpyInstr; return true; } } } } // There are two cases that are interesting for this code to handle: memcpy // and memset. Right now we only handle memset. // Ensure that the value being stored is something that can be memset'able a // byte at a time like "0" or "-1" or any width, as well as things like // 0xA0A0A0A0 and 0.0. if (Value *ByteVal = isBytewiseValue(SI->getOperand(0))) if (Instruction *I = tryMergingIntoMemset(SI, SI->getPointerOperand(), ByteVal)) { BBI = I; // Don't invalidate iterator. return true; } return false; }
/// processMemCpy - perform simplication of memcpy's. If we have memcpy A which /// copies X to Y, and memcpy B which copies Y to Z, then we can rewrite B to be /// a memcpy from X to Z (or potentially a memmove, depending on circumstances). /// This allows later passes to remove the first memcpy altogether. bool MemCpyOpt::processMemCpy(MemCpyInst *M) { MemoryDependenceAnalysis &MD = getAnalysis<MemoryDependenceAnalysis>(); // The are two possible optimizations we can do for memcpy: // a) memcpy-memcpy xform which exposes redundance for DSE. // b) call-memcpy xform for return slot optimization. MemDepResult dep = MD.getDependency(M); if (!dep.isClobber()) return false; if (!isa<MemCpyInst>(dep.getInst())) { if (CallInst *C = dyn_cast<CallInst>(dep.getInst())) return performCallSlotOptzn(M, C); return false; } MemCpyInst *MDep = cast<MemCpyInst>(dep.getInst()); // We can only transforms memcpy's where the dest of one is the source of the // other if (M->getSource() != MDep->getDest()) return false; // Second, the length of the memcpy's must be the same, or the preceeding one // must be larger than the following one. ConstantInt *C1 = dyn_cast<ConstantInt>(MDep->getLength()); ConstantInt *C2 = dyn_cast<ConstantInt>(M->getLength()); if (!C1 || !C2) return false; uint64_t DepSize = C1->getValue().getZExtValue(); uint64_t CpySize = C2->getValue().getZExtValue(); if (DepSize < CpySize) return false; // Finally, we have to make sure that the dest of the second does not // alias the source of the first AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); if (AA.alias(M->getRawDest(), CpySize, MDep->getRawSource(), DepSize) != AliasAnalysis::NoAlias) return false; else if (AA.alias(M->getRawDest(), CpySize, M->getRawSource(), CpySize) != AliasAnalysis::NoAlias) return false; else if (AA.alias(MDep->getRawDest(), DepSize, MDep->getRawSource(), DepSize) != AliasAnalysis::NoAlias) return false; // If all checks passed, then we can transform these memcpy's const Type *ArgTys[3] = { M->getRawDest()->getType(), MDep->getRawSource()->getType(), M->getLength()->getType() }; Function *MemCpyFun = Intrinsic::getDeclaration( M->getParent()->getParent()->getParent(), M->getIntrinsicID(), ArgTys, 3); Value *Args[5] = { M->getRawDest(), MDep->getRawSource(), M->getLength(), M->getAlignmentCst(), M->getVolatileCst() }; CallInst *C = CallInst::Create(MemCpyFun, Args, Args+5, "", M); // If C and M don't interfere, then this is a valid transformation. If they // did, this would mean that the two sources overlap, which would be bad. if (MD.getDependency(C) == dep) { MD.removeInstruction(M); M->eraseFromParent(); ++NumMemCpyInstr; return true; } // Otherwise, there was no point in doing this, so we remove the call we // inserted and act like nothing happened. MD.removeInstruction(C); C->eraseFromParent(); return false; }
/// processMemCpy - perform simplification of memcpy's. If we have memcpy A /// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite /// B to be a memcpy from X to Z (or potentially a memmove, depending on /// circumstances). This allows later passes to remove the first memcpy /// altogether. bool MemCpyOpt::processMemCpy(MemCpyInst *M) { // We can only optimize non-volatile memcpy's. if (M->isVolatile()) return false; // If the source and destination of the memcpy are the same, then zap it. if (M->getSource() == M->getDest()) { MD->removeInstruction(M); M->eraseFromParent(); return false; } // If copying from a constant, try to turn the memcpy into a memset. if (GlobalVariable *GV = dyn_cast<GlobalVariable>(M->getSource())) if (GV->isConstant() && GV->hasDefinitiveInitializer()) if (Value *ByteVal = isBytewiseValue(GV->getInitializer())) { IRBuilder<> Builder(M); Builder.CreateMemSet(M->getRawDest(), ByteVal, M->getLength(), M->getAlignment(), false); MD->removeInstruction(M); M->eraseFromParent(); ++NumCpyToSet; return true; } MemDepResult DepInfo = MD->getDependency(M); // Try to turn a partially redundant memset + memcpy into // memcpy + smaller memset. We don't need the memcpy size for this. if (DepInfo.isClobber()) if (MemSetInst *MDep = dyn_cast<MemSetInst>(DepInfo.getInst())) if (processMemSetMemCpyDependence(M, MDep)) return true; // The optimizations after this point require the memcpy size. ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength()); if (!CopySize) return false; // There are four possible optimizations we can do for memcpy: // a) memcpy-memcpy xform which exposes redundance for DSE. // b) call-memcpy xform for return slot optimization. // c) memcpy from freshly alloca'd space or space that has just started its // lifetime copies undefined data, and we can therefore eliminate the // memcpy in favor of the data that was already at the destination. // d) memcpy from a just-memset'd source can be turned into memset. if (DepInfo.isClobber()) { if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) { if (performCallSlotOptzn(M, M->getDest(), M->getSource(), CopySize->getZExtValue(), M->getAlignment(), C)) { MD->removeInstruction(M); M->eraseFromParent(); return true; } } } AliasAnalysis::Location SrcLoc = AliasAnalysis::getLocationForSource(M); MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(SrcLoc, true, M, M->getParent()); if (SrcDepInfo.isClobber()) { if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst())) return processMemCpyMemCpyDependence(M, MDep); } else if (SrcDepInfo.isDef()) { Instruction *I = SrcDepInfo.getInst(); bool hasUndefContents = false; if (isa<AllocaInst>(I)) { hasUndefContents = true; } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { if (II->getIntrinsicID() == Intrinsic::lifetime_start) if (ConstantInt *LTSize = dyn_cast<ConstantInt>(II->getArgOperand(0))) if (LTSize->getZExtValue() >= CopySize->getZExtValue()) hasUndefContents = true; } if (hasUndefContents) { MD->removeInstruction(M); M->eraseFromParent(); ++NumMemCpyInstr; return true; } } if (SrcDepInfo.isClobber()) if (MemSetInst *MDep = dyn_cast<MemSetInst>(SrcDepInfo.getInst())) if (performMemCpyToMemSetOptzn(M, MDep)) { MD->removeInstruction(M); M->eraseFromParent(); ++NumCpyToSet; return true; } return false; }
bool DSE::runOnBasicBlock(BasicBlock &BB) { bool MadeChange = false; // Do a top-down walk on the BB. for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) { Instruction *Inst = BBI++; // Handle 'free' calls specially. if (CallInst *F = isFreeCall(Inst, TLI)) { MadeChange |= HandleFree(F); continue; } // If we find something that writes memory, get its memory dependence. if (!hasMemoryWrite(Inst, TLI)) continue; MemDepResult InstDep = MD->getDependency(Inst); // Ignore any store where we can't find a local dependence. // FIXME: cross-block DSE would be fun. :) if (!InstDep.isDef() && !InstDep.isClobber()) continue; // If we're storing the same value back to a pointer that we just // loaded from, then the store can be removed. if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { if (LoadInst *DepLoad = dyn_cast<LoadInst>(InstDep.getInst())) { if (SI->getPointerOperand() == DepLoad->getPointerOperand() && SI->getOperand(0) == DepLoad && isRemovable(SI)) { DEBUG(dbgs() << "DSE: Remove Store Of Load from same pointer:\n " << "LOAD: " << *DepLoad << "\n STORE: " << *SI << '\n'); // DeleteDeadInstruction can delete the current instruction. Save BBI // in case we need it. WeakVH NextInst(BBI); DeleteDeadInstruction(SI, *MD, TLI); if (!NextInst) // Next instruction deleted. BBI = BB.begin(); else if (BBI != BB.begin()) // Revisit this instruction if possible. --BBI; ++NumFastStores; MadeChange = true; continue; } } } // Figure out what location is being stored to. AliasAnalysis::Location Loc = getLocForWrite(Inst, *AA); // If we didn't get a useful location, fail. if (!Loc.Ptr) continue; while (InstDep.isDef() || InstDep.isClobber()) { // Get the memory clobbered by the instruction we depend on. MemDep will // skip any instructions that 'Loc' clearly doesn't interact with. If we // end up depending on a may- or must-aliased load, then we can't optimize // away the store and we bail out. However, if we depend on on something // that overwrites the memory location we *can* potentially optimize it. // // Find out what memory location the dependent instruction stores. Instruction *DepWrite = InstDep.getInst(); AliasAnalysis::Location DepLoc = getLocForWrite(DepWrite, *AA); // If we didn't get a useful location, or if it isn't a size, bail out. if (!DepLoc.Ptr) break; // If we find a write that is a) removable (i.e., non-volatile), b) is // completely obliterated by the store to 'Loc', and c) which we know that // 'Inst' doesn't load from, then we can remove it. if (isRemovable(DepWrite) && !isPossibleSelfRead(Inst, Loc, DepWrite, *AA)) { int64_t InstWriteOffset, DepWriteOffset; OverwriteResult OR = isOverwrite(Loc, DepLoc, *AA, DepWriteOffset, InstWriteOffset); if (OR == OverwriteComplete) { DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *DepWrite << "\n KILLER: " << *Inst << '\n'); // Delete the store and now-dead instructions that feed it. DeleteDeadInstruction(DepWrite, *MD, TLI); ++NumFastStores; MadeChange = true; // DeleteDeadInstruction can delete the current instruction in loop // cases, reset BBI. BBI = Inst; if (BBI != BB.begin()) --BBI; break; } else if (OR == OverwriteEnd && isShortenable(DepWrite)) { // TODO: base this on the target vector size so that if the earlier // store was too small to get vector writes anyway then its likely // a good idea to shorten it // Power of 2 vector writes are probably always a bad idea to optimize // as any store/memset/memcpy is likely using vector instructions so // shortening it to not vector size is likely to be slower MemIntrinsic* DepIntrinsic = cast<MemIntrinsic>(DepWrite); unsigned DepWriteAlign = DepIntrinsic->getAlignment(); if (llvm::isPowerOf2_64(InstWriteOffset) || ((DepWriteAlign != 0) && InstWriteOffset % DepWriteAlign == 0)) { DEBUG(dbgs() << "DSE: Remove Dead Store:\n OW END: " << *DepWrite << "\n KILLER (offset " << InstWriteOffset << ", " << DepLoc.Size << ")" << *Inst << '\n'); Value* DepWriteLength = DepIntrinsic->getLength(); Value* TrimmedLength = ConstantInt::get(DepWriteLength->getType(), InstWriteOffset - DepWriteOffset); DepIntrinsic->setLength(TrimmedLength); MadeChange = true; } } } // If this is a may-aliased store that is clobbering the store value, we // can keep searching past it for another must-aliased pointer that stores // to the same location. For example, in: // store -> P // store -> Q // store -> P // we can remove the first store to P even though we don't know if P and Q // alias. if (DepWrite == &BB.front()) break; // Can't look past this instruction if it might read 'Loc'. if (AA->getModRefInfo(DepWrite, Loc) & AliasAnalysis::Ref) break; InstDep = MD->getPointerDependencyFrom(Loc, false, DepWrite, &BB); } } // If this block ends in a return, unwind, or unreachable, all allocas are // dead at its end, which means stores to them are also dead. if (BB.getTerminator()->getNumSuccessors() == 0) MadeChange |= handleEndBlock(BB); return MadeChange; }
bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { if (!SI->isSimple()) return false; // Avoid merging nontemporal stores since the resulting // memcpy/memset would not be able to preserve the nontemporal hint. // In theory we could teach how to propagate the !nontemporal metadata to // memset calls. However, that change would force the backend to // conservatively expand !nontemporal memset calls back to sequences of // store instructions (effectively undoing the merging). if (SI->getMetadata(LLVMContext::MD_nontemporal)) return false; const DataLayout &DL = SI->getModule()->getDataLayout(); // Detect cases where we're performing call slot forwarding, but // happen to be using a load-store pair to implement it, rather than // a memcpy. if (LoadInst *LI = dyn_cast<LoadInst>(SI->getOperand(0))) { if (LI->isSimple() && LI->hasOneUse() && LI->getParent() == SI->getParent()) { MemDepResult ldep = MD->getDependency(LI); CallInst *C = nullptr; if (ldep.isClobber() && !isa<MemCpyInst>(ldep.getInst())) C = dyn_cast<CallInst>(ldep.getInst()); if (C) { // Check that nothing touches the dest of the "copy" between // the call and the store. AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); MemoryLocation StoreLoc = MemoryLocation::get(SI); for (BasicBlock::iterator I = --SI->getIterator(), E = C->getIterator(); I != E; --I) { if (AA.getModRefInfo(&*I, StoreLoc) != MRI_NoModRef) { C = nullptr; break; } } } if (C) { unsigned storeAlign = SI->getAlignment(); if (!storeAlign) storeAlign = DL.getABITypeAlignment(SI->getOperand(0)->getType()); unsigned loadAlign = LI->getAlignment(); if (!loadAlign) loadAlign = DL.getABITypeAlignment(LI->getType()); bool changed = performCallSlotOptzn( LI, SI->getPointerOperand()->stripPointerCasts(), LI->getPointerOperand()->stripPointerCasts(), DL.getTypeStoreSize(SI->getOperand(0)->getType()), std::min(storeAlign, loadAlign), C); if (changed) { MD->removeInstruction(SI); SI->eraseFromParent(); MD->removeInstruction(LI); LI->eraseFromParent(); ++NumMemCpyInstr; return true; } } } } // There are two cases that are interesting for this code to handle: memcpy // and memset. Right now we only handle memset. // Ensure that the value being stored is something that can be memset'able a // byte at a time like "0" or "-1" or any width, as well as things like // 0xA0A0A0A0 and 0.0. if (Value *ByteVal = isBytewiseValue(SI->getOperand(0))) if (Instruction *I = tryMergingIntoMemset(SI, SI->getPointerOperand(), ByteVal)) { BBI = I->getIterator(); // Don't invalidate iterator. return true; } return false; }
bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { if (!SI->isSimple()) return false; // Avoid merging nontemporal stores since the resulting // memcpy/memset would not be able to preserve the nontemporal hint. // In theory we could teach how to propagate the !nontemporal metadata to // memset calls. However, that change would force the backend to // conservatively expand !nontemporal memset calls back to sequences of // store instructions (effectively undoing the merging). if (SI->getMetadata(LLVMContext::MD_nontemporal)) return false; const DataLayout &DL = SI->getModule()->getDataLayout(); // Load to store forwarding can be interpreted as memcpy. if (LoadInst *LI = dyn_cast<LoadInst>(SI->getOperand(0))) { if (LI->isSimple() && LI->hasOneUse() && LI->getParent() == SI->getParent()) { auto *T = LI->getType(); if (T->isAggregateType()) { AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); MemoryLocation LoadLoc = MemoryLocation::get(LI); // We use alias analysis to check if an instruction may store to // the memory we load from in between the load and the store. If // such an instruction is found, we store it in AI. Instruction *AI = nullptr; for (BasicBlock::iterator I = ++LI->getIterator(), E = SI->getIterator(); I != E; ++I) { if (AA.getModRefInfo(&*I, LoadLoc) & MRI_Mod) { AI = &*I; break; } } // If no aliasing instruction is found, then we can promote the // load/store pair to a memcpy at the store loaction. if (!AI) { // If we load from memory that may alias the memory we store to, // memmove must be used to preserve semantic. If not, memcpy can // be used. bool UseMemMove = false; if (!AA.isNoAlias(MemoryLocation::get(SI), LoadLoc)) UseMemMove = true; unsigned Align = findCommonAlignment(DL, SI, LI); uint64_t Size = DL.getTypeStoreSize(T); IRBuilder<> Builder(SI); Instruction *M; if (UseMemMove) M = Builder.CreateMemMove(SI->getPointerOperand(), LI->getPointerOperand(), Size, Align, SI->isVolatile()); else M = Builder.CreateMemCpy(SI->getPointerOperand(), LI->getPointerOperand(), Size, Align, SI->isVolatile()); DEBUG(dbgs() << "Promoting " << *LI << " to " << *SI << " => " << *M << "\n"); MD->removeInstruction(SI); SI->eraseFromParent(); MD->removeInstruction(LI); LI->eraseFromParent(); ++NumMemCpyInstr; // Make sure we do not invalidate the iterator. BBI = M->getIterator(); return true; } } // Detect cases where we're performing call slot forwarding, but // happen to be using a load-store pair to implement it, rather than // a memcpy. MemDepResult ldep = MD->getDependency(LI); CallInst *C = nullptr; if (ldep.isClobber() && !isa<MemCpyInst>(ldep.getInst())) C = dyn_cast<CallInst>(ldep.getInst()); if (C) { // Check that nothing touches the dest of the "copy" between // the call and the store. AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); MemoryLocation StoreLoc = MemoryLocation::get(SI); for (BasicBlock::iterator I = --SI->getIterator(), E = C->getIterator(); I != E; --I) { if (AA.getModRefInfo(&*I, StoreLoc) != MRI_NoModRef) { C = nullptr; break; } } } if (C) { bool changed = performCallSlotOptzn( LI, SI->getPointerOperand()->stripPointerCasts(), LI->getPointerOperand()->stripPointerCasts(), DL.getTypeStoreSize(SI->getOperand(0)->getType()), findCommonAlignment(DL, SI, LI), C); if (changed) { MD->removeInstruction(SI); SI->eraseFromParent(); MD->removeInstruction(LI); LI->eraseFromParent(); ++NumMemCpyInstr; return true; } } } } // There are two cases that are interesting for this code to handle: memcpy // and memset. Right now we only handle memset. // Ensure that the value being stored is something that can be memset'able a // byte at a time like "0" or "-1" or any width, as well as things like // 0xA0A0A0A0 and 0.0. if (Value *ByteVal = isBytewiseValue(SI->getOperand(0))) if (Instruction *I = tryMergingIntoMemset(SI, SI->getPointerOperand(), ByteVal)) { BBI = I->getIterator(); // Don't invalidate iterator. return true; } return false; }
/// processStore - When GVN is scanning forward over instructions, we look for /// some other patterns to fold away. In particular, this looks for stores to /// neighboring locations of memory. If it sees enough consequtive ones /// (currently 4) it attempts to merge them together into a memcpy/memset. bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { if (SI->isVolatile()) return false; TargetData *TD = getAnalysisIfAvailable<TargetData>(); if (!TD) return false; // Detect cases where we're performing call slot forwarding, but // happen to be using a load-store pair to implement it, rather than // a memcpy. if (LoadInst *LI = dyn_cast<LoadInst>(SI->getOperand(0))) { if (!LI->isVolatile() && LI->hasOneUse()) { MemDepResult dep = MD->getDependency(LI); CallInst *C = 0; if (dep.isClobber() && !isa<MemCpyInst>(dep.getInst())) C = dyn_cast<CallInst>(dep.getInst()); if (C) { bool changed = performCallSlotOptzn(LI, SI->getPointerOperand()->stripPointerCasts(), LI->getPointerOperand()->stripPointerCasts(), TD->getTypeStoreSize(SI->getOperand(0)->getType()), C); if (changed) { MD->removeInstruction(SI); SI->eraseFromParent(); LI->eraseFromParent(); ++NumMemCpyInstr; return true; } } } } LLVMContext &Context = SI->getContext(); // There are two cases that are interesting for this code to handle: memcpy // and memset. Right now we only handle memset. // Ensure that the value being stored is something that can be memset'able a // byte at a time like "0" or "-1" or any width, as well as things like // 0xA0A0A0A0 and 0.0. Value *ByteVal = isBytewiseValue(SI->getOperand(0)); if (!ByteVal) return false; AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); Module *M = SI->getParent()->getParent()->getParent(); // Okay, so we now have a single store that can be splatable. Scan to find // all subsequent stores of the same value to offset from the same pointer. // Join these together into ranges, so we can decide whether contiguous blocks // are stored. MemsetRanges Ranges(*TD); Value *StartPtr = SI->getPointerOperand(); BasicBlock::iterator BI = SI; for (++BI; !isa<TerminatorInst>(BI); ++BI) { if (isa<CallInst>(BI) || isa<InvokeInst>(BI)) { // If the call is readnone, ignore it, otherwise bail out. We don't even // allow readonly here because we don't want something like: // A[1] = 2; strlen(A); A[2] = 2; -> memcpy(A, ...); strlen(A). if (AA.getModRefBehavior(CallSite(BI)) == AliasAnalysis::DoesNotAccessMemory) continue; // TODO: If this is a memset, try to join it in. break; } else if (isa<VAArgInst>(BI) || isa<LoadInst>(BI)) break; // If this is a non-store instruction it is fine, ignore it. StoreInst *NextStore = dyn_cast<StoreInst>(BI); if (NextStore == 0) continue; // If this is a store, see if we can merge it in. if (NextStore->isVolatile()) break; // Check to see if this stored value is of the same byte-splattable value. if (ByteVal != isBytewiseValue(NextStore->getOperand(0))) break; // Check to see if this store is to a constant offset from the start ptr. int64_t Offset; if (!IsPointerOffset(StartPtr, NextStore->getPointerOperand(), Offset, *TD)) break; Ranges.addStore(Offset, NextStore); } // If we have no ranges, then we just had a single store with nothing that // could be merged in. This is a very common case of course. if (Ranges.empty()) return false; // If we had at least one store that could be merged in, add the starting // store as well. We try to avoid this unless there is at least something // interesting as a small compile-time optimization. Ranges.addStore(0, SI); // Now that we have full information about ranges, loop over the ranges and // emit memset's for anything big enough to be worthwhile. bool MadeChange = false; for (MemsetRanges::const_iterator I = Ranges.begin(), E = Ranges.end(); I != E; ++I) { const MemsetRange &Range = *I; if (Range.TheStores.size() == 1) continue; // If it is profitable to lower this range to memset, do so now. if (!Range.isProfitableToUseMemset(*TD)) continue; // Otherwise, we do want to transform this! Create a new memset. We put // the memset right before the first instruction that isn't part of this // memset block. This ensure that the memset is dominated by any addressing // instruction needed by the start of the block. BasicBlock::iterator InsertPt = BI; // Get the starting pointer of the block. StartPtr = Range.StartPtr; // Determine alignment unsigned Alignment = Range.Alignment; if (Alignment == 0) { const Type *EltType = cast<PointerType>(StartPtr->getType())->getElementType(); Alignment = TD->getABITypeAlignment(EltType); } // Cast the start ptr to be i8* as memset requires. const PointerType* StartPTy = cast<PointerType>(StartPtr->getType()); const PointerType *i8Ptr = Type::getInt8PtrTy(Context, StartPTy->getAddressSpace()); if (StartPTy!= i8Ptr) StartPtr = new BitCastInst(StartPtr, i8Ptr, StartPtr->getName(), InsertPt); Value *Ops[] = { StartPtr, ByteVal, // Start, value // size ConstantInt::get(Type::getInt64Ty(Context), Range.End-Range.Start), // align ConstantInt::get(Type::getInt32Ty(Context), Alignment), // volatile ConstantInt::getFalse(Context), }; const Type *Tys[] = { Ops[0]->getType(), Ops[2]->getType() }; Function *MemSetF = Intrinsic::getDeclaration(M, Intrinsic::memset, Tys, 2); Value *C = CallInst::Create(MemSetF, Ops, Ops+5, "", InsertPt); DEBUG(dbgs() << "Replace stores:\n"; for (unsigned i = 0, e = Range.TheStores.size(); i != e; ++i) dbgs() << *Range.TheStores[i] << '\n'; dbgs() << "With: " << *C << '\n'); C=C; // Don't invalidate the iterator BBI = BI; // Zap all the stores. for (SmallVector<StoreInst*, 16>::const_iterator SI = Range.TheStores.begin(), SE = Range.TheStores.end(); SI != SE; ++SI) (*SI)->eraseFromParent(); ++NumMemSetInfer; MadeChange = true; } return MadeChange; }
bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { if (SI->isVolatile()) return false; if (TD == 0) return false; // Detect cases where we're performing call slot forwarding, but // happen to be using a load-store pair to implement it, rather than // a memcpy. if (LoadInst *LI = dyn_cast<LoadInst>(SI->getOperand(0))) { if (!LI->isVolatile() && LI->hasOneUse()) { MemDepResult ldep = MD->getDependency(LI); CallInst *C = 0; if (ldep.isClobber() && !isa<MemCpyInst>(ldep.getInst())) C = dyn_cast<CallInst>(ldep.getInst()); if (C) { // Check that nothing touches the dest of the "copy" between // the call and the store. MemDepResult sdep = MD->getDependency(SI); if (!sdep.isNonLocal()) { bool FoundCall = false; for (BasicBlock::iterator I = SI, E = sdep.getInst(); I != E; --I) { if (&*I == C) { FoundCall = true; break; } } if (!FoundCall) C = 0; } } if (C) { bool changed = performCallSlotOptzn(LI, SI->getPointerOperand()->stripPointerCasts(), LI->getPointerOperand()->stripPointerCasts(), TD->getTypeStoreSize(SI->getOperand(0)->getType()), C); if (changed) { MD->removeInstruction(SI); SI->eraseFromParent(); MD->removeInstruction(LI); LI->eraseFromParent(); ++NumMemCpyInstr; return true; } } } } // There are two cases that are interesting for this code to handle: memcpy // and memset. Right now we only handle memset. // Ensure that the value being stored is something that can be memset'able a // byte at a time like "0" or "-1" or any width, as well as things like // 0xA0A0A0A0 and 0.0. if (Value *ByteVal = isBytewiseValue(SI->getOperand(0))) if (Instruction *I = tryMergingIntoMemset(SI, SI->getPointerOperand(), ByteVal)) { BBI = I; // Don't invalidate iterator. return true; } return false; }