/// processByValArgument - This is called on every byval argument in call sites. bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { if (TD == 0) return false; // Find out what feeds this byval argument. Value *ByValArg = CS.getArgument(ArgNo); Type *ByValTy = cast<PointerType>(ByValArg->getType())->getElementType(); uint64_t ByValSize = TD->getTypeAllocSize(ByValTy); MemDepResult DepInfo = MD->getPointerDependencyFrom(AliasAnalysis::Location(ByValArg, ByValSize), true, CS.getInstruction(), CS.getInstruction()->getParent()); if (!DepInfo.isClobber()) return false; // If the byval argument isn't fed by a memcpy, ignore it. If it is fed by // a memcpy, see if we can byval from the source of the memcpy instead of the // result. MemCpyInst *MDep = dyn_cast<MemCpyInst>(DepInfo.getInst()); if (MDep == 0 || MDep->isVolatile() || ByValArg->stripPointerCasts() != MDep->getDest()) return false; // The length of the memcpy must be larger or equal to the size of the byval. ConstantInt *C1 = dyn_cast<ConstantInt>(MDep->getLength()); if (C1 == 0 || C1->getValue().getZExtValue() < ByValSize) return false; // Get the alignment of the byval. If the call doesn't specify the alignment, // then it is some target specific value that we can't know. unsigned ByValAlign = CS.getParamAlignment(ArgNo+1); if (ByValAlign == 0) return false; // If it is greater than the memcpy, then we check to see if we can force the // source of the memcpy to the alignment we need. If we fail, we bail out. if (MDep->getAlignment() < ByValAlign && getOrEnforceKnownAlignment(MDep->getSource(),ByValAlign, TD) < ByValAlign) return false; // Verify that the copied-from memory doesn't change in between the memcpy and // the byval call. // memcpy(a <- b) // *b = 42; // foo(*a) // It would be invalid to transform the second memcpy into foo(*b). // // NOTE: This is conservative, it will stop on any read from the source loc, // not just the defining memcpy. MemDepResult SourceDep = MD->getPointerDependencyFrom(AliasAnalysis::getLocationForSource(MDep), false, CS.getInstruction(), MDep->getParent()); if (!SourceDep.isClobber() || SourceDep.getInst() != MDep) return false; Value *TmpCast = MDep->getSource(); if (MDep->getSource()->getType() != ByValArg->getType()) TmpCast = new BitCastInst(MDep->getSource(), ByValArg->getType(), "tmpcast", CS.getInstruction()); DEBUG(dbgs() << "MemCpyOpt: Forwarding memcpy to byval:\n" << " " << *MDep << "\n" << " " << *CS.getInstruction() << "\n"); // Otherwise we're good! Update the byval argument. CS.setArgument(ArgNo, TmpCast); ++NumMemCpyInstr; return true; }
// FIXME: Should try to pick the most likely to be profitable allocas first. bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) { // Array allocations are probably not worth handling, since an allocation of // the array type is the canonical form. if (!I.isStaticAlloca() || I.isArrayAllocation()) return false; IRBuilder<> Builder(&I); // First try to replace the alloca with a vector Type *AllocaTy = I.getAllocatedType(); DEBUG(dbgs() << "Trying to promote " << I << '\n'); if (tryPromoteAllocaToVector(&I, AS)) return true; // Promoted to vector. const Function &ContainingFunction = *I.getParent()->getParent(); CallingConv::ID CC = ContainingFunction.getCallingConv(); // Don't promote the alloca to LDS for shader calling conventions as the work // item ID intrinsics are not supported for these calling conventions. // Furthermore not all LDS is available for some of the stages. switch (CC) { case CallingConv::AMDGPU_KERNEL: case CallingConv::SPIR_KERNEL: break; default: DEBUG(dbgs() << " promote alloca to LDS not supported with calling convention.\n"); return false; } // Not likely to have sufficient local memory for promotion. if (!SufficientLDS) return false; const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(ContainingFunction); unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second; const DataLayout &DL = Mod->getDataLayout(); unsigned Align = I.getAlignment(); if (Align == 0) Align = DL.getABITypeAlignment(I.getAllocatedType()); // FIXME: This computed padding is likely wrong since it depends on inverse // usage order. // // FIXME: It is also possible that if we're allowed to use all of the memory // could could end up using more than the maximum due to alignment padding. uint32_t NewSize = alignTo(CurrentLocalMemUsage, Align); uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy); NewSize += AllocSize; if (NewSize > LocalMemLimit) { DEBUG(dbgs() << " " << AllocSize << " bytes of local memory not available to promote\n"); return false; } CurrentLocalMemUsage = NewSize; std::vector<Value*> WorkList; if (!collectUsesWithPtrTypes(&I, &I, WorkList)) { DEBUG(dbgs() << " Do not know how to convert all uses\n"); return false; } DEBUG(dbgs() << "Promoting alloca to local memory\n"); Function *F = I.getParent()->getParent(); Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize); GlobalVariable *GV = new GlobalVariable( *Mod, GVTy, false, GlobalValue::InternalLinkage, UndefValue::get(GVTy), Twine(F->getName()) + Twine('.') + I.getName(), nullptr, GlobalVariable::NotThreadLocal, AS.LOCAL_ADDRESS); GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); GV->setAlignment(I.getAlignment()); Value *TCntY, *TCntZ; std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder); Value *TIdX = getWorkitemID(Builder, 0); Value *TIdY = getWorkitemID(Builder, 1); Value *TIdZ = getWorkitemID(Builder, 2); Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true); Tmp0 = Builder.CreateMul(Tmp0, TIdX); Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true); Value *TID = Builder.CreateAdd(Tmp0, Tmp1); TID = Builder.CreateAdd(TID, TIdZ); Value *Indices[] = { Constant::getNullValue(Type::getInt32Ty(Mod->getContext())), TID }; Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices); I.mutateType(Offset->getType()); I.replaceAllUsesWith(Offset); I.eraseFromParent(); for (Value *V : WorkList) { CallInst *Call = dyn_cast<CallInst>(V); if (!Call) { if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) { Value *Src0 = CI->getOperand(0); Type *EltTy = Src0->getType()->getPointerElementType(); PointerType *NewTy = PointerType::get(EltTy, AS.LOCAL_ADDRESS); if (isa<ConstantPointerNull>(CI->getOperand(0))) CI->setOperand(0, ConstantPointerNull::get(NewTy)); if (isa<ConstantPointerNull>(CI->getOperand(1))) CI->setOperand(1, ConstantPointerNull::get(NewTy)); continue; } // The operand's value should be corrected on its own and we don't want to // touch the users. if (isa<AddrSpaceCastInst>(V)) continue; Type *EltTy = V->getType()->getPointerElementType(); PointerType *NewTy = PointerType::get(EltTy, AS.LOCAL_ADDRESS); // FIXME: It doesn't really make sense to try to do this for all // instructions. V->mutateType(NewTy); // Adjust the types of any constant operands. if (SelectInst *SI = dyn_cast<SelectInst>(V)) { if (isa<ConstantPointerNull>(SI->getOperand(1))) SI->setOperand(1, ConstantPointerNull::get(NewTy)); if (isa<ConstantPointerNull>(SI->getOperand(2))) SI->setOperand(2, ConstantPointerNull::get(NewTy)); } else if (PHINode *Phi = dyn_cast<PHINode>(V)) { for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) { if (isa<ConstantPointerNull>(Phi->getIncomingValue(I))) Phi->setIncomingValue(I, ConstantPointerNull::get(NewTy)); } } continue; } IntrinsicInst *Intr = cast<IntrinsicInst>(Call); Builder.SetInsertPoint(Intr); switch (Intr->getIntrinsicID()) { case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: // These intrinsics are for address space 0 only Intr->eraseFromParent(); continue; case Intrinsic::memcpy: { MemCpyInst *MemCpy = cast<MemCpyInst>(Intr); Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getDestAlignment(), MemCpy->getRawSource(), MemCpy->getSourceAlignment(), MemCpy->getLength(), MemCpy->isVolatile()); Intr->eraseFromParent(); continue; } case Intrinsic::memmove: { MemMoveInst *MemMove = cast<MemMoveInst>(Intr); Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getDestAlignment(), MemMove->getRawSource(), MemMove->getSourceAlignment(), MemMove->getLength(), MemMove->isVolatile()); Intr->eraseFromParent(); continue; } case Intrinsic::memset: { MemSetInst *MemSet = cast<MemSetInst>(Intr); Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(), MemSet->getLength(), MemSet->getDestAlignment(), MemSet->isVolatile()); Intr->eraseFromParent(); continue; } case Intrinsic::invariant_start: case Intrinsic::invariant_end: case Intrinsic::invariant_group_barrier: Intr->eraseFromParent(); // FIXME: I think the invariant marker should still theoretically apply, // but the intrinsics need to be changed to accept pointers with any // address space. continue; case Intrinsic::objectsize: { Value *Src = Intr->getOperand(0); Type *SrcTy = Src->getType()->getPointerElementType(); Function *ObjectSize = Intrinsic::getDeclaration(Mod, Intrinsic::objectsize, { Intr->getType(), PointerType::get(SrcTy, AS.LOCAL_ADDRESS) } ); CallInst *NewCall = Builder.CreateCall( ObjectSize, {Src, Intr->getOperand(1), Intr->getOperand(2)}); Intr->replaceAllUsesWith(NewCall); Intr->eraseFromParent(); continue; } default: Intr->print(errs()); llvm_unreachable("Don't know how to promote alloca intrinsic use."); } } return true; }
void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { // Array allocations are probably not worth handling, since an allocation of // the array type is the canonical form. if (!I.isStaticAlloca() || I.isArrayAllocation()) return; IRBuilder<> Builder(&I); // First try to replace the alloca with a vector Type *AllocaTy = I.getAllocatedType(); DEBUG(dbgs() << "Trying to promote " << I << '\n'); if (tryPromoteAllocaToVector(&I)) return; DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n"); const Function &ContainingFunction = *I.getParent()->getParent(); // FIXME: We should also try to get this value from the reqd_work_group_size // function attribute if it is available. unsigned WorkGroupSize = AMDGPU::getMaximumWorkGroupSize(ContainingFunction); int AllocaSize = WorkGroupSize * Mod->getDataLayout().getTypeAllocSize(AllocaTy); if (AllocaSize > LocalMemAvailable) { DEBUG(dbgs() << " Not enough local memory to promote alloca.\n"); return; } std::vector<Value*> WorkList; if (!collectUsesWithPtrTypes(&I, WorkList)) { DEBUG(dbgs() << " Do not know how to convert all uses\n"); return; } DEBUG(dbgs() << "Promoting alloca to local memory\n"); LocalMemAvailable -= AllocaSize; Function *F = I.getParent()->getParent(); Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize); GlobalVariable *GV = new GlobalVariable( *Mod, GVTy, false, GlobalValue::InternalLinkage, UndefValue::get(GVTy), Twine(F->getName()) + Twine('.') + I.getName(), nullptr, GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS); GV->setUnnamedAddr(true); GV->setAlignment(I.getAlignment()); Value *TCntY, *TCntZ; std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder); Value *TIdX = getWorkitemID(Builder, 0); Value *TIdY = getWorkitemID(Builder, 1); Value *TIdZ = getWorkitemID(Builder, 2); Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true); Tmp0 = Builder.CreateMul(Tmp0, TIdX); Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true); Value *TID = Builder.CreateAdd(Tmp0, Tmp1); TID = Builder.CreateAdd(TID, TIdZ); Value *Indices[] = { Constant::getNullValue(Type::getInt32Ty(Mod->getContext())), TID }; Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices); I.mutateType(Offset->getType()); I.replaceAllUsesWith(Offset); I.eraseFromParent(); for (Value *V : WorkList) { CallInst *Call = dyn_cast<CallInst>(V); if (!Call) { Type *EltTy = V->getType()->getPointerElementType(); PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS); // The operand's value should be corrected on its own. if (isa<AddrSpaceCastInst>(V)) continue; // FIXME: It doesn't really make sense to try to do this for all // instructions. V->mutateType(NewTy); continue; } IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call); if (!Intr) { // FIXME: What is this for? It doesn't make sense to promote arbitrary // function calls. If the call is to a defined function that can also be // promoted, we should be able to do this once that function is also // rewritten. std::vector<Type*> ArgTypes; for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands(); ArgIdx != ArgEnd; ++ArgIdx) { ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType()); } Function *F = Call->getCalledFunction(); FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes, F->isVarArg()); Constant *C = Mod->getOrInsertFunction((F->getName() + ".local").str(), NewType, F->getAttributes()); Function *NewF = cast<Function>(C); Call->setCalledFunction(NewF); continue; } Builder.SetInsertPoint(Intr); switch (Intr->getIntrinsicID()) { case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: // These intrinsics are for address space 0 only Intr->eraseFromParent(); continue; case Intrinsic::memcpy: { MemCpyInst *MemCpy = cast<MemCpyInst>(Intr); Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(), MemCpy->getLength(), MemCpy->getAlignment(), MemCpy->isVolatile()); Intr->eraseFromParent(); continue; } case Intrinsic::memmove: { MemMoveInst *MemMove = cast<MemMoveInst>(Intr); Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getRawSource(), MemMove->getLength(), MemMove->getAlignment(), MemMove->isVolatile()); Intr->eraseFromParent(); continue; } case Intrinsic::memset: { MemSetInst *MemSet = cast<MemSetInst>(Intr); Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(), MemSet->getLength(), MemSet->getAlignment(), MemSet->isVolatile()); Intr->eraseFromParent(); continue; } case Intrinsic::invariant_start: case Intrinsic::invariant_end: case Intrinsic::invariant_group_barrier: Intr->eraseFromParent(); // FIXME: I think the invariant marker should still theoretically apply, // but the intrinsics need to be changed to accept pointers with any // address space. continue; case Intrinsic::objectsize: { Value *Src = Intr->getOperand(0); Type *SrcTy = Src->getType()->getPointerElementType(); Function *ObjectSize = Intrinsic::getDeclaration(Mod, Intrinsic::objectsize, { Intr->getType(), PointerType::get(SrcTy, AMDGPUAS::LOCAL_ADDRESS) } ); CallInst *NewCall = Builder.CreateCall(ObjectSize, { Src, Intr->getOperand(1) }); Intr->replaceAllUsesWith(NewCall); Intr->eraseFromParent(); continue; } default: Intr->dump(); llvm_unreachable("Don't know how to promote alloca intrinsic use."); } } }
void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) { IRBuilder<> Builder(&I); // First try to replace the alloca with a vector Type *AllocaTy = I.getAllocatedType(); DEBUG(dbgs() << "Trying to promote " << I << '\n'); if (tryPromoteAllocaToVector(&I)) return; DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n"); // FIXME: This is the maximum work group size. We should try to get // value from the reqd_work_group_size function attribute if it is // available. unsigned WorkGroupSize = 256; int AllocaSize = WorkGroupSize * Mod->getDataLayout()->getTypeAllocSize(AllocaTy); if (AllocaSize > LocalMemAvailable) { DEBUG(dbgs() << " Not enough local memory to promote alloca.\n"); return; } std::vector<Value*> WorkList; if (!collectUsesWithPtrTypes(&I, WorkList)) { DEBUG(dbgs() << " Do not know how to convert all uses\n"); return; } DEBUG(dbgs() << "Promoting alloca to local memory\n"); LocalMemAvailable -= AllocaSize; GlobalVariable *GV = new GlobalVariable( *Mod, ArrayType::get(I.getAllocatedType(), 256), false, GlobalValue::ExternalLinkage, 0, I.getName(), 0, GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS); FunctionType *FTy = FunctionType::get( Type::getInt32Ty(Mod->getContext()), false); AttributeSet AttrSet; AttrSet.addAttribute(Mod->getContext(), 0, Attribute::ReadNone); Value *ReadLocalSizeY = Mod->getOrInsertFunction( "llvm.r600.read.local.size.y", FTy, AttrSet); Value *ReadLocalSizeZ = Mod->getOrInsertFunction( "llvm.r600.read.local.size.z", FTy, AttrSet); Value *ReadTIDIGX = Mod->getOrInsertFunction( "llvm.r600.read.tidig.x", FTy, AttrSet); Value *ReadTIDIGY = Mod->getOrInsertFunction( "llvm.r600.read.tidig.y", FTy, AttrSet); Value *ReadTIDIGZ = Mod->getOrInsertFunction( "llvm.r600.read.tidig.z", FTy, AttrSet); Value *TCntY = Builder.CreateCall(ReadLocalSizeY); Value *TCntZ = Builder.CreateCall(ReadLocalSizeZ); Value *TIdX = Builder.CreateCall(ReadTIDIGX); Value *TIdY = Builder.CreateCall(ReadTIDIGY); Value *TIdZ = Builder.CreateCall(ReadTIDIGZ); Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ); Tmp0 = Builder.CreateMul(Tmp0, TIdX); Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ); Value *TID = Builder.CreateAdd(Tmp0, Tmp1); TID = Builder.CreateAdd(TID, TIdZ); std::vector<Value*> Indices; Indices.push_back(Constant::getNullValue(Type::getInt32Ty(Mod->getContext()))); Indices.push_back(TID); Value *Offset = Builder.CreateGEP(GV, Indices); I.mutateType(Offset->getType()); I.replaceAllUsesWith(Offset); I.eraseFromParent(); for (std::vector<Value*>::iterator i = WorkList.begin(), e = WorkList.end(); i != e; ++i) { Value *V = *i; CallInst *Call = dyn_cast<CallInst>(V); if (!Call) { Type *EltTy = V->getType()->getPointerElementType(); PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS); // The operand's value should be corrected on its own. if (isa<AddrSpaceCastInst>(V)) continue; // FIXME: It doesn't really make sense to try to do this for all // instructions. V->mutateType(NewTy); continue; } IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call); if (!Intr) { std::vector<Type*> ArgTypes; for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands(); ArgIdx != ArgEnd; ++ArgIdx) { ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType()); } Function *F = Call->getCalledFunction(); FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes, F->isVarArg()); Constant *C = Mod->getOrInsertFunction(StringRef(F->getName().str() + ".local"), NewType, F->getAttributes()); Function *NewF = cast<Function>(C); Call->setCalledFunction(NewF); continue; } Builder.SetInsertPoint(Intr); switch (Intr->getIntrinsicID()) { case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: // These intrinsics are for address space 0 only Intr->eraseFromParent(); continue; case Intrinsic::memcpy: { MemCpyInst *MemCpy = cast<MemCpyInst>(Intr); Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(), MemCpy->getLength(), MemCpy->getAlignment(), MemCpy->isVolatile()); Intr->eraseFromParent(); continue; } case Intrinsic::memset: { MemSetInst *MemSet = cast<MemSetInst>(Intr); Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(), MemSet->getLength(), MemSet->getAlignment(), MemSet->isVolatile()); Intr->eraseFromParent(); continue; } default: Intr->dump(); llvm_unreachable("Don't know how to promote alloca intrinsic use."); } } }