void Lint::visitCallSite(CallSite CS) { Instruction &I = *CS.getInstruction(); Value *Callee = CS.getCalledValue(); visitMemoryReference(I, Callee, MemoryLocation::UnknownSize, 0, nullptr, MemRef::Callee); if (Function *F = dyn_cast<Function>(findValue(Callee, /*OffsetOk=*/false))) { Assert(CS.getCallingConv() == F->getCallingConv(), "Undefined behavior: Caller and callee calling convention differ", &I); FunctionType *FT = F->getFunctionType(); unsigned NumActualArgs = CS.arg_size(); Assert(FT->isVarArg() ? FT->getNumParams() <= NumActualArgs : FT->getNumParams() == NumActualArgs, "Undefined behavior: Call argument count mismatches callee " "argument count", &I); Assert(FT->getReturnType() == I.getType(), "Undefined behavior: Call return type mismatches " "callee return type", &I); // Check argument types (in case the callee was casted) and attributes. // TODO: Verify that caller and callee attributes are compatible. Function::arg_iterator PI = F->arg_begin(), PE = F->arg_end(); CallSite::arg_iterator AI = CS.arg_begin(), AE = CS.arg_end(); for (; AI != AE; ++AI) { Value *Actual = *AI; if (PI != PE) { Argument *Formal = &*PI++; Assert(Formal->getType() == Actual->getType(), "Undefined behavior: Call argument type mismatches " "callee parameter type", &I); // Check that noalias arguments don't alias other arguments. This is // not fully precise because we don't know the sizes of the dereferenced // memory regions. if (Formal->hasNoAliasAttr() && Actual->getType()->isPointerTy()) for (CallSite::arg_iterator BI = CS.arg_begin(); BI != AE; ++BI) if (AI != BI && (*BI)->getType()->isPointerTy()) { AliasResult Result = AA->alias(*AI, *BI); Assert(Result != MustAlias && Result != PartialAlias, "Unusual: noalias argument aliases another argument", &I); } // Check that an sret argument points to valid memory. if (Formal->hasStructRetAttr() && Actual->getType()->isPointerTy()) { Type *Ty = cast<PointerType>(Formal->getType())->getElementType(); visitMemoryReference(I, Actual, DL->getTypeStoreSize(Ty), DL->getABITypeAlignment(Ty), Ty, MemRef::Read | MemRef::Write); } } } } if (CS.isCall() && cast<CallInst>(CS.getInstruction())->isTailCall()) for (CallSite::arg_iterator AI = CS.arg_begin(), AE = CS.arg_end(); AI != AE; ++AI) { Value *Obj = findValue(*AI, /*OffsetOk=*/true); Assert(!isa<AllocaInst>(Obj), "Undefined behavior: Call with \"tail\" keyword references " "alloca", &I); } if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) switch (II->getIntrinsicID()) { default: break; // TODO: Check more intrinsics case Intrinsic::memcpy: { MemCpyInst *MCI = cast<MemCpyInst>(&I); // TODO: If the size is known, use it. visitMemoryReference(I, MCI->getDest(), MemoryLocation::UnknownSize, MCI->getAlignment(), nullptr, MemRef::Write); visitMemoryReference(I, MCI->getSource(), MemoryLocation::UnknownSize, MCI->getAlignment(), nullptr, MemRef::Read); // Check that the memcpy arguments don't overlap. The AliasAnalysis API // isn't expressive enough for what we really want to do. Known partial // overlap is not distinguished from the case where nothing is known. uint64_t Size = 0; if (const ConstantInt *Len = dyn_cast<ConstantInt>(findValue(MCI->getLength(), /*OffsetOk=*/false))) if (Len->getValue().isIntN(32)) Size = Len->getValue().getZExtValue(); Assert(AA->alias(MCI->getSource(), Size, MCI->getDest(), Size) != MustAlias, "Undefined behavior: memcpy source and destination overlap", &I); break; } case Intrinsic::memmove: { MemMoveInst *MMI = cast<MemMoveInst>(&I); // TODO: If the size is known, use it. visitMemoryReference(I, MMI->getDest(), MemoryLocation::UnknownSize, MMI->getAlignment(), nullptr, MemRef::Write); visitMemoryReference(I, MMI->getSource(), MemoryLocation::UnknownSize, MMI->getAlignment(), nullptr, MemRef::Read); break; } case Intrinsic::memset: { MemSetInst *MSI = cast<MemSetInst>(&I); // TODO: If the size is known, use it. visitMemoryReference(I, MSI->getDest(), MemoryLocation::UnknownSize, MSI->getAlignment(), nullptr, MemRef::Write); break; } case Intrinsic::vastart: Assert(I.getParent()->getParent()->isVarArg(), "Undefined behavior: va_start called in a non-varargs function", &I); visitMemoryReference(I, CS.getArgument(0), MemoryLocation::UnknownSize, 0, nullptr, MemRef::Read | MemRef::Write); break; case Intrinsic::vacopy: visitMemoryReference(I, CS.getArgument(0), MemoryLocation::UnknownSize, 0, nullptr, MemRef::Write); visitMemoryReference(I, CS.getArgument(1), MemoryLocation::UnknownSize, 0, nullptr, MemRef::Read); break; case Intrinsic::vaend: visitMemoryReference(I, CS.getArgument(0), MemoryLocation::UnknownSize, 0, nullptr, MemRef::Read | MemRef::Write); break; case Intrinsic::stackrestore: // Stackrestore doesn't read or write memory, but it sets the // stack pointer, which the compiler may read from or write to // at any time, so check it for both readability and writeability. visitMemoryReference(I, CS.getArgument(0), MemoryLocation::UnknownSize, 0, nullptr, MemRef::Read | MemRef::Write); break; } }
// FIXME: Should try to pick the most likely to be profitable allocas first. bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) { // Array allocations are probably not worth handling, since an allocation of // the array type is the canonical form. if (!I.isStaticAlloca() || I.isArrayAllocation()) return false; IRBuilder<> Builder(&I); // First try to replace the alloca with a vector Type *AllocaTy = I.getAllocatedType(); DEBUG(dbgs() << "Trying to promote " << I << '\n'); if (tryPromoteAllocaToVector(&I, AS)) return true; // Promoted to vector. const Function &ContainingFunction = *I.getParent()->getParent(); CallingConv::ID CC = ContainingFunction.getCallingConv(); // Don't promote the alloca to LDS for shader calling conventions as the work // item ID intrinsics are not supported for these calling conventions. // Furthermore not all LDS is available for some of the stages. switch (CC) { case CallingConv::AMDGPU_KERNEL: case CallingConv::SPIR_KERNEL: break; default: DEBUG(dbgs() << " promote alloca to LDS not supported with calling convention.\n"); return false; } // Not likely to have sufficient local memory for promotion. if (!SufficientLDS) return false; const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(ContainingFunction); unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second; const DataLayout &DL = Mod->getDataLayout(); unsigned Align = I.getAlignment(); if (Align == 0) Align = DL.getABITypeAlignment(I.getAllocatedType()); // FIXME: This computed padding is likely wrong since it depends on inverse // usage order. // // FIXME: It is also possible that if we're allowed to use all of the memory // could could end up using more than the maximum due to alignment padding. uint32_t NewSize = alignTo(CurrentLocalMemUsage, Align); uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy); NewSize += AllocSize; if (NewSize > LocalMemLimit) { DEBUG(dbgs() << " " << AllocSize << " bytes of local memory not available to promote\n"); return false; } CurrentLocalMemUsage = NewSize; std::vector<Value*> WorkList; if (!collectUsesWithPtrTypes(&I, &I, WorkList)) { DEBUG(dbgs() << " Do not know how to convert all uses\n"); return false; } DEBUG(dbgs() << "Promoting alloca to local memory\n"); Function *F = I.getParent()->getParent(); Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize); GlobalVariable *GV = new GlobalVariable( *Mod, GVTy, false, GlobalValue::InternalLinkage, UndefValue::get(GVTy), Twine(F->getName()) + Twine('.') + I.getName(), nullptr, GlobalVariable::NotThreadLocal, AS.LOCAL_ADDRESS); GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); GV->setAlignment(I.getAlignment()); Value *TCntY, *TCntZ; std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder); Value *TIdX = getWorkitemID(Builder, 0); Value *TIdY = getWorkitemID(Builder, 1); Value *TIdZ = getWorkitemID(Builder, 2); Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true); Tmp0 = Builder.CreateMul(Tmp0, TIdX); Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true); Value *TID = Builder.CreateAdd(Tmp0, Tmp1); TID = Builder.CreateAdd(TID, TIdZ); Value *Indices[] = { Constant::getNullValue(Type::getInt32Ty(Mod->getContext())), TID }; Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices); I.mutateType(Offset->getType()); I.replaceAllUsesWith(Offset); I.eraseFromParent(); for (Value *V : WorkList) { CallInst *Call = dyn_cast<CallInst>(V); if (!Call) { if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) { Value *Src0 = CI->getOperand(0); Type *EltTy = Src0->getType()->getPointerElementType(); PointerType *NewTy = PointerType::get(EltTy, AS.LOCAL_ADDRESS); if (isa<ConstantPointerNull>(CI->getOperand(0))) CI->setOperand(0, ConstantPointerNull::get(NewTy)); if (isa<ConstantPointerNull>(CI->getOperand(1))) CI->setOperand(1, ConstantPointerNull::get(NewTy)); continue; } // The operand's value should be corrected on its own and we don't want to // touch the users. if (isa<AddrSpaceCastInst>(V)) continue; Type *EltTy = V->getType()->getPointerElementType(); PointerType *NewTy = PointerType::get(EltTy, AS.LOCAL_ADDRESS); // FIXME: It doesn't really make sense to try to do this for all // instructions. V->mutateType(NewTy); // Adjust the types of any constant operands. if (SelectInst *SI = dyn_cast<SelectInst>(V)) { if (isa<ConstantPointerNull>(SI->getOperand(1))) SI->setOperand(1, ConstantPointerNull::get(NewTy)); if (isa<ConstantPointerNull>(SI->getOperand(2))) SI->setOperand(2, ConstantPointerNull::get(NewTy)); } else if (PHINode *Phi = dyn_cast<PHINode>(V)) { for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) { if (isa<ConstantPointerNull>(Phi->getIncomingValue(I))) Phi->setIncomingValue(I, ConstantPointerNull::get(NewTy)); } } continue; } IntrinsicInst *Intr = cast<IntrinsicInst>(Call); Builder.SetInsertPoint(Intr); switch (Intr->getIntrinsicID()) { case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: // These intrinsics are for address space 0 only Intr->eraseFromParent(); continue; case Intrinsic::memcpy: { MemCpyInst *MemCpy = cast<MemCpyInst>(Intr); Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getDestAlignment(), MemCpy->getRawSource(), MemCpy->getSourceAlignment(), MemCpy->getLength(), MemCpy->isVolatile()); Intr->eraseFromParent(); continue; } case Intrinsic::memmove: { MemMoveInst *MemMove = cast<MemMoveInst>(Intr); Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getDestAlignment(), MemMove->getRawSource(), MemMove->getSourceAlignment(), MemMove->getLength(), MemMove->isVolatile()); Intr->eraseFromParent(); continue; } case Intrinsic::memset: { MemSetInst *MemSet = cast<MemSetInst>(Intr); Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(), MemSet->getLength(), MemSet->getDestAlignment(), MemSet->isVolatile()); Intr->eraseFromParent(); continue; } case Intrinsic::invariant_start: case Intrinsic::invariant_end: case Intrinsic::invariant_group_barrier: Intr->eraseFromParent(); // FIXME: I think the invariant marker should still theoretically apply, // but the intrinsics need to be changed to accept pointers with any // address space. continue; case Intrinsic::objectsize: { Value *Src = Intr->getOperand(0); Type *SrcTy = Src->getType()->getPointerElementType(); Function *ObjectSize = Intrinsic::getDeclaration(Mod, Intrinsic::objectsize, { Intr->getType(), PointerType::get(SrcTy, AS.LOCAL_ADDRESS) } ); CallInst *NewCall = Builder.CreateCall( ObjectSize, {Src, Intr->getOperand(1), Intr->getOperand(2)}); Intr->replaceAllUsesWith(NewCall); Intr->eraseFromParent(); continue; } default: Intr->print(errs()); llvm_unreachable("Don't know how to promote alloca intrinsic use."); } } return true; }
void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { // Array allocations are probably not worth handling, since an allocation of // the array type is the canonical form. if (!I.isStaticAlloca() || I.isArrayAllocation()) return; IRBuilder<> Builder(&I); // First try to replace the alloca with a vector Type *AllocaTy = I.getAllocatedType(); DEBUG(dbgs() << "Trying to promote " << I << '\n'); if (tryPromoteAllocaToVector(&I)) return; DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n"); const Function &ContainingFunction = *I.getParent()->getParent(); // FIXME: We should also try to get this value from the reqd_work_group_size // function attribute if it is available. unsigned WorkGroupSize = AMDGPU::getMaximumWorkGroupSize(ContainingFunction); int AllocaSize = WorkGroupSize * Mod->getDataLayout().getTypeAllocSize(AllocaTy); if (AllocaSize > LocalMemAvailable) { DEBUG(dbgs() << " Not enough local memory to promote alloca.\n"); return; } std::vector<Value*> WorkList; if (!collectUsesWithPtrTypes(&I, WorkList)) { DEBUG(dbgs() << " Do not know how to convert all uses\n"); return; } DEBUG(dbgs() << "Promoting alloca to local memory\n"); LocalMemAvailable -= AllocaSize; Function *F = I.getParent()->getParent(); Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize); GlobalVariable *GV = new GlobalVariable( *Mod, GVTy, false, GlobalValue::InternalLinkage, UndefValue::get(GVTy), Twine(F->getName()) + Twine('.') + I.getName(), nullptr, GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS); GV->setUnnamedAddr(true); GV->setAlignment(I.getAlignment()); Value *TCntY, *TCntZ; std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder); Value *TIdX = getWorkitemID(Builder, 0); Value *TIdY = getWorkitemID(Builder, 1); Value *TIdZ = getWorkitemID(Builder, 2); Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true); Tmp0 = Builder.CreateMul(Tmp0, TIdX); Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true); Value *TID = Builder.CreateAdd(Tmp0, Tmp1); TID = Builder.CreateAdd(TID, TIdZ); Value *Indices[] = { Constant::getNullValue(Type::getInt32Ty(Mod->getContext())), TID }; Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices); I.mutateType(Offset->getType()); I.replaceAllUsesWith(Offset); I.eraseFromParent(); for (Value *V : WorkList) { CallInst *Call = dyn_cast<CallInst>(V); if (!Call) { Type *EltTy = V->getType()->getPointerElementType(); PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS); // The operand's value should be corrected on its own. if (isa<AddrSpaceCastInst>(V)) continue; // FIXME: It doesn't really make sense to try to do this for all // instructions. V->mutateType(NewTy); continue; } IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call); if (!Intr) { // FIXME: What is this for? It doesn't make sense to promote arbitrary // function calls. If the call is to a defined function that can also be // promoted, we should be able to do this once that function is also // rewritten. std::vector<Type*> ArgTypes; for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands(); ArgIdx != ArgEnd; ++ArgIdx) { ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType()); } Function *F = Call->getCalledFunction(); FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes, F->isVarArg()); Constant *C = Mod->getOrInsertFunction((F->getName() + ".local").str(), NewType, F->getAttributes()); Function *NewF = cast<Function>(C); Call->setCalledFunction(NewF); continue; } Builder.SetInsertPoint(Intr); switch (Intr->getIntrinsicID()) { case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: // These intrinsics are for address space 0 only Intr->eraseFromParent(); continue; case Intrinsic::memcpy: { MemCpyInst *MemCpy = cast<MemCpyInst>(Intr); Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(), MemCpy->getLength(), MemCpy->getAlignment(), MemCpy->isVolatile()); Intr->eraseFromParent(); continue; } case Intrinsic::memmove: { MemMoveInst *MemMove = cast<MemMoveInst>(Intr); Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getRawSource(), MemMove->getLength(), MemMove->getAlignment(), MemMove->isVolatile()); Intr->eraseFromParent(); continue; } case Intrinsic::memset: { MemSetInst *MemSet = cast<MemSetInst>(Intr); Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(), MemSet->getLength(), MemSet->getAlignment(), MemSet->isVolatile()); Intr->eraseFromParent(); continue; } case Intrinsic::invariant_start: case Intrinsic::invariant_end: case Intrinsic::invariant_group_barrier: Intr->eraseFromParent(); // FIXME: I think the invariant marker should still theoretically apply, // but the intrinsics need to be changed to accept pointers with any // address space. continue; case Intrinsic::objectsize: { Value *Src = Intr->getOperand(0); Type *SrcTy = Src->getType()->getPointerElementType(); Function *ObjectSize = Intrinsic::getDeclaration(Mod, Intrinsic::objectsize, { Intr->getType(), PointerType::get(SrcTy, AMDGPUAS::LOCAL_ADDRESS) } ); CallInst *NewCall = Builder.CreateCall(ObjectSize, { Src, Intr->getOperand(1) }); Intr->replaceAllUsesWith(NewCall); Intr->eraseFromParent(); continue; } default: Intr->dump(); llvm_unreachable("Don't know how to promote alloca intrinsic use."); } } }
void Lint::visitCallSite(CallSite CS) { Instruction &I = *CS.getInstruction(); Value *Callee = CS.getCalledValue(); // TODO: Check function alignment? visitMemoryReference(I, Callee, 0, 0); if (Function *F = dyn_cast<Function>(Callee->stripPointerCasts())) { Assert1(CS.getCallingConv() == F->getCallingConv(), "Undefined behavior: Caller and callee calling convention differ", &I); const FunctionType *FT = F->getFunctionType(); unsigned NumActualArgs = unsigned(CS.arg_end()-CS.arg_begin()); Assert1(FT->isVarArg() ? FT->getNumParams() <= NumActualArgs : FT->getNumParams() == NumActualArgs, "Undefined behavior: Call argument count mismatches callee " "argument count", &I); // TODO: Check argument types (in case the callee was casted) // TODO: Check ABI-significant attributes. // TODO: Check noalias attribute. // TODO: Check sret attribute. } // TODO: Check the "tail" keyword constraints. if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) switch (II->getIntrinsicID()) { default: break; // TODO: Check more intrinsics case Intrinsic::memcpy: { MemCpyInst *MCI = cast<MemCpyInst>(&I); visitMemoryReference(I, MCI->getSource(), MCI->getAlignment(), 0); visitMemoryReference(I, MCI->getDest(), MCI->getAlignment(), 0); // Check that the memcpy arguments don't overlap. The AliasAnalysis API // isn't expressive enough for what we really want to do. Known partial // overlap is not distinguished from the case where nothing is known. unsigned Size = 0; if (const ConstantInt *Len = dyn_cast<ConstantInt>(MCI->getLength()->stripPointerCasts())) if (Len->getValue().isIntN(32)) Size = Len->getValue().getZExtValue(); Assert1(AA->alias(MCI->getSource(), Size, MCI->getDest(), Size) != AliasAnalysis::MustAlias, "Undefined behavior: memcpy source and destination overlap", &I); break; } case Intrinsic::memmove: { MemMoveInst *MMI = cast<MemMoveInst>(&I); visitMemoryReference(I, MMI->getSource(), MMI->getAlignment(), 0); visitMemoryReference(I, MMI->getDest(), MMI->getAlignment(), 0); break; } case Intrinsic::memset: { MemSetInst *MSI = cast<MemSetInst>(&I); visitMemoryReference(I, MSI->getDest(), MSI->getAlignment(), 0); break; } case Intrinsic::vastart: Assert1(I.getParent()->getParent()->isVarArg(), "Undefined behavior: va_start called in a non-varargs function", &I); visitMemoryReference(I, CS.getArgument(0), 0, 0); break; case Intrinsic::vacopy: visitMemoryReference(I, CS.getArgument(0), 0, 0); visitMemoryReference(I, CS.getArgument(1), 0, 0); break; case Intrinsic::vaend: visitMemoryReference(I, CS.getArgument(0), 0, 0); break; case Intrinsic::stackrestore: visitMemoryReference(I, CS.getArgument(0), 0, 0); break; } }