void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args) { // Emit a call to cudaSetupArgument for each arg in Args. llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn(); llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); CharUnits Offset = CharUnits::Zero(); for (const VarDecl *A : Args) { CharUnits TyWidth, TyAlign; std::tie(TyWidth, TyAlign) = CGM.getContext().getTypeInfoInChars(A->getType()); Offset = Offset.alignTo(TyAlign); llvm::Value *Args[] = { CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(), VoidPtrTy), llvm::ConstantInt::get(SizeTy, TyWidth.getQuantity()), llvm::ConstantInt::get(SizeTy, Offset.getQuantity()), }; llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args); llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0); llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero); llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next"); CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock); CGF.EmitBlock(NextBlock); Offset += TyWidth; } // Emit the call to cudaLaunch llvm::FunctionCallee cudaLaunchFn = getLaunchFn(); llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy); CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg); CGF.EmitBranch(EndBlock); CGF.EmitBlock(EndBlock); }
/// \brief Emits code for OpenMP 'if' clause using specified \a CodeGen /// function. Here is the logic: /// if (Cond) { /// CodeGen(true); /// } else { /// CodeGen(false); /// } static void EmitOMPIfClause(CodeGenFunction &CGF, const Expr *Cond, const std::function<void(bool)> &CodeGen) { CodeGenFunction::LexicalScope ConditionScope(CGF, Cond->getSourceRange()); // If the condition constant folds and can be elided, try to avoid emitting // the condition and the dead arm of the if/else. bool CondConstant; if (CGF.ConstantFoldsToSimpleInteger(Cond, CondConstant)) { CodeGen(CondConstant); return; } // Otherwise, the condition did not fold, or we couldn't elide it. Just // emit the conditional branch. auto ThenBlock = CGF.createBasicBlock(/*name*/ "omp_if.then"); auto ElseBlock = CGF.createBasicBlock(/*name*/ "omp_if.else"); auto ContBlock = CGF.createBasicBlock(/*name*/ "omp_if.end"); CGF.EmitBranchOnBoolExpr(Cond, ThenBlock, ElseBlock, /*TrueCount*/ 0); // Emit the 'then' code. CGF.EmitBlock(ThenBlock); CodeGen(/*ThenBlock*/ true); CGF.EmitBranch(ContBlock); // Emit the 'else' code if present. { // There is no need to emit line number for unconditional branch. SuppressDebugLocation SDL(CGF.Builder); CGF.EmitBlock(ElseBlock); } CodeGen(/*ThenBlock*/ false); { // There is no need to emit line number for unconditional branch. SuppressDebugLocation SDL(CGF.Builder); CGF.EmitBranch(ContBlock); } // Emit the continuation block for code after the if. CGF.EmitBlock(ContBlock, /*IsFinished*/ true); }
static void EmitOMPIfStmt(CodeGenFunction &CGF, llvm::Value *IfCond, const std::function<void()> &BodyOpGen) { llvm::Value *CallBool = CGF.EmitScalarConversion( IfCond, CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/true), CGF.getContext().BoolTy); auto *ThenBlock = CGF.createBasicBlock("omp_if.then"); auto *ContBlock = CGF.createBasicBlock("omp_if.end"); // Generate the branch (If-stmt) CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock); CGF.EmitBlock(ThenBlock); BodyOpGen(); // Emit the rest of bblocks/branches CGF.EmitBranch(ContBlock); CGF.EmitBlock(ContBlock, true); }
void CGNVCUDARuntime::EmitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args) { // Build the argument value list and the argument stack struct type. SmallVector<llvm::Value *, 16> ArgValues; std::vector<llvm::Type *> ArgTypes; for (FunctionArgList::const_iterator I = Args.begin(), E = Args.end(); I != E; ++I) { llvm::Value *V = CGF.GetAddrOfLocalVar(*I); ArgValues.push_back(V); assert(isa<llvm::PointerType>(V->getType()) && "Arg type not PointerType"); ArgTypes.push_back(cast<llvm::PointerType>(V->getType())->getElementType()); } llvm::StructType *ArgStackTy = llvm::StructType::get( CGF.getLLVMContext(), ArgTypes); llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); // Emit the calls to cudaSetupArgument llvm::Constant *cudaSetupArgFn = getSetupArgumentFn(); for (unsigned I = 0, E = Args.size(); I != E; ++I) { llvm::Value *Args[3]; llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next"); Args[0] = CGF.Builder.CreatePointerCast(ArgValues[I], VoidPtrTy); Args[1] = CGF.Builder.CreateIntCast( llvm::ConstantExpr::getSizeOf(ArgTypes[I]), SizeTy, false); Args[2] = CGF.Builder.CreateIntCast( llvm::ConstantExpr::getOffsetOf(ArgStackTy, I), SizeTy, false); llvm::CallSite CS = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args); llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0); llvm::Value *CSZero = CGF.Builder.CreateICmpEQ(CS.getInstruction(), Zero); CGF.Builder.CreateCondBr(CSZero, NextBlock, EndBlock); CGF.EmitBlock(NextBlock); } // Emit the call to cudaLaunch llvm::Constant *cudaLaunchFn = getLaunchFn(); llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy); CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg); CGF.EmitBranch(EndBlock); CGF.EmitBlock(EndBlock); }
// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local // array and kernels are launched using cudaLaunchKernel(). void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args) { // Build the shadow stack entry at the very start of the function. // Calculate amount of space we will need for all arguments. If we have no // args, allocate a single pointer so we still have a valid pointer to the // argument array that we can pass to runtime, even if it will be unused. Address KernelArgs = CGF.CreateTempAlloca( VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args", llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size()))); // Store pointers to the arguments in a locally allocated launch_args. for (unsigned i = 0; i < Args.size(); ++i) { llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer(); llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy); CGF.Builder.CreateDefaultAlignedStore( VoidVarPtr, CGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), i)); } llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); // Lookup cudaLaunchKernel function. // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, // void **args, size_t sharedMem, // cudaStream_t stream); TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl(); DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl); IdentifierInfo &cudaLaunchKernelII = CGM.getContext().Idents.get("cudaLaunchKernel"); FunctionDecl *cudaLaunchKernelFD = nullptr; for (const auto &Result : DC->lookup(&cudaLaunchKernelII)) { if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result)) cudaLaunchKernelFD = FD; } if (cudaLaunchKernelFD == nullptr) { CGM.Error(CGF.CurFuncDecl->getLocation(), "Can't find declaration for cudaLaunchKernel()"); return; } // Create temporary dim3 grid_dim, block_dim. ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1); QualType Dim3Ty = GridDimParam->getType(); Address GridDim = CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim"); Address BlockDim = CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim"); Address ShmemSize = CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size"); Address Stream = CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream"); llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction( llvm::FunctionType::get(IntTy, {/*gridDim=*/GridDim.getType(), /*blockDim=*/BlockDim.getType(), /*ShmemSize=*/ShmemSize.getType(), /*Stream=*/Stream.getType()}, /*isVarArg=*/false), "__cudaPopCallConfiguration"); CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn, {GridDim.getPointer(), BlockDim.getPointer(), ShmemSize.getPointer(), Stream.getPointer()}); // Emit the call to cudaLaunch llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy); CallArgList LaunchKernelArgs; LaunchKernelArgs.add(RValue::get(Kernel), cudaLaunchKernelFD->getParamDecl(0)->getType()); LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty); LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty); LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()), cudaLaunchKernelFD->getParamDecl(3)->getType()); LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)), cudaLaunchKernelFD->getParamDecl(4)->getType()); LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)), cudaLaunchKernelFD->getParamDecl(5)->getType()); QualType QT = cudaLaunchKernelFD->getType(); QualType CQT = QT.getCanonicalType(); llvm::Type *Ty = CGM.getTypes().ConvertType(CQT); llvm::FunctionType *FTy = dyn_cast<llvm::FunctionType>(Ty); const CGFunctionInfo &FI = CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD); llvm::FunctionCallee cudaLaunchKernelFn = CGM.CreateRuntimeFunction(FTy, "cudaLaunchKernel"); CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(), LaunchKernelArgs); CGF.EmitBranch(EndBlock); CGF.EmitBlock(EndBlock); }