void CGOpenMPRuntime::EmitOMPSerialCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, llvm::Value *CapturedStruct) { auto ThreadID = GetOpenMPThreadID(CGF, Loc); // Build calls: // __kmpc_serialized_parallel(&Loc, GTid); llvm::Value *SerArgs[] = {EmitOpenMPUpdateLocation(CGF, Loc), ThreadID}; auto RTLFn = CreateRuntimeFunction(CGOpenMPRuntime::OMPRTL__kmpc_serialized_parallel); CGF.EmitRuntimeCall(RTLFn, SerArgs); // OutlinedFn(>id, &zero, CapturedStruct); auto ThreadIDAddr = EmitThreadIDAddress(CGF, Loc); auto Int32Ty = CGF.getContext().getIntTypeForBitwidth(/*DestWidth*/ 32, /*Signed*/ true); auto ZeroAddr = CGF.CreateMemTemp(Int32Ty, /*Name*/ ".zero.addr"); CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0)); llvm::Value *OutlinedFnArgs[] = {ThreadIDAddr, ZeroAddr, CapturedStruct}; CGF.EmitCallOrInvoke(OutlinedFn, OutlinedFnArgs); // __kmpc_end_serialized_parallel(&Loc, GTid); llvm::Value *EndSerArgs[] = {EmitOpenMPUpdateLocation(CGF, Loc), ThreadID}; RTLFn = CreateRuntimeFunction( CGOpenMPRuntime::OMPRTL__kmpc_end_serialized_parallel); CGF.EmitRuntimeCall(RTLFn, EndSerArgs); }
// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local // array and kernels are launched using cudaLaunchKernel(). void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args) { // Build the shadow stack entry at the very start of the function. // Calculate amount of space we will need for all arguments. If we have no // args, allocate a single pointer so we still have a valid pointer to the // argument array that we can pass to runtime, even if it will be unused. Address KernelArgs = CGF.CreateTempAlloca( VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args", llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size()))); // Store pointers to the arguments in a locally allocated launch_args. for (unsigned i = 0; i < Args.size(); ++i) { llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer(); llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy); CGF.Builder.CreateDefaultAlignedStore( VoidVarPtr, CGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), i)); } llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); // Lookup cudaLaunchKernel function. // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, // void **args, size_t sharedMem, // cudaStream_t stream); TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl(); DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl); IdentifierInfo &cudaLaunchKernelII = CGM.getContext().Idents.get("cudaLaunchKernel"); FunctionDecl *cudaLaunchKernelFD = nullptr; for (const auto &Result : DC->lookup(&cudaLaunchKernelII)) { if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result)) cudaLaunchKernelFD = FD; } if (cudaLaunchKernelFD == nullptr) { CGM.Error(CGF.CurFuncDecl->getLocation(), "Can't find declaration for cudaLaunchKernel()"); return; } // Create temporary dim3 grid_dim, block_dim. ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1); QualType Dim3Ty = GridDimParam->getType(); Address GridDim = CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim"); Address BlockDim = CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim"); Address ShmemSize = CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size"); Address Stream = CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream"); llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction( llvm::FunctionType::get(IntTy, {/*gridDim=*/GridDim.getType(), /*blockDim=*/BlockDim.getType(), /*ShmemSize=*/ShmemSize.getType(), /*Stream=*/Stream.getType()}, /*isVarArg=*/false), "__cudaPopCallConfiguration"); CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn, {GridDim.getPointer(), BlockDim.getPointer(), ShmemSize.getPointer(), Stream.getPointer()}); // Emit the call to cudaLaunch llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy); CallArgList LaunchKernelArgs; LaunchKernelArgs.add(RValue::get(Kernel), cudaLaunchKernelFD->getParamDecl(0)->getType()); LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty); LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty); LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()), cudaLaunchKernelFD->getParamDecl(3)->getType()); LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)), cudaLaunchKernelFD->getParamDecl(4)->getType()); LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)), cudaLaunchKernelFD->getParamDecl(5)->getType()); QualType QT = cudaLaunchKernelFD->getType(); QualType CQT = QT.getCanonicalType(); llvm::Type *Ty = CGM.getTypes().ConvertType(CQT); llvm::FunctionType *FTy = dyn_cast<llvm::FunctionType>(Ty); const CGFunctionInfo &FI = CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD); llvm::FunctionCallee cudaLaunchKernelFn = CGM.CreateRuntimeFunction(FTy, "cudaLaunchKernel"); CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(), LaunchKernelArgs); CGF.EmitBranch(EndBlock); CGF.EmitBlock(EndBlock); }