__host__ __device__ cudaError_t triple_chevrons(void* kernel, ::dim3 grid_dim, ::dim3 block_dim, int shared_memory_size, cudaStream_t stream, const Args&... args) { // reference the kernel to encourage the compiler not to optimize it away workaround_unused_variable_warning(kernel); #if __cuda_lib_has_cudart # ifndef __CUDA_ARCH__ cudaConfigureCall(grid_dim, block_dim, shared_memory_size, stream); setup_kernel_arguments(0, args...); return cudaLaunch(kernel); # else // XXX generalize to multiple arguments if(sizeof...(Args) != 1) { return cudaErrorNotSupported; } using Arg = typename first_type<Args...>::type; void *param_buffer = cudaGetParameterBuffer(std::alignment_of<Arg>::value, sizeof(Arg)); std::memcpy(param_buffer, &first_parameter(args...), sizeof(Arg)); return cudaLaunchDevice(kernel, param_buffer, grid_dim, block_dim, shared_memory_size, stream); # endif // __CUDA_ARCH__ #else // __cuda_lib_has_cudart return cudaErrorNotSupported; #endif }
void setup_kernel_arguments(size_t offset, const Arg1& arg1, const Args&... args) { offset = align_up<Arg1>(offset); cudaSetupArgument(arg1, offset); setup_kernel_arguments(offset + sizeof(Arg1), args...); }