__host__ __device__ cudaError_t triple_chevrons(void* kernel, ::dim3 grid_dim, ::dim3 block_dim, int shared_memory_size, cudaStream_t stream, const Args&... args) { // reference the kernel to encourage the compiler not to optimize it away workaround_unused_variable_warning(kernel); #if __cuda_lib_has_cudart # ifndef __CUDA_ARCH__ cudaConfigureCall(grid_dim, block_dim, shared_memory_size, stream); setup_kernel_arguments(0, args...); return cudaLaunch(kernel); # else // XXX generalize to multiple arguments if(sizeof...(Args) != 1) { return cudaErrorNotSupported; } using Arg = typename first_type<Args...>::type; void *param_buffer = cudaGetParameterBuffer(std::alignment_of<Arg>::value, sizeof(Arg)); std::memcpy(param_buffer, &first_parameter(args...), sizeof(Arg)); return cudaLaunchDevice(kernel, param_buffer, grid_dim, block_dim, shared_memory_size, stream); # endif // __CUDA_ARCH__ #else // __cuda_lib_has_cudart return cudaErrorNotSupported; #endif }
__host__ __device__ static void supported_path(unsigned int num_blocks, unsigned int block_size, size_t num_dynamic_smem_bytes, cudaStream_t stream, task_type task) { #if __BULK_HAS_CUDART__ # ifndef __CUDA_ARCH__ cudaConfigureCall(dim3(num_blocks), dim3(block_size), num_dynamic_smem_bytes, stream); cudaSetupArgument(task, 0); bulk::detail::throw_on_error(cudaLaunch(super_t::global_function_pointer()), "after cudaLaunch in triple_chevron_launcher::launch()"); # else void *param_buffer = cudaGetParameterBuffer(alignment_of<task_type>::value, sizeof(task_type)); std::memcpy(param_buffer, &task, sizeof(task_type)); bulk::detail::throw_on_error(cudaLaunchDevice(reinterpret_cast<void*>(super_t::global_function_pointer()), param_buffer, dim3(num_blocks), dim3(block_size), num_dynamic_smem_bytes, stream), "after cudaLaunchDevice in triple_chevron_launcher::launch()"); # endif // __CUDA_ARCH__ #endif // __BULK_HAS_CUDART__ }