__host__ __device__ cudaError_t triple_chevrons(void* kernel, ::dim3 grid_dim, ::dim3 block_dim, int shared_memory_size, cudaStream_t stream, const Args&... args) { // reference the kernel to encourage the compiler not to optimize it away workaround_unused_variable_warning(kernel); #if __cuda_lib_has_cudart # ifndef __CUDA_ARCH__ cudaConfigureCall(grid_dim, block_dim, shared_memory_size, stream); setup_kernel_arguments(0, args...); return cudaLaunch(kernel); # else // XXX generalize to multiple arguments if(sizeof...(Args) != 1) { return cudaErrorNotSupported; } using Arg = typename first_type<Args...>::type; void *param_buffer = cudaGetParameterBuffer(std::alignment_of<Arg>::value, sizeof(Arg)); std::memcpy(param_buffer, &first_parameter(args...), sizeof(Arg)); return cudaLaunchDevice(kernel, param_buffer, grid_dim, block_dim, shared_memory_size, stream); # endif // __CUDA_ARCH__ #else // __cuda_lib_has_cudart return cudaErrorNotSupported; #endif }
__host__ __device__ static void supported_path(unsigned int num_blocks, unsigned int block_size, size_t num_dynamic_smem_bytes, cudaStream_t stream, task_type task) { #if __BULK_HAS_CUDART__ # ifndef __CUDA_ARCH__ cudaConfigureCall(dim3(num_blocks), dim3(block_size), num_dynamic_smem_bytes, stream); cudaSetupArgument(task, 0); bulk::detail::throw_on_error(cudaLaunch(super_t::global_function_pointer()), "after cudaLaunch in triple_chevron_launcher::launch()"); # else void *param_buffer = cudaGetParameterBuffer(alignment_of<task_type>::value, sizeof(task_type)); std::memcpy(param_buffer, &task, sizeof(task_type)); bulk::detail::throw_on_error(cudaLaunchDevice(reinterpret_cast<void*>(super_t::global_function_pointer()), param_buffer, dim3(num_blocks), dim3(block_size), num_dynamic_smem_bytes, stream), "after cudaLaunchDevice in triple_chevron_launcher::launch()"); # endif // __CUDA_ARCH__ #endif // __BULK_HAS_CUDART__ }
void masterKendall(const float * x, size_t nx, const float * y, size_t ny, size_t sampleSize, double * results) { size_t outputLength = nx * ny, outputBytes = outputLength*sizeof(double), xBytes = nx*sampleSize*sizeof(float), yBytes = ny*sampleSize*sizeof(float); float * gpux, * gpuy; double * gpuResults; dim3 grid(nx, ny), block(NUMTHREADS, NUMTHREADS); cudaMalloc((void **)&gpux, xBytes); cudaMalloc((void **)&gpuy, yBytes); checkCudaError("input vector space allocation"); cudaMemcpy(gpux, x, xBytes, cudaMemcpyHostToDevice); cudaMemcpy(gpuy, y, yBytes, cudaMemcpyHostToDevice); checkCudaError("copying input vectors to gpu"); cudaMalloc((void **)&gpuResults, outputBytes); checkCudaError("allocation of space for result matrix"); void *args[] = { &gpux , &nx , &gpuy , &ny , &sampleSize , &gpuResults }; cudaLaunch("gpuKendall", args, grid, block); cudaFree(gpux); cudaFree(gpuy); cudaMemcpy(results, gpuResults, outputBytes, cudaMemcpyDeviceToHost); cudaFree(gpuResults); checkCudaError("copying results from gpu and cleaning up"); }
cudaError_t WINAPI wine_cudaLaunch(const char *entry) { WINE_TRACE("%p\n", entry); if (QUEUE_MAX == numQueued) { cudaError_t evtErr; if (WINE_TRACE_ON(cuda)) { /* print out if event was recorded or not */ WINE_TRACE("check event recorded %s\n", debug_cudaError(cudaEventQuery(event))); } /* wait for event */ unsigned int sleepCount = 0; char * SLTIME = getenv("SLEEPTIME"); if ( SLTIME == NULL ) { sleep = 300000; } else { sleep = atoi ( SLTIME ); } while (cudaEventQuery(event) != cudaSuccess) { nanosleep(sleep, NULL); sleepCount++; } WINE_TRACE("slept %u times\n", sleepCount); WINE_TRACE("event recorded, continuing\n"); /* record a new event and subtract HALF_QUEUE_MAX from numQueued */ numQueued = HALF_QUEUE_MAX; evtErr = cudaEventRecord(event, 0); if (evtErr) { WINE_ERR("cudaEventRecord: %s\n", debug_cudaError(evtErr)); } } cudaError_t err = cudaLaunch(entry); if (!eventInitialized) { /* Create an event on the first cudaLaunch call. This is done here so the calling program * has a chance to select the GPU device with cudaSetDevice if desired. */ cudaError_t evtErr = cudaEventCreate(&event); if (evtErr) { WINE_ERR("cudaEventCreate: %s\n", debug_cudaError(evtErr)); } /* cudaEventCreate can WINE_TRACE("\n"); return errors from previous asynchronous calls, so an error here does * not necessarily mean the event wasn't created. Assume it was created for now. */ eventInitialized = TRUE; WINE_TRACE("created event %d\n", event); } /* record an event at HALF_QUEUE_MAX */ if (HALF_QUEUE_MAX == ++numQueued) { cudaError_t evtErr = cudaEventRecord(event, 0); /* Assuming everything using stream 0 */ if (evtErr) { WINE_ERR("cudaEventRecord: %s\n", debug_cudaError(evtErr)); } } if (err) { WINE_TRACE("return %s\n", debug_cudaError(err)); } return err; }