__host__ __device__ cudaError_t triple_chevrons(void* kernel, ::dim3 grid_dim, ::dim3 block_dim, int shared_memory_size, cudaStream_t stream, const Args&... args) { // reference the kernel to encourage the compiler not to optimize it away workaround_unused_variable_warning(kernel); #if __cuda_lib_has_cudart # ifndef __CUDA_ARCH__ cudaConfigureCall(grid_dim, block_dim, shared_memory_size, stream); setup_kernel_arguments(0, args...); return cudaLaunch(kernel); # else // XXX generalize to multiple arguments if(sizeof...(Args) != 1) { return cudaErrorNotSupported; } using Arg = typename first_type<Args...>::type; void *param_buffer = cudaGetParameterBuffer(std::alignment_of<Arg>::value, sizeof(Arg)); std::memcpy(param_buffer, &first_parameter(args...), sizeof(Arg)); return cudaLaunchDevice(kernel, param_buffer, grid_dim, block_dim, shared_memory_size, stream); # endif // __CUDA_ARCH__ #else // __cuda_lib_has_cudart return cudaErrorNotSupported; #endif }
__host__ __device__ static void supported_path(unsigned int num_blocks, unsigned int block_size, size_t num_dynamic_smem_bytes, cudaStream_t stream, task_type task) { #if __BULK_HAS_CUDART__ # ifndef __CUDA_ARCH__ cudaConfigureCall(dim3(num_blocks), dim3(block_size), num_dynamic_smem_bytes, stream); cudaSetupArgument(task, 0); bulk::detail::throw_on_error(cudaLaunch(super_t::global_function_pointer()), "after cudaLaunch in triple_chevron_launcher::launch()"); # else void *param_buffer = cudaGetParameterBuffer(alignment_of<task_type>::value, sizeof(task_type)); std::memcpy(param_buffer, &task, sizeof(task_type)); bulk::detail::throw_on_error(cudaLaunchDevice(reinterpret_cast<void*>(super_t::global_function_pointer()), param_buffer, dim3(num_blocks), dim3(block_size), num_dynamic_smem_bytes, stream), "after cudaLaunchDevice in triple_chevron_launcher::launch()"); # endif // __CUDA_ARCH__ #endif // __BULK_HAS_CUDART__ }
void HostReflectionHost::BootUp::_launchNextKernel() { assert(!_launches.empty()); KernelLaunch& launch = _launches.front(); report(" launching kernel " << launch.ctas << " ctas, " << launch.threads << " threads, kernel: '" << launch.name << "' in module: '" << _module << "'"); cudaConfigureCall(launch.ctas, launch.threads, 0, 0); cudaSetupArgument(&launch.arguments, sizeof(PayloadData), 0); ocelot::launch(_module, launch.name); report(" kernel '" << launch.name << "' finished"); _launches.pop(); }
HostReflectionHost::BootUp::~BootUp() { report("Destroying host reflection"); // kill the thread _kill = true; _thread->join(); delete _thread; // destroy the device queues cudaConfigureCall(dim3(1, 1, 1), dim3(1, 1, 1), 0, 0); ocelot::launch(_module, "_teardownHostReflection"); cudaThreadSynchronize(); // destroy the host queues delete _hostToDeviceQueue; delete _deviceToHostQueue; // delete the queue memory delete[] _deviceHostSharedMemory; }
bool TestInstructionThroughput::testu32InstructionThroughput() { unsigned int* input; unsigned int k=5; cudaMalloc( ( void** ) &input, sizeof( unsigned int ) ); cudaMemcpy( input, &k, sizeof( unsigned int ), cudaMemcpyHostToDevice ); cudaConfigureCall( dim3( ctas, 1, 1 ), dim3( threads, 1, 1 ), 0, 0 ); cudaSetupArgument( &input, sizeof( long long unsigned int ), 0 ); std::stringstream program; program << ".version 2.1\n"; program << ".target sm_21, map_f64_to_f32\n\n"; program << ".entry testu32InstructionThroughput( .param .u64 input )\n"; program << "{\n"; program << " .reg .u64 %r<7>;\n"; program << " .reg .u32 %sum;\n"; program << " .reg .u32 %initial;\n"; program << " .reg .pred %p0;\n"; program << " Entry:\n"; program << " ld.param.u64 %r0, [input];\n"; program << " ld.global.u32 %initial, [%r0];\n"; program << " mov.u64 %r2, " << iterations <<";\n"; program << " mov.u64 %r3, 0; \n"; program << " mov.u32 %sum, 0; \n"; program << " setp.eq.u64 %p0, %r3, %r2;\n"; program << " @%p0 bra Exit;\n"; program << " Begin_iter:\n"; for(int i=0; i<unroll; ++i ) { program << " add.u32 %sum, %sum, %initial;\n"; } program << " add.u64 %r3, %r3, 1; \n"; program << " setp.lt.u64 %p0, %r3, %r2;\n"; program << " @%p0 bra Begin_iter;\n"; program << " End_loop:"; program << " st.global.u32 [%r0], %sum;\n"; program << " Exit:\n"; program << " exit;"; program << "}\n"; ocelot::registerPTXModule( program, "u32throughput" ); hydrazine::Timer timer; timer.start(); ocelot::launch( "u32throughput", "testu32InstructionThroughput" ); cudaThreadSynchronize(); timer.stop(); status << "u32 Operations/sec " << ( (threads * ctas * iterations * unroll) / timer.seconds() ) << " seconds. \n"; unsigned int result; cudaMemcpy( &result, input, sizeof( unsigned int ), cudaMemcpyDeviceToHost ); bool pass = true; if( result != k * iterations * unroll && threads == 1 && ctas == 1 ) { status << "Program generated incorrect output " << result << ", expecting " << (k * iterations * unroll ) << "\n"; pass = false; } cudaFree( input ); return pass; }
cudaError_t WINAPI wine_cudaConfigureCall( dim3 gridDim, dim3 blockDim, size_t sharedMem, cudaStream_t stream ) { WINE_TRACE("\n"); return cudaConfigureCall( gridDim, blockDim, sharedMem, stream ); }
HostReflectionHost::BootUp::BootUp(const std::string& module) : _module(module) { report("Booting up host reflection..."); // add message handlers _addMessageHandlers(); // allocate memory for the queue size_t queueDataSize = maxMessageSize() * 2; size_t size = 2 * (queueDataSize + sizeof(QueueMetaData)); _deviceHostSharedMemory = new char[size]; // setup the queue meta data QueueMetaData* hostToDeviceMetaData = (QueueMetaData*)_deviceHostSharedMemory; QueueMetaData* deviceToHostMetaData = (QueueMetaData*)_deviceHostSharedMemory + 1; char* hostToDeviceData = _deviceHostSharedMemory + 2 * sizeof(QueueMetaData); char* deviceToHostData = _deviceHostSharedMemory + 2 * sizeof(QueueMetaData) + queueDataSize; hostToDeviceMetaData->hostBegin = hostToDeviceData; hostToDeviceMetaData->size = queueDataSize; hostToDeviceMetaData->head = 0; hostToDeviceMetaData->tail = 0; hostToDeviceMetaData->mutex = (size_t)-1; deviceToHostMetaData->hostBegin = deviceToHostData; deviceToHostMetaData->size = queueDataSize; deviceToHostMetaData->head = 0; deviceToHostMetaData->tail = 0; deviceToHostMetaData->mutex = (size_t)-1; // Allocate the queues _hostToDeviceQueue = new HostQueue(hostToDeviceMetaData); _deviceToHostQueue = new HostQueue(deviceToHostMetaData); // Map the memory onto the device cudaHostRegister(_deviceHostSharedMemory, size, 0); char* devicePointer = 0; cudaHostGetDevicePointer((void**)&devicePointer, _deviceHostSharedMemory, 0); // Send the metadata to the device QueueMetaData* hostToDeviceMetaDataPointer = (QueueMetaData*)devicePointer; QueueMetaData* deviceToHostMetaDataPointer = (QueueMetaData*)devicePointer + 1; hostToDeviceMetaData->deviceBegin = devicePointer + 2 * sizeof(QueueMetaData); deviceToHostMetaData->deviceBegin = devicePointer + 2 * sizeof(QueueMetaData) + queueDataSize; cudaConfigureCall(dim3(1, 1, 1), dim3(1, 1, 1), 0, 0); cudaSetupArgument(&hostToDeviceMetaDataPointer, 8, 0 ); cudaSetupArgument(&deviceToHostMetaDataPointer, 8, 8 ); ocelot::launch(_module, "_bootupHostReflection"); // start up the host worker thread _kill = false; _thread = new boost::thread(_runThread, this); }