cuda_texture::cuda_texture( cuda_linear_buffer_device::const_ptr dev_smart_ptr, int vector_size) : tex(0) , dev_smart_ptr(dev_smart_ptr) { struct cudaResourceDesc res_desc; memset(&res_desc, 0, sizeof(res_desc)); res_desc.resType = cudaResourceTypeLinear; res_desc.res.linear.devPtr = const_cast<void *>((const void *)(*dev_smart_ptr)); switch (vector_size) { case 1: res_desc.res.linear.desc = cudaCreateChannelDesc<float>(); break; case 2: res_desc.res.linear.desc = cudaCreateChannelDesc<float2>(); break; case 4: res_desc.res.linear.desc = cudaCreateChannelDesc<float4>(); break; default: throw neural_network_exception((boost::format("Invalid vetor_size %1% for cuda_texture") % vector_size).str()); } res_desc.res.linear.sizeInBytes = dev_smart_ptr->get_size(); struct cudaTextureDesc tex_desc; memset(&tex_desc, 0, sizeof(tex_desc)); tex_desc.addressMode[0] = cudaAddressModeBorder; tex_desc.readMode = cudaReadModeElementType; tex_desc.normalizedCoords = 0; cuda_safe_call(cudaCreateTextureObject(&tex, &res_desc, &tex_desc, 0)); }
size_t cuda_texture::get_size() const { struct cudaResourceDesc res_desc; cuda_safe_call(cudaGetTextureObjectResourceDesc(&res_desc, tex)); return res_desc.res.linear.sizeInBytes; }
void cuda_running_configuration::update_parameters() { cuda_safe_call(cudaDriverGetVersion(&driver_version)); cuda_safe_call(cudaRuntimeGetVersion(&runtime_version)); int device_count; cuda_safe_call(cudaGetDeviceCount(&device_count)); if (device_count <= 0) throw neural_network_exception("No CUDA capable devices are found"); if (device_id >= device_count) throw neural_network_exception((boost::format("Device ID %1% specified while %2% devices are available") % device_id % device_count).str()); cudaDeviceProp device_prop; cuda_safe_call(cudaGetDeviceProperties(&device_prop, device_id)); device_name = device_prop.name; compute_capability_major = device_prop.major; compute_capability_minor = device_prop.minor; clock_rate = device_prop.clockRate; memory_clock_rate = device_prop.memoryClockRate; memory_bus_width = device_prop.memoryBusWidth; global_memory_size = device_prop.totalGlobalMem; ecc_enabled = (device_prop.ECCEnabled != 0); l2_cache_size = device_prop.l2CacheSize; multiprocessor_count = device_prop.multiProcessorCount; smem_per_block = device_prop.sharedMemPerBlock; max_threads_per_multiprocessor = device_prop.maxThreadsPerMultiProcessor; max_threads_per_block = device_prop.maxThreadsPerBlock; for(int i = 0; i < sizeof(max_threads_dim) / sizeof(max_threads_dim[0]); ++i) max_threads_dim[i] = device_prop.maxThreadsDim[i]; for(int i = 0; i < sizeof(max_grid_size) / sizeof(max_grid_size[0]); ++i) max_grid_size[i] = device_prop.maxGridSize[i]; max_texture_1d_linear = device_prop.maxTexture1DLinear; texture_alignment = device_prop.textureAlignment; pci_bus_id = device_prop.pciBusID; pci_device_id = device_prop.pciDeviceID; #ifdef _WIN32 tcc_mode = (device_prop.tccDriver != 0); #endif cuda_safe_call(cudaSetDevice(device_id)); cublas_safe_call(cublasCreate(&cublas_handle)); cusparse_safe_call(cusparseCreate(&cusparse_handle)); }
void layer_updater_cuda::get_data_from_device(const std::vector<cuda_linear_buffer_device::ptr>& device_data, layer_data::ptr host_data) const { unsigned int part_id = 0; for(layer_data::iterator it = host_data->begin(); it != host_data->end(); ++it, ++part_id) { cuda_linear_buffer_device::const_ptr src = device_data[part_id]; cuda_safe_call(cudaMemcpy(&(*it->begin()), *src, it->size() * sizeof(float), cudaMemcpyDeviceToHost)); } }
unsigned int supervised_data_reader_functor::operator()() { unsigned int entries_read_count = 0; try { PUSH_RANGE("Reading supervised data", 0); unsigned int input_neuron_count = reader->get_input_configuration().get_neuron_count(); unsigned int output_neuron_count = reader->get_output_configuration().get_neuron_count(); size_t input_neuron_elem_size = reader->get_input_neuron_elem_size(); while(entries_read_count < entries_to_read_count) { bool entry_read = reader->read( input + (input_neuron_count * entries_read_count * input_neuron_elem_size), output + (output_neuron_count * entries_read_count)); if (!entry_read) break; entries_read_count++; } POP_RANGE; cuda_safe_call(cudaMemcpyAsync( d_input, input, entries_read_count * input_neuron_count * input_neuron_elem_size, cudaMemcpyHostToDevice, stream)); cuda_safe_call(cudaMemcpyAsync( d_output, output, entries_read_count * output_neuron_count * sizeof(float), cudaMemcpyHostToDevice, stream)); } catch (std::runtime_error& e) { *error = e.what(); } return entries_read_count; }
std::vector<const_cuda_linear_buffer_device_smart_ptr> layer_hessian_cuda::get_data_squared(const_layer_data_smart_ptr host_data) const { std::vector<const_cuda_linear_buffer_device_smart_ptr> res; for(std::vector<std::vector<float> >::const_iterator it = host_data->begin(); it != host_data->end(); ++it) { size_t buffer_size = it->size() * sizeof(float); cuda_linear_buffer_device_smart_ptr new_buf(new cuda_linear_buffer_device(buffer_size)); cuda_safe_call(cudaMemcpy(*new_buf, &(*it->begin()), buffer_size, cudaMemcpyHostToDevice)); cuda_util::multiply_by_itself( *cuda_config, *new_buf, *new_buf, new_buf->get_size() / sizeof(float), 0); cuda_safe_call(cudaStreamSynchronize(0)); res.push_back(new_buf); } return res; }
std::ostream& operator<< (std::ostream& out, const cuda_running_configuration& running_configuration) { out << "--- CUDA versions ---" << std::endl; out << "Driver version = " << running_configuration.driver_version / 1000 << "." << (running_configuration.driver_version % 100) / 10 << std::endl; out << "Runtime version = " << running_configuration.runtime_version / 1000 << "." << (running_configuration.runtime_version % 100) / 10 << std::endl; out << "--- Device ---" << std::endl; out << "Device Id = " << running_configuration.device_id << std::endl; out << "Device name = " << running_configuration.device_name << std::endl; out << "Compute capability = " << running_configuration.compute_capability_major << "." << running_configuration.compute_capability_minor << std::endl; out << "Clock rate = " << (running_configuration.clock_rate / 1000) << " MHz" << std::endl; out << "Memory clock rate = " << (running_configuration.memory_clock_rate / 1000) << " MHz" << std::endl; out << "Memory bus width = " << running_configuration.memory_bus_width << " bits" << std::endl; out << "Global memory size = " << running_configuration.global_memory_size / (1024 * 1024) << " MB" << std::endl; out << "ECC support = " << (running_configuration.ecc_enabled ? "Enabled" : "Disabled") << std::endl; out << "L2 cache size = " << running_configuration.l2_cache_size << " bytes" << std::endl; out << "Multiprocessor count = " << running_configuration.multiprocessor_count << std::endl; out << "Shared memory per block size = " << running_configuration.smem_per_block << " bytes" << std::endl; out << "Maximum number of threads per multiprocessor = " << running_configuration.max_threads_per_multiprocessor << std::endl; out << "Maximum number of threads per block = " << running_configuration.max_threads_per_block << std::endl; out << "Maximum sizes of each dimension of a block = " << running_configuration.max_threads_dim[0] << " x " << running_configuration.max_threads_dim[1] << " x " << running_configuration.max_threads_dim[2] << std::endl; out << "Maximum sizes of each dimension of a grid = " << running_configuration.max_grid_size[0] << " x " << running_configuration.max_grid_size[1] << " x " << running_configuration.max_grid_size[2] << std::endl; out << "Maximum size of 1D texture bound to linear memory = " << running_configuration.max_texture_1d_linear << std::endl; out << "Texture alignment = " << running_configuration.texture_alignment << " bytes" << std::endl; out << "PCI Bus ID = " << running_configuration.pci_bus_id << std::endl; out << "PCI Location ID = " << running_configuration.pci_device_id << std::endl; #ifdef WIN32 out << "Driver mode = " << (running_configuration.tcc_mode ? "TCC" : "WDDM") << std::endl; #endif out << "--- Settings ---" << std::endl; out << "Max global memory usage ratio = " << running_configuration.max_global_memory_usage_ratio << std::endl; out << "--- Status ---" << std::endl; size_t free_memory; size_t total_memory; cuda_safe_call(cudaMemGetInfo(&free_memory, &total_memory)); out << "Free memory = " << free_memory / (1024 * 1024) << " MB" << std::endl; out << "Total memory = " << total_memory / (1024 * 1024) << " MB" << std::endl; return out; }
std::vector<cuda_linear_buffer_device::ptr> layer_updater_cuda::get_data(layer_data::const_ptr host_data) const { std::vector<cuda_linear_buffer_device::ptr> res; for(std::vector<std::vector<float> >::const_iterator it = host_data->begin(); it != host_data->end(); ++it) { size_t buffer_size = it->size() * sizeof(float); cuda_linear_buffer_device::ptr new_buf(new cuda_linear_buffer_device(buffer_size)); cuda_safe_call(cudaMemcpy(*new_buf, &(*it->begin()), buffer_size, cudaMemcpyHostToDevice)); res.push_back(new_buf); } return res; }
std::vector<cuda_linear_buffer_device::const_ptr> layer_updater_cuda::set_get_data_custom(layer_data_custom::const_ptr host_data_custom) { notify_data_custom(host_data_custom); std::vector<cuda_linear_buffer_device::const_ptr> res; for(std::vector<std::vector<int> >::const_iterator it = host_data_custom->begin(); it != host_data_custom->end(); ++it) { size_t buffer_size = it->size() * sizeof(int); cuda_linear_buffer_device::ptr new_buf(new cuda_linear_buffer_device(buffer_size)); cuda_safe_call(cudaMemcpy(*new_buf, &(*it->begin()), buffer_size, cudaMemcpyHostToDevice)); res.push_back(new_buf); } return res; }
void cuda_running_configuration::set_device() const { cuda_safe_call(cudaSetDevice(device_id)); }