void import( xpattern<value_type> const& pat ) { std::vector<size_type> v_dims; for ( size_type index = 0; index != config.tilt_size; ++index ) { v_dims.push_back( pat.diag[index].size() ); //ar size_type const ar_offset = index * config.max_dim * config.max_dim; size_type const ar_size = pat.ar[index].size() * sizeof( size_type ); cuda_assert( cudaMemcpy( reinterpret_cast<void*>(data.ar + ar_offset), reinterpret_cast<const void*>(pat.ar[index].data()), ar_size, cudaMemcpyHostToDevice ) ); //diag size_type const diag_offset = index * config.max_dim; size_type const diag_size = pat.diag[index].size() * sizeof( value_type ); cuda_assert( cudaMemcpy( reinterpret_cast<void*>(data.diag + diag_offset), reinterpret_cast<const void*>(pat.diag[index].data()), diag_size, cudaMemcpyHostToDevice ) ); //intensity size_type const I_exp_offset = index * config.max_dim; size_type const I_exp_size = pat.intensity[index].size() * sizeof( value_type ); cuda_assert( cudaMemcpy( reinterpret_cast<void*>(data.I_exp + I_exp_offset), reinterpret_cast<const void*>(pat.intensity[index].data()), I_exp_size, cudaMemcpyHostToDevice ) ); } cuda_assert( cudaMemcpy( reinterpret_cast<void*>(data.thickness_array), reinterpret_cast<const void*>(pat.thickness_array.data()), sizeof(value_type)*pat.tilt_size, cudaMemcpyHostToDevice ) ); cuda_assert( cudaMemcpy( reinterpret_cast<void*>(data.dim), reinterpret_cast<const void*>(v_dims.data()), sizeof(size_type) * v_dims.size(), cudaMemcpyHostToDevice ) ); }
cuda_pattern( xpattern<value_type> const& pat, int device_id = 0 ) : config{ make_cuda_xpattern_config( pat, device_id ) }, data{ config } { int current_id; cuda_assert( cudaGetDevice(¤t_id) ); if ( current_id != config.device_id ) cuda_assert( cudaSetDevice( config.device_id ) ); import( pat ); }
hardware_concurrency( size_type the_device_id = 0 ) { int num_devices = 0; cuda_assert( cudaGetDeviceCount( &num_devices ) ); assert( num_devices > the_device_id ); cudaDeviceProp properties; cuda_assert( cudaGetDeviceProperties( &properties, the_device_id ) ); multiprocessor_number = properties.multiProcessorCount; }//ctor
void const_copy_to(const char *name, void *host, size_t size) { CUdeviceptr mem; size_t bytes; cuda_push_context(); cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name)) //assert(bytes == size); cuda_assert(cuMemcpyHtoD(mem, host, size)) cuda_pop_context(); }
cuda_pattern( pattern<value_type> const& pat, int device_id, std::string const& ug_path ) : config{ make_cuda_pattern_config( pat, device_id ) }, data{ config } { int current_id; cuda_assert( cudaGetDevice(¤t_id) ); if ( current_id != config.device_id ) cuda_assert( cudaSetDevice( config.device_id ) ); import( pat ); sim_ug.load( ug_path ); update_ug(); }
value_type square_residual( value_type* ug, value_type thickness ) { int current_id; cuda_assert( cudaGetDevice(¤t_id) ); if ( current_id != config.device_id ) cuda_assert( cudaSetDevice( config.device_id ) ); update_I_diff(ug, thickness); value_type residual; cublasHandle_t handle; cublas_assert( cublasCreate_v2(&handle) ); cublas_assert( cublasDdot_v2( handle, static_cast<int>(config.max_dim*config.tilt_size), data.I_diff, 1, data.I_diff, 1, &residual ) ); cublas_assert( cublasDestroy_v2(handle) ); return residual; }
void update_ug() { matrix<double> sim_ug_( sim_ug.row(), 2 ); std::copy( sim_ug.col_begin(1), sim_ug.col_end(1), sim_ug_.col_begin(0) ); std::fill( sim_ug.col_begin(2), sim_ug.col_end(2), 0.0 ); cuda_assert( cudaMemcpy( reinterpret_cast<void*>(data.ug), reinterpret_cast<const void*>(sim_ug_.data()), config.ug_size*sizeof(value_type)*2, cudaMemcpyHostToDevice ) ); }
void mem_copy_to(device_memory& mem) { cuda_push_context(); if(mem.device_pointer) cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer), (void*)mem.data_pointer, mem.memory_size())) cuda_pop_context(); }
void mem_alloc(device_memory& mem, MemoryType type) { cuda_push_context(); CUdeviceptr device_pointer; cuda_assert(cuMemAlloc(&device_pointer, mem.memory_size())) mem.device_pointer = (device_ptr)device_pointer; cuda_pop_context(); }
void mem_zero(device_memory& mem) { memset((void*)mem.data_pointer, 0, mem.memory_size()); cuda_push_context(); cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size())) cuda_pop_context(); }
void mem_free(device_memory& mem) { if(mem.device_pointer) { cuda_push_context(); cuda_assert(cuMemFree(cuda_device_ptr(mem.device_pointer))) cuda_pop_context(); mem.device_pointer = 0; } }
fastest_device() : the_fastest_device_id(0) { int num_devices = 0; cuda_assert( cudaGetDeviceCount( &num_devices ) ); assert( !!num_devices ); size_type max_multiprocessors = 0; for ( size_type device = 0; device != num_devices; ++device ) { cudaDeviceProp properties; cuda_assert( cudaGetDeviceProperties( &properties, device ) ); if ( max_multiprocessors < properties.multiProcessorCount ) { max_multiprocessors = properties.multiProcessorCount; the_fastest_device_id = device; } } }//ctor
void mem_copy_from(device_memory& mem, int y, int w, int h, int elem) { size_t offset = elem*y*w; size_t size = elem*w*h; cuda_push_context(); cuda_assert(cuMemcpyDtoH((uchar*)mem.data_pointer + offset, (CUdeviceptr)((uchar*)mem.device_pointer + offset), size)) cuda_pop_context(); }
void update_ug( value_type* ug ) { cuda_assert( cudaMemcpy( reinterpret_cast<void*>(data.ug), reinterpret_cast<const void*>(ug), config.ug_size*sizeof(value_type)*2, cudaMemcpyHostToDevice ) ); }
cuda_xpattern_data( cuda_xpattern_config const& cpc ) { device_id = cpc.device_id; int current_id; cuda_assert( cudaGetDevice(¤t_id) ); if ( current_id != device_id ) cuda_assert( cudaSetDevice( device_id ) ); size_type const ug_size = sizeof(value_type) * cpc.ug_size * 2; cuda_assert( cudaMalloc( reinterpret_cast<void**>(&ug), ug_size ) ); cuda_assert( cudaMemset( reinterpret_cast<void*>(ug), 0, ug_size ) ); size_type const ar_size = sizeof(size_type) * cpc.tilt_size * cpc.max_dim * cpc.max_dim; cuda_assert( cudaMalloc( reinterpret_cast<void**>(&ar), ar_size ) ); cuda_assert( cudaMemset( reinterpret_cast<void*>(ar), 0, ar_size ) ); size_type const diag_size = sizeof(value_type) * cpc.tilt_size * cpc.max_dim; cuda_assert( cudaMalloc( reinterpret_cast<void**>(&diag), diag_size ) ); cuda_assert( cudaMemset( reinterpret_cast<void*>(diag), 0, diag_size ) ); size_type const dim_size = sizeof(size_type) * cpc.tilt_size; cuda_assert( cudaMalloc( reinterpret_cast<void**>(&dim), dim_size ) ); cuda_assert( cudaMemset( reinterpret_cast<void*>(dim), 0, dim_size ) ); size_type const I_exp_size = sizeof(value_type) * cpc.tilt_size * cpc.max_dim; cuda_assert( cudaMalloc( reinterpret_cast<void**>(&I_exp), I_exp_size ) ); cuda_assert( cudaMemset( reinterpret_cast<void*>(I_exp), 0, I_exp_size ) ); size_type const I_diff_size = sizeof(value_type) * cpc.tilt_size * cpc.max_dim; cuda_assert( cudaMalloc( reinterpret_cast<void**>(&I_diff), I_diff_size ) ); cuda_assert( cudaMemset( reinterpret_cast<void*>(I_diff), 0, I_diff_size ) ); size_type const I_zigmoid_size = sizeof(value_type) * cpc.tilt_size * cpc.max_dim; cuda_assert( cudaMalloc( reinterpret_cast<void**>(&I_zigmoid), I_zigmoid_size ) ); cuda_assert( cudaMemset( reinterpret_cast<void*>(I_zigmoid), 0, I_zigmoid_size ) ); size_type const thickness_array_size = sizeof(value_type) * cpc.tilt_size; cuda_assert( cudaMalloc( reinterpret_cast<void**>(&thickness_array), thickness_array_size ) ); cuda_assert( cudaMemset( reinterpret_cast<void*>(thickness_array), 0, thickness_array_size ) ); size_type const cache_size = sizeof(complex_type) * cpc.tilt_size * cpc.max_dim * cpc.max_dim * 6; cuda_assert( cudaMalloc( reinterpret_cast<void**>(&cache), cache_size ) ); cuda_assert( cudaMemset( reinterpret_cast<void*>(cache), 0, cache_size ) ); }
void cuda_pop_context() { cuda_assert(cuCtxSetCurrent(NULL)); }
return true; } #define cuda_error(stmt) cuda_error_(stmt, #stmt) void cuda_error_message(const string& message) { if(error_msg == "") error_msg = message; fprintf(stderr, "%s\n", message.c_str()); cuda_error_documentation(); } void cuda_push_context() { cuda_assert(cuCtxSetCurrent(cuContext)) } void cuda_pop_context() { cuda_assert(cuCtxSetCurrent(NULL)); } CUDADevice(DeviceInfo& info, Stats &stats, bool background_) : Device(stats) { first_error = true; background = background_; cuDevId = info.num; cuDevice = 0; cuContext = 0;
~cuda_xpattern_data() { int current_id; cuda_assert( cudaGetDevice(¤t_id) ); if ( current_id != device_id ) cuda_assert( cudaSetDevice( device_id ) ); if ( ar ) cuda_assert( cudaFree(ar) ); if ( dim ) cuda_assert( cudaFree(dim) ); if ( I_diff ) cuda_assert( cudaFree(I_diff) ); if ( I_exp ) cuda_assert( cudaFree(I_exp) ); if ( I_exp ) cuda_assert( cudaFree(I_zigmoid) ); if ( diag ) cuda_assert( cudaFree(diag) ); if ( ug ) cuda_assert( cudaFree(ug) ); if ( thickness_array ) cuda_assert( cudaFree( thickness_array ) ); if ( cache ) cuda_assert( cudaFree(cache) ); ar = 0; dim = 0; I_diff = 0; I_exp = 0; I_zigmoid = 0; diag = 0; ug = 0; thickness_array = 0; cache = 0; }
void update_kt_factor( value_type* host_kt_factor ) { cuda_assert( cudaMemcpy( reinterpret_cast<void*>(data.kt_factor), reinterpret_cast<const void*>(host_kt_factor), config.tilt_size*sizeof(value_type)*3, cudaMemcpyHostToDevice ) ); }
~cuda_pattern_data() { int current_id; cuda_assert( cudaGetDevice(¤t_id) ); if ( current_id != device_id ) cuda_assert( cudaSetDevice( device_id ) ); if ( ar ) cuda_assert( cudaFree(ar) ); if ( dim ) cuda_assert( cudaFree(dim) ); if ( I_diff ) cuda_assert( cudaFree(I_diff) ); if ( I_exp ) cuda_assert( cudaFree(I_exp) ); if ( I_exp ) cuda_assert( cudaFree(I_zigmoid) ); if ( diag ) cuda_assert( cudaFree(diag) ); if ( ug ) cuda_assert( cudaFree(ug) ); if ( cache ) cuda_assert( cudaFree(cache) ); if ( beams ) cuda_assert( cudaFree(beams) ); if ( kt_factor ) cuda_assert( cudaFree(kt_factor) ); if ( gvec ) cuda_assert( cudaFree(gvec) ); if ( tilt ) cuda_assert( cudaFree(tilt) ); ar = 0; dim = 0; I_diff = 0; I_exp = 0; I_zigmoid = 0; diag = 0; ug = 0; cache = 0; gvec = 0; tilt = 0; }
cuda_pattern_data( cuda_pattern_config const& cpc ) { device_id = cpc.device_id; int current_id; cuda_assert( cudaGetDevice(¤t_id) ); if ( current_id != device_id ) cuda_assert( cudaSetDevice( device_id ) ); size_type const kt_factor_size = sizeof(value_type) * cpc.tilt_size * 3; cuda_assert( cudaMalloc( reinterpret_cast<void**>(&kt_factor), kt_factor_size ) ); cuda_assert( cudaMemset( reinterpret_cast<void*>(kt_factor), 0, kt_factor_size ) ); size_type const beams_size = sizeof(value_type) * cpc.tilt_size * 10; cuda_assert( cudaMalloc( reinterpret_cast<void**>(&beams), beams_size ) ); cuda_assert( cudaMemset( reinterpret_cast<void*>(beams), 0, beams_size ) ); size_type const ug_size = sizeof(value_type) * cpc.ug_size * 2; cuda_assert( cudaMalloc( reinterpret_cast<void**>(&ug), ug_size ) ); cuda_assert( cudaMemset( reinterpret_cast<void*>(ug), 0, ug_size ) ); size_type const ar_size = sizeof(size_type) * cpc.tilt_size * cpc.max_dim * cpc.max_dim; cuda_assert( cudaMalloc( reinterpret_cast<void**>(&ar), ar_size ) ); cuda_assert( cudaMemset( reinterpret_cast<void*>(ar), 0, ar_size ) ); size_type const diag_size = sizeof(value_type) * cpc.tilt_size * cpc.max_dim; cuda_assert( cudaMalloc( reinterpret_cast<void**>(&diag), diag_size ) ); cuda_assert( cudaMemset( reinterpret_cast<void*>(diag), 0, diag_size ) ); size_type const dim_size = sizeof(size_type) * cpc.tilt_size; cuda_assert( cudaMalloc( reinterpret_cast<void**>(&dim), dim_size ) ); cuda_assert( cudaMemset( reinterpret_cast<void*>(dim), 0, dim_size ) ); size_type const I_exp_size = sizeof(value_type) * cpc.tilt_size * cpc.max_dim; cuda_assert( cudaMalloc( reinterpret_cast<void**>(&I_exp), I_exp_size ) ); cuda_assert( cudaMemset( reinterpret_cast<void*>(I_exp), 0, I_exp_size ) ); size_type const I_diff_size = sizeof(value_type) * cpc.tilt_size * cpc.max_dim; cuda_assert( cudaMalloc( reinterpret_cast<void**>(&I_diff), I_diff_size ) ); cuda_assert( cudaMemset( reinterpret_cast<void*>(I_diff), 0, I_diff_size ) ); size_type const I_zigmoid_size = sizeof(value_type) * cpc.tilt_size * cpc.max_dim; cuda_assert( cudaMalloc( reinterpret_cast<void**>(&I_zigmoid), I_zigmoid_size ) ); cuda_assert( cudaMemset( reinterpret_cast<void*>(I_zigmoid), 0, I_zigmoid_size ) ); size_type const cache_size = sizeof(complex_type) * cpc.tilt_size * cpc.max_dim * cpc.max_dim * 6; cuda_assert( cudaMalloc( reinterpret_cast<void**>(&cache), cache_size ) ); cuda_assert( cudaMemset( reinterpret_cast<void*>(cache), 0, cache_size ) ); size_type const gvec_size = sizeof(value_type) * cpc.ug_size * 2; cuda_assert( cudaMalloc( reinterpret_cast<void**>(&gvec), gvec_size ) ); cuda_assert( cudaMemset( reinterpret_cast<void*>(gvec), 0, gvec_size ) ); size_type const tilt_size = sizeof(value_type) * cpc.tilt_size * 2; cuda_assert( cudaMalloc( reinterpret_cast<void**>(&tilt), tilt_size ) ); cuda_assert( cudaMemset( reinterpret_cast<void*>(tilt), 0, tilt_size ) ); }