int main(int argc, char **argv) { cudaError_t err = cudaSuccess; int deviceCount = 0; size_t totalDevMem, freeDevMem; size_t lastLineLength = 0; // MUST be initialized to zero signal(SIGTERM, signalHandler); signal(SIGQUIT, signalHandler); signal(SIGINT, signalHandler); signal(SIGHUP, signalHandler); writeLine(lastLineLength, "Preparing..."); err = cudaGetDeviceCount(&deviceCount); if (err != cudaSuccess) { std::cerr << "ERROR: " << cudaGetErrorString(err) << std::endl; } while (err == cudaSuccess && gRun) { std::ostringstream stream; for (int i=0; i < deviceCount; ++i) { if (err == cudaSuccess) { err = cudaSetDevice(i); if (err == cudaSuccess) { cudaMemGetInfo(&freeDevMem, &totalDevMem); if (i != 0) stream << " : "; stream << "Dev " << i << " (" << (freeDevMem/1024) << " KB of " << (totalDevMem/1048576) << " MB free)"; } } } if (err == cudaSuccess) { writeLine(lastLineLength, stream.str()); } sleep(5); // TODO - make the cycle time an optional command line flag... } cudaThreadExit(); std::cout << std::endl; return 0; }
static CUT_THREADPROC solverThread(TOptionPlan *plan) { //Init GPU cutilSafeCall( cudaSetDevice(plan->device) ); cudaDeviceProp deviceProp; cutilSafeCall(cudaGetDeviceProperties(&deviceProp, plan->device)); int version = deviceProp.major * 10 + deviceProp.minor; if(useDoublePrecision && version < 13) { printf("Double precision is not supported on device %i.\n", plan->device); exit(0); } //Allocate memory for normally distributed samples cutilSafeCall( cudaMalloc( (void **)&plan->d_Samples, plan->pathN * sizeof(float) ) ); //Generate normally distributed samples if(useDoublePrecision) inverseCND_SM13(plan->d_Samples, NULL, plan->pathN); else inverseCND_SM10(plan->d_Samples, NULL, plan->pathN); //Allocate intermediate memory for MC integrator if(useDoublePrecision) initMonteCarlo_SM13(plan); else initMonteCarlo_SM10(plan); //Main computations if(useDoublePrecision) MonteCarlo_SM13(plan); else MonteCarlo_SM10(plan); cutilSafeCall( cudaThreadSynchronize() ); //Shut down this GPU if(useDoublePrecision) closeMonteCarlo_SM13(plan); else closeMonteCarlo_SM10(plan); cutilSafeCall( cudaFree(plan->d_Samples) ); cudaThreadExit(); CUT_THREADEND; }
value_type abs_residual( value_type* ug, value_type thickness ) { int current_id; cuda_assert( cudaGetDevice(¤t_id) ); if ( current_id != config.device_id ) cuda_assert( cudaSetDevice( config.device_id ) ); update_I_diff(ug, thickness); value_type residual; cublasHandle_t handle; cublas_assert( cublasCreate_v2(&handle) ); cublas_assert( cublasDasum_v2( handle, static_cast<int>(config.max_dim*config.tilt_size), data.I_diff, 1, &residual ) ); cublas_assert( cublasDestroy_v2(handle) ); return residual; }
//void device_init_(int *icuda) { int device_init_(long *icuda,long *cuda_device_number ) { /* Set device_id */ int dev_count_check=0; device_id = util_my_smp_index(); cudaGetDeviceCount(&dev_count_check); if(dev_count_check < *icuda){ printf("Warning: Please check whether you have %ld cuda devices per node\n",*icuda); fflush(stdout); *cuda_device_number = 30; } else { cudaSetDevice(device_id); } return 1; }
void Solver::Reduce(int device, Caffe::Brew mode, uint64_t random_seed, int solver_count, bool root_solver) { Caffe::set_mode(mode); #ifndef CPU_ONLY if (Caffe::mode() == Caffe::GPU) { CUDA_CHECK(cudaSetDevice(device)); #ifndef NO_NVML nvml::setCpuAffinity(rank_); #endif } #endif Caffe::set_random_seed(random_seed); Caffe::set_solver_count(solver_count); Caffe::set_root_solver(root_solver); net_->ReduceAndUpdate(); }
bool initCuda() { int devID = 0; int device_count= 0; cudaGetDeviceCount(&device_count); if (device_count < 1) { return false; } cudaSetDevice(devID); return true; }
bool InitCUDA(int i) { int count = 0; cudaGetDeviceCount(&count); if(count == 0) { fprintf(stderr, "There is no device.\n"); return false; } if(i == count) { fprintf(stderr, "There is no device supporting CUDA.\n"); return false; } cudaSetDevice(i%count); return true; }
bool setupFirstCuda2xDevice () { int numberOfDevices = 0; if (cudaSuccess != cudaGetDeviceCount (&numberOfDevices)) { return false; } for (int d = 0; d < numberOfDevices; ++d) { cudaDeviceProp properties; if (cudaSuccess != cudaGetDeviceProperties (&properties, d)) { continue; } if ((2 == properties.major) && (cudaSuccess == cudaSetDevice(d))) { return true; } } return false; }
PtrFreeScene* PtrFreeScene :: to_device(int device_id) const { cudaSetDevice(device_id); PtrFreeScene* scene = new PtrFreeScene; memcpy(scene, this, sizeof(PtrFreeScene)); scene->original_scene = NULL; scene->data_set = NULL; /*Point* vertexes; cudaMalloc(&vertexes, sizeof(Point) * vertex_count); cudaMemcpy(vertexes, this->vertexes, sizeof(Point) * vertex_count, cudaMemcpyHostToDevice); test_kernel<<<1, 1>>>(vertexes, vertex_count); Point new_vertexes[2]; cudaMemcpy(&new_vertexes, vertexes, 2 * sizeof(Point), cudaMemcpyDeviceToHost); printf("%f\n", new_vertexes[0].x);*/ CUDA_SAFE(alloc_copy_to_cuda(&scene->vertexes, this->vertexes, vertex_count)); CUDA_SAFE(alloc_copy_to_cuda(&scene->normals, this->normals, normals_count)); CUDA_SAFE(alloc_copy_to_cuda(&scene->colors, this->colors, colors_count)); CUDA_SAFE(alloc_copy_to_cuda(&scene->uvs, this->uvs, uvs_count)); CUDA_SAFE(alloc_copy_to_cuda(&scene->triangles, this->triangles, triangles_count)); CUDA_SAFE(alloc_copy_to_cuda(&scene->mesh_descs, this->mesh_descs, mesh_descs_count)); CUDA_SAFE(alloc_copy_to_cuda(&scene->mesh_ids, this->mesh_ids, data_set->totalTriangleCount)); CUDA_SAFE(alloc_copy_to_cuda(&scene->mesh_first_triangle_offset, this->mesh_first_triangle_offset, mesh_count)); CUDA_SAFE(alloc_copy_to_cuda(&scene->compiled_materials, this->compiled_materials, ppm::MAT_MAX)); CUDA_SAFE(alloc_copy_to_cuda(&scene->materials, this->materials, materials_count)); CUDA_SAFE(alloc_copy_to_cuda(&scene->mesh_materials, this->mesh_materials, mesh_materials_count)); CUDA_SAFE(alloc_copy_to_cuda(&scene->area_lights, this->area_lights, area_lights_count)); CUDA_SAFE(alloc_copy_to_cuda(&scene->infinite_light_map, this->infinite_light_map, this->infinite_light.width * this->infinite_light.height)); CUDA_SAFE(alloc_copy_to_cuda(&scene->tex_maps, this->tex_maps, tex_maps_count)); CUDA_SAFE(alloc_copy_to_cuda(&scene->rgb_tex, this->rgb_tex, rgb_tex_count)); CUDA_SAFE(alloc_copy_to_cuda(&scene->alpha_tex, this->alpha_tex, alpha_tex_count)); CUDA_SAFE(alloc_copy_to_cuda(&scene->mesh_texs, this->mesh_texs, mesh_materials_count)); CUDA_SAFE(alloc_copy_to_cuda(&scene->bump_map, this->bump_map, mesh_materials_count)); CUDA_SAFE(alloc_copy_to_cuda(&scene->bump_map_scales, this->bump_map_scales, mesh_materials_count)); CUDA_SAFE(alloc_copy_to_cuda(&scene->normal_map, this->normal_map, mesh_materials_count)); CUDA_SAFE(alloc_copy_to_cuda(&scene->nodes, (QBVHNode*) this->data_set->GetAccelerator()->GetNodes(), n_nodes)); CUDA_SAFE(alloc_copy_to_cuda(&scene->prims, (QuadTriangle*)this->data_set->GetAccelerator()->GetPrims(), n_prims)); PtrFreeScene* cuda_scene; CUDA_SAFE(alloc_copy_to_cuda(&cuda_scene, scene, 1)); CUDA_SAFE(cudaDeviceSynchronize()); return cuda_scene; }
int main(int argc, char **argv) { //test_resize("data/bad.jpg"); //test_box(); //test_convolutional_layer(); if(argc < 2){ fprintf(stderr, "usage: %s <function>\n", argv[0]); return 0; } gpu_index = find_int_arg(argc, argv, "-i", 0); if(find_arg(argc, argv, "-nogpu")) gpu_index = -1; #ifndef GPU gpu_index = -1; #else if(gpu_index >= 0){ cudaSetDevice(gpu_index); } #endif if(0==strcmp(argv[1], "imagenet")){ run_imagenet(argc, argv); } else if (0 == strcmp(argv[1], "detection")){ run_detection(argc, argv); } else if (0 == strcmp(argv[1], "writing")){ run_writing(argc, argv); } else if (0 == strcmp(argv[1], "test")){ test_resize(argv[2]); } else if (0 == strcmp(argv[1], "captcha")){ run_captcha(argc, argv); } else if (0 == strcmp(argv[1], "nightmare")){ run_nightmare(argc, argv); } else if (0 == strcmp(argv[1], "change")){ change_rate(argv[2], atof(argv[3]), (argc > 4) ? atof(argv[4]) : 0); } else if (0 == strcmp(argv[1], "rgbgr")){ rgbgr_net(argv[2], argv[3], argv[4]); } else if (0 == strcmp(argv[1], "partial")){ partial(argv[2], argv[3], argv[4], atoi(argv[5])); } else if (0 == strcmp(argv[1], "visualize")){ visualize(argv[2], (argc > 3) ? argv[3] : 0); } else if (0 == strcmp(argv[1], "imtest")){ test_resize(argv[2]); } else { fprintf(stderr, "Not an option: %s\n", argv[1]); } return 0; }
void Caffe::SetDevice(const int device_id) { int current_device; CUDA_CHECK(cudaGetDevice(¤t_device)); if (current_device == device_id) { return; } if (Get().cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().cublas_handle_)); if (Get().curand_generator_) { CURAND_CHECK(curandDestroyGenerator(Get().curand_generator_)); } CUDA_CHECK(cudaSetDevice(device_id)); CUBLAS_CHECK(cublasCreate(&Get().cublas_handle_)); CURAND_CHECK(curandCreateGenerator(&Get().curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_, cluster_seedgen())); }
//alloc GPU segemnt //and then register it with all nodes //then delete it int main(int argc, char *argv[]) { TSUITE_INIT(argc, argv); ASSERT (gaspi_proc_init(GASPI_BLOCK)); ASSERT(gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK)); gaspi_rank_t rank, nprocs, i; gaspi_number_t seg_max; gaspi_gpu_id_t gpus[8]; gaspi_number_t nGPUs; ASSERT(gaspi_proc_num(&nprocs)); ASSERT (gaspi_proc_rank(&rank)); ASSERT(gaspi_gpu_init()); seg_max = 1; ASSERT (gaspi_gpu_number(&nGPUs)); ASSERT (gaspi_gpu_ids(gpus)); cudaSetDevice(gpus[0]); ASSERT (gaspi_segment_alloc(0, 1024, GASPI_MEM_INITIALIZED|GASPI_MEM_GPU)); ASSERT(gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK)); for (i = 0; i < nprocs; i++) { if(i == rank) continue; ASSERT( gaspi_segment_register(0, i, GASPI_BLOCK)); } ASSERT(gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK)); ASSERT (gaspi_segment_delete(0)); ASSERT(gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK)); ASSERT (gaspi_proc_term(GASPI_BLOCK)); return EXIT_SUCCESS; }
void initializeCUDA() { cudaError_t error; int devID = 0; error = cudaSetDevice(devID); if (error != cudaSuccess){printf("cudaSetDevice returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);} error = cudaGetDevice(&devID); if (error != cudaSuccess){printf("cudaGetDevice returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);} // printf("Device ID is %d\n",devID); cudaDeviceProp deviceProp; error = cudaGetDeviceProperties(&deviceProp,devID); if (error != cudaSuccess){printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);} // printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor); // use larger block size for Fermi and above block_size = (deviceProp.major < 2) ? 16 : 32; }
void cutilDeviceInit ( int argc, char ** argv ) { int deviceCount; cutilSafeCall ( cudaGetDeviceCount ( &deviceCount ) ); if ( deviceCount == 0 ) { printf ( "cutil error: no devices supporting CUDA\n" ); exit ( -1 ); } cudaDeviceProp_t deviceProp; cutilSafeCall ( cudaGetDeviceProperties ( &deviceProp, 0 ) ); printf ( "\n Using CUDA device: %s\n", deviceProp.name ); cutilSafeCall ( cudaSetDevice ( 0 ) ); }
void cuda_running_configuration::update_parameters() { cuda_safe_call(cudaDriverGetVersion(&driver_version)); cuda_safe_call(cudaRuntimeGetVersion(&runtime_version)); int device_count; cuda_safe_call(cudaGetDeviceCount(&device_count)); if (device_count <= 0) throw neural_network_exception("No CUDA capable devices are found"); if (device_id >= device_count) throw neural_network_exception((boost::format("Device ID %1% specified while %2% devices are available") % device_id % device_count).str()); cudaDeviceProp device_prop; cuda_safe_call(cudaGetDeviceProperties(&device_prop, device_id)); device_name = device_prop.name; compute_capability_major = device_prop.major; compute_capability_minor = device_prop.minor; clock_rate = device_prop.clockRate; memory_clock_rate = device_prop.memoryClockRate; memory_bus_width = device_prop.memoryBusWidth; global_memory_size = device_prop.totalGlobalMem; ecc_enabled = (device_prop.ECCEnabled != 0); l2_cache_size = device_prop.l2CacheSize; multiprocessor_count = device_prop.multiProcessorCount; smem_per_block = device_prop.sharedMemPerBlock; max_threads_per_multiprocessor = device_prop.maxThreadsPerMultiProcessor; max_threads_per_block = device_prop.maxThreadsPerBlock; for(int i = 0; i < sizeof(max_threads_dim) / sizeof(max_threads_dim[0]); ++i) max_threads_dim[i] = device_prop.maxThreadsDim[i]; for(int i = 0; i < sizeof(max_grid_size) / sizeof(max_grid_size[0]); ++i) max_grid_size[i] = device_prop.maxGridSize[i]; max_texture_1d_linear = device_prop.maxTexture1DLinear; texture_alignment = device_prop.textureAlignment; pci_bus_id = device_prop.pciBusID; pci_device_id = device_prop.pciDeviceID; #ifdef _WIN32 tcc_mode = (device_prop.tccDriver != 0); #endif cuda_safe_call(cudaSetDevice(device_id)); cublas_safe_call(cublasCreate(&cublas_handle)); cusparse_safe_call(cusparseCreate(&cusparse_handle)); }
void DialogSelectHardware::ChangeText(int indexDevice) { int driverVersion = 0, runtimeVersion = 0; cudaSetDevice(indexDevice); cudaGetDeviceProperties(deviceProp, indexDevice); cudaDriverGetVersion(&driverVersion); cudaRuntimeGetVersion(&runtimeVersion); char msg[256]; SPRINTF(msg,"%.0f MBytes (%llu bytes)\n", (float)deviceProp->totalGlobalMem/1048576.0f, (unsigned long long) deviceProp->totalGlobalMem); ui->tableWidget->clear(); addItem(QString ("Device "+QString::number(indexDevice).append(" : ")+ deviceProp->name),0,0); addItem((selectDevice == indexDevice) ? "Dispositivo Seleccionado " : " ",0,1); addItem("CUDA Driver Version / Runtime Version",1,0); addItem(QString ("%1.%2 / %3.%4").arg(driverVersion/1000).arg((driverVersion%100)/10).arg( runtimeVersion/1000).arg((runtimeVersion%100)/10),1,1); addItem("CUDA Capability Major/Minor version number: ",2,0); addItem(QString ("%1.%2").arg(deviceProp->major).arg(deviceProp->minor),2,1); addItem("Total amount of global memory:",3,0); addItem(msg,3,1); addItem(QString ("(%1) Multiprocessors, (%2) CUDA Cores/MP:%3 CUDA Cores").arg( deviceProp->multiProcessorCount).arg( _ConvertSMVer2Cores(deviceProp->major, deviceProp->minor)).arg( _ConvertSMVer2Cores(deviceProp->major, deviceProp->minor) * deviceProp->multiProcessorCount),4,0); addItem("Total amount of constant memory:",5,0); addItem(QString ("%1 bytes").arg(deviceProp->totalConstMem),5,1); addItem("Total amount of shared memory per block:",6,0); addItem(QString ("%1 bytes").arg(deviceProp->sharedMemPerBlock),6,1); addItem("Total number of registers available per block:",7,0); addItem(QString ("%1").arg(deviceProp->regsPerBlock),7,1); addItem("Warp size:",8,0); addItem(QString ("%1").arg(deviceProp->warpSize),8,1); addItem("Maximum number of threads per multiprocessor:",9,0); addItem(QString ("%1").arg(deviceProp->maxThreadsPerMultiProcessor),9,1); addItem("Maximum number of threads per block:",10,0); addItem(QString ("%1").arg(deviceProp->maxThreadsPerBlock),10,1); addItem("Max dimension size of a thread block (x,y,z):",11,0); addItem(QString ("(%1, %2, %3)").arg(deviceProp->maxThreadsDim[0]).arg( deviceProp->maxThreadsDim[1]).arg( deviceProp->maxThreadsDim[2]),11,1); addItem("Max dimension size of a grid size (x,y,z):",12,0); addItem(QString ("(%1, %2, %3)\n").arg(deviceProp->maxGridSize[0]).arg(deviceProp->maxGridSize[1]).arg(deviceProp->maxGridSize[2]),12,1); addItem("Run time limit on kernels: ",13,0); addItem(QString ("%1\n").arg(deviceProp->kernelExecTimeoutEnabled ? "Yes" : "No"),13,1); addItem("Integrated GPU sharing Host Memory: ",14,0); addItem( QString ("%1\n").arg(deviceProp->integrated ? "Yes" : "No"),14,1); ui->tableWidget->resizeColumnsToContents(); ui->tableWidget->resizeRowsToContents(); }
int main(int argc, char **argv) { // define the ptr int size = WIDTH * HEIGHT; float *h_data, *d_send_data, *d_recv_data; bool use_cuda_time = true; if(argc != 3) { std::cout << "the number of paramter should be equal 2" << std::endl; std::cout << "egs: bandwidth_test_between2gpu 0 1" << std::endl; return 1; } //std::cout << "debug 1" << std::endl; int id0 = atoi(argv[1]); int id1 = atoi(argv[2]); std::cout << "id0=" << id0 << ", id1=" << id1 << std::endl; //h_data = new float[size]; cudaMallocHost(&h_data, size*sizeof(float)); init_data(h_data, size); cudaSetDevice(id0); cudaMalloc(&d_send_data, size*sizeof(float)); cudaSetDevice(id1); cudaMalloc(&d_recv_data, size*sizeof(float)); cudaMemcpy(d_send_data, h_data, size*sizeof(float), cudaMemcpyHostToDevice); int can_access_peer_0_1, can_access_peer_1_0; cudaSetDevice(id0); CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer_0_1, id0, id1)); CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer_1_0, id1, id0)); if(can_access_peer_0_1 && can_access_peer_1_0) { std::cout << "can GPU" << id0 << "access from GPU" << id1 << ": Yes" << std::endl; cudaSetDevice(id0); CUDA_CHECK(cudaDeviceEnablePeerAccess(id1, 0)); cudaSetDevice(id1); CUDA_CHECK(cudaDeviceEnablePeerAccess(id0, 0)); } else { std::cout << "can GPU" << id0 << "access from GPU" << id1 << ": No" << std::endl; } cudaSetDevice(id1); use_cuda_time = false; //use_cuda_time = true; test_2gpu(d_send_data, d_recv_data, size, id0, id1, use_cuda_time); cudaFreeHost(h_data); cudaFree(d_send_data); cudaFree(d_recv_data); return 0; }
TEST(PeerAccess, DeviceReset) { cudaError_t ret; int devices; ret = cudaGetDeviceCount(&devices); ASSERT_EQ(cudaSuccess, ret); if (devices <= 1) { return; } bool found = false; int dj; for (int i = 0; i < devices && !(found); i++) { ret = cudaSetDevice(i); ASSERT_EQ(cudaSuccess, ret); for (int j = 0; j < devices; j++) { int peer; ret = cudaDeviceCanAccessPeer(&peer, i, j); ASSERT_EQ(cudaSuccess, ret); if (peer) { ret = cudaDeviceEnablePeerAccess(j, 0); ASSERT_EQ(cudaSuccess, ret); found = true; dj = j; break; } } } if (!(found)) { return; } /* Perform a device reset. */ ret = cudaDeviceReset(); EXPECT_EQ(cudaSuccess, ret); ret = cudaDeviceDisablePeerAccess(dj); EXPECT_EQ(cudaErrorPeerAccessNotEnabled, ret); }
// General GPU Device CUDA Initialization inline int gpuDeviceInit(int devID) { int device_count; checkCudaErrors(cudaGetDeviceCount(&device_count)); if (device_count == 0) { fprintf(stderr, "gpuDeviceInit() CUDA error: no devices supporting CUDA.\n"); exit(EXIT_FAILURE); } if (devID < 0) { devID = 0; } if (devID > device_count-1) { fprintf(stderr, "\n"); fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", device_count); fprintf(stderr, ">> gpuDeviceInit (-device=%d) is not a valid GPU device. <<\n", devID); fprintf(stderr, "\n"); return -devID; } cudaDeviceProp deviceProp; checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); if (deviceProp.computeMode == cudaComputeModeProhibited) { fprintf(stderr, "Error: device is running in <Compute Mode Prohibited>, no threads can use ::cudaSetDevice().\n"); return -1; } if (deviceProp.major < 1) { fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n"); exit(EXIT_FAILURE); } checkCudaErrors(cudaSetDevice(devID)); printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, deviceProp.name); return devID; }
ContextPtr CudaDevice::Create(int ordinal, bool stream) { // Create the device. DevicePtr device(new CudaDevice); cudaError_t error = cudaGetDeviceProperties(&device->_prop, ordinal); if(cudaSuccess != error) { fprintf(stderr, "FAILURE TO CREATE DEVICE %d\n", ordinal); exit(0); } // Set this device as the active one on the thread. device->_ordinal = ordinal; cudaSetDevice(ordinal); AllocPtr alloc = device->CreateDefaultAlloc(); // Create the context. return device->CreateStream(stream, alloc.get()); }
void _set_cuda_device(QSP_ARG_DECL Cuda_Device *cdp ) { #ifdef HAVE_CUDA cudaError_t e; if( curr_cdp == cdp ){ sprintf(ERROR_STRING,"set_cuda_device: current device is already %s!?",cdp->cudev_name); warn(ERROR_STRING); return; } e = cudaSetDevice( cdp->cudev_index ); if( e != cudaSuccess ) describe_cuda_driver_error2("set_cuda_device","cudaSetDevice",e); else curr_cdp = cdp; #endif // HAVE_CUDA }
void InternalThread::entry(int device, Caffe::Brew mode, int rand_seed, int solver_count, bool root_solver) { #ifndef CPU_ONLY CUDA_CHECK(cudaSetDevice(device)); #endif Caffe::set_mode(mode); Caffe::set_random_seed(rand_seed); Caffe::set_solver_count(solver_count); Caffe::set_root_solver(root_solver); #ifdef _OPENMP caffe::cpu::OpenMpManager::bindCurrentThreadToNonPrimaryCoreIfPossible(); #endif SetThreadAffinity(); InternalThreadEntry(); }
RemoteCUDARunner::RemoteCUDARunner():GPURunner<unsigned long,int>(TYPE_CUDA),m_metahashsize(0) { m_in=0; m_devin=0; m_out=0; m_devout=0; m_metahash=0; m_devmetahash=0; cutilSafeCall(cudaGetDeviceCount(&m_devicecount)); if(m_devicecount>0) { if(m_deviceindex<0 || m_deviceindex>=m_devicecount) { m_deviceindex=cutGetMaxGflopsDeviceId(); std::cout << "Setting CUDA device to Max GFlops device at index " << m_deviceindex << std::endl; } else { std::cout << "Setting CUDA device to device at index " << m_deviceindex << std::endl; } cudaDeviceProp props; cudaGetDeviceProperties(&props,m_deviceindex); std::cout << "Device info for " << props.name << " :" << std::endl; std::cout << "Compute Capability : " << props.major << "." << props.minor << std::endl; std::cout << "Clock Rate (hz) : " << props.clockRate << std::endl; if(props.major>999) { std::cout << "CUDA seems to be running in CPU emulation mode" << std::endl; } cutilSafeCall(cudaSetDevice(m_deviceindex)); } else { m_deviceindex=-1; std::cout << "No CUDA capable device detected" << std::endl; } }
cuda_xpattern_data( cuda_xpattern_config const& cpc ) { device_id = cpc.device_id; int current_id; cuda_assert( cudaGetDevice(¤t_id) ); if ( current_id != device_id ) cuda_assert( cudaSetDevice( device_id ) ); size_type const ug_size = sizeof(value_type) * cpc.ug_size * 2; cuda_assert( cudaMalloc( reinterpret_cast<void**>(&ug), ug_size ) ); cuda_assert( cudaMemset( reinterpret_cast<void*>(ug), 0, ug_size ) ); size_type const ar_size = sizeof(size_type) * cpc.tilt_size * cpc.max_dim * cpc.max_dim; cuda_assert( cudaMalloc( reinterpret_cast<void**>(&ar), ar_size ) ); cuda_assert( cudaMemset( reinterpret_cast<void*>(ar), 0, ar_size ) ); size_type const diag_size = sizeof(value_type) * cpc.tilt_size * cpc.max_dim; cuda_assert( cudaMalloc( reinterpret_cast<void**>(&diag), diag_size ) ); cuda_assert( cudaMemset( reinterpret_cast<void*>(diag), 0, diag_size ) ); size_type const dim_size = sizeof(size_type) * cpc.tilt_size; cuda_assert( cudaMalloc( reinterpret_cast<void**>(&dim), dim_size ) ); cuda_assert( cudaMemset( reinterpret_cast<void*>(dim), 0, dim_size ) ); size_type const I_exp_size = sizeof(value_type) * cpc.tilt_size * cpc.max_dim; cuda_assert( cudaMalloc( reinterpret_cast<void**>(&I_exp), I_exp_size ) ); cuda_assert( cudaMemset( reinterpret_cast<void*>(I_exp), 0, I_exp_size ) ); size_type const I_diff_size = sizeof(value_type) * cpc.tilt_size * cpc.max_dim; cuda_assert( cudaMalloc( reinterpret_cast<void**>(&I_diff), I_diff_size ) ); cuda_assert( cudaMemset( reinterpret_cast<void*>(I_diff), 0, I_diff_size ) ); size_type const I_zigmoid_size = sizeof(value_type) * cpc.tilt_size * cpc.max_dim; cuda_assert( cudaMalloc( reinterpret_cast<void**>(&I_zigmoid), I_zigmoid_size ) ); cuda_assert( cudaMemset( reinterpret_cast<void*>(I_zigmoid), 0, I_zigmoid_size ) ); size_type const thickness_array_size = sizeof(value_type) * cpc.tilt_size; cuda_assert( cudaMalloc( reinterpret_cast<void**>(&thickness_array), thickness_array_size ) ); cuda_assert( cudaMemset( reinterpret_cast<void*>(thickness_array), 0, thickness_array_size ) ); size_type const cache_size = sizeof(complex_type) * cpc.tilt_size * cpc.max_dim * cpc.max_dim * 6; cuda_assert( cudaMalloc( reinterpret_cast<void**>(&cache), cache_size ) ); cuda_assert( cudaMemset( reinterpret_cast<void*>(cache), 0, cache_size ) ); }
void Engine::SetDevice(const int device_id) { int current_device; CUDA_CHECK(cudaGetDevice(¤t_device)); if (current_device == device_id) { return; } // The call to cudaSetDevice must come before any calls to Get, which // may perform initialization using the GPU. CUDA_CHECK(cudaSetDevice(device_id)); if (Get().cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().cublas_handle_)); if (Get().curand_generator_) { CURAND_CHECK(curandDestroyGenerator(Get().curand_generator_)); } CUBLAS_CHECK(cublasCreate(&Get().cublas_handle_)); CURAND_CHECK(curandCreateGenerator(&Get().curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_, cluster_seedgen())); }
// selects GPU to use and returns gpu ID or -1 if using CPU int init_cuda() { // Select the proper device const char* devstr = getenv("CUDA_DEVICE"); const int env_dev = (devstr != NULL) ? atoi(devstr) : 0; int dev = env_dev; int devcnt; ebf::cudaErrCheck( cudaGetDeviceCount(&devcnt) ); if( dev >= 0 && dev < devcnt ) { ebf::cudaErrCheck( cudaSetDevice(dev) ); cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); } else { dev = -1; std::cerr << "# Cannot select the CUDA device. Using CPU!" << std::endl; } return dev; }
inline void InitTensorEngine(int dev_id){ cudaDeviceProp prop; int device_id = 0; int device_count = 0; cudaGetDeviceCount(&device_count); if (dev_id < 0) { #if (MSHADOW_USE_NVML) device_id = AutoSelectDevice(device_count); #endif } else { device_id = dev_id; } utils::Assert( device_id < device_count, "Incorrect Device ID" ); utils::Assert( cudaSetDevice(device_id) == cudaSuccess, "cannot set device" ); cudaGetDeviceProperties(&prop, device_id); printf("Use CUDA Device %d: %s\n", device_id, prop.name); cublasInit(); }
void InitCUDA(int device) { /////////////////////////// // CUDA initialisation /////////////////////////// int deviceCount; CUDA_SAFE_CALL(cudaGetDeviceCount(&deviceCount)); if (deviceCount == 0) std::cout << "There is no device supporting CUDA" << std::endl; CUDA_SAFE_CALL(cudaSetDevice(device)); cudaDeviceProp deviceProp; CUDA_SAFE_CALL(cudaGetDeviceProperties(&deviceProp, device)); std::cout << "Device " << device << ": " << deviceProp.name << std::endl; // or // CUT_DEVICE_INIT(); // with --device=1 (num device chosen) }
void initCuda(int deviceId) { // Check deviceId area int nbDevice = Device::getDeviceCount(); assert(deviceId >= 0 && deviceId < nbDevice); // Choose current device (state of host-thread) HANDLE_ERROR(cudaSetDevice(deviceId)); // Enable Interoperabilité OpenGL: // - Create a cuda specifique contexte, shared between Cuda and GL // - To be called before first call to kernel // - cudaSetDevice ou cudaGLSetGLDevice are mutualy exclusive HANDLE_ERROR(cudaGLSetGLDevice(deviceId)); // It can be usefull to preload driver, by example to practice benchmarking! (sometimes slow under linux) Device::loadCudaDriver(deviceId); // Device::loadCudaDriverAll();// Force driver to be load for all GPU }
/* usage: cutorch.streamBarrierMultiDevice({[gpu1]={stream1_1, ..., stream1_N}, [gpuK]={streamK_1, ..., streamK_M}}) with a specified GPU per each list of streams. Each stream (gpu1, stream1_1), ..., (gpu1, stream1_N), ..., (gpuK, streamK_1), ..., (gpuK, streamK_M) will wait for all others to complete fully. Streams are bucketed per device. Equivalent to streamBarrier() if only one GPU is specified. */ static int cutorch_streamBarrierMultiDevice(lua_State *L) { THCState *state = cutorch_getstate(L); int prevDev = -1; THCudaCheck(cudaGetDevice(&prevDev)); /* Validate and count set of {gpu={streams...}} that are mutually waiting */ int gpus = 0; int streams = 0; checkAndCountListOfGPUStreamPairs(L, state, 1, &gpus, &streams); if (streams < 2) { /* nothing to synchronize together */ return 0; } /* Events can only be recorded on the same device on which they are created. -For each GPU, create an event, and record that event on each stream given for that GPU. -For each GPU, for each stream, wait on the event created by each other GPU. */ cudaEvent_t* events = (cudaEvent_t*) malloc(sizeof(cudaEvent_t) * streams); /* First, create an event per GPU and record events for the specified stream on that GPU */ createMultiDeviceEvents(L, state, 1, events); /* Then, wait on the events. Each stream is actually waiting on itself here too, but that's harmless and isn't worth weeding out. */ waitMultiDeviceEvents(L, state, 1, events, streams); /* Clean up events */ for (int i = 0; i < streams; ++i) { THCudaCheck(cudaEventDestroy(events[i])); } free(events); THCudaCheck(cudaSetDevice(prevDev)); return 0; }