Esempio n. 1
0
int main(int argc, char **argv)
{
  cudaError_t err = cudaSuccess;
  int deviceCount = 0;
  size_t totalDevMem, freeDevMem;
  size_t lastLineLength = 0; // MUST be initialized to zero

  signal(SIGTERM, signalHandler);
  signal(SIGQUIT, signalHandler);
  signal(SIGINT, signalHandler);
  signal(SIGHUP, signalHandler);

  writeLine(lastLineLength, "Preparing...");

  err = cudaGetDeviceCount(&deviceCount);

  if (err != cudaSuccess) {
   std::cerr << "ERROR: " << cudaGetErrorString(err) << std::endl; 
  }

  while (err == cudaSuccess && gRun) {
    
    std::ostringstream stream;

    for (int i=0; i < deviceCount; ++i) {
      if (err == cudaSuccess) {
	err = cudaSetDevice(i);
	if (err == cudaSuccess) {
	  cudaMemGetInfo(&freeDevMem, &totalDevMem);
	  if (i != 0)
	    stream << " : ";
	  stream << "Dev " << i << " (" << (freeDevMem/1024) << " KB of " << (totalDevMem/1048576) << " MB free)";
	}
      }
    }
    if (err == cudaSuccess) {
      writeLine(lastLineLength, stream.str());
    }
    
    sleep(5); // TODO - make the cycle time an optional command line flag...
  }

  cudaThreadExit();

  std::cout << std::endl;

  return 0;
}
Esempio n. 2
0
static CUT_THREADPROC solverThread(TOptionPlan *plan) {
    //Init GPU
    cutilSafeCall( cudaSetDevice(plan->device) );

    cudaDeviceProp deviceProp;
    cutilSafeCall(cudaGetDeviceProperties(&deviceProp, plan->device));
    int version = deviceProp.major * 10 + deviceProp.minor;
    if(useDoublePrecision && version < 13) {
        printf("Double precision is not supported on device %i.\n", plan->device);
        exit(0);
    }

    //Allocate memory for normally distributed samples
    cutilSafeCall( cudaMalloc(
                       (void **)&plan->d_Samples,
                       plan->pathN * sizeof(float)
                   ) );

    //Generate normally distributed samples
    if(useDoublePrecision)
        inverseCND_SM13(plan->d_Samples, NULL, plan->pathN);
    else
        inverseCND_SM10(plan->d_Samples, NULL, plan->pathN);

    //Allocate intermediate memory for MC integrator
    if(useDoublePrecision)
        initMonteCarlo_SM13(plan);
    else
        initMonteCarlo_SM10(plan);

    //Main computations
    if(useDoublePrecision)
        MonteCarlo_SM13(plan);
    else
        MonteCarlo_SM10(plan);
    cutilSafeCall( cudaThreadSynchronize() );

    //Shut down this GPU
    if(useDoublePrecision)
        closeMonteCarlo_SM13(plan);
    else
        closeMonteCarlo_SM10(plan);
    cutilSafeCall( cudaFree(plan->d_Samples) );

    cudaThreadExit();

    CUT_THREADEND;
}
        value_type abs_residual( value_type* ug, value_type thickness )
        {
            int current_id;
            cuda_assert( cudaGetDevice(&current_id) );
            if ( current_id != config.device_id ) cuda_assert( cudaSetDevice( config.device_id ) );

            update_I_diff(ug, thickness);

            value_type residual;
            cublasHandle_t handle;
            cublas_assert( cublasCreate_v2(&handle) );
            cublas_assert( cublasDasum_v2( handle, static_cast<int>(config.max_dim*config.tilt_size), data.I_diff, 1, &residual ) );
            cublas_assert( cublasDestroy_v2(handle) );

            return residual;
        }
Esempio n. 4
0
//void device_init_(int *icuda) {
int device_init_(long *icuda,long *cuda_device_number ) {
  /* Set device_id */
  
  int dev_count_check=0;
  device_id = util_my_smp_index();
  cudaGetDeviceCount(&dev_count_check);
  if(dev_count_check < *icuda){
    printf("Warning: Please check whether you have %ld cuda devices per node\n",*icuda);
    fflush(stdout);
    *cuda_device_number = 30;
  }
  else {
    cudaSetDevice(device_id);
  }
  return 1;
}
Esempio n. 5
0
void Solver::Reduce(int device, Caffe::Brew mode, uint64_t random_seed,
    int solver_count, bool root_solver) {
  Caffe::set_mode(mode);
#ifndef CPU_ONLY
  if (Caffe::mode() == Caffe::GPU) {
    CUDA_CHECK(cudaSetDevice(device));
#ifndef NO_NVML
    nvml::setCpuAffinity(rank_);
#endif
  }
#endif
  Caffe::set_random_seed(random_seed);
  Caffe::set_solver_count(solver_count);
  Caffe::set_root_solver(root_solver);
  net_->ReduceAndUpdate();
}
Esempio n. 6
0
bool initCuda()
{
	int devID = 0;
	int device_count= 0;

	cudaGetDeviceCount(&device_count);

	if (device_count < 1)
	{
		return false;
	}

	cudaSetDevice(devID);

	return true;
}
Esempio n. 7
0
bool InitCUDA(int i)
{
    int count = 0;

    cudaGetDeviceCount(&count);
    if(count == 0) {
        fprintf(stderr, "There is no device.\n");
        return false;
    }
    if(i == count) {
        fprintf(stderr, "There is no device supporting CUDA.\n");
        return false;
    }
    cudaSetDevice(i%count);
    return true;
}
bool setupFirstCuda2xDevice () {
    int numberOfDevices = 0;
    if (cudaSuccess != cudaGetDeviceCount (&numberOfDevices)) {
        return false;
    }
    for (int d = 0; d < numberOfDevices; ++d) {
        cudaDeviceProp properties;
        if (cudaSuccess != cudaGetDeviceProperties (&properties, d)) {
            continue;
        }
        if ((2 == properties.major) && (cudaSuccess == cudaSetDevice(d))) {
            return true;
        }
    }
    return false;
}
Esempio n. 9
0
PtrFreeScene* PtrFreeScene :: to_device(int device_id) const {
  cudaSetDevice(device_id);

  PtrFreeScene* scene = new PtrFreeScene;
  memcpy(scene, this, sizeof(PtrFreeScene));
  scene->original_scene = NULL;
  scene->data_set = NULL;

  /*Point* vertexes;
  cudaMalloc(&vertexes, sizeof(Point) * vertex_count);
  cudaMemcpy(vertexes, this->vertexes, sizeof(Point) * vertex_count, cudaMemcpyHostToDevice);

  test_kernel<<<1, 1>>>(vertexes, vertex_count);
  Point new_vertexes[2];
  cudaMemcpy(&new_vertexes, vertexes, 2 * sizeof(Point), cudaMemcpyDeviceToHost);
  printf("%f\n", new_vertexes[0].x);*/

  CUDA_SAFE(alloc_copy_to_cuda(&scene->vertexes,   this->vertexes,   vertex_count));
  CUDA_SAFE(alloc_copy_to_cuda(&scene->normals,    this->normals,    normals_count));
  CUDA_SAFE(alloc_copy_to_cuda(&scene->colors,     this->colors,     colors_count));
  CUDA_SAFE(alloc_copy_to_cuda(&scene->uvs,        this->uvs,        uvs_count));
  CUDA_SAFE(alloc_copy_to_cuda(&scene->triangles,  this->triangles,  triangles_count));
  CUDA_SAFE(alloc_copy_to_cuda(&scene->mesh_descs, this->mesh_descs, mesh_descs_count));
  CUDA_SAFE(alloc_copy_to_cuda(&scene->mesh_ids,   this->mesh_ids,   data_set->totalTriangleCount));
  CUDA_SAFE(alloc_copy_to_cuda(&scene->mesh_first_triangle_offset, this->mesh_first_triangle_offset, mesh_count));
  CUDA_SAFE(alloc_copy_to_cuda(&scene->compiled_materials, this->compiled_materials, ppm::MAT_MAX));
  CUDA_SAFE(alloc_copy_to_cuda(&scene->materials,          this->materials,          materials_count));
  CUDA_SAFE(alloc_copy_to_cuda(&scene->mesh_materials,     this->mesh_materials,     mesh_materials_count));
  CUDA_SAFE(alloc_copy_to_cuda(&scene->area_lights,        this->area_lights,        area_lights_count));
  CUDA_SAFE(alloc_copy_to_cuda(&scene->infinite_light_map, this->infinite_light_map, this->infinite_light.width * this->infinite_light.height));
  CUDA_SAFE(alloc_copy_to_cuda(&scene->tex_maps,           this->tex_maps,         tex_maps_count));
  CUDA_SAFE(alloc_copy_to_cuda(&scene->rgb_tex,            this->rgb_tex,          rgb_tex_count));
  CUDA_SAFE(alloc_copy_to_cuda(&scene->alpha_tex,          this->alpha_tex,        alpha_tex_count));
  CUDA_SAFE(alloc_copy_to_cuda(&scene->mesh_texs,          this->mesh_texs,        mesh_materials_count));
  CUDA_SAFE(alloc_copy_to_cuda(&scene->bump_map,           this->bump_map,         mesh_materials_count));
  CUDA_SAFE(alloc_copy_to_cuda(&scene->bump_map_scales,    this->bump_map_scales,  mesh_materials_count));
  CUDA_SAFE(alloc_copy_to_cuda(&scene->normal_map,         this->normal_map,       mesh_materials_count));

  CUDA_SAFE(alloc_copy_to_cuda(&scene->nodes, (QBVHNode*)    this->data_set->GetAccelerator()->GetNodes(), n_nodes));
  CUDA_SAFE(alloc_copy_to_cuda(&scene->prims, (QuadTriangle*)this->data_set->GetAccelerator()->GetPrims(), n_prims));

  PtrFreeScene* cuda_scene;
  CUDA_SAFE(alloc_copy_to_cuda(&cuda_scene, scene, 1));
  CUDA_SAFE(cudaDeviceSynchronize());

  return cuda_scene;
}
Esempio n. 10
0
int main(int argc, char **argv)
{
    //test_resize("data/bad.jpg");
    //test_box();
    //test_convolutional_layer();
    if(argc < 2){
        fprintf(stderr, "usage: %s <function>\n", argv[0]);
        return 0;
    }
    gpu_index = find_int_arg(argc, argv, "-i", 0);
    if(find_arg(argc, argv, "-nogpu")) gpu_index = -1;

#ifndef GPU
    gpu_index = -1;
#else
    if(gpu_index >= 0){
        cudaSetDevice(gpu_index);
    }
#endif

    if(0==strcmp(argv[1], "imagenet")){
        run_imagenet(argc, argv);
    } else if (0 == strcmp(argv[1], "detection")){
        run_detection(argc, argv);
    } else if (0 == strcmp(argv[1], "writing")){
        run_writing(argc, argv);
    } else if (0 == strcmp(argv[1], "test")){
        test_resize(argv[2]);
    } else if (0 == strcmp(argv[1], "captcha")){
        run_captcha(argc, argv);
    } else if (0 == strcmp(argv[1], "nightmare")){
        run_nightmare(argc, argv);
    } else if (0 == strcmp(argv[1], "change")){
        change_rate(argv[2], atof(argv[3]), (argc > 4) ? atof(argv[4]) : 0);
    } else if (0 == strcmp(argv[1], "rgbgr")){
        rgbgr_net(argv[2], argv[3], argv[4]);
    } else if (0 == strcmp(argv[1], "partial")){
        partial(argv[2], argv[3], argv[4], atoi(argv[5]));
    } else if (0 == strcmp(argv[1], "visualize")){
        visualize(argv[2], (argc > 3) ? argv[3] : 0);
    } else if (0 == strcmp(argv[1], "imtest")){
        test_resize(argv[2]);
    } else {
        fprintf(stderr, "Not an option: %s\n", argv[1]);
    }
    return 0;
}
Esempio n. 11
0
void Caffe::SetDevice(const int device_id) {
  int current_device;
  CUDA_CHECK(cudaGetDevice(&current_device));
  if (current_device == device_id) {
    return;
  }
  if (Get().cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().cublas_handle_));
  if (Get().curand_generator_) {
    CURAND_CHECK(curandDestroyGenerator(Get().curand_generator_));
  }
  CUDA_CHECK(cudaSetDevice(device_id));
  CUBLAS_CHECK(cublasCreate(&Get().cublas_handle_));
  CURAND_CHECK(curandCreateGenerator(&Get().curand_generator_,
      CURAND_RNG_PSEUDO_DEFAULT));
  CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_,
      cluster_seedgen()));
}
Esempio n. 12
0
//alloc GPU segemnt
//and then register it with all nodes
//then delete it 
int main(int argc, char *argv[])
{
  TSUITE_INIT(argc, argv);

  ASSERT (gaspi_proc_init(GASPI_BLOCK));

  ASSERT(gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK));

  gaspi_rank_t rank, nprocs, i;
  gaspi_number_t seg_max;

  gaspi_gpu_id_t gpus[8]; 
  gaspi_number_t nGPUs;

  ASSERT(gaspi_proc_num(&nprocs));
  ASSERT (gaspi_proc_rank(&rank));
  ASSERT(gaspi_gpu_init());
  seg_max = 1;
  ASSERT (gaspi_gpu_number(&nGPUs));
  ASSERT (gaspi_gpu_ids(gpus));

  cudaSetDevice(gpus[0]);

  ASSERT (gaspi_segment_alloc(0, 1024, GASPI_MEM_INITIALIZED|GASPI_MEM_GPU));

  ASSERT(gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK));

  for (i = 0; i < nprocs; i++)
  {
    if(i == rank)
      continue;

    ASSERT( gaspi_segment_register(0, i, GASPI_BLOCK));

  }

  ASSERT(gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK));

  ASSERT (gaspi_segment_delete(0));

  ASSERT(gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK));

  ASSERT (gaspi_proc_term(GASPI_BLOCK));

  return EXIT_SUCCESS;
}
Esempio n. 13
0
void initializeCUDA() {
  cudaError_t error;
  int devID = 0;

  error = cudaSetDevice(devID); if (error != cudaSuccess){printf("cudaSetDevice returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);}
  error = cudaGetDevice(&devID); if (error != cudaSuccess){printf("cudaGetDevice returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);}

  //  printf("Device ID is %d\n",devID);

  cudaDeviceProp deviceProp;
  error = cudaGetDeviceProperties(&deviceProp,devID); if (error != cudaSuccess){printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);}

  //  printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);

  // use larger block size for Fermi and above
  block_size = (deviceProp.major < 2) ? 16 : 32;
}
Esempio n. 14
0
void
cutilDeviceInit ( int argc, char ** argv )
{
  int deviceCount;
  cutilSafeCall ( cudaGetDeviceCount ( &deviceCount ) );
  if ( deviceCount == 0 )
  {
    printf ( "cutil error: no devices supporting CUDA\n" );
    exit ( -1 );
  }

  cudaDeviceProp_t deviceProp;
  cutilSafeCall ( cudaGetDeviceProperties ( &deviceProp, 0 ) );

  printf ( "\n Using CUDA device: %s\n", deviceProp.name );
  cutilSafeCall ( cudaSetDevice ( 0 ) );
}
		void cuda_running_configuration::update_parameters()
		{
	        cuda_safe_call(cudaDriverGetVersion(&driver_version));
	        cuda_safe_call(cudaRuntimeGetVersion(&runtime_version));

			int device_count;
		    cuda_safe_call(cudaGetDeviceCount(&device_count));
			if (device_count <= 0)
				throw neural_network_exception("No CUDA capable devices are found");

			if (device_id >= device_count)
				throw neural_network_exception((boost::format("Device ID %1% specified while %2% devices are available") % device_id % device_count).str());

			cudaDeviceProp device_prop;
			cuda_safe_call(cudaGetDeviceProperties(&device_prop, device_id));
			device_name = device_prop.name;
			compute_capability_major = device_prop.major;
			compute_capability_minor = device_prop.minor;
			clock_rate = device_prop.clockRate;
			memory_clock_rate = device_prop.memoryClockRate;
			memory_bus_width = device_prop.memoryBusWidth;
			global_memory_size = device_prop.totalGlobalMem;
			ecc_enabled = (device_prop.ECCEnabled != 0);
			l2_cache_size = device_prop.l2CacheSize;
			multiprocessor_count = device_prop.multiProcessorCount;
			smem_per_block = device_prop.sharedMemPerBlock;
			max_threads_per_multiprocessor = device_prop.maxThreadsPerMultiProcessor;
			max_threads_per_block = device_prop.maxThreadsPerBlock;
			for(int i = 0; i < sizeof(max_threads_dim) / sizeof(max_threads_dim[0]); ++i)
				max_threads_dim[i] = device_prop.maxThreadsDim[i];
			for(int i = 0; i < sizeof(max_grid_size) / sizeof(max_grid_size[0]); ++i)
				max_grid_size[i] = device_prop.maxGridSize[i];
			max_texture_1d_linear = device_prop.maxTexture1DLinear;
			texture_alignment = device_prop.textureAlignment;
			pci_bus_id = device_prop.pciBusID;
			pci_device_id = device_prop.pciDeviceID;
		#ifdef _WIN32
			tcc_mode = (device_prop.tccDriver != 0);
		#endif

			cuda_safe_call(cudaSetDevice(device_id));

			cublas_safe_call(cublasCreate(&cublas_handle));

			cusparse_safe_call(cusparseCreate(&cusparse_handle));
		}
void DialogSelectHardware::ChangeText(int indexDevice)
{
    int  driverVersion = 0, runtimeVersion = 0;
    cudaSetDevice(indexDevice);
    cudaGetDeviceProperties(deviceProp, indexDevice);
    cudaDriverGetVersion(&driverVersion);
    cudaRuntimeGetVersion(&runtimeVersion);

    char msg[256];
    SPRINTF(msg,"%.0f MBytes (%llu bytes)\n",
            (float)deviceProp->totalGlobalMem/1048576.0f, (unsigned long long) deviceProp->totalGlobalMem);

    ui->tableWidget->clear();
    addItem(QString ("Device "+QString::number(indexDevice).append(" : ")+ deviceProp->name),0,0);
    addItem((selectDevice == indexDevice) ? "Dispositivo Seleccionado " : " ",0,1);
    addItem("CUDA Driver Version / Runtime Version",1,0);
    addItem(QString ("%1.%2  /  %3.%4").arg(driverVersion/1000).arg((driverVersion%100)/10).arg( runtimeVersion/1000).arg((runtimeVersion%100)/10),1,1);
    addItem("CUDA Capability Major/Minor version number: ",2,0);
    addItem(QString ("%1.%2").arg(deviceProp->major).arg(deviceProp->minor),2,1);
    addItem("Total amount of global memory:",3,0);
    addItem(msg,3,1);
    addItem(QString ("(%1) Multiprocessors, (%2) CUDA Cores/MP:%3 CUDA Cores").arg( deviceProp->multiProcessorCount).arg( _ConvertSMVer2Cores(deviceProp->major, deviceProp->minor)).arg( _ConvertSMVer2Cores(deviceProp->major, deviceProp->minor) * deviceProp->multiProcessorCount),4,0);
    addItem("Total amount of constant memory:",5,0);
    addItem(QString ("%1 bytes").arg(deviceProp->totalConstMem),5,1);
    addItem("Total amount of shared memory per block:",6,0);
    addItem(QString ("%1 bytes").arg(deviceProp->sharedMemPerBlock),6,1);
    addItem("Total number of registers available per block:",7,0);
    addItem(QString ("%1").arg(deviceProp->regsPerBlock),7,1);
    addItem("Warp size:",8,0);
    addItem(QString ("%1").arg(deviceProp->warpSize),8,1);
    addItem("Maximum number of threads per multiprocessor:",9,0);
    addItem(QString ("%1").arg(deviceProp->maxThreadsPerMultiProcessor),9,1);
    addItem("Maximum number of threads per block:",10,0);
    addItem(QString ("%1").arg(deviceProp->maxThreadsPerBlock),10,1);
    addItem("Max dimension size of a thread block (x,y,z):",11,0);
    addItem(QString ("(%1, %2, %3)").arg(deviceProp->maxThreadsDim[0]).arg(  deviceProp->maxThreadsDim[1]).arg(  deviceProp->maxThreadsDim[2]),11,1);
    addItem("Max dimension size of a grid size    (x,y,z):",12,0);
    addItem(QString ("(%1, %2, %3)\n").arg(deviceProp->maxGridSize[0]).arg(deviceProp->maxGridSize[1]).arg(deviceProp->maxGridSize[2]),12,1);
    addItem("Run time limit on kernels: ",13,0);
    addItem(QString ("%1\n").arg(deviceProp->kernelExecTimeoutEnabled ? "Yes" : "No"),13,1);
    addItem("Integrated GPU sharing Host Memory: ",14,0);
    addItem( QString ("%1\n").arg(deviceProp->integrated ? "Yes" : "No"),14,1);

    ui->tableWidget->resizeColumnsToContents();
    ui->tableWidget->resizeRowsToContents();
}
int main(int argc, char **argv)
{
	// define the ptr
	int size = WIDTH * HEIGHT;
	float *h_data, *d_send_data, *d_recv_data; 
	bool use_cuda_time = true;

	if(argc != 3) {
		std::cout << "the number of paramter should be equal 2" << std::endl;
		std::cout << "egs: bandwidth_test_between2gpu 0 1" << std::endl;
		return 1;
	}
	//std::cout << "debug 1" << std::endl;
	int id0 = atoi(argv[1]);
	int id1 = atoi(argv[2]);
	std::cout << "id0=" << id0 << ", id1=" << id1 << std::endl;

	//h_data = new float[size];
	cudaMallocHost(&h_data, size*sizeof(float));
	init_data(h_data, size);

	cudaSetDevice(id0);
	cudaMalloc(&d_send_data, size*sizeof(float));
	cudaSetDevice(id1);
	cudaMalloc(&d_recv_data, size*sizeof(float));
	cudaMemcpy(d_send_data, h_data, size*sizeof(float), cudaMemcpyHostToDevice);

	int can_access_peer_0_1, can_access_peer_1_0;
	cudaSetDevice(id0);
	CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer_0_1, id0, id1));
	CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer_1_0, id1, id0));

	if(can_access_peer_0_1 && can_access_peer_1_0) {
		std::cout << "can GPU" << id0 << "access from GPU" << id1 << ": Yes" << std::endl;
		cudaSetDevice(id0);
		CUDA_CHECK(cudaDeviceEnablePeerAccess(id1, 0));
		cudaSetDevice(id1);
		CUDA_CHECK(cudaDeviceEnablePeerAccess(id0, 0));
	} else {
		std::cout << "can GPU" << id0 << "access from GPU" << id1 << ": No" << std::endl;
	}

	cudaSetDevice(id1);
	use_cuda_time = false;
	//use_cuda_time = true;
	test_2gpu(d_send_data, d_recv_data, size, id0, id1, use_cuda_time);

	cudaFreeHost(h_data);
	cudaFree(d_send_data);
	cudaFree(d_recv_data);

	return 0;
}
Esempio n. 18
0
TEST(PeerAccess, DeviceReset) {
    cudaError_t ret;

    int devices;
    ret = cudaGetDeviceCount(&devices);
    ASSERT_EQ(cudaSuccess, ret);

    if (devices <= 1) {
        return;
    }

    bool found = false;
    int dj;

    for (int i = 0; i < devices && !(found); i++) {
        ret = cudaSetDevice(i);
        ASSERT_EQ(cudaSuccess, ret);

        for (int j = 0; j < devices; j++) {
            int peer;
            ret = cudaDeviceCanAccessPeer(&peer, i, j);
            ASSERT_EQ(cudaSuccess, ret);

            if (peer) {
                ret = cudaDeviceEnablePeerAccess(j, 0);
                ASSERT_EQ(cudaSuccess, ret);

                found = true;
                dj = j;
                break;
            }
        }
    }

    if (!(found)) {
        return;
    }

    /* Perform a device reset. */
    ret = cudaDeviceReset();
    EXPECT_EQ(cudaSuccess, ret);

    ret = cudaDeviceDisablePeerAccess(dj);
    EXPECT_EQ(cudaErrorPeerAccessNotEnabled, ret);
}
Esempio n. 19
0
// General GPU Device CUDA Initialization
inline int gpuDeviceInit(int devID)
{
    int device_count;
    checkCudaErrors(cudaGetDeviceCount(&device_count));

    if (device_count == 0)
    {
        fprintf(stderr, "gpuDeviceInit() CUDA error: no devices supporting CUDA.\n");
        exit(EXIT_FAILURE);
    }

    if (devID < 0)
    {
        devID = 0;
    }

    if (devID > device_count-1)
    {
        fprintf(stderr, "\n");
        fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", device_count);
        fprintf(stderr, ">> gpuDeviceInit (-device=%d) is not a valid GPU device. <<\n", devID);
        fprintf(stderr, "\n");
        return -devID;
    }

    cudaDeviceProp deviceProp;
    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));

    if (deviceProp.computeMode == cudaComputeModeProhibited)
    {
        fprintf(stderr, "Error: device is running in <Compute Mode Prohibited>, no threads can use ::cudaSetDevice().\n");
        return -1;
    }

    if (deviceProp.major < 1)
    {
        fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n");
        exit(EXIT_FAILURE);
    }

    checkCudaErrors(cudaSetDevice(devID));
    printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, deviceProp.name);

    return devID;
}
Esempio n. 20
0
ContextPtr CudaDevice::Create(int ordinal, bool stream) {
	// Create the device.
	DevicePtr device(new CudaDevice);
	cudaError_t error = cudaGetDeviceProperties(&device->_prop, ordinal);
	if(cudaSuccess != error) {
		fprintf(stderr, "FAILURE TO CREATE DEVICE %d\n", ordinal);
		exit(0);
	}

	// Set this device as the active one on the thread.
	device->_ordinal = ordinal;
	cudaSetDevice(ordinal);

	AllocPtr alloc = device->CreateDefaultAlloc();

	// Create the context.
	return device->CreateStream(stream, alloc.get());
}
Esempio n. 21
0
void _set_cuda_device(QSP_ARG_DECL   Cuda_Device *cdp )
{
#ifdef HAVE_CUDA
	cudaError_t e;

	if( curr_cdp == cdp ){
		sprintf(ERROR_STRING,"set_cuda_device:  current device is already %s!?",cdp->cudev_name);
		warn(ERROR_STRING);
		return;
	}

	e = cudaSetDevice( cdp->cudev_index );
	if( e != cudaSuccess )
		describe_cuda_driver_error2("set_cuda_device","cudaSetDevice",e);
	else
		curr_cdp = cdp;
#endif //  HAVE_CUDA
}
Esempio n. 22
0
void InternalThread::entry(int device, Caffe::Brew mode, int rand_seed,
    int solver_count, bool root_solver) {
#ifndef CPU_ONLY
  CUDA_CHECK(cudaSetDevice(device));
#endif
  Caffe::set_mode(mode);
  Caffe::set_random_seed(rand_seed);
  Caffe::set_solver_count(solver_count);
  Caffe::set_root_solver(root_solver);

#ifdef _OPENMP
  caffe::cpu::OpenMpManager::bindCurrentThreadToNonPrimaryCoreIfPossible();
#endif

  SetThreadAffinity();
  
  InternalThreadEntry();
}
RemoteCUDARunner::RemoteCUDARunner():GPURunner<unsigned long,int>(TYPE_CUDA),m_metahashsize(0)
{
	m_in=0;
	m_devin=0;
	m_out=0;
	m_devout=0;
	m_metahash=0;
	m_devmetahash=0;

	cutilSafeCall(cudaGetDeviceCount(&m_devicecount));

	if(m_devicecount>0)
	{
		if(m_deviceindex<0 || m_deviceindex>=m_devicecount)
		{
			m_deviceindex=cutGetMaxGflopsDeviceId();
			std::cout << "Setting CUDA device to Max GFlops device at index " << m_deviceindex << std::endl;
		}
		else
		{
			std::cout << "Setting CUDA device to device at index " << m_deviceindex << std::endl;
		}
		
		cudaDeviceProp props;
		cudaGetDeviceProperties(&props,m_deviceindex);

		std::cout << "Device info for " << props.name << " :" << std::endl;
		std::cout << "Compute Capability : " << props.major << "." << props.minor << std::endl;
		std::cout << "Clock Rate (hz) : " << props.clockRate << std::endl;

		if(props.major>999)
		{
			std::cout << "CUDA seems to be running in CPU emulation mode" << std::endl;
		}

		cutilSafeCall(cudaSetDevice(m_deviceindex));

	}
	else
	{
		m_deviceindex=-1;
		std::cout << "No CUDA capable device detected" << std::endl;
	}
}
        cuda_xpattern_data( cuda_xpattern_config const& cpc )
        {
            device_id = cpc.device_id;

            int current_id;
            cuda_assert( cudaGetDevice(&current_id) );
            if ( current_id != device_id ) cuda_assert( cudaSetDevice( device_id ) );

            size_type const ug_size = sizeof(value_type) * cpc.ug_size * 2;
            cuda_assert( cudaMalloc( reinterpret_cast<void**>(&ug), ug_size ) );
            cuda_assert( cudaMemset( reinterpret_cast<void*>(ug), 0, ug_size ) );

            size_type const ar_size = sizeof(size_type) * cpc.tilt_size * cpc.max_dim * cpc.max_dim;
            cuda_assert( cudaMalloc( reinterpret_cast<void**>(&ar), ar_size ) );
            cuda_assert( cudaMemset( reinterpret_cast<void*>(ar), 0, ar_size ) );

            size_type const diag_size = sizeof(value_type) * cpc.tilt_size * cpc.max_dim;
            cuda_assert( cudaMalloc( reinterpret_cast<void**>(&diag), diag_size ) );
            cuda_assert( cudaMemset( reinterpret_cast<void*>(diag), 0, diag_size ) );

            size_type const dim_size = sizeof(size_type) * cpc.tilt_size;
            cuda_assert( cudaMalloc( reinterpret_cast<void**>(&dim), dim_size ) );
            cuda_assert( cudaMemset( reinterpret_cast<void*>(dim), 0, dim_size ) );

            size_type const I_exp_size = sizeof(value_type) * cpc.tilt_size * cpc.max_dim;
            cuda_assert( cudaMalloc( reinterpret_cast<void**>(&I_exp), I_exp_size ) );
            cuda_assert( cudaMemset( reinterpret_cast<void*>(I_exp), 0, I_exp_size ) );

            size_type const I_diff_size = sizeof(value_type) * cpc.tilt_size * cpc.max_dim;
            cuda_assert( cudaMalloc( reinterpret_cast<void**>(&I_diff), I_diff_size ) );
            cuda_assert( cudaMemset( reinterpret_cast<void*>(I_diff), 0, I_diff_size ) );

            size_type const I_zigmoid_size = sizeof(value_type) * cpc.tilt_size * cpc.max_dim;
            cuda_assert( cudaMalloc( reinterpret_cast<void**>(&I_zigmoid), I_zigmoid_size ) );
            cuda_assert( cudaMemset( reinterpret_cast<void*>(I_zigmoid), 0, I_zigmoid_size ) );

            size_type const thickness_array_size = sizeof(value_type) * cpc.tilt_size;
            cuda_assert( cudaMalloc( reinterpret_cast<void**>(&thickness_array), thickness_array_size ) );
            cuda_assert( cudaMemset( reinterpret_cast<void*>(thickness_array), 0, thickness_array_size ) );

            size_type const cache_size = sizeof(complex_type) * cpc.tilt_size * cpc.max_dim * cpc.max_dim * 6;
            cuda_assert( cudaMalloc( reinterpret_cast<void**>(&cache), cache_size ) );
            cuda_assert( cudaMemset( reinterpret_cast<void*>(cache), 0, cache_size ) );
        }
Esempio n. 25
0
void Engine::SetDevice(const int device_id) {
  int current_device;
  CUDA_CHECK(cudaGetDevice(&current_device));
  if (current_device == device_id) {
    return;
  }
  // The call to cudaSetDevice must come before any calls to Get, which
  // may perform initialization using the GPU.
  CUDA_CHECK(cudaSetDevice(device_id));
  if (Get().cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().cublas_handle_));
  if (Get().curand_generator_) {
    CURAND_CHECK(curandDestroyGenerator(Get().curand_generator_));
  }
  CUBLAS_CHECK(cublasCreate(&Get().cublas_handle_));
  CURAND_CHECK(curandCreateGenerator(&Get().curand_generator_,
      CURAND_RNG_PSEUDO_DEFAULT));
  CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_,
      cluster_seedgen()));
}
Esempio n. 26
0
// selects GPU to use and returns gpu ID or -1 if using CPU
int init_cuda() 
{ 
    // Select the proper device
    const char* devstr = getenv("CUDA_DEVICE");
    const int env_dev = (devstr != NULL) ? atoi(devstr) : 0;
    int dev = env_dev;
    int devcnt; ebf::cudaErrCheck( cudaGetDeviceCount(&devcnt) );
    if( dev >= 0 && dev < devcnt )
       { 
       ebf::cudaErrCheck( cudaSetDevice(dev) ); 
       cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
       }
    else
       {
        dev = -1;
       	std::cerr << "# Cannot select the CUDA device. Using CPU!" << std::endl;
	}
    return dev;
}
Esempio n. 27
0
inline void InitTensorEngine(int dev_id){
  cudaDeviceProp prop;
  int device_id = 0;
  int device_count = 0;
  cudaGetDeviceCount(&device_count);

  if (dev_id < 0) {
#if (MSHADOW_USE_NVML)
    device_id = AutoSelectDevice(device_count);
#endif
  } else {
    device_id = dev_id;
  }
  utils::Assert( device_id < device_count, "Incorrect Device ID" );
  utils::Assert( cudaSetDevice(device_id) == cudaSuccess, "cannot set device" );
  cudaGetDeviceProperties(&prop, device_id);
  printf("Use CUDA Device %d: %s\n", device_id, prop.name);
  cublasInit();
}
Esempio n. 28
0
void InitCUDA(int device)
{
    ///////////////////////////
    // CUDA initialisation
    ///////////////////////////

    int deviceCount;
    CUDA_SAFE_CALL(cudaGetDeviceCount(&deviceCount));

    if (deviceCount == 0) std::cout << "There is no device supporting CUDA" << std::endl;

    CUDA_SAFE_CALL(cudaSetDevice(device));
    cudaDeviceProp deviceProp;
    CUDA_SAFE_CALL(cudaGetDeviceProperties(&deviceProp, device));
    std::cout << "Device " << device << ": " << deviceProp.name << std::endl;

    // or
    // CUT_DEVICE_INIT(); // with --device=1 (num device chosen)
}
Esempio n. 29
0
void initCuda(int deviceId)
    {
    // Check deviceId area
    int nbDevice = Device::getDeviceCount();
    assert(deviceId >= 0 && deviceId < nbDevice);

    // Choose current device  (state of host-thread)
    HANDLE_ERROR(cudaSetDevice(deviceId));

    // Enable Interoperabilité OpenGL:
    //		- Create a cuda specifique contexte, shared between Cuda and GL
    //		- To be called before first call to kernel
    //		- cudaSetDevice ou cudaGLSetGLDevice are mutualy exclusive
    HANDLE_ERROR(cudaGLSetGLDevice(deviceId));

    // It can be usefull to preload driver, by example to practice benchmarking! (sometimes slow under linux)
    Device::loadCudaDriver(deviceId);
    // Device::loadCudaDriverAll();// Force driver to be load for all GPU
    }
Esempio n. 30
0
/* usage:
   cutorch.streamBarrierMultiDevice({[gpu1]={stream1_1, ..., stream1_N},
                                     [gpuK]={streamK_1, ..., streamK_M}})
   with a specified GPU per each list of streams.
   Each stream (gpu1, stream1_1), ..., (gpu1, stream1_N), ...,
               (gpuK, streamK_1), ..., (gpuK, streamK_M) will wait
   for all others to complete fully.
   Streams are bucketed per device. Equivalent to streamBarrier() if only
   one GPU is specified.
 */
static int cutorch_streamBarrierMultiDevice(lua_State *L)
{
  THCState *state = cutorch_getstate(L);

  int prevDev = -1;
  THCudaCheck(cudaGetDevice(&prevDev));

  /* Validate and count set of {gpu={streams...}} that are mutually waiting */
  int gpus = 0;
  int streams = 0;
  checkAndCountListOfGPUStreamPairs(L, state, 1, &gpus, &streams);

  if (streams < 2) {
    /* nothing to synchronize together */
    return 0;
  }

  /*
     Events can only be recorded on the same device on which they are created.
     -For each GPU, create an event, and record that event on each stream given
     for that GPU.
     -For each GPU, for each stream, wait on the event created by each other
     GPU.
  */
  cudaEvent_t* events = (cudaEvent_t*) malloc(sizeof(cudaEvent_t) * streams);

  /* First, create an event per GPU and record events for the specified stream
     on that GPU */
  createMultiDeviceEvents(L, state, 1, events);

  /* Then, wait on the events. Each stream is actually waiting on itself here
     too, but that's harmless and isn't worth weeding out. */
  waitMultiDeviceEvents(L, state, 1, events, streams);

  /* Clean up events */
  for (int i = 0; i < streams; ++i) {
    THCudaCheck(cudaEventDestroy(events[i]));
  }
  free(events);
  THCudaCheck(cudaSetDevice(prevDev));

  return 0;
}