Exemple #1
1
int main(int argc, char ** argv) {
    int deviceCount;

    wbArg_read(argc, argv);

    cudaGetDeviceCount(&deviceCount);

    wbTime_start(GPU, "Getting GPU Data."); //@@ start a timer

    for (int dev = 0; dev < deviceCount; dev++) {
        cudaDeviceProp deviceProp;

        cudaGetDeviceProperties(&deviceProp, dev);

        if (dev == 0) {
            if (deviceProp.major == 9999 && deviceProp.minor == 9999) {
                wbLog(TRACE, "No CUDA GPU has been detected");
                return -1;
            } else if (deviceCount == 1) {
                //@@ WbLog is a provided logging API (similar to Log4J).
                //@@ The logging function wbLog takes a level which is either
                //@@ OFF, FATAL, ERROR, WARN, INFO, DEBUG, or TRACE and a
                //@@ message to be printed.
                wbLog(TRACE, "There is 1 device supporting CUDA");
            } else {
                wbLog(TRACE, "There are ", deviceCount, " devices supporting CUDA");
            }
        }

        wbLog(TRACE, "Device ", dev, " name: ", deviceProp.name);
        wbLog(TRACE, " Computational Capabilities: ", deviceProp.major, ".", deviceProp.minor);
        wbLog(TRACE, " Maximum global memory size: ", deviceProp.totalGlobalMem);
        wbLog(TRACE, " Maximum constant memory size: ", deviceProp.totalConstMem);
        wbLog(TRACE, " Maximum shared memory size per block: ", deviceProp.sharedMemPerBlock);
        wbLog(TRACE, " Maximum block dimensions: ", deviceProp.maxThreadsDim[0], " x ",
                                                    deviceProp.maxThreadsDim[1], " x ",
                                                    deviceProp.maxThreadsDim[2]);
        wbLog(TRACE, " Maximum grid dimensions: ", deviceProp.maxGridSize[0], " x ",
                                                   deviceProp.maxGridSize[1], " x ",
                                                   deviceProp.maxGridSize[2]);
        wbLog(TRACE, " Warp size: ", deviceProp.warpSize);
    }

    wbTime_stop(GPU, "Getting GPU Data."); //@@ stop the timer

    return 0;
}
void initializeCUDA() {
  cudaError_t error;
  int devID = 0;

  error = cudaSetDevice(devID); if (error != cudaSuccess){printf("cudaSetDevice returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);}
  error = cudaGetDevice(&devID); if (error != cudaSuccess){printf("cudaGetDevice returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);}

  //  printf("Device ID is %d\n",devID);

  cudaDeviceProp deviceProp;
  error = cudaGetDeviceProperties(&deviceProp,devID); if (error != cudaSuccess){printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);}

  //  printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);

  // use larger block size for Fermi and above
  block_size = (deviceProp.major < 2) ? 16 : 32;
}
xdl_int XdevLCudaImpl::init() {
	TiXmlDocument xmlDocument;

	if (!xmlDocument.LoadFile(getMediator()->getXmlFilename())) {
		XDEVL_MODULE_ERROR("Could not parse xml file: " << getMediator()->getXmlFilename() << std::endl);
		return ERR_ERROR;
	}

	if (readModuleInformation(&xmlDocument) != ERR_OK)
		return ERR_ERROR;


	cudaGetDevice(&m_devID);
  cudaGetDeviceProperties(&m_prop, m_devID);
	
	return ERR_OK;
}
		void cuda_running_configuration::update_parameters()
		{
	        cuda_safe_call(cudaDriverGetVersion(&driver_version));
	        cuda_safe_call(cudaRuntimeGetVersion(&runtime_version));

			int device_count;
		    cuda_safe_call(cudaGetDeviceCount(&device_count));
			if (device_count <= 0)
				throw neural_network_exception("No CUDA capable devices are found");

			if (device_id >= device_count)
				throw neural_network_exception((boost::format("Device ID %1% specified while %2% devices are available") % device_id % device_count).str());

			cudaDeviceProp device_prop;
			cuda_safe_call(cudaGetDeviceProperties(&device_prop, device_id));
			device_name = device_prop.name;
			compute_capability_major = device_prop.major;
			compute_capability_minor = device_prop.minor;
			clock_rate = device_prop.clockRate;
			memory_clock_rate = device_prop.memoryClockRate;
			memory_bus_width = device_prop.memoryBusWidth;
			global_memory_size = device_prop.totalGlobalMem;
			ecc_enabled = (device_prop.ECCEnabled != 0);
			l2_cache_size = device_prop.l2CacheSize;
			multiprocessor_count = device_prop.multiProcessorCount;
			smem_per_block = device_prop.sharedMemPerBlock;
			max_threads_per_multiprocessor = device_prop.maxThreadsPerMultiProcessor;
			max_threads_per_block = device_prop.maxThreadsPerBlock;
			for(int i = 0; i < sizeof(max_threads_dim) / sizeof(max_threads_dim[0]); ++i)
				max_threads_dim[i] = device_prop.maxThreadsDim[i];
			for(int i = 0; i < sizeof(max_grid_size) / sizeof(max_grid_size[0]); ++i)
				max_grid_size[i] = device_prop.maxGridSize[i];
			max_texture_1d_linear = device_prop.maxTexture1DLinear;
			texture_alignment = device_prop.textureAlignment;
			pci_bus_id = device_prop.pciBusID;
			pci_device_id = device_prop.pciDeviceID;
		#ifdef _WIN32
			tcc_mode = (device_prop.tccDriver != 0);
		#endif

			cuda_safe_call(cudaSetDevice(device_id));

			cublas_safe_call(cublasCreate(&cublas_handle));

			cusparse_safe_call(cusparseCreate(&cusparse_handle));
		}
void DialogSelectHardware::setListDevice()
{
    cudaGetDeviceCount(&deviceCount);

    QString text("Detectados "+QString::number(deviceCount)+" Dispositivos Compatibles con CUDA");
    QMessageBox::information(0,"Dispositivos Detectados",text,QMessageBox::Ok);

    deviceProp = new cudaDeviceProp;

    for (int dev = 0; dev < deviceCount; ++dev)
    {
        cudaGetDeviceProperties(deviceProp, dev);

        QString text("Device "+QString::number(dev).append(" : ")+ deviceProp->name);
        ui->deviceComboBox->addItem(text);
    }
}
void DialogSelectHardware::ChangeText(int indexDevice)
{
    int  driverVersion = 0, runtimeVersion = 0;
    cudaSetDevice(indexDevice);
    cudaGetDeviceProperties(deviceProp, indexDevice);
    cudaDriverGetVersion(&driverVersion);
    cudaRuntimeGetVersion(&runtimeVersion);

    char msg[256];
    SPRINTF(msg,"%.0f MBytes (%llu bytes)\n",
            (float)deviceProp->totalGlobalMem/1048576.0f, (unsigned long long) deviceProp->totalGlobalMem);

    ui->tableWidget->clear();
    addItem(QString ("Device "+QString::number(indexDevice).append(" : ")+ deviceProp->name),0,0);
    addItem((selectDevice == indexDevice) ? "Dispositivo Seleccionado " : " ",0,1);
    addItem("CUDA Driver Version / Runtime Version",1,0);
    addItem(QString ("%1.%2  /  %3.%4").arg(driverVersion/1000).arg((driverVersion%100)/10).arg( runtimeVersion/1000).arg((runtimeVersion%100)/10),1,1);
    addItem("CUDA Capability Major/Minor version number: ",2,0);
    addItem(QString ("%1.%2").arg(deviceProp->major).arg(deviceProp->minor),2,1);
    addItem("Total amount of global memory:",3,0);
    addItem(msg,3,1);
    addItem(QString ("(%1) Multiprocessors, (%2) CUDA Cores/MP:%3 CUDA Cores").arg( deviceProp->multiProcessorCount).arg( _ConvertSMVer2Cores(deviceProp->major, deviceProp->minor)).arg( _ConvertSMVer2Cores(deviceProp->major, deviceProp->minor) * deviceProp->multiProcessorCount),4,0);
    addItem("Total amount of constant memory:",5,0);
    addItem(QString ("%1 bytes").arg(deviceProp->totalConstMem),5,1);
    addItem("Total amount of shared memory per block:",6,0);
    addItem(QString ("%1 bytes").arg(deviceProp->sharedMemPerBlock),6,1);
    addItem("Total number of registers available per block:",7,0);
    addItem(QString ("%1").arg(deviceProp->regsPerBlock),7,1);
    addItem("Warp size:",8,0);
    addItem(QString ("%1").arg(deviceProp->warpSize),8,1);
    addItem("Maximum number of threads per multiprocessor:",9,0);
    addItem(QString ("%1").arg(deviceProp->maxThreadsPerMultiProcessor),9,1);
    addItem("Maximum number of threads per block:",10,0);
    addItem(QString ("%1").arg(deviceProp->maxThreadsPerBlock),10,1);
    addItem("Max dimension size of a thread block (x,y,z):",11,0);
    addItem(QString ("(%1, %2, %3)").arg(deviceProp->maxThreadsDim[0]).arg(  deviceProp->maxThreadsDim[1]).arg(  deviceProp->maxThreadsDim[2]),11,1);
    addItem("Max dimension size of a grid size    (x,y,z):",12,0);
    addItem(QString ("(%1, %2, %3)\n").arg(deviceProp->maxGridSize[0]).arg(deviceProp->maxGridSize[1]).arg(deviceProp->maxGridSize[2]),12,1);
    addItem("Run time limit on kernels: ",13,0);
    addItem(QString ("%1\n").arg(deviceProp->kernelExecTimeoutEnabled ? "Yes" : "No"),13,1);
    addItem("Integrated GPU sharing Host Memory: ",14,0);
    addItem( QString ("%1\n").arg(deviceProp->integrated ? "Yes" : "No"),14,1);

    ui->tableWidget->resizeColumnsToContents();
    ui->tableWidget->resizeRowsToContents();
}
void
cutilDeviceInit ( int argc, char ** argv )
{
  int deviceCount;
  cutilSafeCall ( cudaGetDeviceCount ( &deviceCount ) );
  if ( deviceCount == 0 )
  {
    printf ( "cutil error: no devices supporting CUDA\n" );
    exit ( -1 );
  }

  cudaDeviceProp_t deviceProp;
  cutilSafeCall ( cudaGetDeviceProperties ( &deviceProp, 0 ) );

  printf ( "\n Using CUDA device: %s\n", deviceProp.name );
  cutilSafeCall ( cudaSetDevice ( 0 ) );
}
Exemple #8
0
void printCudaDeviceInfo(int deviceId) {

    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, deviceId);

    printf("CUDA Device Information:\n");
    printf("Device %d: \"%s\"\n", deviceId, deviceProp.name);
    printf("  Integrated:                                    %d\n", deviceProp.integrated);
    printf("  Can map host mem:                              %d\n", deviceProp.canMapHostMemory);
    printf("  Number of cores:                               %d\n",
        ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
    printf("  Clock rate:                                    %.2f GHz\n", deviceProp.clockRate * 1e-6f);
    printf("  Performance Number:                            %d\n",
        ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount *
            (deviceProp.clockRate / 1000));
    printf("  Note: Performance number is clock in mhz * core count, for comparing devices.\n");
}
////////////////////////////////////////////////////////////////////////////////
//! Run test
////////////////////////////////////////////////////////////////////////////////
void runAutoTest(int argc, char** argv)
{
    printf("[%s]\n", sSDKsample);

    // Cuda init
	int dev = cutilChooseCudaDevice(argc, argv);

    cudaDeviceProp deviceProp;
    cutilSafeCall(cudaGetDeviceProperties(&deviceProp, dev));
    printf("Compute capability %d.%d\n", deviceProp.major, deviceProp.minor);
    int version = deviceProp.major*10 + deviceProp.minor;
    g_hasDouble = (version >= 13);
    if (inEmulationMode()) {
        // workaround since SM13 kernel doesn't produce correct output in emulation mode
        g_hasDouble = false;
    }

    // create FFT plan
    CUFFT_SAFE_CALL(cufftPlan2d(&fftPlan, meshW, meshH, CUFFT_C2R) );

    // allocate memory
    fftInputW = (meshW / 2)+1;
    fftInputH = meshH;
    fftInputSize = (fftInputW*fftInputH)*sizeof(float2);

    cutilSafeCall(cudaMalloc((void **)&d_h0, fftInputSize) );
    cutilSafeCall(cudaMalloc((void **)&d_ht, fftInputSize) );
    h_h0 = (float2 *) malloc(fftInputSize);
    generate_h0();
    cutilSafeCall(cudaMemcpy(d_h0, h_h0, fftInputSize, cudaMemcpyHostToDevice) );

    cutilSafeCall(cudaMalloc((void **)&d_slope, meshW*meshH*sizeof(float2)) );

    cutCreateTimer(&timer);
    cutStartTimer(timer);
    prevTime = cutGetTimerValue(timer);

    // Creating the Auto-Validation Code
    g_CheckRender = new CheckBackBuffer(windowH, windowH, 4, false);
    g_CheckRender->setPixelFormat(GL_RGBA);
    g_CheckRender->setExecPath(argv[0]);
    g_CheckRender->EnableQAReadback(true);

    runCudaTest(g_hasDouble);
    cudaThreadExit();
}
Exemple #10
0
CudaDeviceDialog::CudaDeviceDialog(QWidget *parent) 
    : QDialog(parent)
{
    m = new Ui_CudaDeviceDialog;
    m->setupUi(this);

    int deviceCount = 0;
    if (cudaGetDeviceCount(&deviceCount) == cudaSuccess) {
        for (int i = 0; i < deviceCount; ++i) {
            cudaDeviceProp p;
            cudaGetDeviceProperties(&p, i);
            m->comboBox->addItem(p.name);
        }
    }
    connect(m->comboBox, SIGNAL(currentIndexChanged(int)), this, SLOT(updateInfo(int)));
    updateInfo(0);
}
Exemple #11
0
/**
 * @brief This function is called immediately before the main Jacobi loop
 *
 * @param[in]	cartComm	The carthesian communicator
 * @param[in]	rank		The rank of the calling MPI process
 * @param[in]	size		The total number of MPI processes available
 * @param[out]	timerStart	The Jacobi loop starting moment (measured as wall-time)
 */
void PreRunJacobi(MPI_Comm cartComm, int rank, int size, double * timerStart)
{
	struct cudaDeviceProp devProps;
	int crtDevice = 0, enabledECC = 0;

	// We get the properties of the current device, assuming all other devices are the same
	SafeCudaCall(cudaGetDevice(&crtDevice));
	SafeCudaCall(cudaGetDeviceProperties(&devProps, crtDevice));

	// Determine how many devices have ECC enabled (assuming exactly one process per device)
	MPI_Reduce(&devProps.ECCEnabled, &enabledECC, 1, MPI_INT, MPI_SUM, MPI_MASTER_RANK, cartComm);

	MPI_Barrier(cartComm);
	OnePrintf(rank == MPI_MASTER_RANK, "Starting Jacobi run with %d processes using \"%s\" GPUs (ECC enabled: %d / %d):\n", 
				size, devProps.name, enabledECC, size);
	* timerStart = MPI_Wtime();
}
Exemple #12
0
// General GPU Device CUDA Initialization
inline int gpuDeviceInit(int devID)
{
    int device_count;
    checkCudaErrors(cudaGetDeviceCount(&device_count));

    if (device_count == 0)
    {
        fprintf(stderr, "gpuDeviceInit() CUDA error: no devices supporting CUDA.\n");
        exit(EXIT_FAILURE);
    }

    if (devID < 0)
    {
        devID = 0;
    }

    if (devID > device_count-1)
    {
        fprintf(stderr, "\n");
        fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", device_count);
        fprintf(stderr, ">> gpuDeviceInit (-device=%d) is not a valid GPU device. <<\n", devID);
        fprintf(stderr, "\n");
        return -devID;
    }

    cudaDeviceProp deviceProp;
    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));

    if (deviceProp.computeMode == cudaComputeModeProhibited)
    {
        fprintf(stderr, "Error: device is running in <Compute Mode Prohibited>, no threads can use ::cudaSetDevice().\n");
        return -1;
    }

    if (deviceProp.major < 1)
    {
        fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n");
        exit(EXIT_FAILURE);
    }

    checkCudaErrors(cudaSetDevice(devID));
    printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, deviceProp.name);

    return devID;
}
Exemple #13
0
void THCudaInit(THCState* state)
{
  int count = 0;
  THCudaCheck(cudaGetDeviceCount(&count));

  int device = 0;
  THCudaCheck(cudaGetDevice(&device));

  state->rngState = (THCRNGState*)malloc(sizeof(THCRNGState));
  THCRandom_init(state, count, device);

  state->blasState = (THCBlasState*)malloc(sizeof(THCBlasState));
  THCudaBlas_init(state, count, device);

  state->numDevices = count;
  state->deviceProperties =
    (struct cudaDeviceProp*)malloc(count * sizeof(struct cudaDeviceProp));

  THCState_setDeviceMode(state, THCStateDeviceModeManual);

  state->numUserStreams = 0;
  state->streamsPerDevice =
    (cudaStream_t**)malloc(count * sizeof(cudaStream_t*));

  /* Enable P2P access between all pairs, if possible */
  THCudaEnablePeerToPeerAccess(state);

  for (int i = 0; i < count; ++i)
  {
    THCudaCheck(cudaSetDevice(i));
    THCudaCheck(cudaGetDeviceProperties(&state->deviceProperties[i], i));
    /* Stream index 0 will be the default stream for convenience; by
       default no user streams are reserved */
    state->streamsPerDevice[i] =
      (cudaStream_t*)malloc(sizeof(cudaStream_t));
    state->streamsPerDevice[i][0] = NULL;
  }

  /* Restore to previous device */
  THCudaCheck(cudaSetDevice(device));

  /* Start in the default stream on the current device */
  state->currentPerDeviceStream = 0;
  state->currentStream = NULL;
}
Exemple #14
0
ContextPtr CudaDevice::Create(int ordinal, bool stream) {
	// Create the device.
	DevicePtr device(new CudaDevice);
	cudaError_t error = cudaGetDeviceProperties(&device->_prop, ordinal);
	if(cudaSuccess != error) {
		fprintf(stderr, "FAILURE TO CREATE DEVICE %d\n", ordinal);
		exit(0);
	}

	// Set this device as the active one on the thread.
	device->_ordinal = ordinal;
	cudaSetDevice(ordinal);

	AllocPtr alloc = device->CreateDefaultAlloc();

	// Create the context.
	return device->CreateStream(stream, alloc.get());
}
Exemple #15
0
bool checkCUDAProfile(int dev, int min_runtime, int min_compute)
{
    int runtimeVersion = 0;

    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, dev);

    fprintf(stderr,"\nDevice %d: \"%s\"\n", dev, deviceProp.name);
    cudaRuntimeGetVersion(&runtimeVersion);
    fprintf(stderr,"  CUDA Runtime Version     :\t%d.%d\n", runtimeVersion/1000, (runtimeVersion%100)/10);
    fprintf(stderr,"  CUDA Compute Capability  :\t%d.%d\n", deviceProp.major, deviceProp.minor);

    if( runtimeVersion >= min_runtime && ((deviceProp.major<<4) + deviceProp.minor) >= min_compute ) {
        return true;
    } else {
        return false;
    }
}
bool checkCUDAProfile(int dev)
{
    int runtimeVersion = 0;     

    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, dev);

    fprintf(stderr,"\nDevice %d: \"%s\"\n", dev, deviceProp.name);
    cudaRuntimeGetVersion(&runtimeVersion);
    fprintf(stderr,"  CUDA Runtime Version:\t%d.%d\n", runtimeVersion/1000, (runtimeVersion%100)/10);
    fprintf(stderr,"  CUDA SM Capability  :\t%d.%d\n", deviceProp.major, deviceProp.minor);

    if( runtimeVersion/1000 >= 3 && runtimeVersion%100 >= 1 && deviceProp.major >= 2 ) {
        return true;
    } else {
        return false;
    }
}
RemoteCUDARunner::RemoteCUDARunner():GPURunner<unsigned long,int>(TYPE_CUDA),m_metahashsize(0)
{
	m_in=0;
	m_devin=0;
	m_out=0;
	m_devout=0;
	m_metahash=0;
	m_devmetahash=0;

	cutilSafeCall(cudaGetDeviceCount(&m_devicecount));

	if(m_devicecount>0)
	{
		if(m_deviceindex<0 || m_deviceindex>=m_devicecount)
		{
			m_deviceindex=cutGetMaxGflopsDeviceId();
			std::cout << "Setting CUDA device to Max GFlops device at index " << m_deviceindex << std::endl;
		}
		else
		{
			std::cout << "Setting CUDA device to device at index " << m_deviceindex << std::endl;
		}
		
		cudaDeviceProp props;
		cudaGetDeviceProperties(&props,m_deviceindex);

		std::cout << "Device info for " << props.name << " :" << std::endl;
		std::cout << "Compute Capability : " << props.major << "." << props.minor << std::endl;
		std::cout << "Clock Rate (hz) : " << props.clockRate << std::endl;

		if(props.major>999)
		{
			std::cout << "CUDA seems to be running in CPU emulation mode" << std::endl;
		}

		cutilSafeCall(cudaSetDevice(m_deviceindex));

	}
	else
	{
		m_deviceindex=-1;
		std::cout << "No CUDA capable device detected" << std::endl;
	}
}
int findCapableDevice(int argc, char **argv)
{
    int dev;
    int bestDev = -1;

    int deviceCount = 0;
    if (cudaGetDeviceCount(&deviceCount) != cudaSuccess) {
        fprintf(stderr, "cudaGetDeviceCount FAILED CUDA Driver and Runtime version may be mismatched.\n");
        fprintf(stderr, "\nFAILED\n");
        cudaThreadExit();
        cutilExit(argc, argv);
    }
    for (dev = 0; dev < deviceCount; ++dev) {
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, dev);

        if (dev == 0) {
            // This function call returns 9999 for both major & minor fields, if no CUDA capable devices are present
            if (deviceProp.major == 9999 && deviceProp.minor == 9999)
                fprintf(stderr,"There is no device supporting CUDA.\n");
            else if (deviceCount == 1)
                fprintf(stderr,"There is 1 device supporting CUDA\n");
            else
                fprintf(stderr,"There are %d devices supporting CUDA\n", deviceCount);
        }

        if( checkCUDAProfile( dev ) ) {
            fprintf(stderr,"\nFound capable device: %d\n", dev );
            if( bestDev == -1 ) { 
                bestDev = dev;
                fprintf(stderr, "Setting active device to %d\n", bestDev );
            }
        }
    }

    if( bestDev == -1 ) {
        fprintf(stderr, "\nNo configuration with available capabilities was found.  Test has been waived.\n");
        fprintf(stderr, "This sample requires:\n");
        fprintf(stderr, "\tGPU Device Compute   >= 2.0 is required\n");
        fprintf(stderr, "\tCUDA Runtime Version >= 3.1 is required\n");
        fprintf(stderr, "PASSED\n");
    }
    return bestDev;
}
Exemple #19
0
// TODO: Fix this for the new backend
void Caffe::DeviceQuery() {
  if (Get().default_device_context_->backend() == BACKEND_CUDA) {
#ifdef USE_CUDA
    cudaDeviceProp prop;
    int device;
    if (cudaSuccess != cudaGetDevice(&device)) {
      printf("No cuda device present.\n");
    } else {
      CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
      LOG(INFO)<< "Device id:                     " << device;
      LOG(INFO)<< "Major revision number:         " << prop.major;
      LOG(INFO)<< "Minor revision number:         " << prop.minor;
      LOG(INFO)<< "Name:                          " << prop.name;
      LOG(INFO)<< "Total global memory:           " << prop.totalGlobalMem;
      LOG(INFO)<< "Total shared memory per block: " << prop.sharedMemPerBlock;
      LOG(INFO)<< "Total registers per block:     " << prop.regsPerBlock;
      LOG(INFO)<< "Warp size:                     " << prop.warpSize;
      LOG(INFO)<< "Maximum memory pitch:          " << prop.memPitch;
      LOG(INFO)<< "Maximum threads per block:     " << prop.maxThreadsPerBlock;
      LOG(INFO)<< "Maximum dimension of block:    "
      << prop.maxThreadsDim[0] << ", " << prop.maxThreadsDim[1] << ", "
      << prop.maxThreadsDim[2];
      LOG(INFO)<< "Maximum dimension of grid:     "
      << prop.maxGridSize[0] << ", " << prop.maxGridSize[1] << ", "
      << prop.maxGridSize[2];
      LOG(INFO)<< "Clock rate:                    " << prop.clockRate;
      LOG(INFO)<< "Total constant memory:         " << prop.totalConstMem;
      LOG(INFO)<< "Texture alignment:             " << prop.textureAlignment;
      LOG(INFO)<< "Concurrent copy and execution: "
      << (prop.deviceOverlap ? "Yes" : "No");
      LOG(INFO)<< "Number of multiprocessors:     " << prop.multiProcessorCount;
      LOG(INFO)<< "Kernel execution timeout:      "
      << (prop.kernelExecTimeoutEnabled ? "Yes" : "No");
    }
#endif  // USE_CUDA
  } else {
#ifdef USE_GREENTEA
    // TODO: Complete OpenCL device information of current device
#endif  // USE_GREENTEA
  }

  return;
}
Exemple #20
0
inline void InitTensorEngine(int dev_id){
  cudaDeviceProp prop;
  int device_id = 0;
  int device_count = 0;
  cudaGetDeviceCount(&device_count);

  if (dev_id < 0) {
#if (MSHADOW_USE_NVML)
    device_id = AutoSelectDevice(device_count);
#endif
  } else {
    device_id = dev_id;
  }
  utils::Assert( device_id < device_count, "Incorrect Device ID" );
  utils::Assert( cudaSetDevice(device_id) == cudaSuccess, "cannot set device" );
  cudaGetDeviceProperties(&prop, device_id);
  printf("Use CUDA Device %d: %s\n", device_id, prop.name);
  cublasInit();
}
        fastest_device() : the_fastest_device_id(0)
        {
            int num_devices = 0;
            cuda_assert( cudaGetDeviceCount( &num_devices ) );
            assert( !!num_devices );

            size_type max_multiprocessors = 0;
            for ( size_type device = 0; device != num_devices; ++device )
            {
                cudaDeviceProp properties;
                cuda_assert( cudaGetDeviceProperties( &properties, device ) );

                if ( max_multiprocessors < properties.multiProcessorCount )
                {
                    max_multiprocessors = properties.multiProcessorCount;
                    the_fastest_device_id = device;
                }
            }
        }//ctor
Exemple #22
0
void InitCUDA(int device)
{
    ///////////////////////////
    // CUDA initialisation
    ///////////////////////////

    int deviceCount;
    CUDA_SAFE_CALL(cudaGetDeviceCount(&deviceCount));

    if (deviceCount == 0) std::cout << "There is no device supporting CUDA" << std::endl;

    CUDA_SAFE_CALL(cudaSetDevice(device));
    cudaDeviceProp deviceProp;
    CUDA_SAFE_CALL(cudaGetDeviceProperties(&deviceProp, device));
    std::cout << "Device " << device << ": " << deviceProp.name << std::endl;

    // or
    // CUT_DEVICE_INIT(); // with --device=1 (num device chosen)
}
Exemple #23
0
/**
 * @brief Compresses data stream
 *
 * Performs compression using a three stage pipeline consisting of the Burrows-Wheeler
 * transform, the move-to-front transform, and Huffman encoding.
 * The compression algorithms are described in our paper "Parallel Lossless
 * Data Compression on the GPU". (See the \ref references bibliography).
 *
 * - Only unsigned char type is supported.
 * - Currently, the input stream (d_uncompressed) must be a buffer of 1,048,576 (uchar) elements (~1MB).
 * - The BWT Index (d_bwtIndex) is an integer number (int). This is used during the reverse-BWT stage.
 * - The Histogram size pointer (d_histSize) can be ignored and can be passed a null pointer.
 * - The Histrogram (d_hist) is a 256-entry (unsigned int) buffer. The histogram is used to
 * construct the Huffman tree during decoding.
 * - The Encoded offset table (d_encodeOffset) is a 256-entry (unsigned int) buffer. Since the input
 * stream is compressed in blocks of 4096 characters, the offset table gives the starting offset of
 * where each block starts in the compressed data (d_compressedSize). The very first uint at each starting offset
 * gives the size (in words) of that corresponding compressed block. This allows us to decompress each 4096
 * character-block in parallel.
 * - The size of compressed data (d_compressedSize) is a uint and gives the final size (in words)
 * of the compressed data.
 * - The compress data stream (d_compressed) is a uint buffer. The user should allocate enough
 * memory for worst-case (no compression occurs).
 * - \a numElements is a uint and must be set to 1048576.
 *
 * @param[out] d_bwtIndex BWT Index (int)
 * @param[out] d_histSize Histogram size (ignored, null ptr)
 * @param[out] d_hist Histogram (256-entry, uint)
 * @param[out] d_encodeOffset Encoded offset table (256-entry, uint)
 * @param[out] d_compressedSize Size of compressed data (uint)
 * @param[out] d_compressed Compressed data
 * @param[in] planHandle Handle to plan for compressor
 * @param[in] d_uncompressed Uncompressed data
 * @param[in] numElements Number of elements to compress
 * @returns CUDPPResult indicating success or error condition
 *
 * @see cudppPlan, CUDPPConfiguration, CUDPPAlgorithm
 */
CUDPP_DLL
CUDPPResult cudppCompress(CUDPPHandle planHandle,
                          unsigned char *d_uncompressed,
                          int *d_bwtIndex,
                          unsigned int *d_histSize,
                          unsigned int *d_hist,
                          unsigned int *d_encodeOffset,
                          unsigned int *d_compressedSize,
                          unsigned int *d_compressed,
                          size_t numElements)
{
    // first check: is this device >= 2.0? if not, return error
    int dev;
    cudaGetDevice(&dev);

    cudaDeviceProp devProps;
    cudaGetDeviceProperties(&devProps, dev);

    if((int)devProps.major < 2) {
        // Only supported on devices with compute
        // capability 2.0 or greater
        return CUDPP_ERROR_ILLEGAL_CONFIGURATION;
    }

    CUDPPCompressPlan * plan =
        (CUDPPCompressPlan *) getPlanPtrFromHandle<CUDPPCompressPlan>(planHandle);
    
    if(plan != NULL)
    {
        if (plan->m_config.algorithm != CUDPP_COMPRESS)
            return CUDPP_ERROR_INVALID_PLAN;
        if (plan->m_config.datatype != CUDPP_UCHAR)
            return CUDPP_ERROR_ILLEGAL_CONFIGURATION;
        if (numElements != 1048576)
            return CUDPP_ERROR_ILLEGAL_CONFIGURATION;

        cudppCompressDispatch(d_uncompressed, d_bwtIndex, d_histSize, d_hist, d_encodeOffset,
            d_compressedSize, d_compressed, numElements, plan);
        return CUDPP_SUCCESS;
    }
    else
        return CUDPP_ERROR_INVALID_HANDLE;
}
Exemple #24
0
void computeNumCTAs(KernelPointer kernel, int smemDynamicBytes, bool bManualCoalesce)
{
    cudaDeviceProp devprop;
    int deviceID = -1;
    cudaError_t err = cudaGetDevice(&deviceID);
    assert(err == cudaSuccess);

    cudaGetDeviceProperties(&devprop, deviceID);

    // Determine the maximum number of CTAs that can be run simultaneously for each kernel
    // This is equivalent to the calculation done in the CUDA Occupancy Calculator spreadsheet
    const unsigned int regAllocationUnit = (devprop.major < 2 && devprop.minor < 2) ? 256 : 512; // in registers
    const unsigned int warpAllocationMultiple = 2;
    const unsigned int smemAllocationUnit = 512;                                                 // in bytes
    const unsigned int maxThreadsPerSM = bManualCoalesce ? 768 : 1024; // sm_12 GPUs increase threads/SM to 1024
    const unsigned int maxBlocksPerSM = 8;

    cudaFuncAttributes attr;
    err = cudaFuncGetAttributes(&attr, (const char*)kernel);
    assert(err == cudaSuccess);


    // Number of warps (round up to nearest whole multiple of warp size)
    size_t numWarps = multiple(RadixSort::CTA_SIZE, devprop.warpSize);
    // Round up to warp allocation multiple
    numWarps = ceiling(numWarps, warpAllocationMultiple);

    // Number of regs is regs per thread times number of warps times warp size
    size_t regsPerCTA = attr.numRegs * devprop.warpSize * numWarps;
    // Round up to multiple of register allocation unit size
    regsPerCTA = ceiling(regsPerCTA, regAllocationUnit);

    size_t smemBytes = attr.sharedSizeBytes + smemDynamicBytes;
    size_t smemPerCTA = ceiling(smemBytes, smemAllocationUnit);

    size_t ctaLimitRegs    = regsPerCTA > 0 ? devprop.regsPerBlock / regsPerCTA : maxBlocksPerSM;
    size_t ctaLimitSMem    = smemPerCTA > 0 ? devprop.sharedMemPerBlock      / smemPerCTA : maxBlocksPerSM;
    size_t ctaLimitThreads =                  maxThreadsPerSM                / RadixSort::CTA_SIZE;

    unsigned int numSMs = devprop.multiProcessorCount;
    int maxCTAs = numSMs * std::min<size_t>(ctaLimitRegs, std::min<size_t>(ctaLimitSMem, std::min<size_t>(ctaLimitThreads, maxBlocksPerSM)));
    setNumCTAs(kernel, maxCTAs);
}
Exemple #25
0
static void do_main()
{
  Matrix A;
  double *x, *b;

  printf("#############################################################################################\n");
  printf("**                           B I C G S T A B   S O L V E R                                 **\n");
  printf("#############################################################################################\n\n");

  cudaDeviceProp props;
  CUDA_SAFE_CALL( cudaGetDeviceProperties(&props, 0) );
  printf("** DEVICE      : %10s (ECC: %3s)                                                     **\n", props.name, props.ECCEnabled ? "ON" : "OFF");
  printf("\n#############################################################################################\n\n");

  Context ctx;
  ctx.read_from_file("config.txt");
  read_system_from_file(&ctx, "res/matrix.inp", &A, &x, &b);

  cudaEvent_t start, stop;
  CUDA_SAFE_CALL( cudaEventCreate(&start) );
  CUDA_SAFE_CALL( cudaEventCreate(&stop) );
  
  CUDA_SAFE_CALL( cudaEventRecord(start) );

  //Jacobi solver(0.6);
  Bicgstab solver;
  solver.setup(&ctx, &A);
  solver.solve(&ctx, &A, x, b);

  float elapsed_time = 0.0f;
  CUDA_SAFE_CALL( cudaEventRecord(stop) );
  CUDA_SAFE_CALL( cudaEventSynchronize(stop) );
  CUDA_SAFE_CALL( cudaEventElapsedTime(&elapsed_time, start, stop) );
  
  printf("** ELAPSED TIME: %9.3fms                                                               **\n", elapsed_time);
  printf("\n#############################################################################################\n\n");

  CUDA_SAFE_CALL( cudaEventDestroy(stop) );
  CUDA_SAFE_CALL( cudaEventDestroy(start) );

  CUDA_SAFE_CALL( cudaFree(x) );
  CUDA_SAFE_CALL( cudaFree(b) );
}
Exemple #26
0
char CudaBase::CheckCUDevice()
{
    int deviceCount = 0;
    if (cudaGetDeviceCount(&deviceCount) != cudaSuccess) {
            std::cout << "Cannot find CUDA device!";
        return 0;
    }
    
    if(deviceCount>0) {
            std::cout << "Found " << deviceCount << " device(s)\n";
            int driverVersion = 0, runtimeVersion = 0;
            cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, 0);
            cudaDriverGetVersion(&driverVersion);
            cudaRuntimeGetVersion(&runtimeVersion);
            std::cout << "  Device name: " << deviceProp.name<<"\n";
            std::cout << "  Diver Version: " << driverVersion<<"\n";
            std::cout << "  Runtime Version: " << runtimeVersion<<"\n";
            std::cout << "  Capability Major/Minor version number: "<<deviceProp.major<<"."<<deviceProp.minor<<"\n";
            std::cout << "  Total amount of global memory: "<<(unsigned long long)deviceProp.totalGlobalMem<<" bytes\n";
            std::cout << "  Total amount of constant memory: "<<deviceProp.totalConstMem<<"bytes\n"; 
            std::cout << "  Total amount of shared memory per block: "<<deviceProp.sharedMemPerBlock<<" bytes\n";
            std::cout << "  Total number of registers available per block: "<<deviceProp.regsPerBlock<<"\n";
			std::cout << "  Warp size: "<<deviceProp.warpSize<<"\n";
        
            std::stringstream sst;
            sst<<"  Maximum sizes of each dimension of a grid: "<<deviceProp.maxGridSize[0]<<" x "<<deviceProp.maxGridSize[1]<<" x "<<deviceProp.maxGridSize[2];
            std::cout<<sst.str()<<"\n";
            sst.str("");
            sst<<"  Maximum sizes of each dimension of a block: "<<deviceProp.maxThreadsDim[0]<<" x "<<deviceProp.maxThreadsDim[1]<<" x "<<deviceProp.maxThreadsDim[2];
            std::cout<<sst.str()<<"\n";
            std::cout << "  Maximum number of threads per block: " << deviceProp.maxThreadsPerBlock<<"\n";
            
            MaxThreadPerBlock = deviceProp.maxThreadsPerBlock;
            MaxRegisterPerBlock = deviceProp.regsPerBlock;
            MaxSharedMemoryPerBlock = deviceProp.sharedMemPerBlock;
			WarpSize = deviceProp.warpSize;
			RuntimeVersion = runtimeVersion;
        return 1;               
    }
    return 0;
}
Exemple #27
0
//set up context and queue on a device and retrieve 
//device information for other methods
dpClient::dpClient(int plat, int dev){
	platform = plat;
	device = dev;
	
	nameFixer(platform, device, platName, devName);
	fprintf(stderr,"On Platform %s\n", platName);
	fprintf(stderr,"using device %s\n", devName);
	
	//OpenCL
	if (platform != 3){
		cl_platform_id platform_ids[16];
		cl_device_id device_ids[16];
		unsigned int numDevices;
		int err;
		cl_context_properties props[3] = {CL_CONTEXT_PLATFORM,0,0};
		clErrChk(clGetPlatformIDs(16, platform_ids, NULL));
		clErrChk(clGetDeviceIDs(platform_ids[platform], CL_DEVICE_TYPE_ALL, 16, device_ids, &numDevices));
		clErrChk(clGetDeviceInfo(device_ids[device], CL_DEVICE_MAX_WORK_GROUP_SIZE , sizeof(MaxWorkGroupSize), &MaxWorkGroupSize, NULL));
		clErrChk(clGetDeviceInfo(device_ids[device], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(MaxComputeUnits), &MaxComputeUnits, NULL));
		clErrChk(clGetDeviceInfo(device_ids[device], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(MaxWorkDim), &MaxWorkDim, NULL));
		clErrChk(clGetDeviceInfo(device_ids[device], CL_DEVICE_MAX_MEM_ALLOC_SIZE , sizeof(MaxMemAlloc), &MaxMemAlloc, NULL));
		strcpy(type, "OpenCL");
		
		props[1] = (cl_context_properties) platform_ids[platform];
		context = clCreateContext(props, 1, &device_ids[device], NULL, NULL, &err); 
		clErrChk(err);
		queue = clCreateCommandQueue( context, device_ids[device], 0, &err); 
		clErrChk(err);
	}
	
	//CUDA
	else{
		//CLIENT:
		cudaDeviceProp properties;
		cudaGetDeviceProperties(&properties, device);
		cudaSetDevice(device);
		MaxWorkGroupSize = 1;
		MaxMemAlloc = 100000;
		strcpy(type, "CUDA");
	}
	
}
/// Utility function to tweak problem size for small GPUs
int adjustProblemSize(int GPU_N, int default_nOptions)
{
    int nOptions = default_nOptions;

    // select problem size
    for (int i=0; i<GPU_N; i++)
    {
        cudaDeviceProp deviceProp;
        checkCudaErrors(cudaGetDeviceProperties(&deviceProp, i));
        int cudaCores = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor)
                        * deviceProp.multiProcessorCount;

        if (cudaCores <= 32)
        {
            nOptions = (nOptions < cudaCores/2 ? nOptions : cudaCores/2);
        }
    }

    return nOptions;
}
Exemple #29
0
void cuda_devicenames()
{
	cudaError_t err;
	int GPU_N;
	err = cudaGetDeviceCount(&GPU_N);
	if (err != cudaSuccess)
	{
		applog(LOG_ERR, "Unable to query number of CUDA devices! Is an nVidia driver installed?");
		exit(1);
	}

	for (int i = 0; i < GPU_N*opt_n_gputhreads; i++)
	{
		cudaDeviceProp props;
		cudaGetDeviceProperties(&props, device_map[i / opt_n_gputhreads]);

		device_name[i] = strdup(props.name);
		device_sm[i] = (props.major * 100 + props.minor * 10);
	}
}
void CudaUtils::printDevices()
{
    int count;
    if (cudaGetDeviceCount(&count))
        return;
    qDebug() << "Found" << count << "CUDA device(s)";
    for (int i=0; i < count; i++) {

        cudaDeviceProp prop;
        cudaGetDeviceProperties(&prop, i);
        QString deviceString = QString("* %1, Compute capability: %2.%3").arg(prop.name).arg(prop.major).arg(prop.minor);
        QString propString1 = QString("  Global mem: %1M, Shared mem per block: %2k, Registers per block: %3").arg(prop.totalGlobalMem / 1024 / 1024)
                .arg(prop.sharedMemPerBlock / 1024).arg(prop.regsPerBlock);
        QString propString2 = QString("  Warp size: %1 threads, Max threads per block: %2, Multiprocessor count: %3")
                .arg(prop.warpSize).arg(prop.maxThreadsPerBlock).arg(prop.multiProcessorCount);
        qDebug() << deviceString;
        qDebug() << propString1;
        qDebug() << propString2;
    }
}