int main(int argc, char ** argv) { int deviceCount; wbArg_read(argc, argv); cudaGetDeviceCount(&deviceCount); wbTime_start(GPU, "Getting GPU Data."); //@@ start a timer for (int dev = 0; dev < deviceCount; dev++) { cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); if (dev == 0) { if (deviceProp.major == 9999 && deviceProp.minor == 9999) { wbLog(TRACE, "No CUDA GPU has been detected"); return -1; } else if (deviceCount == 1) { //@@ WbLog is a provided logging API (similar to Log4J). //@@ The logging function wbLog takes a level which is either //@@ OFF, FATAL, ERROR, WARN, INFO, DEBUG, or TRACE and a //@@ message to be printed. wbLog(TRACE, "There is 1 device supporting CUDA"); } else { wbLog(TRACE, "There are ", deviceCount, " devices supporting CUDA"); } } wbLog(TRACE, "Device ", dev, " name: ", deviceProp.name); wbLog(TRACE, " Computational Capabilities: ", deviceProp.major, ".", deviceProp.minor); wbLog(TRACE, " Maximum global memory size: ", deviceProp.totalGlobalMem); wbLog(TRACE, " Maximum constant memory size: ", deviceProp.totalConstMem); wbLog(TRACE, " Maximum shared memory size per block: ", deviceProp.sharedMemPerBlock); wbLog(TRACE, " Maximum block dimensions: ", deviceProp.maxThreadsDim[0], " x ", deviceProp.maxThreadsDim[1], " x ", deviceProp.maxThreadsDim[2]); wbLog(TRACE, " Maximum grid dimensions: ", deviceProp.maxGridSize[0], " x ", deviceProp.maxGridSize[1], " x ", deviceProp.maxGridSize[2]); wbLog(TRACE, " Warp size: ", deviceProp.warpSize); } wbTime_stop(GPU, "Getting GPU Data."); //@@ stop the timer return 0; }
void initializeCUDA() { cudaError_t error; int devID = 0; error = cudaSetDevice(devID); if (error != cudaSuccess){printf("cudaSetDevice returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);} error = cudaGetDevice(&devID); if (error != cudaSuccess){printf("cudaGetDevice returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);} // printf("Device ID is %d\n",devID); cudaDeviceProp deviceProp; error = cudaGetDeviceProperties(&deviceProp,devID); if (error != cudaSuccess){printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);} // printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor); // use larger block size for Fermi and above block_size = (deviceProp.major < 2) ? 16 : 32; }
xdl_int XdevLCudaImpl::init() { TiXmlDocument xmlDocument; if (!xmlDocument.LoadFile(getMediator()->getXmlFilename())) { XDEVL_MODULE_ERROR("Could not parse xml file: " << getMediator()->getXmlFilename() << std::endl); return ERR_ERROR; } if (readModuleInformation(&xmlDocument) != ERR_OK) return ERR_ERROR; cudaGetDevice(&m_devID); cudaGetDeviceProperties(&m_prop, m_devID); return ERR_OK; }
void cuda_running_configuration::update_parameters() { cuda_safe_call(cudaDriverGetVersion(&driver_version)); cuda_safe_call(cudaRuntimeGetVersion(&runtime_version)); int device_count; cuda_safe_call(cudaGetDeviceCount(&device_count)); if (device_count <= 0) throw neural_network_exception("No CUDA capable devices are found"); if (device_id >= device_count) throw neural_network_exception((boost::format("Device ID %1% specified while %2% devices are available") % device_id % device_count).str()); cudaDeviceProp device_prop; cuda_safe_call(cudaGetDeviceProperties(&device_prop, device_id)); device_name = device_prop.name; compute_capability_major = device_prop.major; compute_capability_minor = device_prop.minor; clock_rate = device_prop.clockRate; memory_clock_rate = device_prop.memoryClockRate; memory_bus_width = device_prop.memoryBusWidth; global_memory_size = device_prop.totalGlobalMem; ecc_enabled = (device_prop.ECCEnabled != 0); l2_cache_size = device_prop.l2CacheSize; multiprocessor_count = device_prop.multiProcessorCount; smem_per_block = device_prop.sharedMemPerBlock; max_threads_per_multiprocessor = device_prop.maxThreadsPerMultiProcessor; max_threads_per_block = device_prop.maxThreadsPerBlock; for(int i = 0; i < sizeof(max_threads_dim) / sizeof(max_threads_dim[0]); ++i) max_threads_dim[i] = device_prop.maxThreadsDim[i]; for(int i = 0; i < sizeof(max_grid_size) / sizeof(max_grid_size[0]); ++i) max_grid_size[i] = device_prop.maxGridSize[i]; max_texture_1d_linear = device_prop.maxTexture1DLinear; texture_alignment = device_prop.textureAlignment; pci_bus_id = device_prop.pciBusID; pci_device_id = device_prop.pciDeviceID; #ifdef _WIN32 tcc_mode = (device_prop.tccDriver != 0); #endif cuda_safe_call(cudaSetDevice(device_id)); cublas_safe_call(cublasCreate(&cublas_handle)); cusparse_safe_call(cusparseCreate(&cusparse_handle)); }
void DialogSelectHardware::setListDevice() { cudaGetDeviceCount(&deviceCount); QString text("Detectados "+QString::number(deviceCount)+" Dispositivos Compatibles con CUDA"); QMessageBox::information(0,"Dispositivos Detectados",text,QMessageBox::Ok); deviceProp = new cudaDeviceProp; for (int dev = 0; dev < deviceCount; ++dev) { cudaGetDeviceProperties(deviceProp, dev); QString text("Device "+QString::number(dev).append(" : ")+ deviceProp->name); ui->deviceComboBox->addItem(text); } }
void DialogSelectHardware::ChangeText(int indexDevice) { int driverVersion = 0, runtimeVersion = 0; cudaSetDevice(indexDevice); cudaGetDeviceProperties(deviceProp, indexDevice); cudaDriverGetVersion(&driverVersion); cudaRuntimeGetVersion(&runtimeVersion); char msg[256]; SPRINTF(msg,"%.0f MBytes (%llu bytes)\n", (float)deviceProp->totalGlobalMem/1048576.0f, (unsigned long long) deviceProp->totalGlobalMem); ui->tableWidget->clear(); addItem(QString ("Device "+QString::number(indexDevice).append(" : ")+ deviceProp->name),0,0); addItem((selectDevice == indexDevice) ? "Dispositivo Seleccionado " : " ",0,1); addItem("CUDA Driver Version / Runtime Version",1,0); addItem(QString ("%1.%2 / %3.%4").arg(driverVersion/1000).arg((driverVersion%100)/10).arg( runtimeVersion/1000).arg((runtimeVersion%100)/10),1,1); addItem("CUDA Capability Major/Minor version number: ",2,0); addItem(QString ("%1.%2").arg(deviceProp->major).arg(deviceProp->minor),2,1); addItem("Total amount of global memory:",3,0); addItem(msg,3,1); addItem(QString ("(%1) Multiprocessors, (%2) CUDA Cores/MP:%3 CUDA Cores").arg( deviceProp->multiProcessorCount).arg( _ConvertSMVer2Cores(deviceProp->major, deviceProp->minor)).arg( _ConvertSMVer2Cores(deviceProp->major, deviceProp->minor) * deviceProp->multiProcessorCount),4,0); addItem("Total amount of constant memory:",5,0); addItem(QString ("%1 bytes").arg(deviceProp->totalConstMem),5,1); addItem("Total amount of shared memory per block:",6,0); addItem(QString ("%1 bytes").arg(deviceProp->sharedMemPerBlock),6,1); addItem("Total number of registers available per block:",7,0); addItem(QString ("%1").arg(deviceProp->regsPerBlock),7,1); addItem("Warp size:",8,0); addItem(QString ("%1").arg(deviceProp->warpSize),8,1); addItem("Maximum number of threads per multiprocessor:",9,0); addItem(QString ("%1").arg(deviceProp->maxThreadsPerMultiProcessor),9,1); addItem("Maximum number of threads per block:",10,0); addItem(QString ("%1").arg(deviceProp->maxThreadsPerBlock),10,1); addItem("Max dimension size of a thread block (x,y,z):",11,0); addItem(QString ("(%1, %2, %3)").arg(deviceProp->maxThreadsDim[0]).arg( deviceProp->maxThreadsDim[1]).arg( deviceProp->maxThreadsDim[2]),11,1); addItem("Max dimension size of a grid size (x,y,z):",12,0); addItem(QString ("(%1, %2, %3)\n").arg(deviceProp->maxGridSize[0]).arg(deviceProp->maxGridSize[1]).arg(deviceProp->maxGridSize[2]),12,1); addItem("Run time limit on kernels: ",13,0); addItem(QString ("%1\n").arg(deviceProp->kernelExecTimeoutEnabled ? "Yes" : "No"),13,1); addItem("Integrated GPU sharing Host Memory: ",14,0); addItem( QString ("%1\n").arg(deviceProp->integrated ? "Yes" : "No"),14,1); ui->tableWidget->resizeColumnsToContents(); ui->tableWidget->resizeRowsToContents(); }
void cutilDeviceInit ( int argc, char ** argv ) { int deviceCount; cutilSafeCall ( cudaGetDeviceCount ( &deviceCount ) ); if ( deviceCount == 0 ) { printf ( "cutil error: no devices supporting CUDA\n" ); exit ( -1 ); } cudaDeviceProp_t deviceProp; cutilSafeCall ( cudaGetDeviceProperties ( &deviceProp, 0 ) ); printf ( "\n Using CUDA device: %s\n", deviceProp.name ); cutilSafeCall ( cudaSetDevice ( 0 ) ); }
void printCudaDeviceInfo(int deviceId) { cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, deviceId); printf("CUDA Device Information:\n"); printf("Device %d: \"%s\"\n", deviceId, deviceProp.name); printf(" Integrated: %d\n", deviceProp.integrated); printf(" Can map host mem: %d\n", deviceProp.canMapHostMemory); printf(" Number of cores: %d\n", ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount); printf(" Clock rate: %.2f GHz\n", deviceProp.clockRate * 1e-6f); printf(" Performance Number: %d\n", ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount * (deviceProp.clockRate / 1000)); printf(" Note: Performance number is clock in mhz * core count, for comparing devices.\n"); }
//////////////////////////////////////////////////////////////////////////////// //! Run test //////////////////////////////////////////////////////////////////////////////// void runAutoTest(int argc, char** argv) { printf("[%s]\n", sSDKsample); // Cuda init int dev = cutilChooseCudaDevice(argc, argv); cudaDeviceProp deviceProp; cutilSafeCall(cudaGetDeviceProperties(&deviceProp, dev)); printf("Compute capability %d.%d\n", deviceProp.major, deviceProp.minor); int version = deviceProp.major*10 + deviceProp.minor; g_hasDouble = (version >= 13); if (inEmulationMode()) { // workaround since SM13 kernel doesn't produce correct output in emulation mode g_hasDouble = false; } // create FFT plan CUFFT_SAFE_CALL(cufftPlan2d(&fftPlan, meshW, meshH, CUFFT_C2R) ); // allocate memory fftInputW = (meshW / 2)+1; fftInputH = meshH; fftInputSize = (fftInputW*fftInputH)*sizeof(float2); cutilSafeCall(cudaMalloc((void **)&d_h0, fftInputSize) ); cutilSafeCall(cudaMalloc((void **)&d_ht, fftInputSize) ); h_h0 = (float2 *) malloc(fftInputSize); generate_h0(); cutilSafeCall(cudaMemcpy(d_h0, h_h0, fftInputSize, cudaMemcpyHostToDevice) ); cutilSafeCall(cudaMalloc((void **)&d_slope, meshW*meshH*sizeof(float2)) ); cutCreateTimer(&timer); cutStartTimer(timer); prevTime = cutGetTimerValue(timer); // Creating the Auto-Validation Code g_CheckRender = new CheckBackBuffer(windowH, windowH, 4, false); g_CheckRender->setPixelFormat(GL_RGBA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); runCudaTest(g_hasDouble); cudaThreadExit(); }
CudaDeviceDialog::CudaDeviceDialog(QWidget *parent) : QDialog(parent) { m = new Ui_CudaDeviceDialog; m->setupUi(this); int deviceCount = 0; if (cudaGetDeviceCount(&deviceCount) == cudaSuccess) { for (int i = 0; i < deviceCount; ++i) { cudaDeviceProp p; cudaGetDeviceProperties(&p, i); m->comboBox->addItem(p.name); } } connect(m->comboBox, SIGNAL(currentIndexChanged(int)), this, SLOT(updateInfo(int))); updateInfo(0); }
/** * @brief This function is called immediately before the main Jacobi loop * * @param[in] cartComm The carthesian communicator * @param[in] rank The rank of the calling MPI process * @param[in] size The total number of MPI processes available * @param[out] timerStart The Jacobi loop starting moment (measured as wall-time) */ void PreRunJacobi(MPI_Comm cartComm, int rank, int size, double * timerStart) { struct cudaDeviceProp devProps; int crtDevice = 0, enabledECC = 0; // We get the properties of the current device, assuming all other devices are the same SafeCudaCall(cudaGetDevice(&crtDevice)); SafeCudaCall(cudaGetDeviceProperties(&devProps, crtDevice)); // Determine how many devices have ECC enabled (assuming exactly one process per device) MPI_Reduce(&devProps.ECCEnabled, &enabledECC, 1, MPI_INT, MPI_SUM, MPI_MASTER_RANK, cartComm); MPI_Barrier(cartComm); OnePrintf(rank == MPI_MASTER_RANK, "Starting Jacobi run with %d processes using \"%s\" GPUs (ECC enabled: %d / %d):\n", size, devProps.name, enabledECC, size); * timerStart = MPI_Wtime(); }
// General GPU Device CUDA Initialization inline int gpuDeviceInit(int devID) { int device_count; checkCudaErrors(cudaGetDeviceCount(&device_count)); if (device_count == 0) { fprintf(stderr, "gpuDeviceInit() CUDA error: no devices supporting CUDA.\n"); exit(EXIT_FAILURE); } if (devID < 0) { devID = 0; } if (devID > device_count-1) { fprintf(stderr, "\n"); fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", device_count); fprintf(stderr, ">> gpuDeviceInit (-device=%d) is not a valid GPU device. <<\n", devID); fprintf(stderr, "\n"); return -devID; } cudaDeviceProp deviceProp; checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); if (deviceProp.computeMode == cudaComputeModeProhibited) { fprintf(stderr, "Error: device is running in <Compute Mode Prohibited>, no threads can use ::cudaSetDevice().\n"); return -1; } if (deviceProp.major < 1) { fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n"); exit(EXIT_FAILURE); } checkCudaErrors(cudaSetDevice(devID)); printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, deviceProp.name); return devID; }
void THCudaInit(THCState* state) { int count = 0; THCudaCheck(cudaGetDeviceCount(&count)); int device = 0; THCudaCheck(cudaGetDevice(&device)); state->rngState = (THCRNGState*)malloc(sizeof(THCRNGState)); THCRandom_init(state, count, device); state->blasState = (THCBlasState*)malloc(sizeof(THCBlasState)); THCudaBlas_init(state, count, device); state->numDevices = count; state->deviceProperties = (struct cudaDeviceProp*)malloc(count * sizeof(struct cudaDeviceProp)); THCState_setDeviceMode(state, THCStateDeviceModeManual); state->numUserStreams = 0; state->streamsPerDevice = (cudaStream_t**)malloc(count * sizeof(cudaStream_t*)); /* Enable P2P access between all pairs, if possible */ THCudaEnablePeerToPeerAccess(state); for (int i = 0; i < count; ++i) { THCudaCheck(cudaSetDevice(i)); THCudaCheck(cudaGetDeviceProperties(&state->deviceProperties[i], i)); /* Stream index 0 will be the default stream for convenience; by default no user streams are reserved */ state->streamsPerDevice[i] = (cudaStream_t*)malloc(sizeof(cudaStream_t)); state->streamsPerDevice[i][0] = NULL; } /* Restore to previous device */ THCudaCheck(cudaSetDevice(device)); /* Start in the default stream on the current device */ state->currentPerDeviceStream = 0; state->currentStream = NULL; }
ContextPtr CudaDevice::Create(int ordinal, bool stream) { // Create the device. DevicePtr device(new CudaDevice); cudaError_t error = cudaGetDeviceProperties(&device->_prop, ordinal); if(cudaSuccess != error) { fprintf(stderr, "FAILURE TO CREATE DEVICE %d\n", ordinal); exit(0); } // Set this device as the active one on the thread. device->_ordinal = ordinal; cudaSetDevice(ordinal); AllocPtr alloc = device->CreateDefaultAlloc(); // Create the context. return device->CreateStream(stream, alloc.get()); }
bool checkCUDAProfile(int dev, int min_runtime, int min_compute) { int runtimeVersion = 0; cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); fprintf(stderr,"\nDevice %d: \"%s\"\n", dev, deviceProp.name); cudaRuntimeGetVersion(&runtimeVersion); fprintf(stderr," CUDA Runtime Version :\t%d.%d\n", runtimeVersion/1000, (runtimeVersion%100)/10); fprintf(stderr," CUDA Compute Capability :\t%d.%d\n", deviceProp.major, deviceProp.minor); if( runtimeVersion >= min_runtime && ((deviceProp.major<<4) + deviceProp.minor) >= min_compute ) { return true; } else { return false; } }
bool checkCUDAProfile(int dev) { int runtimeVersion = 0; cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); fprintf(stderr,"\nDevice %d: \"%s\"\n", dev, deviceProp.name); cudaRuntimeGetVersion(&runtimeVersion); fprintf(stderr," CUDA Runtime Version:\t%d.%d\n", runtimeVersion/1000, (runtimeVersion%100)/10); fprintf(stderr," CUDA SM Capability :\t%d.%d\n", deviceProp.major, deviceProp.minor); if( runtimeVersion/1000 >= 3 && runtimeVersion%100 >= 1 && deviceProp.major >= 2 ) { return true; } else { return false; } }
RemoteCUDARunner::RemoteCUDARunner():GPURunner<unsigned long,int>(TYPE_CUDA),m_metahashsize(0) { m_in=0; m_devin=0; m_out=0; m_devout=0; m_metahash=0; m_devmetahash=0; cutilSafeCall(cudaGetDeviceCount(&m_devicecount)); if(m_devicecount>0) { if(m_deviceindex<0 || m_deviceindex>=m_devicecount) { m_deviceindex=cutGetMaxGflopsDeviceId(); std::cout << "Setting CUDA device to Max GFlops device at index " << m_deviceindex << std::endl; } else { std::cout << "Setting CUDA device to device at index " << m_deviceindex << std::endl; } cudaDeviceProp props; cudaGetDeviceProperties(&props,m_deviceindex); std::cout << "Device info for " << props.name << " :" << std::endl; std::cout << "Compute Capability : " << props.major << "." << props.minor << std::endl; std::cout << "Clock Rate (hz) : " << props.clockRate << std::endl; if(props.major>999) { std::cout << "CUDA seems to be running in CPU emulation mode" << std::endl; } cutilSafeCall(cudaSetDevice(m_deviceindex)); } else { m_deviceindex=-1; std::cout << "No CUDA capable device detected" << std::endl; } }
int findCapableDevice(int argc, char **argv) { int dev; int bestDev = -1; int deviceCount = 0; if (cudaGetDeviceCount(&deviceCount) != cudaSuccess) { fprintf(stderr, "cudaGetDeviceCount FAILED CUDA Driver and Runtime version may be mismatched.\n"); fprintf(stderr, "\nFAILED\n"); cudaThreadExit(); cutilExit(argc, argv); } for (dev = 0; dev < deviceCount; ++dev) { cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); if (dev == 0) { // This function call returns 9999 for both major & minor fields, if no CUDA capable devices are present if (deviceProp.major == 9999 && deviceProp.minor == 9999) fprintf(stderr,"There is no device supporting CUDA.\n"); else if (deviceCount == 1) fprintf(stderr,"There is 1 device supporting CUDA\n"); else fprintf(stderr,"There are %d devices supporting CUDA\n", deviceCount); } if( checkCUDAProfile( dev ) ) { fprintf(stderr,"\nFound capable device: %d\n", dev ); if( bestDev == -1 ) { bestDev = dev; fprintf(stderr, "Setting active device to %d\n", bestDev ); } } } if( bestDev == -1 ) { fprintf(stderr, "\nNo configuration with available capabilities was found. Test has been waived.\n"); fprintf(stderr, "This sample requires:\n"); fprintf(stderr, "\tGPU Device Compute >= 2.0 is required\n"); fprintf(stderr, "\tCUDA Runtime Version >= 3.1 is required\n"); fprintf(stderr, "PASSED\n"); } return bestDev; }
// TODO: Fix this for the new backend void Caffe::DeviceQuery() { if (Get().default_device_context_->backend() == BACKEND_CUDA) { #ifdef USE_CUDA cudaDeviceProp prop; int device; if (cudaSuccess != cudaGetDevice(&device)) { printf("No cuda device present.\n"); } else { CUDA_CHECK(cudaGetDeviceProperties(&prop, device)); LOG(INFO)<< "Device id: " << device; LOG(INFO)<< "Major revision number: " << prop.major; LOG(INFO)<< "Minor revision number: " << prop.minor; LOG(INFO)<< "Name: " << prop.name; LOG(INFO)<< "Total global memory: " << prop.totalGlobalMem; LOG(INFO)<< "Total shared memory per block: " << prop.sharedMemPerBlock; LOG(INFO)<< "Total registers per block: " << prop.regsPerBlock; LOG(INFO)<< "Warp size: " << prop.warpSize; LOG(INFO)<< "Maximum memory pitch: " << prop.memPitch; LOG(INFO)<< "Maximum threads per block: " << prop.maxThreadsPerBlock; LOG(INFO)<< "Maximum dimension of block: " << prop.maxThreadsDim[0] << ", " << prop.maxThreadsDim[1] << ", " << prop.maxThreadsDim[2]; LOG(INFO)<< "Maximum dimension of grid: " << prop.maxGridSize[0] << ", " << prop.maxGridSize[1] << ", " << prop.maxGridSize[2]; LOG(INFO)<< "Clock rate: " << prop.clockRate; LOG(INFO)<< "Total constant memory: " << prop.totalConstMem; LOG(INFO)<< "Texture alignment: " << prop.textureAlignment; LOG(INFO)<< "Concurrent copy and execution: " << (prop.deviceOverlap ? "Yes" : "No"); LOG(INFO)<< "Number of multiprocessors: " << prop.multiProcessorCount; LOG(INFO)<< "Kernel execution timeout: " << (prop.kernelExecTimeoutEnabled ? "Yes" : "No"); } #endif // USE_CUDA } else { #ifdef USE_GREENTEA // TODO: Complete OpenCL device information of current device #endif // USE_GREENTEA } return; }
inline void InitTensorEngine(int dev_id){ cudaDeviceProp prop; int device_id = 0; int device_count = 0; cudaGetDeviceCount(&device_count); if (dev_id < 0) { #if (MSHADOW_USE_NVML) device_id = AutoSelectDevice(device_count); #endif } else { device_id = dev_id; } utils::Assert( device_id < device_count, "Incorrect Device ID" ); utils::Assert( cudaSetDevice(device_id) == cudaSuccess, "cannot set device" ); cudaGetDeviceProperties(&prop, device_id); printf("Use CUDA Device %d: %s\n", device_id, prop.name); cublasInit(); }
fastest_device() : the_fastest_device_id(0) { int num_devices = 0; cuda_assert( cudaGetDeviceCount( &num_devices ) ); assert( !!num_devices ); size_type max_multiprocessors = 0; for ( size_type device = 0; device != num_devices; ++device ) { cudaDeviceProp properties; cuda_assert( cudaGetDeviceProperties( &properties, device ) ); if ( max_multiprocessors < properties.multiProcessorCount ) { max_multiprocessors = properties.multiProcessorCount; the_fastest_device_id = device; } } }//ctor
void InitCUDA(int device) { /////////////////////////// // CUDA initialisation /////////////////////////// int deviceCount; CUDA_SAFE_CALL(cudaGetDeviceCount(&deviceCount)); if (deviceCount == 0) std::cout << "There is no device supporting CUDA" << std::endl; CUDA_SAFE_CALL(cudaSetDevice(device)); cudaDeviceProp deviceProp; CUDA_SAFE_CALL(cudaGetDeviceProperties(&deviceProp, device)); std::cout << "Device " << device << ": " << deviceProp.name << std::endl; // or // CUT_DEVICE_INIT(); // with --device=1 (num device chosen) }
/** * @brief Compresses data stream * * Performs compression using a three stage pipeline consisting of the Burrows-Wheeler * transform, the move-to-front transform, and Huffman encoding. * The compression algorithms are described in our paper "Parallel Lossless * Data Compression on the GPU". (See the \ref references bibliography). * * - Only unsigned char type is supported. * - Currently, the input stream (d_uncompressed) must be a buffer of 1,048,576 (uchar) elements (~1MB). * - The BWT Index (d_bwtIndex) is an integer number (int). This is used during the reverse-BWT stage. * - The Histogram size pointer (d_histSize) can be ignored and can be passed a null pointer. * - The Histrogram (d_hist) is a 256-entry (unsigned int) buffer. The histogram is used to * construct the Huffman tree during decoding. * - The Encoded offset table (d_encodeOffset) is a 256-entry (unsigned int) buffer. Since the input * stream is compressed in blocks of 4096 characters, the offset table gives the starting offset of * where each block starts in the compressed data (d_compressedSize). The very first uint at each starting offset * gives the size (in words) of that corresponding compressed block. This allows us to decompress each 4096 * character-block in parallel. * - The size of compressed data (d_compressedSize) is a uint and gives the final size (in words) * of the compressed data. * - The compress data stream (d_compressed) is a uint buffer. The user should allocate enough * memory for worst-case (no compression occurs). * - \a numElements is a uint and must be set to 1048576. * * @param[out] d_bwtIndex BWT Index (int) * @param[out] d_histSize Histogram size (ignored, null ptr) * @param[out] d_hist Histogram (256-entry, uint) * @param[out] d_encodeOffset Encoded offset table (256-entry, uint) * @param[out] d_compressedSize Size of compressed data (uint) * @param[out] d_compressed Compressed data * @param[in] planHandle Handle to plan for compressor * @param[in] d_uncompressed Uncompressed data * @param[in] numElements Number of elements to compress * @returns CUDPPResult indicating success or error condition * * @see cudppPlan, CUDPPConfiguration, CUDPPAlgorithm */ CUDPP_DLL CUDPPResult cudppCompress(CUDPPHandle planHandle, unsigned char *d_uncompressed, int *d_bwtIndex, unsigned int *d_histSize, unsigned int *d_hist, unsigned int *d_encodeOffset, unsigned int *d_compressedSize, unsigned int *d_compressed, size_t numElements) { // first check: is this device >= 2.0? if not, return error int dev; cudaGetDevice(&dev); cudaDeviceProp devProps; cudaGetDeviceProperties(&devProps, dev); if((int)devProps.major < 2) { // Only supported on devices with compute // capability 2.0 or greater return CUDPP_ERROR_ILLEGAL_CONFIGURATION; } CUDPPCompressPlan * plan = (CUDPPCompressPlan *) getPlanPtrFromHandle<CUDPPCompressPlan>(planHandle); if(plan != NULL) { if (plan->m_config.algorithm != CUDPP_COMPRESS) return CUDPP_ERROR_INVALID_PLAN; if (plan->m_config.datatype != CUDPP_UCHAR) return CUDPP_ERROR_ILLEGAL_CONFIGURATION; if (numElements != 1048576) return CUDPP_ERROR_ILLEGAL_CONFIGURATION; cudppCompressDispatch(d_uncompressed, d_bwtIndex, d_histSize, d_hist, d_encodeOffset, d_compressedSize, d_compressed, numElements, plan); return CUDPP_SUCCESS; } else return CUDPP_ERROR_INVALID_HANDLE; }
void computeNumCTAs(KernelPointer kernel, int smemDynamicBytes, bool bManualCoalesce) { cudaDeviceProp devprop; int deviceID = -1; cudaError_t err = cudaGetDevice(&deviceID); assert(err == cudaSuccess); cudaGetDeviceProperties(&devprop, deviceID); // Determine the maximum number of CTAs that can be run simultaneously for each kernel // This is equivalent to the calculation done in the CUDA Occupancy Calculator spreadsheet const unsigned int regAllocationUnit = (devprop.major < 2 && devprop.minor < 2) ? 256 : 512; // in registers const unsigned int warpAllocationMultiple = 2; const unsigned int smemAllocationUnit = 512; // in bytes const unsigned int maxThreadsPerSM = bManualCoalesce ? 768 : 1024; // sm_12 GPUs increase threads/SM to 1024 const unsigned int maxBlocksPerSM = 8; cudaFuncAttributes attr; err = cudaFuncGetAttributes(&attr, (const char*)kernel); assert(err == cudaSuccess); // Number of warps (round up to nearest whole multiple of warp size) size_t numWarps = multiple(RadixSort::CTA_SIZE, devprop.warpSize); // Round up to warp allocation multiple numWarps = ceiling(numWarps, warpAllocationMultiple); // Number of regs is regs per thread times number of warps times warp size size_t regsPerCTA = attr.numRegs * devprop.warpSize * numWarps; // Round up to multiple of register allocation unit size regsPerCTA = ceiling(regsPerCTA, regAllocationUnit); size_t smemBytes = attr.sharedSizeBytes + smemDynamicBytes; size_t smemPerCTA = ceiling(smemBytes, smemAllocationUnit); size_t ctaLimitRegs = regsPerCTA > 0 ? devprop.regsPerBlock / regsPerCTA : maxBlocksPerSM; size_t ctaLimitSMem = smemPerCTA > 0 ? devprop.sharedMemPerBlock / smemPerCTA : maxBlocksPerSM; size_t ctaLimitThreads = maxThreadsPerSM / RadixSort::CTA_SIZE; unsigned int numSMs = devprop.multiProcessorCount; int maxCTAs = numSMs * std::min<size_t>(ctaLimitRegs, std::min<size_t>(ctaLimitSMem, std::min<size_t>(ctaLimitThreads, maxBlocksPerSM))); setNumCTAs(kernel, maxCTAs); }
static void do_main() { Matrix A; double *x, *b; printf("#############################################################################################\n"); printf("** B I C G S T A B S O L V E R **\n"); printf("#############################################################################################\n\n"); cudaDeviceProp props; CUDA_SAFE_CALL( cudaGetDeviceProperties(&props, 0) ); printf("** DEVICE : %10s (ECC: %3s) **\n", props.name, props.ECCEnabled ? "ON" : "OFF"); printf("\n#############################################################################################\n\n"); Context ctx; ctx.read_from_file("config.txt"); read_system_from_file(&ctx, "res/matrix.inp", &A, &x, &b); cudaEvent_t start, stop; CUDA_SAFE_CALL( cudaEventCreate(&start) ); CUDA_SAFE_CALL( cudaEventCreate(&stop) ); CUDA_SAFE_CALL( cudaEventRecord(start) ); //Jacobi solver(0.6); Bicgstab solver; solver.setup(&ctx, &A); solver.solve(&ctx, &A, x, b); float elapsed_time = 0.0f; CUDA_SAFE_CALL( cudaEventRecord(stop) ); CUDA_SAFE_CALL( cudaEventSynchronize(stop) ); CUDA_SAFE_CALL( cudaEventElapsedTime(&elapsed_time, start, stop) ); printf("** ELAPSED TIME: %9.3fms **\n", elapsed_time); printf("\n#############################################################################################\n\n"); CUDA_SAFE_CALL( cudaEventDestroy(stop) ); CUDA_SAFE_CALL( cudaEventDestroy(start) ); CUDA_SAFE_CALL( cudaFree(x) ); CUDA_SAFE_CALL( cudaFree(b) ); }
char CudaBase::CheckCUDevice() { int deviceCount = 0; if (cudaGetDeviceCount(&deviceCount) != cudaSuccess) { std::cout << "Cannot find CUDA device!"; return 0; } if(deviceCount>0) { std::cout << "Found " << deviceCount << " device(s)\n"; int driverVersion = 0, runtimeVersion = 0; cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, 0); cudaDriverGetVersion(&driverVersion); cudaRuntimeGetVersion(&runtimeVersion); std::cout << " Device name: " << deviceProp.name<<"\n"; std::cout << " Diver Version: " << driverVersion<<"\n"; std::cout << " Runtime Version: " << runtimeVersion<<"\n"; std::cout << " Capability Major/Minor version number: "<<deviceProp.major<<"."<<deviceProp.minor<<"\n"; std::cout << " Total amount of global memory: "<<(unsigned long long)deviceProp.totalGlobalMem<<" bytes\n"; std::cout << " Total amount of constant memory: "<<deviceProp.totalConstMem<<"bytes\n"; std::cout << " Total amount of shared memory per block: "<<deviceProp.sharedMemPerBlock<<" bytes\n"; std::cout << " Total number of registers available per block: "<<deviceProp.regsPerBlock<<"\n"; std::cout << " Warp size: "<<deviceProp.warpSize<<"\n"; std::stringstream sst; sst<<" Maximum sizes of each dimension of a grid: "<<deviceProp.maxGridSize[0]<<" x "<<deviceProp.maxGridSize[1]<<" x "<<deviceProp.maxGridSize[2]; std::cout<<sst.str()<<"\n"; sst.str(""); sst<<" Maximum sizes of each dimension of a block: "<<deviceProp.maxThreadsDim[0]<<" x "<<deviceProp.maxThreadsDim[1]<<" x "<<deviceProp.maxThreadsDim[2]; std::cout<<sst.str()<<"\n"; std::cout << " Maximum number of threads per block: " << deviceProp.maxThreadsPerBlock<<"\n"; MaxThreadPerBlock = deviceProp.maxThreadsPerBlock; MaxRegisterPerBlock = deviceProp.regsPerBlock; MaxSharedMemoryPerBlock = deviceProp.sharedMemPerBlock; WarpSize = deviceProp.warpSize; RuntimeVersion = runtimeVersion; return 1; } return 0; }
//set up context and queue on a device and retrieve //device information for other methods dpClient::dpClient(int plat, int dev){ platform = plat; device = dev; nameFixer(platform, device, platName, devName); fprintf(stderr,"On Platform %s\n", platName); fprintf(stderr,"using device %s\n", devName); //OpenCL if (platform != 3){ cl_platform_id platform_ids[16]; cl_device_id device_ids[16]; unsigned int numDevices; int err; cl_context_properties props[3] = {CL_CONTEXT_PLATFORM,0,0}; clErrChk(clGetPlatformIDs(16, platform_ids, NULL)); clErrChk(clGetDeviceIDs(platform_ids[platform], CL_DEVICE_TYPE_ALL, 16, device_ids, &numDevices)); clErrChk(clGetDeviceInfo(device_ids[device], CL_DEVICE_MAX_WORK_GROUP_SIZE , sizeof(MaxWorkGroupSize), &MaxWorkGroupSize, NULL)); clErrChk(clGetDeviceInfo(device_ids[device], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(MaxComputeUnits), &MaxComputeUnits, NULL)); clErrChk(clGetDeviceInfo(device_ids[device], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(MaxWorkDim), &MaxWorkDim, NULL)); clErrChk(clGetDeviceInfo(device_ids[device], CL_DEVICE_MAX_MEM_ALLOC_SIZE , sizeof(MaxMemAlloc), &MaxMemAlloc, NULL)); strcpy(type, "OpenCL"); props[1] = (cl_context_properties) platform_ids[platform]; context = clCreateContext(props, 1, &device_ids[device], NULL, NULL, &err); clErrChk(err); queue = clCreateCommandQueue( context, device_ids[device], 0, &err); clErrChk(err); } //CUDA else{ //CLIENT: cudaDeviceProp properties; cudaGetDeviceProperties(&properties, device); cudaSetDevice(device); MaxWorkGroupSize = 1; MaxMemAlloc = 100000; strcpy(type, "CUDA"); } }
/// Utility function to tweak problem size for small GPUs int adjustProblemSize(int GPU_N, int default_nOptions) { int nOptions = default_nOptions; // select problem size for (int i=0; i<GPU_N; i++) { cudaDeviceProp deviceProp; checkCudaErrors(cudaGetDeviceProperties(&deviceProp, i)); int cudaCores = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount; if (cudaCores <= 32) { nOptions = (nOptions < cudaCores/2 ? nOptions : cudaCores/2); } } return nOptions; }
void cuda_devicenames() { cudaError_t err; int GPU_N; err = cudaGetDeviceCount(&GPU_N); if (err != cudaSuccess) { applog(LOG_ERR, "Unable to query number of CUDA devices! Is an nVidia driver installed?"); exit(1); } for (int i = 0; i < GPU_N*opt_n_gputhreads; i++) { cudaDeviceProp props; cudaGetDeviceProperties(&props, device_map[i / opt_n_gputhreads]); device_name[i] = strdup(props.name); device_sm[i] = (props.major * 100 + props.minor * 10); } }
void CudaUtils::printDevices() { int count; if (cudaGetDeviceCount(&count)) return; qDebug() << "Found" << count << "CUDA device(s)"; for (int i=0; i < count; i++) { cudaDeviceProp prop; cudaGetDeviceProperties(&prop, i); QString deviceString = QString("* %1, Compute capability: %2.%3").arg(prop.name).arg(prop.major).arg(prop.minor); QString propString1 = QString(" Global mem: %1M, Shared mem per block: %2k, Registers per block: %3").arg(prop.totalGlobalMem / 1024 / 1024) .arg(prop.sharedMemPerBlock / 1024).arg(prop.regsPerBlock); QString propString2 = QString(" Warp size: %1 threads, Max threads per block: %2, Multiprocessor count: %3") .arg(prop.warpSize).arg(prop.maxThreadsPerBlock).arg(prop.multiProcessorCount); qDebug() << deviceString; qDebug() << propString1; qDebug() << propString2; } }