bool configureGpu(bool use_gpu_acceleration, std::vector<int> &valid_devices, int use_all_gpus, int &numBkgWorkers_gpu) { #ifdef ION_COMPILE_CUDA const unsigned long long gpu_mem = 2.5 * 1024 * 1024 * 1024; if (!use_gpu_acceleration) return false; // Get number of GPUs in system int num_gpus = 0; cudaError_t err = cudaGetDeviceCount( &num_gpus ); if (err != cudaSuccess) { printf("CUDA: No GPU device available. Defaulting to CPU only computation\n"); return false; } if ( use_all_gpus ) { // Add all GPUs to the valid device list for ( int dev = 0; dev < num_gpus; dev++ ) valid_devices.push_back(dev); } else { // Only add the highest compute devices to the compute list int version = 0; int major = 0; int minor = 0; cudaDeviceProp dev_props; // Iterate over GPUs to find the highest compute device for ( int dev = 0; dev < num_gpus; dev++ ) { cudaGetDeviceProperties( &dev_props, dev ); if ( (dev_props.major*10) + dev_props.minor > version ) { version = (dev_props.major*10) + dev_props.minor; major = dev_props.major; minor = dev_props.minor; } } for ( int dev = 0; dev < num_gpus; dev++ ) { cudaGetDeviceProperties(&dev_props, dev); if (dev_props.major == major && dev_props.minor == minor) { if (dev_props.totalGlobalMem > gpu_mem) { valid_devices.push_back(dev); } } } } // Set the number of GPU workers and tell CUDA about our list of valid devices if (valid_devices.size() > 0) { numBkgWorkers_gpu = int(valid_devices.size()); cudaSetValidDevices( &valid_devices[0], int( valid_devices.size() ) ); } else { printf("CUDA: No GPU device available. Defaulting to CPU only computation\n"); return false; } PoissonCDFApproxMemo poiss_cache; poiss_cache.Allocate (MAX_POISSON_TABLE_COL,MAX_POISSON_TABLE_ROW,POISSON_TABLE_STEP); poiss_cache.GenerateValues(); // fill out my table for(int i=valid_devices.size()-1 ; i >= 0; i--){ try{ //cudaSetDevice(valid_devices[i]); cout << "CUDA "<< valid_devices[i] << ": Creating Context and Constant memory on device with id: "<< valid_devices[i]<< endl; InitConstantMemoryOnGpu(valid_devices[i],poiss_cache); } catch(cudaException &e) { cout << "CUDA "<< valid_devices[i] << ": Context could not be created. removing device with id: "<< valid_devices[i] << " from valid device list" << endl; valid_devices.erase (valid_devices.begin()+i); numBkgWorkers_gpu -= 1; if(numBkgWorkers_gpu == 0) cout << "CUDA: no context could be created, defaulting to CPU only execution" << endl; } } if(numBkgWorkers_gpu == 0) return false; return true; #else return false; #endif }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { pArgc = &argc; pArgv = argv; printf("%s Starting...\n\n", argv[0]); printf(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n"); int deviceCount = 0; cudaError_t error_id = cudaGetDeviceCount(&deviceCount); if (error_id != cudaSuccess) { printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id)); exit(EXIT_FAILURE); } // This function call returns 0 if there are no CUDA capable devices. if (deviceCount == 0) { printf("There are no available device(s) that support CUDA\n"); } else { printf("Detected %d CUDA Capable device(s)\n", deviceCount); } int dev, driverVersion = 0, runtimeVersion = 0; for (dev = 0; dev < deviceCount; ++dev) { cudaSetDevice(dev); cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name); // Console log cudaDriverGetVersion(&driverVersion); cudaRuntimeGetVersion(&runtimeVersion); printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion/1000, (driverVersion%100)/10, runtimeVersion/1000, (runtimeVersion%100)/10); printf(" CUDA Capability Major/Minor version number: %d.%d\n", deviceProp.major, deviceProp.minor); char msg[256]; sprintf(msg, " Total amount of global memory: %.0f MBytes (%llu bytes)\n", (float)deviceProp.totalGlobalMem/1048576.0f, (unsigned long long) deviceProp.totalGlobalMem); printf("%s", msg); printf(" (%2d) Multiprocessors x (%3d) CUDA Cores/MP: %d CUDA Cores\n", deviceProp.multiProcessorCount, _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor), _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount); printf(" GPU Clock rate: %.0f MHz (%0.2f GHz)\n", deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f); #if CUDART_VERSION >= 5000 // This is supported in CUDA 5.0 (runtime API device properties) printf(" Memory Clock rate: %.0f Mhz\n", deviceProp.memoryClockRate * 1e-3f); printf(" Memory Bus Width: %d-bit\n", deviceProp.memoryBusWidth); if (deviceProp.l2CacheSize) { printf(" L2 Cache Size: %d bytes\n", deviceProp.l2CacheSize); } #else // This only available in CUDA 4.0-4.2 (but these were only exposed in the CUDA Driver API) int memoryClock; getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev); printf(" Memory Clock rate: %.0f Mhz\n", memoryClock * 1e-3f); int memBusWidth; getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev); printf(" Memory Bus Width: %d-bit\n", memBusWidth); int L2CacheSize; getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev); if (L2CacheSize) { printf(" L2 Cache Size: %d bytes\n", L2CacheSize); } #endif printf(" Max Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n", deviceProp.maxTexture1D , deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1], deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]); printf(" Max Layered Texture Size (dim) x layers 1D=(%d) x %d, 2D=(%d,%d) x %d\n", deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1], deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]); printf(" Total amount of constant memory: %lu bytes\n", deviceProp.totalConstMem); printf(" Total amount of shared memory per block: %lu bytes\n", deviceProp.sharedMemPerBlock); printf(" Total number of registers available per block: %d\n", deviceProp.regsPerBlock); printf(" Warp size: %d\n", deviceProp.warpSize); printf(" Maximum number of threads per multiprocessor: %d\n", deviceProp.maxThreadsPerMultiProcessor); printf(" Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock); printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n", deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]); printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]); printf(" Maximum memory pitch: %lu bytes\n", deviceProp.memPitch); printf(" Texture alignment: %lu bytes\n", deviceProp.textureAlignment); printf(" Concurrent copy and kernel execution: %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount); printf(" Run time limit on kernels: %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No"); printf(" Integrated GPU sharing Host Memory: %s\n", deviceProp.integrated ? "Yes" : "No"); printf(" Support host page-locked memory mapping: %s\n", deviceProp.canMapHostMemory ? "Yes" : "No"); printf(" Alignment requirement for Surfaces: %s\n", deviceProp.surfaceAlignment ? "Yes" : "No"); printf(" Device has ECC support: %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled"); #ifdef WIN32 printf(" CUDA Device Driver Mode (TCC or WDDM): %s\n", deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)"); #endif printf(" Device supports Unified Addressing (UVA): %s\n", deviceProp.unifiedAddressing ? "Yes" : "No"); printf(" Device PCI Bus ID / PCI location ID: %d / %d\n", deviceProp.pciBusID, deviceProp.pciDeviceID); const char *sComputeMode[] = { "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)", "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)", "Prohibited (no host thread can use ::cudaSetDevice() with this device)", "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)", "Unknown", NULL }; printf(" Compute Mode:\n"); printf(" < %s >\n", sComputeMode[deviceProp.computeMode]); } // csv masterlog info // ***************************** // exe and CUDA driver name printf("\n"); std::string sProfileString = "deviceQuery, CUDA Driver = CUDART"; char cTemp[16]; // driver version sProfileString += ", CUDA Driver Version = "; #ifdef WIN32 sprintf_s(cTemp, 10, "%d.%d", driverVersion/1000, (driverVersion%100)/10); #else sprintf(cTemp, "%d.%d", driverVersion/1000, (driverVersion%100)/10); #endif sProfileString += cTemp; // Runtime version sProfileString += ", CUDA Runtime Version = "; #ifdef WIN32 sprintf_s(cTemp, 10, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10); #else sprintf(cTemp, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10); #endif sProfileString += cTemp; // Device count sProfileString += ", NumDevs = "; #ifdef WIN32 sprintf_s(cTemp, 10, "%d", deviceCount); #else sprintf(cTemp, "%d", deviceCount); #endif sProfileString += cTemp; // Print Out all device Names for (dev = 0; dev < deviceCount; ++dev) { #ifdef _WIN32 sprintf_s(cTemp, 13, ", Device%d = ", dev); #else sprintf(cTemp, ", Device%d = ", dev); #endif cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); sProfileString += cTemp; sProfileString += deviceProp.name; } sProfileString += "\n"; printf("%s", sProfileString.c_str()); // finish exit(EXIT_SUCCESS); }
void initQuda(int dev) { static int initialized = 0; if (initialized) { return; } initialized = 1; #if (CUDA_VERSION >= 4000) && defined(MULTI_GPU) //check if CUDA_NIC_INTEROP is set to 1 in the enviroment char* cni_str = getenv("CUDA_NIC_INTEROP"); if(cni_str == NULL){ errorQuda("Environment variable CUDA_NIC_INTEROP is not set\n"); } int cni_int = atoi(cni_str); if (cni_int != 1){ errorQuda("Environment variable CUDA_NIC_INTEROP is not set to 1\n"); } #endif int deviceCount; cudaGetDeviceCount(&deviceCount); if (deviceCount == 0) { errorQuda("No devices supporting CUDA"); } for(int i=0; i<deviceCount; i++) { cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, i); printfQuda("QUDA: Found device %d: %s\n", i, deviceProp.name); } #ifdef QMP_COMMS int ndim; const int *dim; if ( QMP_is_initialized() != QMP_TRUE ) { errorQuda("QMP is not initialized"); } num_QMP=QMP_get_number_of_nodes(); rank_QMP=QMP_get_node_number(); dev += rank_QMP % deviceCount; ndim = QMP_get_logical_number_of_dimensions(); dim = QMP_get_logical_dimensions(); #elif defined(MPI_COMMS) comm_init(); dev=comm_gpuid(); #else if (dev < 0) dev = deviceCount - 1; #endif // Used for applying the gauge field boundary condition if( commCoords(3) == 0 ) qudaPt0=true; else qudaPt0=false; if( commCoords(3) == commDim(3)-1 ) qudaPtNm1=true; else qudaPtNm1=false; cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); if (deviceProp.major < 1) { errorQuda("Device %d does not support CUDA", dev); } printfQuda("QUDA: Using device %d: %s\n", dev, deviceProp.name); cudaSetDevice(dev); #ifdef HAVE_NUMA if(numa_config_set){ if(gpu_affinity[dev] >=0){ printfQuda("Numa setting to cpu node %d\n", gpu_affinity[dev]); if(numa_run_on_node(gpu_affinity[dev]) != 0){ printfQuda("Warning: Setting numa to cpu node %d failed\n", gpu_affinity[dev]); } } } #endif initCache(); quda::initBlas(); }
/** Initialize hardware counters, setup the function vector table * and get hardware information, this routine is called when the * PAPI process is initialized (IE PAPI_library_init) */ int _papi_nvml_init_substrate( int cidx ) { nvmlReturn_t ret; cudaError_t cuerr; int cuda_count = 0; unsigned int nvml_count = 0; ret = nvmlInit(); if ( NVML_SUCCESS != ret ) { strcpy(_nvml_vector.cmp_info.disabled_reason, "The NVIDIA managament library failed to initialize."); goto disable; } cuerr = cuInit( 0 ); if ( CUDA_SUCCESS != cuerr ) { strcpy(_nvml_vector.cmp_info.disabled_reason, "The CUDA library failed to initialize."); goto disable; } /* Figure out the number of CUDA devices in the system */ ret = nvmlDeviceGetCount( &nvml_count ); if ( NVML_SUCCESS != ret ) { strcpy(_nvml_vector.cmp_info.disabled_reason, "Unable to get a count of devices from the NVIDIA managament library."); goto disable; } cuerr = cudaGetDeviceCount( &cuda_count ); if ( CUDA_SUCCESS != cuerr ) { strcpy(_nvml_vector.cmp_info.disabled_reason, "Unable to get a device count from CUDA."); goto disable; } /* We can probably recover from this, when we're clever */ if ( nvml_count != cuda_count ) { strcpy(_nvml_vector.cmp_info.disabled_reason, "Cuda and the NVIDIA managament library have different device counts."); goto disable; } device_count = cuda_count; /* A per device representation of what events are present */ features = (int*)papi_malloc(sizeof(int) * device_count ); /* Handles to each device */ devices = (nvmlDevice_t*)papi_malloc(sizeof(nvmlDevice_t) * device_count); /* Figure out what events are supported on each card. */ if ( (papi_errorcode = detectDevices( ) ) != PAPI_OK ) { papi_free(features); papi_free(devices); sprintf(_nvml_vector.cmp_info.disabled_reason, "An error occured in device feature detection, please check your NVIDIA Management Library and CUDA install." ); goto disable; } /* The assumption is that if everything went swimmingly in detectDevices, all nvml calls here should be fine. */ createNativeEvents( ); /* Export the total number of events available */ _nvml_vector.cmp_info.num_native_events = num_events; /* Export the component id */ _nvml_vector.cmp_info.CmpIdx = cidx; /* Export the number of 'counters' */ _nvml_vector.cmp_info.num_cntrs = num_events; return PAPI_OK; disable: _nvml_vector.cmp_info.num_cntrs = 0; return PAPI_OK; }
/** Documented at declaration */ int gpujpeg_init_device(int device_id, int flags) { int dev_count; cudaGetDeviceCount(&dev_count); if ( dev_count == 0 ) { fprintf(stderr, "[GPUJPEG] [Error] No CUDA enabled device\n"); return -1; } if ( device_id < 0 || device_id >= dev_count ) { fprintf(stderr, "[GPUJPEG] [Error] Selected device %d is out of bound. Devices on your system are in range %d - %d\n", device_id, 0, dev_count - 1); return -1; } struct cudaDeviceProp devProp; if ( cudaSuccess != cudaGetDeviceProperties(&devProp, device_id) ) { fprintf(stderr, "[GPUJPEG] [Error] Can't get CUDA device properties!\n" "[GPUJPEG] [Error] Do you have proper driver for CUDA installed?\n" ); return -1; } if ( devProp.major < 1 ) { fprintf(stderr, "[GPUJPEG] [Error] Device %d does not support CUDA\n", device_id); return -1; } if ( flags & GPUJPEG_OPENGL_INTEROPERABILITY ) { cudaGLSetGLDevice(device_id); gpujpeg_cuda_check_error("Enabling OpenGL interoperability"); } if ( flags & GPUJPEG_VERBOSE ) { int cuda_driver_version = 0; cudaDriverGetVersion(&cuda_driver_version); printf("CUDA driver version: %d.%d\n", cuda_driver_version / 1000, (cuda_driver_version % 100) / 10); int cuda_runtime_version = 0; cudaRuntimeGetVersion(&cuda_runtime_version); printf("CUDA runtime version: %d.%d\n", cuda_runtime_version / 1000, (cuda_runtime_version % 100) / 10); printf("Using Device #%d: %s (c.c. %d.%d)\n", device_id, devProp.name, devProp.major, devProp.minor); } cudaSetDevice(device_id); gpujpeg_cuda_check_error("Set CUDA device"); // Test by simple copying that the device is ready uint8_t data[] = {8}; uint8_t* d_data = NULL; cudaMalloc((void**)&d_data, 1); cudaMemcpy(d_data, data, 1, cudaMemcpyHostToDevice); cudaFree(d_data); cudaError_t error = cudaGetLastError(); if ( cudaSuccess != error ) { fprintf(stderr, "[GPUJPEG] [Error] Failed to initialize CUDA device.\n"); if ( flags & GPUJPEG_OPENGL_INTEROPERABILITY ) fprintf(stderr, "[GPUJPEG] [Info] OpenGL interoperability is used, is OpenGL context available?\n"); return -1; } return 0; }
// This function returns the best GPU (with maximum GFLOPS) inline int gpuGetMaxGflopsDeviceId() { int current_device = 0, sm_per_multiproc = 0; int max_perf_device = 0; int device_count = 0, best_SM_arch = 0; int devices_prohibited = 0; unsigned long long max_compute_perf = 0; cudaDeviceProp deviceProp; cudaGetDeviceCount(&device_count); checkCudaErrors(cudaGetDeviceCount(&device_count)); if (device_count == 0) { fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error: no devices supporting CUDA.\n"); exit(EXIT_FAILURE); } // Find the best major SM Architecture GPU device while (current_device < device_count) { cudaGetDeviceProperties(&deviceProp, current_device); // If this GPU is not running on Compute Mode prohibited, then we can add it to the list if (deviceProp.computeMode != cudaComputeModeProhibited) { if (deviceProp.major > 0 && deviceProp.major < 9999) { best_SM_arch = MAX(best_SM_arch, deviceProp.major); } } else { devices_prohibited++; } current_device++; } if (devices_prohibited == device_count) { fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error: all devices have compute mode prohibited.\n"); exit(EXIT_FAILURE); } // Find the best CUDA capable GPU device current_device = 0; while (current_device < device_count) { cudaGetDeviceProperties(&deviceProp, current_device); // If this GPU is not running on Compute Mode prohibited, then we can add it to the list if (deviceProp.computeMode != cudaComputeModeProhibited) { if (deviceProp.major == 9999 && deviceProp.minor == 9999) { sm_per_multiproc = 1; } else { sm_per_multiproc = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor); } unsigned long long compute_perf = (unsigned long long) deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate; if (compute_perf > max_compute_perf) { // If we find GPU with SM major > 2, search only these if (best_SM_arch > 2) { // If our device==dest_SM_arch, choose this, or else pass if (deviceProp.major == best_SM_arch) { max_compute_perf = compute_perf; max_perf_device = current_device; } } else { max_compute_perf = compute_perf; max_perf_device = current_device; } } } ++current_device; } return max_perf_device; }
magma_int_t magma_zparse_opts( int argc, char** argv, magma_zopts *opts, int *matrices, magma_queue_t queue ) { // negative flag indicating -m, -n, -k not given int m = -1; int n = -1; int k = -1; // fill in default values opts->input_format = Magma_CSR; opts->blocksize = 8; opts->alignment = 8; opts->output_format = Magma_CSR; opts->input_location = Magma_CPU; opts->output_location = Magma_CPU; opts->scaling = Magma_NOSCALE; opts->solver_par.epsilon = 10e-16; opts->solver_par.maxiter = 1000; opts->solver_par.verbose = 0; opts->solver_par.version = 0; opts->solver_par.restart = 30; opts->solver_par.num_eigenvalues = 0; opts->precond_par.solver = Magma_JACOBI; opts->precond_par.epsilon = 0.01; opts->precond_par.maxiter = 100; opts->precond_par.restart = 10; opts->solver_par.solver = Magma_CG; printf( usage_sparse_short, argv[0] ); int ndevices; cudaGetDeviceCount( &ndevices ); int info; int ntest = 0; for( int i = 1; i < argc; ++i ) { if ( strcmp("--format", argv[i]) == 0 && i+1 < argc ) { info = atoi( argv[++i] ); switch( info ) { case 0: opts->output_format = Magma_CSR; break; case 1: opts->output_format = Magma_ELL; break; case 2: opts->output_format = Magma_SELLP; break; //case 2: opts->output_format = Magma_ELLRT; break; } } else if ( strcmp("--mscale", argv[i]) == 0 && i+1 < argc ) { info = atoi( argv[++i] ); switch( info ) { case 0: opts->scaling = Magma_NOSCALE; break; case 1: opts->scaling = Magma_UNITDIAG; break; case 2: opts->scaling = Magma_UNITROW; break; } } else if ( strcmp("--solver", argv[i]) == 0 && i+1 < argc ) { info = atoi( argv[++i] ); switch( info ) { case 0: opts->solver_par.solver = Magma_CG; break; case 1: opts->solver_par.solver = Magma_CGMERGE; break; case 2: opts->solver_par.solver = Magma_PCG; break; case 3: opts->solver_par.solver = Magma_BICGSTAB; break; case 4: opts->solver_par.solver = Magma_BICGSTABMERGE; break; case 5: opts->solver_par.solver = Magma_PBICGSTAB; break; case 6: opts->solver_par.solver = Magma_GMRES; break; case 7: opts->solver_par.solver = Magma_PGMRES; break; case 8: opts->solver_par.solver = Magma_LOBPCG; opts->solver_par.num_eigenvalues = 16;break; case 9: opts->solver_par.solver = Magma_JACOBI; break; case 10: opts->solver_par.solver = Magma_BAITER; break; case 21: opts->solver_par.solver = Magma_ITERREF; break; } } else if ( strcmp("--restart", argv[i]) == 0 && i+1 < argc ) { opts->solver_par.restart = atoi( argv[++i] ); } else if ( strcmp("--precond", argv[i]) == 0 && i+1 < argc ) { info = atoi( argv[++i] ); switch( info ) { case 0: opts->precond_par.solver = Magma_NONE; break; case 1: opts->precond_par.solver = Magma_JACOBI; break; case 2: opts->precond_par.solver = Magma_ILU; break; case 3: opts->precond_par.solver = Magma_CG; break; case 4: opts->precond_par.solver = Magma_BICGSTAB; break; case 5: opts->precond_par.solver = Magma_GMRES; break; case 6: opts->precond_par.solver = Magma_BAITER; break; } } else if ( strcmp("--ptol", argv[i]) == 0 && i+1 < argc ) { sscanf( argv[++i], "%lf", &opts->precond_par.epsilon ); } else if ( strcmp("--blocksize", argv[i]) == 0 && i+1 < argc ) { opts->blocksize = atoi( argv[++i] ); } else if ( strcmp("--alignment", argv[i]) == 0 && i+1 < argc ) { opts->alignment = atoi( argv[++i] ); } else if ( strcmp("--verbose", argv[i]) == 0 && i+1 < argc ) { opts->solver_par.verbose = atoi( argv[++i] ); } else if ( strcmp("--maxiter", argv[i]) == 0 && i+1 < argc ) { opts->solver_par.maxiter = atoi( argv[++i] ); } else if ( strcmp("--tol", argv[i]) == 0 && i+1 < argc ) { sscanf( argv[++i], "%lf", &opts->solver_par.epsilon ); } else if ( strcmp("--ev", argv[i]) == 0 && i+1 < argc ) { opts->solver_par.num_eigenvalues = atoi( argv[++i] ); } else if ( strcmp("--version", argv[i]) == 0 && i+1 < argc ) { opts->solver_par.version = atoi( argv[++i] ); } // ----- usage else if ( strcmp("-h", argv[i]) == 0 || strcmp("--help", argv[i]) == 0 ) { fprintf( stderr, usage_sparse, argv[0] ); return -1; } else { *matrices = i; break; } } // ensure to take a symmetric preconditioner for the PCG if ( opts->solver_par.solver == Magma_PCG && opts->precond_par.solver == Magma_ILU ) opts->precond_par.solver = Magma_ICC; return MAGMA_SUCCESS; }
// Parse command line clp_return_type parse_cmdline( int argc , char ** argv, CMD & cmdline, const Teuchos::Comm<int>& comm, const bool uq ) { Teuchos::ParameterList params; Teuchos::CommandLineProcessor clp(false); const int num_grouping_types = 4; const GroupingType grouping_values[] = { GROUPING_NATURAL, GROUPING_MAX_ANISOTROPY, GROUPING_MORTAN_Z, GROUPING_TASMANIAN_SURROGATE }; const char *grouping_names[] = { "natural", "max-anisotropy", "mortan-z", "tasmanian-surrogate" }; const int num_sampling_types = 3; const SamplingType sampling_values[] = { SAMPLING_STOKHOS, SAMPLING_TASMANIAN, SAMPLING_FILE }; const char *sampling_names[] = { "stokhos", "tasmanian", "file" }; clp.setOption("serial", "no-serial", &cmdline.USE_SERIAL, "use the serial device"); clp.setOption("threads", &cmdline.USE_THREADS, "number of pthreads threads"); clp.setOption("openmp", &cmdline.USE_OPENMP, "number of openmp threads"); clp.setOption("numa", &cmdline.USE_NUMA, "number of numa nodes"); clp.setOption("cores", &cmdline.USE_CORE_PER_NUMA, "cores per numa node"); clp.setOption("cuda", "no-cuda", &cmdline.USE_CUDA, "use the CUDA device"); clp.setOption("device", &cmdline.USE_CUDA_DEV, "CUDA device ID. Set to default of -1 to use the default device as determined by the local node MPI rank and --ngpus"); clp.setOption("ngpus", &cmdline.USE_NGPUS, "Number of GPUs per node for multi-GPU runs via MPI"); std::string fixtureSpec="2x2x2"; clp.setOption("fixture", &fixtureSpec, "fixture string: \"XxYxZ\""); clp.setOption("fixture-x", &cmdline.USE_FIXTURE_X, "fixture"); clp.setOption("fixture-y", &cmdline.USE_FIXTURE_Y, "fixture"); clp.setOption("fixture-z", &cmdline.USE_FIXTURE_Z, "fixture"); clp.setOption("fixture-quadratic", "no-fixture-quadratic", &cmdline.USE_FIXTURE_QUADRATIC, "quadratic"); clp.setOption("atomic", "no-atomic", &cmdline.USE_ATOMIC , "atomic"); clp.setOption("trials", &cmdline.USE_TRIALS, "trials"); clp.setOption("xml-file", &cmdline.USE_FENL_XML_FILE, "XML file containing solver parameters"); clp.setOption("belos", "no-belos", &cmdline.USE_BELOS , "use Belos solver"); clp.setOption("muelu", "no-muelu", &cmdline.USE_MUELU, "use MueLu preconditioner"); clp.setOption("mean-based", "no-mean-based", &cmdline.USE_MEANBASED, "use mean-based preconditioner"); if(cmdline.USE_MUELU || cmdline.USE_MEANBASED) cmdline.USE_BELOS = true; clp.setOption("sampling", &cmdline.USE_UQ_SAMPLING, num_sampling_types, sampling_values, sampling_names, "UQ sampling method"); clp.setOption("uq-fake", &cmdline.USE_UQ_FAKE, "setup a fake UQ problem of this size"); clp.setOption("uq-dim", &cmdline.USE_UQ_DIM, "UQ dimension"); clp.setOption("uq-order", &cmdline.USE_UQ_ORDER, "UQ order"); clp.setOption("uq-init-level", &cmdline.USE_UQ_INIT_LEVEL, "Initial adaptive sparse grid level"); clp.setOption("uq-max-level", &cmdline.USE_UQ_MAX_LEVEL, "Max adaptive sparse grid level"); clp.setOption("uq-max-samples", &cmdline.USE_UQ_MAX_SAMPLES, "Max number of samples to run"); clp.setOption("uq-tol", &cmdline.USE_UQ_TOL, "Adaptive sparse grid tolerance"); clp.setOption("diff-coeff-linear", &cmdline.USE_DIFF_COEFF_LINEAR, "Linear term in diffusion coefficient"); clp.setOption("diff-coeff-constant", &cmdline.USE_DIFF_COEFF_CONSTANT, "Constant term in diffusion coefficient"); clp.setOption("mean", &cmdline.USE_MEAN, "KL diffusion mean"); clp.setOption("var", &cmdline.USE_VAR, "KL diffusion variance"); clp.setOption("cor", &cmdline.USE_COR, "KL diffusion correlation"); clp.setOption("exponential", "no-exponential", &cmdline.USE_EXPONENTIAL, "take exponential of KL diffusion coefficient"); clp.setOption("exp-shift", &cmdline.USE_EXP_SHIFT, "Linear shift of exponential of KL diffusion coefficient"); clp.setOption("exp-scale", &cmdline.USE_EXP_SCALE, "Multiplicative scale of exponential of KL diffusion coefficient"); clp.setOption("discontinuous-exp-scale", "continuous-exp-scale", &cmdline.USE_DISC_EXP_SCALE, "use discontinuous scale factor on exponential"); clp.setOption("isotropic", "anisotropic", &cmdline.USE_ISOTROPIC, "use isotropic or anisotropic diffusion coefficient"); clp.setOption("coeff-src", &cmdline.USE_COEFF_SRC, "Coefficient for source term"); clp.setOption("coeff-adv", &cmdline.USE_COEFF_ADV, "Coefficient for advection term"); clp.setOption("sparse", "tensor", &cmdline.USE_SPARSE , "use sparse or tensor grid"); clp.setOption("ensemble", &cmdline.USE_UQ_ENSEMBLE, "UQ ensemble size. This needs to be a valid choice based on available instantiations."); clp.setOption("grouping", &cmdline.USE_GROUPING, num_grouping_types, grouping_values, grouping_names, "Sample grouping method for ensemble propagation"); clp.setOption("surrogate-grouping-level", &cmdline.TAS_GROUPING_INITIAL_LEVEL, "Starting level for surrogate-based grouping"); clp.setOption("vtune", "no-vtune", &cmdline.VTUNE , "connect to vtune"); clp.setOption("verbose", "no-verbose", &cmdline.VERBOSE, "print verbose intialization info"); clp.setOption("print", "no-print", &cmdline.PRINT, "print detailed test output"); clp.setOption("print-its", "no-print-its",&cmdline.PRINT_ITS, "print solver iterations after each sample"); clp.setOption("summarize", "no-summarize",&cmdline.SUMMARIZE, "summarize Teuchos timers at end of run"); bool doDryRun = false; clp.setOption("echo", "no-echo", &doDryRun, "dry-run only"); switch (clp.parse(argc, argv)) { case Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED: return CLP_HELP; case Teuchos::CommandLineProcessor::PARSE_ERROR: case Teuchos::CommandLineProcessor::PARSE_UNRECOGNIZED_OPTION: return CLP_ERROR; case Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL: break; } #if defined( KOKKOS_HAVE_CUDA ) // Set CUDA device based on local node rank if (cmdline.USE_CUDA && cmdline.USE_CUDA_DEV == -1) { int local_rank = 0; char *str; if ((str = std::getenv("MV2_COMM_WORLD_LOCAL_RANK"))) local_rank = std::atoi(str); else if ((str = getenv("OMPI_COMM_WORLD_LOCAL_RANK"))) local_rank = std::atoi(str); else if ((str = std::getenv("SLURM_LOCALID"))) local_rank = std::atoi(str); cmdline.USE_CUDA_DEV = local_rank % cmdline.USE_NGPUS; // Check device is valid int num_device; cudaGetDeviceCount(&num_device); TEUCHOS_TEST_FOR_EXCEPTION( cmdline.USE_CUDA_DEV >= cmdline.USE_NGPUS, std::logic_error, "Invalid device ID " << cmdline.USE_CUDA_DEV << ". You probably are trying" << " to run with too many GPUs per node"); } #endif sscanf( fixtureSpec.c_str() , "%dx%dx%d" , &cmdline.USE_FIXTURE_X , &cmdline.USE_FIXTURE_Y , &cmdline.USE_FIXTURE_Z ); cmdline.USE_UQ = uq; if (doDryRun) { print_cmdline( std::cout , cmdline ); cmdline.ECHO = 1; } else { cmdline.ECHO = 0; } cmdline.ERROR = 0 ; return CLP_OK; }
int main(int argc, char **argv) { TESTING_INIT(); magma_setdevice(0); magma_timestr_t start, end; float flops, magma_perf, cuda_perf, error, work[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t n_local[4]; FILE *fp ; magma_int_t N, m, i, j, lda, LDA, M; magma_int_t matsize; magma_int_t vecsize; magma_int_t istart = 64; magma_int_t incx = 1; char uplo = MagmaLower; magmaFloatComplex alpha = MAGMA_C_MAKE(1., 0.); // MAGMA_C_MAKE( 1.5, -2.3 ); magmaFloatComplex beta = MAGMA_C_MAKE(0., 0.); // MAGMA_C_MAKE( -0.6, 0.8 ); magmaFloatComplex *A, *X, *Y[4], *Ycublas, *Ymagma; magmaFloatComplex *dA, *dX[4], *dY[4], *d_lA[4], *dYcublas ; magma_queue_t stream[4][10]; magmaFloatComplex *C_work; magmaFloatComplex *dC_work[4]; int max_num_gpus; magma_int_t num_gpus = 1, nb; magma_int_t blocks, lwork; magma_int_t offset = 0; M = 0; N = 0; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) { N = atoi(argv[++i]); istart = N; } else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); else if (strcmp("-NGPU", argv[i])==0) num_gpus = atoi(argv[++i]); else if (strcmp("-offset", argv[i])==0) offset = atoi(argv[++i]); } if ( M == 0 ) { M = N; } if ( N == 0 ) { N = M; } if (M>0 && N>0) { printf(" testing_chemv_mgpu -M %d -N %d -NGPU %d\n\n", (int) M, (int) N, (int) num_gpus); printf(" in %c side \n", uplo); } else { printf("\nUsage: \n"); printf(" testing_chemv_mgpu -M %d -N %d -NGPU %d\n\n", 1024, 1024, 1); exit(1); } } else { #if defined(PRECISION_z) M = N = 8000; #else M = N = 12480; #endif num_gpus = 2; offset = 0; printf("\nUsage: \n"); printf(" testing_chemv_mgpu -M %d -N %d -NGPU %d\n\n", (int) M, (int) N, (int) num_gpus); } ////////////////////////////////////////////////////////////////////////// cudaGetDeviceCount(&max_num_gpus); if (num_gpus > max_num_gpus){ printf("More GPUs requested than available. Have to change it.\n"); num_gpus = max_num_gpus; } printf("Number of GPUs to be used = %d\n", (int) num_gpus); for(int i=0; i< num_gpus; i++) { magma_queue_create(&stream[i][0]); } LDA = ((N+31)/32)*32; matsize = N*LDA; vecsize = N*incx; nb = 32; //nb = 64; printf("block size = %d\n", (int) nb); TESTING_MALLOC_CPU( A, magmaFloatComplex, matsize ); TESTING_MALLOC_CPU( X, magmaFloatComplex, vecsize ); TESTING_MALLOC_CPU( Ycublas, magmaFloatComplex, vecsize ); TESTING_MALLOC_CPU( Ymagma, magmaFloatComplex, vecsize ); for(i=0; i<num_gpus; i++) { TESTING_MALLOC_CPU( Y[i], magmaFloatComplex, vecsize ); } magma_setdevice(0); TESTING_MALLOC_DEV( dA, magmaFloatComplex, matsize ); TESTING_MALLOC_DEV( dYcublas, magmaFloatComplex, vecsize ); for(i=0; i<num_gpus; i++) { n_local[i] = ((N/nb)/num_gpus)*nb; if (i < (N/nb)%num_gpus) n_local[i] += nb; else if (i == (N/nb)%num_gpus) n_local[i] += N%nb; magma_setdevice(i); TESTING_MALLOC_DEV( d_lA[i], magmaFloatComplex, LDA*n_local[i] );// potentially bugged TESTING_MALLOC_DEV( dX[i], magmaFloatComplex, vecsize ); TESTING_MALLOC_DEV( dY[i], magmaFloatComplex, vecsize ); printf("device %2d n_local = %4d\n", (int) i, (int) n_local[i]); } magma_setdevice(0); ////////////////////////////////////////////////////////////////////////// /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &matsize, A ); magma_cmake_hermitian( N, A, LDA ); blocks = N / nb + (N % nb != 0); lwork = LDA * (blocks + 1); TESTING_MALLOC_CPU( C_work, magmaFloatComplex, lwork ); for(i=0; i<num_gpus; i++){ magma_setdevice(i); TESTING_MALLOC_DEV( dC_work[i], magmaFloatComplex, lwork ); //fillZero(dC_work[i], lwork); } magma_setdevice(0); ////////////////////////////////////////////////////////////////////////// fp = fopen ("results_chemv_mgpu.csv", "w") ; if( fp == NULL ){ printf("Couldn't open output file\n"); exit(1);} printf("CHEMV magmaFloatComplex precision\n\n"); printf( " n CUBLAS,Gflop/s MAGMABLAS,Gflop/s \"error\"\n" "==============================================================\n"); fprintf(fp, " n CUBLAS,Gflop/s MAGMABLAS,Gflop/s \"error\"\n" "==============================================================\n"); // for( offset = 0; offset< N; offset ++ ) for(int size = istart ; size <= N ; size += 128) { // printf("offset = %d ", offset); m = size ; // m = N; // lda = ((m+31)/32)*32;// lda = LDA; flops = FLOPS( (float)m ) / 1e6; printf( "N %5d ", (int) m ); fprintf( fp, "%5d, ", (int) m ); vecsize = m * incx; lapackf77_clarnv( &ione, ISEED, &vecsize, X ); lapackf77_clarnv( &ione, ISEED, &vecsize, Y[0] ); /* ===================================================================== Performs operation using CUDA-BLAS =================================================================== */ magma_setdevice(0); magma_csetmatrix_1D_col_bcyclic(m, m, A, LDA, d_lA, lda, num_gpus, nb); magma_setdevice(0); magma_csetmatrix( m, m, A, LDA, dA, lda ); magma_csetvector( m, Y[0], incx, dYcublas, incx ); for(i=0; i<num_gpus; i++){ magma_setdevice(i); magma_csetvector( m, X, incx, dX[i], incx ); magma_csetvector( m, Y[0], incx, dY[i], incx ); blocks = m / nb + (m % nb != 0); magma_csetmatrix( lda, blocks, C_work, LDA, dC_work[i], lda ); } magma_setdevice(0); start = get_current_time(); cublasChemv( uplo, m-offset, alpha, dA + offset + offset * lda, lda, dX[0] + offset, incx, beta, dYcublas + offset, incx ); end = get_current_time(); magma_cgetvector( m, dYcublas, incx, Ycublas, incx ); cuda_perf = flops / GetTimerValue(start,end); printf( "%11.2f", cuda_perf ); fprintf(fp, "%11.2f,", cuda_perf ); magma_setdevice(0); start = get_current_time(); if(nb == 32) { magmablas_chemv2_mgpu_32_offset( uplo, m, alpha, d_lA, lda, dX, incx, beta, dY, incx, dC_work, lwork, num_gpus, nb, offset); } else // nb = 64 { magmablas_chemv2_mgpu_offset( uplo, m, alpha, d_lA, lda, dX, incx, beta, dY, incx, dC_work, lwork, num_gpus, nb, offset); } for(i=1; i<num_gpus; i++) { magma_setdevice(i); cudaDeviceSynchronize(); } end = get_current_time(); magma_perf = flops / GetTimerValue(start,end); printf( "%11.2f", magma_perf ); fprintf(fp, "%11.2f,", magma_perf ); for(i=0; i<num_gpus; i++) { magma_setdevice(i); magma_cgetvector( m, dY[i], incx, Y[i], incx ); } magma_setdevice(0); #ifdef validate for( j= offset;j<m;j++) { for(i=1; i<num_gpus; i++) { // printf("Y[%d][%d] = %15.14f\n", i, j, Y[i][j].x); #if defined(PRECISION_z) || defined(PRECISION_c) Y[0][j].x = Y[0][j].x + Y[i][j].x; Y[0][j].y = Y[0][j].y + Y[i][j].y; #else Y[0][j] = Y[0][j] + Y[i][j]; #endif } } /* #if defined(PRECISION_z) || defined(PRECISION_c) for( j=offset;j<m;j++) { if(Y[0][j].x != Ycublas[j].x) { printf("Y-multi[%d] = %f, %f\n", j, Y[0][j].x, Y[0][j].y ); printf("Ycublas[%d] = %f, %f\n", j, Ycublas[j].x, Ycublas[j].y); } } #else for( j=offset;j<m;j++) { if(Y[0][j] != Ycublas[j]) { printf("Y-multi[%d] = %f\n", j, Y[0][j] ); printf("Ycublas[%d] = %f\n", j, Ycublas[j]); } } #endif */ /* ===================================================================== Computing the Difference Cublas VS Magma =================================================================== */ magma_int_t nw = m - offset ; blasf77_caxpy( &nw, &c_neg_one, Y[0] + offset, &incx, Ycublas + offset, &incx); error = lapackf77_clange( "M", &nw, &ione, Ycublas + offset, &nw, work ); #if 0 printf( "\t\t %8.6e", error / m ); fprintf( fp, "\t\t %8.6e", error / m ); /* * Extra check with cblas vs magma */ cblas_ccopy( m, Y, incx, Ycublas, incx ); cblas_chemv( CblasColMajor, CblasLower, m, CBLAS_SADDR(alpha), A, LDA, X, incx, CBLAS_SADDR(beta), Ycublas, incx ); blasf77_caxpy( &m, &c_neg_one, Ymagma, &incx, Ycublas, &incx); error = lapackf77_clange( "M", &m, &ione, Ycublas, &m, work ); #endif printf( "\t\t %8.6e", error / m ); fprintf( fp, "\t\t %8.6e", error / m ); #endif printf("\n"); fprintf(fp, "\n"); } fclose( fp ) ; /* Free Memory */ TESTING_FREE_CPU( A ); TESTING_FREE_CPU( X ); TESTING_FREE_CPU( Ycublas ); TESTING_FREE_CPU( Ymagma ); TESTING_FREE_CPU( C_work ); magma_setdevice(0); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dYcublas ); for(i=0; i<num_gpus; i++) { TESTING_FREE_CPU( Y[i] ); magma_setdevice(i); TESTING_FREE_DEV( d_lA[i] ); TESTING_FREE_DEV( dX[i] ); TESTING_FREE_DEV( dY[i] ); TESTING_FREE_DEV( dC_work[i] ); } magma_setdevice(0); /////////////////////////////////////////////////////////// /* Free device */ TESTING_FINALIZE(); return 0; }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { pArgc = &argc; pArgv = argv; printf("%s Starting...\n\n", argv[0]); printf(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n"); int deviceCount = 0; cudaError_t error_id = cudaGetDeviceCount(&deviceCount); if (error_id != cudaSuccess) { printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id)); printf("Result = FAIL\n"); exit(EXIT_FAILURE); } // This function call returns 0 if there are no CUDA capable devices. if (deviceCount == 0) { printf("There are no available device(s) that support CUDA\n"); } else { printf("Detected %d CUDA Capable device(s)\n", deviceCount); } int dev, driverVersion = 0, runtimeVersion = 0; for (dev = 0; dev < deviceCount; ++dev) { cudaSetDevice(dev); cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name); // Console log cudaDriverGetVersion(&driverVersion); cudaRuntimeGetVersion(&runtimeVersion); printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion/1000, (driverVersion%100)/10, runtimeVersion/1000, (runtimeVersion%100)/10); printf(" CUDA Capability Major/Minor version number: %d.%d\n", deviceProp.major, deviceProp.minor); char msg[256]; SPRINTF(msg, " Total amount of global memory: %.0f MBytes (%llu bytes)\n", (float)deviceProp.totalGlobalMem/1048576.0f, (unsigned long long) deviceProp.totalGlobalMem); printf("%s", msg); printf(" (%2d) Multiprocessors, (%3d) CUDA Cores/MP: %d CUDA Cores\n", deviceProp.multiProcessorCount, _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor), _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount); printf(" GPU Max Clock rate: %.0f MHz (%0.2f GHz)\n", deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f); #if CUDART_VERSION >= 5000 // This is supported in CUDA 5.0 (runtime API device properties) printf(" Memory Clock rate: %.0f Mhz\n", deviceProp.memoryClockRate * 1e-3f); printf(" Memory Bus Width: %d-bit\n", deviceProp.memoryBusWidth); if (deviceProp.l2CacheSize) { printf(" L2 Cache Size: %d bytes\n", deviceProp.l2CacheSize); } #else // This only available in CUDA 4.0-4.2 (but these were only exposed in the CUDA Driver API) int memoryClock; getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev); printf(" Memory Clock rate: %.0f Mhz\n", memoryClock * 1e-3f); int memBusWidth; getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev); printf(" Memory Bus Width: %d-bit\n", memBusWidth); int L2CacheSize; getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev); if (L2CacheSize) { printf(" L2 Cache Size: %d bytes\n", L2CacheSize); } #endif printf(" Maximum Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d, %d), 3D=(%d, %d, %d)\n", deviceProp.maxTexture1D , deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1], deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]); printf(" Maximum Layered 1D Texture Size, (num) layers 1D=(%d), %d layers\n", deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1]); printf(" Maximum Layered 2D Texture Size, (num) layers 2D=(%d, %d), %d layers\n", deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]); printf(" Total amount of constant memory: %lu bytes\n", deviceProp.totalConstMem); printf(" Total amount of shared memory per block: %lu bytes\n", deviceProp.sharedMemPerBlock); printf(" Total number of registers available per block: %d\n", deviceProp.regsPerBlock); printf(" Warp size: %d\n", deviceProp.warpSize); printf(" Maximum number of threads per multiprocessor: %d\n", deviceProp.maxThreadsPerMultiProcessor); printf(" Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock); printf(" Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n", deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]); printf(" Max dimension size of a grid size (x,y,z): (%d, %d, %d)\n", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]); printf(" Maximum memory pitch: %lu bytes\n", deviceProp.memPitch); printf(" Texture alignment: %lu bytes\n", deviceProp.textureAlignment); printf(" Concurrent copy and kernel execution: %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount); printf(" Run time limit on kernels: %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No"); printf(" Integrated GPU sharing Host Memory: %s\n", deviceProp.integrated ? "Yes" : "No"); printf(" Support host page-locked memory mapping: %s\n", deviceProp.canMapHostMemory ? "Yes" : "No"); printf(" Alignment requirement for Surfaces: %s\n", deviceProp.surfaceAlignment ? "Yes" : "No"); printf(" Device has ECC support: %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled"); #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) printf(" CUDA Device Driver Mode (TCC or WDDM): %s\n", deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)"); #endif printf(" Device supports Unified Addressing (UVA): %s\n", deviceProp.unifiedAddressing ? "Yes" : "No"); printf(" Device PCI Domain ID / Bus ID / location ID: %d / %d / %d\n", deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID); const char *sComputeMode[] = { "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)", "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)", "Prohibited (no host thread can use ::cudaSetDevice() with this device)", "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)", "Unknown", NULL }; printf(" Compute Mode:\n"); printf(" < %s >\n", sComputeMode[deviceProp.computeMode]); } // If there are 2 or more GPUs, query to determine whether RDMA is supported if (deviceCount >= 2) { cudaDeviceProp prop[64]; int gpuid[64]; // we want to find the first two GPUs that can support P2P int gpu_p2p_count = 0; for (int i=0; i < deviceCount; i++) { checkCudaErrors(cudaGetDeviceProperties(&prop[i], i)); // Only boards based on Fermi or later can support P2P if ((prop[i].major >= 2) #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) // on Windows (64-bit), the Tesla Compute Cluster driver for windows must be enabled to support this && prop[i].tccDriver #endif ) { // This is an array of P2P capable GPUs gpuid[gpu_p2p_count++] = i; } } // Show all the combinations of support P2P GPUs int can_access_peer; if (gpu_p2p_count >= 2) { for (int i = 0; i < gpu_p2p_count; i++) { for (int j = 0; j < gpu_p2p_count; j++) { if (gpuid[i] == gpuid[j]) { continue; } checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j])); printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[gpuid[i]].name, gpuid[i], prop[gpuid[j]].name, gpuid[j] , can_access_peer ? "Yes" : "No"); } } } } // csv masterlog info // ***************************** // exe and CUDA driver name printf("\n"); std::string sProfileString = "deviceQuery, CUDA Driver = CUDART"; char cTemp[16]; // driver version sProfileString += ", CUDA Driver Version = "; #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) sprintf_s(cTemp, 10, "%d.%d", driverVersion/1000, (driverVersion%100)/10); #else sprintf(cTemp, "%d.%d", driverVersion/1000, (driverVersion%100)/10); #endif sProfileString += cTemp; // Runtime version sProfileString += ", CUDA Runtime Version = "; #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) sprintf_s(cTemp, 10, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10); #else sprintf(cTemp, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10); #endif sProfileString += cTemp; // Device count sProfileString += ", NumDevs = "; #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) sprintf_s(cTemp, 10, "%d", deviceCount); #else sprintf(cTemp, "%d", deviceCount); #endif sProfileString += cTemp; // Print Out all device Names for (dev = 0; dev < deviceCount; ++dev) { #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) sprintf_s(cTemp, 13, ", Device%d = ", dev); #else sprintf(cTemp, ", Device%d = ", dev); #endif cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); sProfileString += cTemp; sProfileString += deviceProp.name; } sProfileString += "\n"; printf("%s", sProfileString.c_str()); printf("Result = PASS\n"); // finish exit(EXIT_SUCCESS); }
PetscErrorCode PetscOptionsCheckInitial_Private(void) { char string[64],mname[PETSC_MAX_PATH_LEN],*f; MPI_Comm comm = PETSC_COMM_WORLD; PetscBool flg1 = PETSC_FALSE,flg2 = PETSC_FALSE,flg3 = PETSC_FALSE,flag; PetscErrorCode ierr; PetscReal si; PetscInt intensity; int i; PetscMPIInt rank; char version[256]; #if !defined(PETSC_HAVE_THREADSAFETY) PetscReal logthreshold; #endif #if defined(PETSC_USE_LOG) PetscViewerFormat format; PetscBool flg4 = PETSC_FALSE; #endif PetscFunctionBegin; ierr = MPI_Comm_rank(PETSC_COMM_WORLD,&rank);CHKERRQ(ierr); #if !defined(PETSC_HAVE_THREADSAFETY) /* Setup the memory management; support for tracing malloc() usage */ ierr = PetscOptionsHasName(NULL,"-malloc_log",&flg3);CHKERRQ(ierr); logthreshold = 0.0; ierr = PetscOptionsGetReal(NULL,"-malloc_log_threshold",&logthreshold,&flg1);CHKERRQ(ierr); if (flg1) flg3 = PETSC_TRUE; #if defined(PETSC_USE_DEBUG) ierr = PetscOptionsGetBool(NULL,"-malloc",&flg1,&flg2);CHKERRQ(ierr); if ((!flg2 || flg1) && !petscsetmallocvisited) { if (flg2 || !(PETSC_RUNNING_ON_VALGRIND)) { /* turn off default -malloc if valgrind is being used */ ierr = PetscSetUseTrMalloc_Private();CHKERRQ(ierr); } } #else ierr = PetscOptionsGetBool(NULL,"-malloc_dump",&flg1,NULL);CHKERRQ(ierr); ierr = PetscOptionsGetBool(NULL,"-malloc",&flg2,NULL);CHKERRQ(ierr); if (flg1 || flg2 || flg3) {ierr = PetscSetUseTrMalloc_Private();CHKERRQ(ierr);} #endif if (flg3) { ierr = PetscMallocSetDumpLogThreshold((PetscLogDouble)logthreshold);CHKERRQ(ierr); } flg1 = PETSC_FALSE; ierr = PetscOptionsGetBool(NULL,"-malloc_debug",&flg1,NULL);CHKERRQ(ierr); if (flg1) { ierr = PetscSetUseTrMalloc_Private();CHKERRQ(ierr); ierr = PetscMallocDebug(PETSC_TRUE);CHKERRQ(ierr); } flg1 = PETSC_FALSE; ierr = PetscOptionsGetBool(NULL,"-malloc_test",&flg1,NULL);CHKERRQ(ierr); #if defined(PETSC_USE_DEBUG) if (flg1 && !PETSC_RUNNING_ON_VALGRIND) { ierr = PetscSetUseTrMalloc_Private();CHKERRQ(ierr); ierr = PetscMallocSetDumpLog();CHKERRQ(ierr); ierr = PetscMallocDebug(PETSC_TRUE);CHKERRQ(ierr); } #endif flg1 = PETSC_FALSE; ierr = PetscOptionsGetBool(NULL,"-malloc_info",&flg1,NULL);CHKERRQ(ierr); if (!flg1) { flg1 = PETSC_FALSE; ierr = PetscOptionsGetBool(NULL,"-memory_view",&flg1,NULL);CHKERRQ(ierr); } if (flg1) { ierr = PetscMemorySetGetMaximumUsage();CHKERRQ(ierr); } #endif #if defined(PETSC_USE_LOG) ierr = PetscOptionsHasName(NULL,"-objects_dump",&PetscObjectsLog);CHKERRQ(ierr); #endif /* Set the display variable for graphics */ ierr = PetscSetDisplay();CHKERRQ(ierr); /* Print the PETSc version information */ ierr = PetscOptionsHasName(NULL,"-v",&flg1);CHKERRQ(ierr); ierr = PetscOptionsHasName(NULL,"-version",&flg2);CHKERRQ(ierr); ierr = PetscOptionsHasName(NULL,"-help",&flg3);CHKERRQ(ierr); if (flg1 || flg2 || flg3) { /* Print "higher-level" package version message */ if (PetscExternalVersionFunction) { ierr = (*PetscExternalVersionFunction)(comm);CHKERRQ(ierr); } ierr = PetscGetVersion(version,256);CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm,"--------------------------------------------\ ------------------------------\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm,"%s\n",version);CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm,"%s",PETSC_AUTHOR_INFO);CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm,"See docs/changes/index.html for recent updates.\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm,"See docs/faq.html for problems.\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm,"See docs/manualpages/index.html for help. \n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm,"Libraries linked from %s\n",PETSC_LIB_DIR);CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm,"--------------------------------------------\ ------------------------------\n");CHKERRQ(ierr); } /* Print "higher-level" package help message */ if (flg3) { if (PetscExternalHelpFunction) { ierr = (*PetscExternalHelpFunction)(comm);CHKERRQ(ierr); } } /* Setup the error handling */ flg1 = PETSC_FALSE; ierr = PetscOptionsGetBool(NULL,"-on_error_abort",&flg1,NULL);CHKERRQ(ierr); if (flg1) { ierr = MPI_Comm_set_errhandler(PETSC_COMM_WORLD,MPI_ERRORS_ARE_FATAL);CHKERRQ(ierr); ierr = PetscPushErrorHandler(PetscAbortErrorHandler,0);CHKERRQ(ierr); } flg1 = PETSC_FALSE; ierr = PetscOptionsGetBool(NULL,"-on_error_mpiabort",&flg1,NULL);CHKERRQ(ierr); if (flg1) { ierr = PetscPushErrorHandler(PetscMPIAbortErrorHandler,0);CHKERRQ(ierr);} flg1 = PETSC_FALSE; ierr = PetscOptionsGetBool(NULL,"-mpi_return_on_error",&flg1,NULL);CHKERRQ(ierr); if (flg1) { ierr = MPI_Comm_set_errhandler(comm,MPI_ERRORS_RETURN);CHKERRQ(ierr); } flg1 = PETSC_FALSE; ierr = PetscOptionsGetBool(NULL,"-no_signal_handler",&flg1,NULL);CHKERRQ(ierr); if (!flg1) {ierr = PetscPushSignalHandler(PetscSignalHandlerDefault,(void*)0);CHKERRQ(ierr);} flg1 = PETSC_FALSE; ierr = PetscOptionsGetBool(NULL,"-fp_trap",&flg1,NULL);CHKERRQ(ierr); if (flg1) {ierr = PetscSetFPTrap(PETSC_FP_TRAP_ON);CHKERRQ(ierr);} ierr = PetscOptionsGetInt(NULL,"-check_pointer_intensity",&intensity,&flag);CHKERRQ(ierr); if (flag) {ierr = PetscCheckPointerSetIntensity(intensity);CHKERRQ(ierr);} /* Setup debugger information */ ierr = PetscSetDefaultDebugger();CHKERRQ(ierr); ierr = PetscOptionsGetString(NULL,"-on_error_attach_debugger",string,64,&flg1);CHKERRQ(ierr); if (flg1) { MPI_Errhandler err_handler; ierr = PetscSetDebuggerFromString(string);CHKERRQ(ierr); ierr = MPI_Comm_create_errhandler((MPI_Handler_function*)Petsc_MPI_DebuggerOnError,&err_handler);CHKERRQ(ierr); ierr = MPI_Comm_set_errhandler(comm,err_handler);CHKERRQ(ierr); ierr = PetscPushErrorHandler(PetscAttachDebuggerErrorHandler,0);CHKERRQ(ierr); } ierr = PetscOptionsGetString(NULL,"-debug_terminal",string,64,&flg1);CHKERRQ(ierr); if (flg1) { ierr = PetscSetDebugTerminal(string);CHKERRQ(ierr); } ierr = PetscOptionsGetString(NULL,"-start_in_debugger",string,64,&flg1);CHKERRQ(ierr); ierr = PetscOptionsGetString(NULL,"-stop_for_debugger",string,64,&flg2);CHKERRQ(ierr); if (flg1 || flg2) { PetscMPIInt size; PetscInt lsize,*nodes; MPI_Errhandler err_handler; /* we have to make sure that all processors have opened connections to all other processors, otherwise once the debugger has stated it is likely to receive a SIGUSR1 and kill the program. */ ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr); if (size > 2) { PetscMPIInt dummy = 0; MPI_Status status; for (i=0; i<size; i++) { if (rank != i) { ierr = MPI_Send(&dummy,1,MPI_INT,i,109,PETSC_COMM_WORLD);CHKERRQ(ierr); } } for (i=0; i<size; i++) { if (rank != i) { ierr = MPI_Recv(&dummy,1,MPI_INT,i,109,PETSC_COMM_WORLD,&status);CHKERRQ(ierr); } } } /* check if this processor node should be in debugger */ ierr = PetscMalloc1(size,&nodes);CHKERRQ(ierr); lsize = size; ierr = PetscOptionsGetIntArray(NULL,"-debugger_nodes",nodes,&lsize,&flag);CHKERRQ(ierr); if (flag) { for (i=0; i<lsize; i++) { if (nodes[i] == rank) { flag = PETSC_FALSE; break; } } } if (!flag) { ierr = PetscSetDebuggerFromString(string);CHKERRQ(ierr); ierr = PetscPushErrorHandler(PetscAbortErrorHandler,0);CHKERRQ(ierr); if (flg1) { ierr = PetscAttachDebugger();CHKERRQ(ierr); } else { ierr = PetscStopForDebugger();CHKERRQ(ierr); } ierr = MPI_Comm_create_errhandler((MPI_Handler_function*)Petsc_MPI_AbortOnError,&err_handler);CHKERRQ(ierr); ierr = MPI_Comm_set_errhandler(comm,err_handler);CHKERRQ(ierr); } ierr = PetscFree(nodes);CHKERRQ(ierr); } ierr = PetscOptionsGetString(NULL,"-on_error_emacs",emacsmachinename,128,&flg1);CHKERRQ(ierr); if (flg1 && !rank) {ierr = PetscPushErrorHandler(PetscEmacsClientErrorHandler,emacsmachinename);CHKERRQ(ierr);} /* Setup profiling and logging */ #if defined(PETSC_USE_INFO) { char logname[PETSC_MAX_PATH_LEN]; logname[0] = 0; ierr = PetscOptionsGetString(NULL,"-info",logname,250,&flg1);CHKERRQ(ierr); if (flg1 && logname[0]) { ierr = PetscInfoAllow(PETSC_TRUE,logname);CHKERRQ(ierr); } else if (flg1) { ierr = PetscInfoAllow(PETSC_TRUE,NULL);CHKERRQ(ierr); } } #endif #if defined(PETSC_USE_LOG) mname[0] = 0; ierr = PetscOptionsGetString(NULL,"-history",mname,PETSC_MAX_PATH_LEN,&flg1);CHKERRQ(ierr); if (flg1) { if (mname[0]) { ierr = PetscOpenHistoryFile(mname,&petsc_history);CHKERRQ(ierr); } else { ierr = PetscOpenHistoryFile(NULL,&petsc_history);CHKERRQ(ierr); } } #if defined(PETSC_HAVE_MPE) flg1 = PETSC_FALSE; ierr = PetscOptionsHasName(NULL,"-log_mpe",&flg1);CHKERRQ(ierr); if (flg1) {ierr = PetscLogMPEBegin();CHKERRQ(ierr);} #endif flg1 = PETSC_FALSE; flg3 = PETSC_FALSE; ierr = PetscOptionsGetBool(NULL,"-log_all",&flg1,NULL);CHKERRQ(ierr); ierr = PetscOptionsHasName(NULL,"-log_summary",&flg3);CHKERRQ(ierr); if (flg1) { ierr = PetscLogAllBegin();CHKERRQ(ierr); } else if (flg3) { ierr = PetscLogDefaultBegin();CHKERRQ(ierr);} ierr = PetscOptionsGetString(NULL,"-log_trace",mname,250,&flg1);CHKERRQ(ierr); if (flg1) { char name[PETSC_MAX_PATH_LEN],fname[PETSC_MAX_PATH_LEN]; FILE *file; if (mname[0]) { sprintf(name,"%s.%d",mname,rank); ierr = PetscFixFilename(name,fname);CHKERRQ(ierr); file = fopen(fname,"w"); if (!file) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_FILE_OPEN,"Unable to open trace file: %s",fname); } else file = PETSC_STDOUT; ierr = PetscLogTraceBegin(file);CHKERRQ(ierr); } ierr = PetscOptionsGetViewer(PETSC_COMM_WORLD,NULL,"-log_view",NULL,&format,&flg4);CHKERRQ(ierr); if (flg4) { if (format == PETSC_VIEWER_ASCII_XML){ ierr = PetscLogNestedBegin();CHKERRQ(ierr); } else { ierr = PetscLogDefaultBegin();CHKERRQ(ierr); } } #endif ierr = PetscOptionsGetBool(NULL,"-saws_options",&PetscOptionsPublish,NULL);CHKERRQ(ierr); #if defined(PETSC_HAVE_CUDA) ierr = PetscOptionsHasName(NULL,"-cuda_show_devices",&flg1);CHKERRQ(ierr); if (flg1) { struct cudaDeviceProp prop; int devCount; int device; cudaError_t err = cudaSuccess; err = cudaGetDeviceCount(&devCount); if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaGetDeviceCount %s",cudaGetErrorString(err)); for (device = 0; device < devCount; ++device) { err = cudaGetDeviceProperties(&prop, device); if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaGetDeviceProperties %s",cudaGetErrorString(err)); ierr = PetscPrintf(PETSC_COMM_WORLD, "CUDA device %d: %s\n", device, prop.name);CHKERRQ(ierr); } } { int size; ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr); if (size>1) { int devCount, device, rank; cudaError_t err = cudaSuccess; /* check to see if we force multiple ranks to hit the same GPU */ ierr = PetscOptionsGetInt(NULL,"-cuda_set_device", &device, &flg1);CHKERRQ(ierr); if (flg1) { err = cudaSetDevice(device); if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaSetDevice %s",cudaGetErrorString(err)); } else { /* we're not using the same GPU on multiple MPI threads. So try to allocated different GPUs to different processes */ /* First get the device count */ err = cudaGetDeviceCount(&devCount); if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaGetDeviceCount %s",cudaGetErrorString(err)); /* next determine the rank and then set the device via a mod */ ierr = MPI_Comm_rank(PETSC_COMM_WORLD,&rank);CHKERRQ(ierr); device = rank % devCount; err = cudaSetDevice(device); if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaSetDevice %s",cudaGetErrorString(err)); } /* set the device flags so that it can map host memory ... do NOT throw exception on err!=cudaSuccess multiple devices may try to set the flags on the same device. So long as one of them succeeds, things are ok. */ err = cudaSetDeviceFlags(cudaDeviceMapHost); if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaSetDeviceFlags %s",cudaGetErrorString(err)); } else { int device; cudaError_t err = cudaSuccess; /* the code below works for serial GPU simulations */ ierr = PetscOptionsGetInt(NULL,"-cuda_set_device", &device, &flg1);CHKERRQ(ierr); if (flg1) { err = cudaSetDevice(device); if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaSetDevice %s",cudaGetErrorString(err)); } /* set the device flags so that it can map host memory ... here, we error check. */ err = cudaSetDeviceFlags(cudaDeviceMapHost); if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaSetDeviceFlags %s",cudaGetErrorString(err)); } } #endif /* Print basic help message */ ierr = PetscOptionsHasName(NULL,"-help",&flg1);CHKERRQ(ierr); if (flg1) { ierr = (*PetscHelpPrintf)(comm,"Options for all PETSc programs:\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -help: prints help method for each option\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -on_error_abort: cause an abort when an error is detected. Useful \n ");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," only when run in the debugger\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -on_error_attach_debugger [gdb,dbx,xxgdb,ups,noxterm]\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," start the debugger in new xterm\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," unless noxterm is given\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -start_in_debugger [gdb,dbx,xxgdb,ups,noxterm]\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," start all processes in the debugger\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -on_error_emacs <machinename>\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," emacs jumps to error file\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -debugger_nodes [n1,n2,..] Nodes to start in debugger\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -debugger_pause [m] : delay (in seconds) to attach debugger\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -stop_for_debugger : prints message on how to attach debugger manually\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," waits the delay for you to attach\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -display display: Location where X window graphics and debuggers are displayed\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -no_signal_handler: do not trap error signals\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -mpi_return_on_error: MPI returns error code, rather than abort on internal error\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -fp_trap: stop on floating point exceptions\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," note on IBM RS6000 this slows run greatly\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -malloc_dump <optional filename>: dump list of unfreed memory at conclusion\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -malloc: use our error checking malloc\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -malloc no: don't use error checking malloc\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -malloc_info: prints total memory usage\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -malloc_log: keeps log of all memory allocations\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -malloc_debug: enables extended checking for memory corruption\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -options_table: dump list of options inputted\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -options_left: dump list of unused options\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -options_left no: don't dump list of unused options\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -tmp tmpdir: alternative /tmp directory\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -shared_tmp: tmp directory is shared by all processors\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -not_shared_tmp: each processor has separate tmp directory\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -memory_view: print memory usage at end of run\n");CHKERRQ(ierr); #if defined(PETSC_USE_LOG) ierr = (*PetscHelpPrintf)(comm," -get_total_flops: total flops over all processors\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -log[_summary _summary_python]: logging objects and events\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -log_trace [filename]: prints trace of all PETSc calls\n");CHKERRQ(ierr); #if defined(PETSC_HAVE_MPE) ierr = (*PetscHelpPrintf)(comm," -log_mpe: Also create logfile viewable through Jumpshot\n");CHKERRQ(ierr); #endif ierr = (*PetscHelpPrintf)(comm," -info <optional filename>: print informative messages about the calculations\n");CHKERRQ(ierr); #endif ierr = (*PetscHelpPrintf)(comm," -v: prints PETSc version number and release date\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -options_file <file>: reads options from file\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -petsc_sleep n: sleeps n seconds before running program\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm,"-----------------------------------------------\n");CHKERRQ(ierr); } #if defined(PETSC_HAVE_POPEN) { char machine[128]; ierr = PetscOptionsGetString(NULL,"-popen_machine",machine,128,&flg1);CHKERRQ(ierr); if (flg1) { ierr = PetscPOpenSetMachine(machine);CHKERRQ(ierr); } } #endif ierr = PetscOptionsGetReal(NULL,"-petsc_sleep",&si,&flg1);CHKERRQ(ierr); if (flg1) { ierr = PetscSleep(si);CHKERRQ(ierr); } ierr = PetscOptionsGetString(NULL,"-info_exclude",mname,PETSC_MAX_PATH_LEN,&flg1);CHKERRQ(ierr); if (flg1) { ierr = PetscStrstr(mname,"null",&f);CHKERRQ(ierr); if (f) { ierr = PetscInfoDeactivateClass(0);CHKERRQ(ierr); } } #if defined(PETSC_HAVE_CUSP) || defined(PETSC_HAVE_VIENNACL) ierr = PetscOptionsHasName(NULL,"-log_summary",&flg3);CHKERRQ(ierr); if (!flg3) { ierr = PetscOptionsHasName(NULL,"-log_view",&flg3);CHKERRQ(ierr); } #endif #if defined(PETSC_HAVE_CUSP) ierr = PetscOptionsGetBool(NULL,"-cusp_synchronize",&flg3,NULL);CHKERRQ(ierr); PetscCUSPSynchronize = flg3; #elif defined(PETSC_HAVE_VIENNACL) ierr = PetscOptionsGetBool(NULL,"-viennacl_synchronize",&flg3,NULL);CHKERRQ(ierr); PetscViennaCLSynchronize = flg3; #endif PetscFunctionReturn(0); }
int main(int argc, char** argv) { bool srcbin = 0; bool invbk = 0; if(argc < 3){ printf("Not enough args!\narg1: target image\narg2: source image\narg3: do source image adaptive threshold or not\narg4: invert back ground or not\n"); getchar(); return 1; } if(argc >= 4){ if(!strcmp(argv[3], "1")) srcbin = 1; } if(argc >= 5){ if(!strcmp(argv[4], "1")) invbk = 1; } IplImage* srcimg= 0, *srcimgb= 0, *srcimgb2 = 0, *bimg = 0, *b2img = 0,*bugimg = 0, *alg2dst = 0; srcimg= cvLoadImage(argv[2], 1); if (!srcimg) { printf("src img %s load failed!\n", argv[2]); getchar(); return 1; } //choosing the parameters for our ccl int bn = 8; //how many partitions int nwidth = 512; if(srcimg->width > 512){ nwidth = 1024; bn = 6; } if(srcimg->width > 1024){ nwidth = 2048; bn = 3; } if(srcimg->width > 2048){ printf("warning, image too wide, max support 2048. image is truncated.\n"); getchar(); return 1; } //start selection gpu devices int devCount; int smCnt = 0; cudaGetDeviceCount(&devCount); // Iterate through devices int devChosen = 0; for (int i = 0; i < devCount; ++i) { cudaDeviceProp devProp; cudaGetDeviceProperties(&devProp, i); if(devProp.major >= 2){//only one device supported smCnt = max(smCnt, devProp.multiProcessorCount); if(devProp.multiProcessorCount == smCnt) devChosen = i; } } if(smCnt == 0){ //our ccl require CUDA cap 2.0 or above, but the Ostava's ccl can be run on any CUDA gpu printf("Error, no device with cap 2.x found. Only cpu alg will be run.\n"); getchar(); return 1; } if(smCnt != 0){ cudaSetDevice(devChosen); bn = bn * smCnt; } int nheight = (cvGetSize(srcimg).height-2) / (2*bn); if((nheight*2*bn+2) < cvGetSize(srcimg).height) nheight++; nheight = nheight*2*bn+2; if(smCnt != 0) printf("gpu ccl for image width 512, 1024, 2048.\nchoosing device %d, width %d, height %d, blocks %d\n", devChosen, nwidth, nheight, bn); srcimgb= cvCreateImage(cvSize(nwidth, cvGetSize(srcimg).height),IPL_DEPTH_8U,1); srcimgb2= cvCreateImage(cvSize(nwidth, cvGetSize(srcimg).height),IPL_DEPTH_8U,1); cvSetImageROI(srcimg, cvRect(0, 0, min(cvGetSize(srcimg).width, nwidth), cvGetSize(srcimg).height)); cvSetImageROI(srcimgb2, cvRect(0, 0, min(cvGetSize(srcimg).width, nwidth), cvGetSize(srcimg).height)); cvSet(srcimgb2, cvScalar(0,0,0)); cvCvtColor(srcimg, srcimgb2, CV_BGRA2GRAY); cvResetImageROI(srcimgb2); cvReleaseImage(&srcimg); if(srcbin) cvAdaptiveThreshold(srcimgb2, srcimgb, 1.0, CV_ADAPTIVE_THRESH_MEAN_C, invbk ? CV_THRESH_BINARY_INV : CV_THRESH_BINARY); else cvThreshold(srcimgb2, srcimgb, 0.0, 1.0, invbk ? CV_THRESH_BINARY_INV : CV_THRESH_BINARY); boundCheck(srcimgb); cvScale(srcimgb, srcimgb2, 255); //the source binary image to be labeled is saved as bsrc.bmp cvSaveImage("bsrc.bmp", srcimgb2); cvSet(srcimgb2, cvScalar(0,0,0)); float elapsedMilliSeconds1; {//begin cpu labeling algorithm, the SBLA proposed by Zhao LABELDATATYPE *data=(LABELDATATYPE *)malloc(srcimgb->width * srcimgb->height * sizeof(LABELDATATYPE)); for(int j = 0; j<srcimgb->height; j++) for(int i = 0; i<srcimgb->width; i++) data[i + j*srcimgb->width] = (srcimgb->imageData[i + j*srcimgb->widthStep]) ? 1 : 0; int iNumLabels; CPerformanceCounter perf; perf.Start(); iNumLabels = LabelSBLA(data, srcimgb->width, srcimgb->height); elapsedMilliSeconds1 = (float)perf.GetElapsedMilliSeconds(); printf("cpu SBLA used %f ms, total labels %u\n", elapsedMilliSeconds1, iNumLabels); free(data); } IplImage *src2(0),*dst2(0); int iNumLabels; float elapsedMilliSeconds2; {//begin cpu labeling algorithm, the BBDT proposed by C. Grana, D. Borghesani, R. Cucchiara CPerformanceCounter perf; src2 = cvCreateImage( cvGetSize(srcimgb), IPL_DEPTH_8U, 1 ); cvCopyImage(srcimgb,src2); dst2 = cvCreateImage( cvGetSize(srcimgb), IPL_DEPTH_32S, 1 ); perf.Start(); cvLabelingImageLab(src2, dst2, 1, &iNumLabels); elapsedMilliSeconds2 = (float)perf.GetElapsedMilliSeconds(); printf("cpu BBDT used %f ms, total labels %u\n", elapsedMilliSeconds2, iNumLabels); cvSaveImage("bbdt.bmp", dst2); // cvReleaseImage(&src2); // cvReleaseImage(&dst2); } if(smCnt != 0){ bugimg = cvCreateImage(cvSize(nwidth, 9*bn),IPL_DEPTH_8U,1); bimg = cvCreateImage(cvSize(nwidth, 2*bn),IPL_DEPTH_8U,1); b2img = cvCreateImage(cvSize(nwidth, 2*bn),IPL_DEPTH_8U,1); // cvNamedWindow("src",CV_WINDOW_AUTOSIZE); // cvShowImage("src",srcimg); //prepare buffers for our gpu algorithm CudaBuffer srcBuf, dstBuf, dstBuf2, bBuf, b2Buf, errBuf, glabel; srcBuf.Create2D(nwidth, nheight); //the binary image to be processed dstBuf.Create2D(nwidth, (nheight-2)/2); //the label result, only about 1/4 the size of source image contains the final labels dstBuf2.Create2D(nwidth,(nheight-2)/2); //a copy of the pass1 temp result, for debug purpose glabel.Create2D(4, 1); //a int size global buffer for unique final label errBuf.Create2D(nwidth, 9*bn); //a buffer for debug info bBuf.Create2D(nwidth, 2 * bn); //the intersection info used by pass2 b2Buf.Create2D(nwidth, 2 * bn); //a copy of bBuf for debug purpose srcBuf.SetZeroData(); srcBuf.CopyFrom(srcimgb->imageData, srcimgb->widthStep, nwidth, cvGetSize(srcimgb).height); float elapsedTimeInMs = 0.0f; //-------------------gpu part---------------------------- cudaEvent_t start, stop; cutilSafeCall ( cudaEventCreate( &start ) ); cutilSafeCall ( cudaEventCreate( &stop ) ); cutilSafeCall( cudaEventRecord( start, 0 ) ); if(nwidth == 512) label_512(&dstBuf, &dstBuf2, &srcBuf, &bBuf, &b2Buf, &glabel, nheight, bn, &errBuf); else if(nwidth == 1024) label_1024(&dstBuf, &dstBuf2, &srcBuf, &bBuf, &b2Buf, &glabel, nheight, bn, &errBuf); else if(nwidth == 2048) label_2048(&dstBuf, &dstBuf2, &srcBuf, &bBuf, &b2Buf, &glabel, nheight, bn, &errBuf); cutilSafeCall( cudaEventRecord( stop, 0 ) ); // cutilCheckMsg("kernel launch failure"); cudaEventSynchronize(stop); cutilSafeCall( cudaEventElapsedTime( &elapsedTimeInMs, start, stop ) ); uint tlabel = 0; cudaMemcpy(&tlabel, glabel.GetData(), 4, cudaMemcpyDeviceToHost); printf("gpu alg 1 used %f ms, total labels %u\n", elapsedTimeInMs, tlabel); dstBuf.CopyToHost(srcimgb->imageData, srcimgb->widthStep, nwidth, (nheight-2)/2); dstBuf2.CopyToHost(srcimgb2->imageData, srcimgb->widthStep, nwidth, (nheight-2)/2); errBuf.CopyToHost(bugimg->imageData, bugimg->widthStep, nwidth, 9*bn); bBuf.CopyToHost(bimg->imageData, bimg->widthStep, nwidth, 2*bn); b2Buf.CopyToHost(b2img->imageData, bimg->widthStep, nwidth, 2*bn); // cvNamedWindow("gpu",CV_WINDOW_AUTOSIZE); // cvShowImage("gpu",srcimgb); cvSaveImage(argv[1], srcimgb); cvSaveImage("gpu2.bmp", srcimgb2); //the final labels of our algorithm cvSaveImage("bug.bmp", bugimg); cvSaveImage("b.bmp", bimg); cvSaveImage("b2.bmp", b2img); //now start the gpu ccl implemented by Ostava alg2dst= cvCreateImage(cvSize(nwidth*4, cvGetSize(srcimgb).height),IPL_DEPTH_8U,1); CCLBase* m_ccl; m_ccl = new CCL(); m_ccl->FindRegions(nwidth, cvGetSize(srcimgb).height, &srcBuf); m_ccl->GetConnectedRegionsBuffer()->CopyToHost(alg2dst->imageData, alg2dst->widthStep, nwidth*4, cvGetSize(srcimgb).height); delete m_ccl; cvSaveImage("alg2.bmp", alg2dst); cvReleaseImage(&bugimg); cvReleaseImage(&bimg); cvReleaseImage(&b2img); cvReleaseImage(&alg2dst); // } //cvWaitKey(0); //now start cross compare label results of our ccl and the BBDT, to check the correctness // if(smCnt != 0){ ushort *gpures, *cpures; uint sz = nwidth * (cvGetSize(srcimgb).height/2); gpures = (ushort*)malloc(sz); cpures = (ushort*)malloc(sz); dstBuf.CopyToHost(gpures, nwidth, nwidth, (cvGetSize(srcimgb).height/2)); //first, reduce cpu labels from one label for each pixel to one label for a 2x2 block, assuming 8-connectivity for(int j = 0; j < (cvGetSize(srcimgb).height/2); j++) for(int i = 0; i < (nwidth/2); i++){ uint* cpup; ushort res = LBMAX; uint y = j*2, x = i*2; cpup = (uint*)(dst2->imageData + y*dst2->widthStep); // if(y < cvGetSize(srcimgb).height){ if(cpup[x] != 0) res = cpup[x]-1; if(cpup[x+1] != 0) res = cpup[x+1]-1; // } y++; cpup = (uint*)(dst2->imageData + y*dst2->widthStep); // if(y < cvGetSize(srcimgb).height){ if(cpup[x] != 0) res = cpup[x]-1; if(cpup[x+1] != 0) res = cpup[x+1]-1; // } cpures[i + j*(nwidth/2)] = res; } //our algo use unsigned short to represent a label, the first label starts a 0, and maximun labels is LBMAX if(iNumLabels > LBMAX) printf("too much cc, compare abort.\n"); else{ //create a error //cpures[5] = 12; //cpures[15] = 18; printf("Checking correctness of gpu alg1\nChecking gpu ref by cpu.\n"); checkLabels(cpures, gpures, nwidth/2, cvGetSize(srcimgb).height/2, iNumLabels); printf("Checking cpu ref by gpu.\n"); checkLabels(gpures, cpures, nwidth/2, cvGetSize(srcimgb).height/2, tlabel); } free(gpures); free(cpures); printf("speedup is %f, %f, %f\n", gpu2time/elapsedTimeInMs, elapsedMilliSeconds1/elapsedTimeInMs, elapsedMilliSeconds2/elapsedTimeInMs); } cvReleaseImage(&srcimgb); cvReleaseImage(&srcimgb2); cvReleaseImage(&dst2); cvReleaseImage(&src2); cutilSafeCall( cudaThreadExit() ); return 0; }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main( int argc, char** argv) { pArgc = &argc; pArgv = argv; /* shrQAStart(argc, argv); shrSetLogFileName ("deviceQuery.txt"); */ shrLog("%s Starting...\n\n", argv[0]); shrLog(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n"); int deviceCount = 0; cudaError_t error_id = cudaGetDeviceCount(&deviceCount); if (error_id != cudaSuccess) { shrLog( "cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id) ); return -1; } // This function call returns 0 if there are no CUDA capable devices. if (deviceCount == 0) shrLog("There is no device supporting CUDA\n"); else shrLog("Found %d CUDA Capable device(s)\n", deviceCount); int dev, driverVersion = 0, runtimeVersion = 0; for (dev = 0; dev < deviceCount; ++dev) { cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); shrLog("\nDevice %d: \"%s\"\n", dev, deviceProp.name); #if CUDART_VERSION >= 2020 // Console log cudaDriverGetVersion(&driverVersion); cudaRuntimeGetVersion(&runtimeVersion); shrLog(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion/1000, (driverVersion%100)/10, runtimeVersion/1000, (runtimeVersion%100)/10); #endif shrLog(" CUDA Capability Major/Minor version number: %d.%d\n", deviceProp.major, deviceProp.minor); char msg[256]; sprintf(msg, " Total amount of global memory: %.0f MBytes (%llu bytes)\n", (float)deviceProp.totalGlobalMem/1048576.0f, (unsigned long long) deviceProp.totalGlobalMem); shrLog(msg); #if CUDART_VERSION >= 2000 shrLog(" (%2d) Multiprocessors x (%2d) CUDA Cores/MP: %d CUDA Cores\n", deviceProp.multiProcessorCount, ConvertSMVer2Cores(deviceProp.major, deviceProp.minor), ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount); #endif shrLog(" GPU Clock Speed: %.2f GHz\n", deviceProp.clockRate * 1e-6f); #if CUDART_VERSION >= 4000 // This is not available in the CUDA Runtime API, so we make the necessary calls the driver API to support this for output int memoryClock; getCudaAttribute<int>( &memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev ); shrLog(" Memory Clock rate: %.2f Mhz\n", memoryClock * 1e-3f); int memBusWidth; getCudaAttribute<int>( &memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev ); shrLog(" Memory Bus Width: %d-bit\n", memBusWidth); int L2CacheSize; getCudaAttribute<int>( &L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev ); if (L2CacheSize) { shrLog(" L2 Cache Size: %d bytes\n", L2CacheSize); } shrLog(" Max Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n", deviceProp.maxTexture1D, deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1], deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]); shrLog(" Max Layered Texture Size (dim) x layers 1D=(%d) x %d, 2D=(%d,%d) x %d\n", deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1], deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]); #endif shrLog(" Total amount of constant memory: %u bytes\n", deviceProp.totalConstMem); shrLog(" Total amount of shared memory per block: %u bytes\n", deviceProp.sharedMemPerBlock); shrLog(" Total number of registers available per block: %d\n", deviceProp.regsPerBlock); shrLog(" Warp size: %d\n", deviceProp.warpSize); shrLog(" Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock); shrLog(" Maximum sizes of each dimension of a block: %d x %d x %d\n", deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]); shrLog(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]); shrLog(" Maximum memory pitch: %u bytes\n", deviceProp.memPitch); shrLog(" Texture alignment: %u bytes\n", deviceProp.textureAlignment); #if CUDART_VERSION >= 4000 shrLog(" Concurrent copy and execution: %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount); #else shrLog(" Concurrent copy and execution: %s\n", deviceProp.deviceOverlap ? "Yes" : "No"); #endif #if CUDART_VERSION >= 2020 shrLog(" Run time limit on kernels: %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No"); shrLog(" Integrated GPU sharing Host Memory: %s\n", deviceProp.integrated ? "Yes" : "No"); shrLog(" Support host page-locked memory mapping: %s\n", deviceProp.canMapHostMemory ? "Yes" : "No"); #endif #if CUDART_VERSION >= 3000 shrLog(" Concurrent kernel execution: %s\n", deviceProp.concurrentKernels ? "Yes" : "No"); shrLog(" Alignment requirement for Surfaces: %s\n", deviceProp.surfaceAlignment ? "Yes" : "No"); #endif #if CUDART_VERSION >= 3010 shrLog(" Device has ECC support enabled: %s\n", deviceProp.ECCEnabled ? "Yes" : "No"); #endif #if CUDART_VERSION >= 3020 shrLog(" Device is using TCC driver mode: %s\n", deviceProp.tccDriver ? "Yes" : "No"); #endif #if CUDART_VERSION >= 4000 shrLog(" Device supports Unified Addressing (UVA): %s\n", deviceProp.unifiedAddressing ? "Yes" : "No"); shrLog(" Device PCI Bus ID / PCI location ID: %d / %d\n", deviceProp.pciBusID, deviceProp.pciDeviceID ); #endif #if CUDART_VERSION >= 2020 const char *sComputeMode[] = { "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)", "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)", "Prohibited (no host thread can use ::cudaSetDevice() with this device)", "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)", "Unknown", NULL }; shrLog(" Compute Mode:\n"); shrLog(" < %s >\n", sComputeMode[deviceProp.computeMode]); #endif } // csv masterlog info // ***************************** // exe and CUDA driver name shrLog("\n"); std::string sProfileString = "deviceQuery, CUDA Driver = CUDART"; char cTemp[10]; // driver version sProfileString += ", CUDA Driver Version = "; #ifdef WIN32 sprintf_s(cTemp, 10, "%d.%d", driverVersion/1000, (driverVersion%100)/10); #else sprintf(cTemp, "%d.%d", driverVersion/1000, (driverVersion%100)/10); #endif sProfileString += cTemp; // Runtime version sProfileString += ", CUDA Runtime Version = "; #ifdef WIN32 sprintf_s(cTemp, 10, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10); #else sprintf(cTemp, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10); #endif sProfileString += cTemp; // Device count sProfileString += ", NumDevs = "; #ifdef WIN32 sprintf_s(cTemp, 10, "%d", deviceCount); #else sprintf(cTemp, "%d", deviceCount); #endif sProfileString += cTemp; // First 2 device names, if any for (dev = 0; dev < ((deviceCount > 2) ? 2 : deviceCount); ++dev) { cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); sProfileString += ", Device = "; sProfileString += deviceProp.name; } sProfileString += "\n"; //shrLogEx(LOGBOTH | MASTER, 0, sProfileString.c_str()); std::cout << sProfileString.c_str() << std::endl; std::cout << "Press <ENTER>" << std::endl; // getchar(); runtimeTest(); getchar(); // finish return 0; }
int getDeviceProps (int *deviceCount, char **deviceProps) { // Cuda Runtime interface void *cudaRT = NULL; cudaGetDeviceCount_f cudaGetDeviceCount = NULL; cudaGetDeviceProperties_f cudaGetDeviceProperties = NULL; cudaError_t cuErr; int ndevices; // Number of devices reported by Cuda runtime int undevices = 0; // Number of devices that are unusable by simEngine unsigned int deviceid; unsigned int sort; simCudaDevice *devices; cudaRT = dlopen(CUDART_LIBRARY_NAME, RTLD_NOW); if(!cudaRT) { char full_library_name[PATH_MAX]; sprintf(full_library_name, "/usr/local/cuda/lib64/%s", CUDART_LIBRARY_NAME); cudaRT = dlopen(full_library_name, RTLD_NOW); if(!cudaRT) { sprintf(full_library_name, "/usr/local/cuda/lib/%s", CUDART_LIBRARY_NAME); cudaRT = dlopen(full_library_name, RTLD_NOW); if(!cudaRT) { snprintf(error_message, BUFFER_LENGTH, "Failed to load CUDA runtime environment from %s.\n" "\tIs the CUDA runtime environment installed in the default location\n" "\tOR is LD_LIBRARY_PATH environment variable set to include CUDA libraries?", CUDART_LIBRARY_NAME); error_message[BUFFER_LENGTH - 1] = '\0'; return DeviceProps_NoCudaRuntime; } } } cudaGetDeviceCount = (cudaGetDeviceCount_f)dlsym(cudaRT, "cudaGetDeviceCount"); cudaGetDeviceProperties = (cudaGetDeviceProperties_f)dlsym(cudaRT, "cudaGetDeviceProperties"); if(!cudaGetDeviceCount || !cudaGetDeviceProperties) { snprintf(error_message, BUFFER_LENGTH, "Failed to load CUDA functions from %s.\n" "\tThe CUDA library found is incompatible with simEngine.", CUDART_LIBRARY_NAME); error_message[BUFFER_LENGTH - 1] = '\0'; return DeviceProps_NoCudaRuntime; } if (cudaSuccess != cudaGetDeviceCount(&ndevices)) { snprintf(error_message, BUFFER_LENGTH, "Error obtaining device count.\n" "\tIs there a CUDA capable GPU available on this computer?"); error_message[BUFFER_LENGTH - 1] = '\0'; return DeviceProps_UnknownError; } if (0 == ndevices) { snprintf(error_message, BUFFER_LENGTH, "No suitable devices found.\n" "\tIs your CUDA driver installed, and have you rebooted since installation?"); error_message[BUFFER_LENGTH - 1] = '\0'; return DeviceProps_NoDevices; } devices = (simCudaDevice *)malloc(sizeof(simCudaDevice) * ndevices); // Retrieve the properties for all Cuda devices for (deviceid = 0; deviceid < ndevices; ++deviceid) { if (cudaSuccess != cudaGetDeviceProperties(&devices[deviceid-undevices].props, deviceid)) { snprintf(error_message, BUFFER_LENGTH, "Error obtaining properties for device %d.\n" "\tThe CUDA library found is incompatible with simEngine.", deviceid); error_message[BUFFER_LENGTH - 1] = '\0'; free(devices); return DeviceProps_UnknownError; } // Filter out emulation devices if(9999 == devices[deviceid-undevices].props.major) { undevices += 1; } // Track GFLOPs of real devices else { devices[deviceid-undevices].gflops = devices[deviceid-undevices].props.multiProcessorCount * devices[deviceid-undevices].props.clockRate; devices[deviceid-undevices].unsorted = 1; } } // Subtract emulation devices from device count *deviceCount = ndevices - undevices; if (0 == *deviceCount) { snprintf(error_message, BUFFER_LENGTH, "Only emulation device found.\n" "\tDo you have a CUDA device?\n" "\tIs the CUDA driver installed?\n" "\tHave you rebooted after installing the driver?\n" "\tDo you have device permissions set to allow CUDA computation?"); error_message[BUFFER_LENGTH - 1] = '\0'; free(devices); return DeviceProps_EmulationOnly; } // Sort the useable devices by max GFLOPs char *write = props_buffer; for(sort = 0; sort<(ndevices - undevices) && sort<MAX_DEVICES; ++sort) { int max_gflops = 0; int max_gflops_dev = 0; int written = 0; for(deviceid = 0; deviceid<(ndevices - undevices); ++deviceid) { if(devices[deviceid].unsorted && devices[deviceid].gflops > max_gflops) { max_gflops = devices[deviceid].gflops; max_gflops_dev = deviceid; } } // Print one device per line with properties colon separated written = snprintf(write, BUFFER_LENGTH, "%d:%s:%zd:%zd:%d:%d:%zd:%d:%d,%d,%d:%d,%d,%d:%zd:%d:%d:%d:%zd:%d:%d:%d:%d:%d:%d", max_gflops_dev, devices[max_gflops_dev].props.name, // Switch to kB to not overflow an int devices[max_gflops_dev].props.totalGlobalMem>>10, devices[max_gflops_dev].props.sharedMemPerBlock, devices[max_gflops_dev].props.regsPerBlock, devices[max_gflops_dev].props.warpSize, devices[max_gflops_dev].props.memPitch, devices[max_gflops_dev].props.maxThreadsPerBlock, devices[max_gflops_dev].props.maxThreadsDim[0], devices[max_gflops_dev].props.maxThreadsDim[1], devices[max_gflops_dev].props.maxThreadsDim[2], devices[max_gflops_dev].props.maxGridSize[0], devices[max_gflops_dev].props.maxGridSize[1], devices[max_gflops_dev].props.maxGridSize[2], devices[max_gflops_dev].props.totalConstMem, devices[max_gflops_dev].props.major, devices[max_gflops_dev].props.minor, devices[max_gflops_dev].props.clockRate, devices[max_gflops_dev].props.textureAlignment, devices[max_gflops_dev].props.deviceOverlap, devices[max_gflops_dev].props.multiProcessorCount, devices[max_gflops_dev].props.kernelExecTimeoutEnabled, devices[max_gflops_dev].props.integrated, devices[max_gflops_dev].props.canMapHostMemory, devices[max_gflops_dev].props.computeMode ); write += 1 + written; devices[max_gflops_dev].unsorted = 0; } *deviceProps = props_buffer; free(devices); error_message[0] = '\0'; return DeviceProps_Success; }
int main(int argc, char* atgv[]) { int ndevices = 0; cudaError_t cuda_status = cudaGetDeviceCount(&ndevices); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot get the cuda device count, status = %d: %s\n", cuda_status, cudaGetErrorString(cuda_status)); return cuda_status; } // Return if no cuda devices present. printf("%d CUDA device(s) found\n", ndevices); if (!ndevices) return 0; // Create input data. Each device will have an equal // piece of data. size_t np = nx * ny, size = np * sizeof(float); float* data = (float*)malloc(size * 2); float *input = data, *output = data + np; float invdrandmax = 1.0 / RAND_MAX; for (size_t i = 0; i < np; i++) input[i] = rand() * invdrandmax; struct time_t start, finish; get_time(&start); // Get control result on CPU (to compare with results on devices). pattern2d_cpu(1, nx, 1, 1, ny, 1, input, output, ndevices); get_time(&finish); printf("CPU time = %f sec\n", get_time_diff(&start, &finish)); // Create config structures to store device-specific // values. config_t* configs = (config_t*)malloc( sizeof(config_t) * ndevices); // Initialize CUDA devices. for (int idevice = 0; idevice < ndevices; idevice++) { config_t* config = configs + idevice; // TODO: Set curent CUDA device to idevice. cudaError_t cuda_status = cudaSetDevice(idevice); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot get the cuda device count, status = %d: %s\n", cuda_status, cudaGetErrorString(cuda_status)); return cuda_status; } // Create device arrays for input and output data. cuda_status = cudaMalloc((void**)&config->in_dev, size); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot allocate CUDA input buffer on device %d, status = %d: %s\n", idevice, cuda_status, cudaGetErrorString(cuda_status)); return cuda_status; } cuda_status = cudaMalloc((void**)&config->out_dev, size); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot allocate CUDA output buffer on device %d, status = %d: %s\n", idevice, cuda_status, cudaGetErrorString(cuda_status)); return cuda_status; } // Copy input data to device buffer. cuda_status = cudaMemcpy(config->in_dev, input, size, cudaMemcpyHostToDevice); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot copy input data to CUDA buffer on device %d, status = %d: %s\n", idevice, cuda_status, cudaGetErrorString(cuda_status)); return cuda_status; } printf("Device %d initialized\n", idevice); } // Start execution of kernels. One kernel // is executed on each device in parallel. for (int idevice = 0; idevice < ndevices; idevice++) { config_t* config = configs + idevice; // TODO: Set curent CUDA device to idevice. cudaError_t cuda_status = cudaSetDevice(idevice); get_time(&config->start); // Run test kernel on the current device. int status = pattern2d_gpu(1, nx, 1, 1, ny, 1, config->in_dev, config->out_dev, idevice); if (status) { fprintf(stderr, "Cannot execute pattern 2d on device %d, status = %d: %s\n", idevice, status, cudaGetErrorString(status)); return status; } } // Synchronize kernels execution. for (int idevice = 0; idevice < ndevices; idevice++) { config_t* config = configs + idevice; // TODO: Set curent CUDA device to idevice. cudaError_t cuda_status = cudaSetDevice(idevice); // Wait for current device to finish processing // the kernels. cuda_status = cudaThreadSynchronize(); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot synchronize thread by device %d, status = %d: %s\n", idevice, cuda_status, cudaGetErrorString(cuda_status)); return cuda_status; } get_time(&finish); printf("GPU %d time = %f sec\n", idevice, get_time_diff(&config->start, &finish)); } // Check results and dispose resources used by devices. for (int idevice = 0; idevice < ndevices; idevice++) { config_t* config = configs + idevice; // TODO: Set curent CUDA device to idevice. // Offload results back to host memory. cuda_status = cudaMemcpy(input, config->out_dev, size, cudaMemcpyDeviceToHost); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot copy output data from CUDA buffer on device %d, status = %d: %s\n", idevice, cuda_status, cudaGetErrorString(cuda_status)); return cuda_status; } // Free device arrays. cuda_status = cudaFree(config->in_dev); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot release input buffer on device %d, status = %d: %s\n", idevice, cuda_status, cudaGetErrorString(cuda_status)); return cuda_status; } cuda_status = cudaFree(config->out_dev); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot release output buffer on device %d, status = %d: %s\n", idevice, cuda_status, cudaGetErrorString(cuda_status)); return cuda_status; } printf("Device %d deinitialized\n", idevice); // Compare each GPU result to CPU result. // Find the maximum abs difference. int maxi = 0, maxj = 0; float maxdiff = fabs(input[0] - output[0]); for (int j = 0; j < ny; j++) { for (int i = 0; i < nx; i++) { float diff = fabs( input[i + j * nx] - output[i + j * nx]); if (diff > maxdiff) { maxdiff = diff; maxi = i; maxj = j; } } } printf("Device %d result abs max diff = %f @ (%d,%d)\n", idevice, maxdiff, maxi, maxj); } // Measure time between first GPU launch and last GPU // finish. This will show how much time is spent on GPU // kernels in total. // XXX If this time is comparabe to the time of // individual GPU, then we likely reached our goal: // kernels are executed in parallel. printf("Total time of %d GPUs = %f\n", ndevices, get_time_diff(&configs[0].start, &finish)); free(configs); free(data); return 0; }
int main(int argc, char **argv) { char *rawfilename = NULL; int numiter = 250; int use_apc = 1; int use_normalization = 0; conjugrad_float_t lambda_single = F001; // 0.01 conjugrad_float_t lambda_pair = FInf; conjugrad_float_t lambda_pair_factor = F02; // 0.2 int conjugrad_k = 5; conjugrad_float_t conjugrad_eps = 0.01; parse_option *optList, *thisOpt; char *optstr; char *old_optstr = malloc(1); old_optstr[0] = 0; optstr = concat("r:i:n:w:k:e:l:ARh?", old_optstr); free(old_optstr); #ifdef OPENMP int numthreads = 1; old_optstr = optstr; optstr = concat("t:", optstr); free(old_optstr); #endif #ifdef CUDA int use_def_gpu = 0; old_optstr = optstr; optstr = concat("d:", optstr); free(old_optstr); #endif #ifdef MSGPACK char* msgpackfilename = NULL; old_optstr = optstr; optstr = concat("b:", optstr); free(old_optstr); #endif optList = parseopt(argc, argv, optstr); free(optstr); char* msafilename = NULL; char* matfilename = NULL; char* initfilename = NULL; conjugrad_float_t reweighting_threshold = F08; // 0.8 while(optList != NULL) { thisOpt = optList; optList = optList->next; switch(thisOpt->option) { #ifdef OPENMP case 't': numthreads = atoi(thisOpt->argument); #ifdef CUDA use_def_gpu = -1; // automatically disable GPU if number of threads specified #endif break; #endif #ifdef CUDA case 'd': use_def_gpu = atoi(thisOpt->argument); break; #endif #ifdef MSGPACK case 'b': msgpackfilename = thisOpt->argument; break; #endif case 'r': rawfilename = thisOpt->argument; break; case 'i': initfilename = thisOpt->argument; break; case 'n': numiter = atoi(thisOpt->argument); break; case 'w': reweighting_threshold = (conjugrad_float_t)atof(thisOpt->argument); break; case 'l': lambda_pair_factor = (conjugrad_float_t)atof(thisOpt->argument); break; case 'k': conjugrad_k = (int)atoi(thisOpt->argument); break; case 'e': conjugrad_eps = (conjugrad_float_t)atof(thisOpt->argument); break; case 'A': use_apc = 0; break; case 'R': use_normalization = 1; break; case 'h': case '?': usage(argv[0], 1); break; case 0: if(msafilename == NULL) { msafilename = thisOpt->argument; } else if(matfilename == NULL) { matfilename = thisOpt->argument; } else { usage(argv[0], 0); } break; default: die("Unknown argument"); } free(thisOpt); } if(msafilename == NULL || matfilename == NULL) { usage(argv[0], 0); } FILE *msafile = fopen(msafilename, "r"); if( msafile == NULL) { printf("Cannot open %s!\n\n", msafilename); return 2; } #ifdef JANSSON char* metafilename = malloc(2048); snprintf(metafilename, 2048, "%s.meta.json", msafilename); FILE *metafile = fopen(metafilename, "r"); json_t *meta; if(metafile == NULL) { // Cannot find .meta.json file - create new empty metadata meta = meta_create(); } else { // Load metadata from matfile.meta.json meta = meta_read_json(metafile); fclose(metafile); } json_object_set(meta, "method", json_string("ccmpred")); json_t *meta_step = meta_add_step(meta, "ccmpred"); json_object_set(meta_step, "version", json_string(__VERSION)); json_t *meta_parameters = json_object(); json_object_set(meta_step, "parameters", meta_parameters); json_t *meta_steps = json_array(); json_object_set(meta_step, "iterations", meta_steps); json_t *meta_results = json_object(); json_object_set(meta_step, "results", meta_results); #endif int ncol, nrow; unsigned char* msa = read_msa(msafile, &ncol, &nrow); fclose(msafile); int nsingle = ncol * (N_ALPHA - 1); int nvar = nsingle + ncol * ncol * N_ALPHA * N_ALPHA; int nsingle_padded = nsingle + N_ALPHA_PAD - (nsingle % N_ALPHA_PAD); int nvar_padded = nsingle_padded + ncol * ncol * N_ALPHA * N_ALPHA_PAD; #ifdef CURSES bool color = detect_colors(); #else bool color = false; #endif logo(color); #ifdef CUDA int num_devices, dev_ret; struct cudaDeviceProp prop; dev_ret = cudaGetDeviceCount(&num_devices); if(dev_ret != CUDA_SUCCESS) { num_devices = 0; } if(num_devices == 0) { printf("No CUDA devices available, "); use_def_gpu = -1; } else if (use_def_gpu < -1 || use_def_gpu >= num_devices) { printf("Error: %d is not a valid device number. Please choose a number between 0 and %d\n", use_def_gpu, num_devices - 1); exit(1); } else { printf("Found %d CUDA devices, ", num_devices); } if (use_def_gpu != -1) { cudaError_t err = cudaSetDevice(use_def_gpu); if(cudaSuccess != err) { printf("Error setting device: %d\n", err); exit(1); } cudaGetDeviceProperties(&prop, use_def_gpu); printf("using device #%d: %s\n", use_def_gpu, prop.name); size_t mem_free, mem_total; err = cudaMemGetInfo(&mem_free, &mem_total); if(cudaSuccess != err) { printf("Error getting memory info: %d\n", err); exit(1); } size_t mem_needed = nrow * ncol * 2 + // MSAs sizeof(conjugrad_float_t) * nrow * ncol * 2 + // PC, PCS sizeof(conjugrad_float_t) * nrow * ncol * N_ALPHA_PAD + // PCN sizeof(conjugrad_float_t) * nrow + // Weights (sizeof(conjugrad_float_t) * ((N_ALPHA - 1) * ncol + ncol * ncol * N_ALPHA * N_ALPHA_PAD)) * 4; setlocale(LC_NUMERIC, ""); printf("Total GPU RAM: %'17lu\n", mem_total); printf("Free GPU RAM: %'17lu\n", mem_free); printf("Needed GPU RAM: %'17lu ", mem_needed); if(mem_needed <= mem_free) { printf("✓\n"); } else { printf("⚠\n"); } #ifdef JANSSON json_object_set(meta_parameters, "device", json_string("gpu")); json_t* meta_gpu = json_object(); json_object_set(meta_parameters, "gpu_info", meta_gpu); json_object_set(meta_gpu, "name", json_string(prop.name)); json_object_set(meta_gpu, "mem_total", json_integer(mem_total)); json_object_set(meta_gpu, "mem_free", json_integer(mem_free)); json_object_set(meta_gpu, "mem_needed", json_integer(mem_needed)); #endif } else { printf("using CPU"); #ifdef JANSSON json_object_set(meta_parameters, "device", json_string("cpu")); #endif #ifdef OPENMP printf(" (%d thread(s))", numthreads); #ifdef JANSSON json_object_set(meta_parameters, "cpu_threads", json_integer(numthreads)); #endif #endif printf("\n"); } #else // CUDA printf("using CPU"); #ifdef JANSSON json_object_set(meta_parameters, "device", json_string("cpu")); #endif #ifdef OPENMP printf(" (%d thread(s))\n", numthreads); #ifdef JANSSON json_object_set(meta_parameters, "cpu_threads", json_integer(numthreads)); #endif #endif // OPENMP printf("\n"); #endif // CUDA conjugrad_float_t *x = conjugrad_malloc(nvar_padded); if( x == NULL) { die("ERROR: Not enough memory to allocate variables!"); } memset(x, 0, sizeof(conjugrad_float_t) * nvar_padded); // Auto-set lambda_pair if(isnan(lambda_pair)) { lambda_pair = lambda_pair_factor * (ncol - 1); } // fill up user data struct for passing to evaluate userdata *ud = (userdata *)malloc( sizeof(userdata) ); if(ud == 0) { die("Cannot allocate memory for user data!"); } ud->msa = msa; ud->ncol = ncol; ud->nrow = nrow; ud->nsingle = nsingle; ud->nvar = nvar; ud->lambda_single = lambda_single; ud->lambda_pair = lambda_pair; ud->weights = conjugrad_malloc(nrow); ud->reweighting_threshold = reweighting_threshold; if(initfilename == NULL) { // Initialize emissions to pwm init_bias(x, ud); } else { // Load potentials from file read_raw(initfilename, ud, x); } // optimize with default parameters conjugrad_parameter_t *param = conjugrad_init(); param->max_iterations = numiter; param->epsilon = conjugrad_eps; param->k = conjugrad_k; param->max_linesearch = 5; param->alpha_mul = F05; param->ftol = 1e-4; param->wolfe = F02; int (*init)(void *) = init_cpu; int (*destroy)(void *) = destroy_cpu; conjugrad_evaluate_t evaluate = evaluate_cpu; #ifdef OPENMP omp_set_num_threads(numthreads); if(numthreads > 1) { init = init_cpu_omp; destroy = destroy_cpu_omp; evaluate = evaluate_cpu_omp; } #endif #ifdef CUDA if(use_def_gpu != -1) { init = init_cuda; destroy = destroy_cuda; evaluate = evaluate_cuda; } #endif init(ud); #ifdef JANSSON json_object_set(meta_parameters, "reweighting_threshold", json_real(ud->reweighting_threshold)); json_object_set(meta_parameters, "apc", json_boolean(use_apc)); json_object_set(meta_parameters, "normalization", json_boolean(use_normalization)); json_t *meta_regularization = json_object(); json_object_set(meta_parameters, "regularization", meta_regularization); json_object_set(meta_regularization, "type", json_string("l2")); json_object_set(meta_regularization, "lambda_single", json_real(lambda_single)); json_object_set(meta_regularization, "lambda_pair", json_real(lambda_pair)); json_object_set(meta_regularization, "lambda_pair_factor", json_real(lambda_pair_factor)); json_t *meta_opt = json_object(); json_object_set(meta_parameters, "optimization", meta_opt); json_object_set(meta_opt, "method", json_string("libconjugrad")); json_object_set(meta_opt, "float_bits", json_integer((int)sizeof(conjugrad_float_t) * 8)); json_object_set(meta_opt, "max_iterations", json_integer(param->max_iterations)); json_object_set(meta_opt, "max_linesearch", json_integer(param->max_linesearch)); json_object_set(meta_opt, "alpha_mul", json_real(param->alpha_mul)); json_object_set(meta_opt, "ftol", json_real(param->ftol)); json_object_set(meta_opt, "wolfe", json_real(param->wolfe)); json_t *meta_msafile = meta_file_from_path(msafilename); json_object_set(meta_parameters, "msafile", meta_msafile); json_object_set(meta_msafile, "ncol", json_integer(ncol)); json_object_set(meta_msafile, "nrow", json_integer(nrow)); if(initfilename != NULL) { json_t *meta_initfile = meta_file_from_path(initfilename); json_object_set(meta_parameters, "initfile", meta_initfile); json_object_set(meta_initfile, "ncol", json_integer(ncol)); json_object_set(meta_initfile, "nrow", json_integer(nrow)); } double neff = 0; for(int i = 0; i < nrow; i++) { neff += ud->weights[i]; } json_object_set(meta_msafile, "neff", json_real(neff)); ud->meta_steps = meta_steps; #endif printf("\nWill optimize %d %ld-bit variables\n\n", nvar, sizeof(conjugrad_float_t) * 8); if(color) { printf("\x1b[1m"); } printf("iter\teval\tf(x) \t║x║ \t║g║ \tstep\n"); if(color) { printf("\x1b[0m"); } conjugrad_float_t fx; int ret; #ifdef CUDA if(use_def_gpu != -1) { conjugrad_float_t *d_x; cudaError_t err = cudaMalloc((void **) &d_x, sizeof(conjugrad_float_t) * nvar_padded); if (cudaSuccess != err) { printf("CUDA error No. %d while allocation memory for d_x\n", err); exit(1); } err = cudaMemcpy(d_x, x, sizeof(conjugrad_float_t) * nvar_padded, cudaMemcpyHostToDevice); if (cudaSuccess != err) { printf("CUDA error No. %d while copying parameters to GPU\n", err); exit(1); } ret = conjugrad_gpu(nvar_padded, d_x, &fx, evaluate, progress, ud, param); err = cudaMemcpy(x, d_x, sizeof(conjugrad_float_t) * nvar_padded, cudaMemcpyDeviceToHost); if (cudaSuccess != err) { printf("CUDA error No. %d while copying parameters back to CPU\n", err); exit(1); } err = cudaFree(d_x); if (cudaSuccess != err) { printf("CUDA error No. %d while freeing memory for d_x\n", err); exit(1); } } else { ret = conjugrad(nvar_padded, x, &fx, evaluate, progress, ud, param); } #else ret = conjugrad(nvar_padded, x, &fx, evaluate, progress, ud, param); #endif printf("\n"); printf("%s with status code %d - ", (ret < 0 ? "Exit" : "Done"), ret); if(ret == CONJUGRAD_SUCCESS) { printf("Success!\n"); } else if(ret == CONJUGRAD_ALREADY_MINIMIZED) { printf("Already minimized!\n"); } else if(ret == CONJUGRADERR_MAXIMUMITERATION) { printf("Maximum number of iterations reached.\n"); } else { printf("Unknown status code!\n"); } printf("\nFinal fx = %f\n\n", fx); FILE* out = fopen(matfilename, "w"); if(out == NULL) { printf("Cannot open %s for writing!\n\n", matfilename); return 3; } conjugrad_float_t *outmat = conjugrad_malloc(ncol * ncol); FILE *rawfile = NULL; if(rawfilename != NULL) { printf("Writing raw output to %s\n", rawfilename); rawfile = fopen(rawfilename, "w"); if(rawfile == NULL) { printf("Cannot open %s for writing!\n\n", rawfilename); return 4; } write_raw(rawfile, x, ncol); } #ifdef MSGPACK FILE *msgpackfile = NULL; if(msgpackfilename != NULL) { printf("Writing msgpack raw output to %s\n", msgpackfilename); msgpackfile = fopen(msgpackfilename, "w"); if(msgpackfile == NULL) { printf("Cannot open %s for writing!\n\n", msgpackfilename); return 4; } #ifndef JANSSON void *meta = NULL; #endif } #endif sum_submatrices(x, outmat, ncol); if(use_apc) { apc(outmat, ncol); } if(use_normalization) { normalize(outmat, ncol); } write_matrix(out, outmat, ncol, ncol); #ifdef JANSSON json_object_set(meta_results, "fx_final", json_real(fx)); json_object_set(meta_results, "num_iterations", json_integer(json_array_size(meta_steps))); json_object_set(meta_results, "opt_code", json_integer(ret)); json_t *meta_matfile = meta_file_from_path(matfilename); json_object_set(meta_results, "matfile", meta_matfile); if(rawfilename != NULL) { json_object_set(meta_results, "rawfile", meta_file_from_path(rawfilename)); } if(msgpackfilename != NULL) { json_object_set(meta_results, "msgpackfile", meta_file_from_path(msgpackfilename)); } fprintf(out, "#>META> %s", json_dumps(meta, JSON_COMPACT)); if(rawfile != NULL) { fprintf(rawfile, "#>META> %s", json_dumps(meta, JSON_COMPACT)); } #endif if(rawfile != NULL) { fclose(rawfile); } #ifdef MSGPACK if(msgpackfile != NULL) { write_raw_msgpack(msgpackfile, x, ncol, meta); fclose(msgpackfile); } #endif fflush(out); fclose(out); destroy(ud); conjugrad_free(outmat); conjugrad_free(x); conjugrad_free(ud->weights); free(ud); free(msa); free(param); printf("Output can be found in %s\n", matfilename); return 0; }
void thread_task(const int AS, const int num_of_nodes, int ingr_node_counter) { std::cout << std::endl << "------------ PROCESSING AS GRAPH --------------------" << std::endl << std::endl; // declare variables std::stringstream sstm; std::string ID_ = "AS"; Graph * gr = NULL; std::uniform_real_distribution<double> RANDOM_GENERATOR(1, MAX_NUM_OF_INGRESS_); float multipleDeviceElapsedTime = 0, singleDeviceElapsedTime = 0, hostElapsedTime = 0, oldHostElapsedTime = 0; // create the AS id for the file name sstm << ID_ << AS; ID_ = sstm.str(); std::cout << "Generating Topology..." << std::endl; // create brite and generate the topology Brite *b_topology = new Brite(); if(num_of_nodes < 500000) { std::cout << "Generating" << std::endl; gr = b_topology->GenerateTopology(AS, num_of_nodes); } else { std::cout << "From File!" << std::endl; // convert number of nodes to a string std::string node = std::to_string(num_of_nodes); // read in the topology from BRITE file gr = b_topology->Populate_Topology_Result("../data/" + node +".brite"); } std::cout << "Finished Generating" << std::endl; // get the number of vertices int numVertices = gr->numberOfVertex(); // if no ingress nodes then generate a random amount (1 to 10) if (ingr_node_counter == 0) { ingr_node_counter = RANDOM_GENERATOR(generator6); } // create memory for the results int ** gpuCost = new int * [ingr_node_counter]; int ** sequentialCost = new int * [ingr_node_counter]; int ** oldSequentialCost = new int * [ingr_node_counter]; int ** multipleGPUResult = new int * [ingr_node_counter]; for(int i = 0; i < ingr_node_counter; i++) { gpuCost[i] = new int[numVertices]; sequentialCost[i] = new int[numVertices]; oldSequentialCost[i] = new int[numVertices]; multipleGPUResult[i] = new int[numVertices]; } int * source = new int [ingr_node_counter]; std::cout << std::endl << "Running Sequential Algorithm..." << std::endl; // loop for all ingress nodes for (int i = 0; i < ingr_node_counter; i++) { source[i] = random_ingress_node_selector(*gr); hostElapsedTime += processGraphSequential(gr, source[i], sequentialCost[i]); } //totalOldBFSSequential += oldHostElapsedTime; totalNewBFSSequential += hostElapsedTime; std::cout << "Finished Sequential" << std::endl; // print the CPU results to a file and the screen std::cout << ID_ << ", CPU Execution time (sec), " << hostElapsedTime << ", Num Vertices, " << numVertices << ", Num Ingress, " << ingr_node_counter << std::endl; timingFile << ID_ << ", CPU Execution time (sec), " << hostElapsedTime << ", Num Vertices, " << numVertices << ", Num Ingress, " << ingr_node_counter << std::endl; std::cout << std::endl << "Running Single GPU Algorithm..." << std::endl; totalSingleGPU += singleDeviceElapsedTime = processGraphSingleGPU(gr, source, ingr_node_counter, gpuCost); std::cout << "Finished Single GPU" << std::endl; // print the Single GPU results to a file and the screen std::cout << ID_ << ", CUDA Single Device Execution time (sec), " << singleDeviceElapsedTime << ", Num Vertices, " << numVertices << ", Num Ingress, " << ingr_node_counter << std::endl; timingFile << ID_ << ", CUDA Single Device Execution time (sec), " << singleDeviceElapsedTime << ", Num Vertices, " << numVertices << ", Num Ingress, " << ingr_node_counter << std::endl; // CALL MULTI-GPU FUNCTION HERE!!!! std::cout << std::endl << "Running Multiple GPU Algorithm..." << std::endl; int numDevices; cudaGetDeviceCount(&numDevices); for(int i = 2; i <= numDevices; i++) { multipleDeviceElapsedTime = processGraphMultipleGPU(gr, source, ingr_node_counter, multipleGPUResult, i); std::cout << ID_ << ", CUDA Multiple Device Execution time (sec), " << multipleDeviceElapsedTime << ", Num GPUs, " << i << ", Num Vertices, " << numVertices << ", Num Ingress, " << ingr_node_counter << std::endl; timingFile << ID_ << ", CUDA Multiple Device Execution time (sec), " << multipleDeviceElapsedTime << ", Num GPUs, " << i << ", Num Vertices, " << numVertices << ", Num Ingress, " << ingr_node_counter << std::endl; } std::cout << "Finished Multiple GPU" << std::endl << std::endl; std::cout << std::endl << "Verifying Results..." << std::endl; // verify results if(verifyResults(sequentialCost, gpuCost, multipleGPUResult, numVertices, ingr_node_counter)) std::cout << "Results Match!!" << std::endl; else std::cout << "NOT EQUAL!!" << std::endl; std::cout << "Finished Verifying" << std::endl; // clean up memory for(int i = 0; i < ingr_node_counter; i++) { delete [] gpuCost[i]; delete [] sequentialCost[i]; delete [] oldSequentialCost[i]; delete [] multipleGPUResult[i]; } // deallocate memory delete [] gpuCost; delete [] sequentialCost; delete [] oldSequentialCost; delete [] multipleGPUResult; delete [] source; delete b_topology; delete gr; std::cout << std::endl << "------------ FINISHED PROCESSING --------------------" << std::endl << std::endl; }
int CudaDeviceContext::getDeviceCount() const { int count = 0; checkCudaError(cudaGetDeviceCount(&count)); return count; }
extern "C" magma_int_t magma_z_initP2P( magma_int_t *bw_bmark, magma_int_t *num_gpus, magma_queue_t queue ) { // Number of GPUs printf("Checking for multiple GPUs...\n"); int gpu_n; (cudaGetDeviceCount(&gpu_n)); printf("CUDA-capable device count: %i\n", gpu_n); if (gpu_n < 2) { printf("Two or more Tesla(s) with (SM 2.0)" " class GPUs are required for P2P.\n"); } // Query device properties cudaDeviceProp prop[64]; int gpuid_tesla[64]; // find the first two GPU's that can support P2P int gpu_count = 0; // GPUs that meet the criteria for (int i=0; i < gpu_n; i++) { (cudaGetDeviceProperties(&prop[i], i)); // Only Tesla boards based on Fermi can support P2P { // This is an array of P2P capable GPUs gpuid_tesla[gpu_count++] = i; } } *num_gpus=gpu_n; for(int i=0; i<gpu_n; i++) { for(int j=i+1; j<gpu_n; j++) { // Check possibility for peer access printf("\nChecking GPU(s) for support of peer to peer memory access...\n"); int can_access_peer_0_1, can_access_peer_1_0; // In this case we just pick the first two that we can support (cudaDeviceCanAccessPeer(&can_access_peer_0_1, gpuid_tesla[i], gpuid_tesla[j])); (cudaDeviceCanAccessPeer(&can_access_peer_1_0, gpuid_tesla[j], gpuid_tesla[i])); // Output results from P2P capabilities printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[gpuid_tesla[i]].name, gpuid_tesla[i], prop[gpuid_tesla[j]].name, gpuid_tesla[j] , can_access_peer_0_1 ? "Yes" : "No"); printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[gpuid_tesla[j]].name, gpuid_tesla[j], prop[gpuid_tesla[i]].name, gpuid_tesla[i], can_access_peer_1_0 ? "Yes" : "No"); if (can_access_peer_0_1 == 0 || can_access_peer_1_0 == 0) { printf("Two or more Tesla(s) with class" " GPUs are required for P2P to run.\n"); printf("Support for UVA requires a Tesla with SM 2.0 capabilities.\n"); printf("Peer to Peer access is not available between" " GPU%d <-> GPU%d, waiving test.\n", gpuid_tesla[i], gpuid_tesla[j]); printf("PASSED\n"); //exit(EXIT_SUCCESS); } } } // Enable peer access for(int i=0; i<gpu_n; i++) { for(int j=i+1; j<gpu_n; j++) { printf("Enabling peer access between GPU%d and GPU%d...\n", gpuid_tesla[i], gpuid_tesla[j]); (cudaSetDevice(gpuid_tesla[i])); (cudaDeviceEnablePeerAccess(gpuid_tesla[j], 0)); (cudaSetDevice(gpuid_tesla[j])); (cudaDeviceEnablePeerAccess(gpuid_tesla[i], 0)); magma_zcheckerr("P2P"); } } magma_zcheckerr("P2P successful"); // Enable peer access for(int i=0; i<gpu_n; i++) { for(int j=i+1; j<gpu_n; j++) { // Check that we got UVA on both devices printf("Checking GPU%d and GPU%d for UVA capabilities...\n", gpuid_tesla[i], gpuid_tesla[j]); //const bool has_uva = (prop[gpuid_tesla[i]].unifiedAddressing && // prop[gpuid_tesla[j]].unifiedAddressing); printf("> %s (GPU%d) supports UVA: %s\n", prop[gpuid_tesla[i]].name, gpuid_tesla[i], (prop[gpuid_tesla[i]].unifiedAddressing ? "Yes" : "No") ); printf("> %s (GPU%d) supports UVA: %s\n", prop[gpuid_tesla[j]].name, gpuid_tesla[j], (prop[gpuid_tesla[j]].unifiedAddressing ? "Yes" : "No") ); } } if (*bw_bmark==1) { // P2P memcopy() benchmark for(int i=0; i<gpu_n; i++) { for(int j=i+1; j<gpu_n; j++) { // Allocate buffers const size_t buf_size = 1024 * 1024 * 16 * sizeof(float); printf("Allocating buffers (%iMB on GPU%d, GPU%d and CPU Host)...\n", int(buf_size / 1024 / 1024), gpuid_tesla[i], gpuid_tesla[j]); (cudaSetDevice(gpuid_tesla[i])); float* g0; (cudaMalloc(&g0, buf_size)); (cudaSetDevice(gpuid_tesla[j])); float* g1; (cudaMalloc(&g1, buf_size)); float* h0; (cudaMallocHost(&h0, buf_size)); // Automatically portable with UVA // Create CUDA event handles printf("Creating event handles...\n"); cudaEvent_t start_event, stop_event; float time_memcpy; int eventflags = cudaEventBlockingSync; (cudaEventCreateWithFlags(&start_event, eventflags)); (cudaEventCreateWithFlags(&stop_event, eventflags)); (cudaEventRecord(start_event, 0)); for (int k=0; k<100; k++) { // With UVA we don't need to specify source and target devices, the // runtime figures this out by itself from the pointers // Ping-pong copy between GPUs if (k % 2 == 0) (cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault)); else (cudaMemcpy(g0, g1, buf_size, cudaMemcpyDefault)); } (cudaEventRecord(stop_event, 0)); (cudaEventSynchronize(stop_event)); (cudaEventElapsedTime(&time_memcpy, start_event, stop_event)); printf("cudaMemcpyPeer / cudaMemcpy between" "GPU%d and GPU%d: %.2fGB/s\n", gpuid_tesla[i], gpuid_tesla[j], (1.0f / (time_memcpy / 1000.0f)) * ((100.0f * buf_size)) / 1024.0f / 1024.0f / 1024.0f); // Cleanup and shutdown printf("Cleanup of P2P benchmark...\n"); (cudaEventDestroy(start_event)); (cudaEventDestroy(stop_event)); (cudaSetDevice(gpuid_tesla[i])); (magma_free( g0) ); (cudaSetDevice(gpuid_tesla[j])); (magma_free( g1) ); (magma_free_cpu( h0) ); } } // host-device memcopy() benchmark for(int j=0; j<gpu_n; j++) { cudaSetDevice(gpuid_tesla[j]); int *h_data_source; int *h_data_sink; int *h_data_in[STREAM_COUNT]; int *d_data_in[STREAM_COUNT]; int *h_data_out[STREAM_COUNT]; int *d_data_out[STREAM_COUNT]; cudaEvent_t cycleDone[STREAM_COUNT]; cudaStream_t stream[STREAM_COUNT]; cudaEvent_t start, stop; // Allocate resources int memsize; memsize = 1000000 * sizeof(int); h_data_source = (int*) malloc(memsize); h_data_sink = (int*) malloc(memsize); for( int i =0; i<STREAM_COUNT; ++i ) { ( cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault) ); ( cudaMalloc(&d_data_in[i], memsize) ); ( cudaHostAlloc(&h_data_out[i], memsize, cudaHostAllocDefault) ); ( cudaMalloc(&d_data_out[i], memsize) ); ( cudaStreamCreate(&stream[i]) ); ( cudaEventCreate(&cycleDone[i]) ); cudaEventRecord(cycleDone[i], stream[i]); } cudaEventCreate(&start); cudaEventCreate(&stop); // Time host-device copies cudaEventRecord(start,0); ( cudaMemcpyAsync(d_data_in[0], h_data_in[0], memsize, cudaMemcpyHostToDevice,0) ); cudaEventRecord(stop,0); cudaEventSynchronize(stop); float memcpy_h2d_time; cudaEventElapsedTime(&memcpy_h2d_time, start, stop); cudaEventRecord(start,0); ( cudaMemcpyAsync(h_data_out[0], d_data_out[0], memsize, cudaMemcpyDeviceToHost, 0) ); cudaEventRecord(stop,0); cudaEventSynchronize(stop); float memcpy_d2h_time; cudaEventElapsedTime(&memcpy_d2h_time, start, stop); cudaEventSynchronize(stop); printf("Measured timings (throughput):\n"); printf(" Memcpy host to device GPU %d \t: %f ms (%f GB/s)\n", j, memcpy_h2d_time, (memsize * 1e-6)/ memcpy_h2d_time ); printf(" Memcpy device GPU %d to host\t: %f ms (%f GB/s)\n", j, memcpy_d2h_time, (memsize * 1e-6)/ memcpy_d2h_time); // Free resources free( h_data_source ); free( h_data_sink ); for( int i =0; i<STREAM_COUNT; ++i ) { magma_free_cpu( h_data_in[i] ); magma_free( d_data_in[i] ); magma_free_cpu( h_data_out[i] ); magma_free( d_data_out[i] ); cudaStreamDestroy(stream[i]); cudaEventDestroy(cycleDone[i]); } cudaEventDestroy(start); cudaEventDestroy(stop); } }//end if-loop bandwidth_benchmark magma_zcheckerr("P2P established"); return MAGMA_SUCCESS; }
bool CudaDeviceContextPrivate::create(int device) { if (isValid()) { return true; } if (device < 0) { std::cout << "Using CPU only" << std::endl; activeDevice = -1; return false; } int count = 0; checkCudaError(cudaGetDeviceCount(&count)); if (count < 1) { std::cout << "No CUDA devices found; using CPU only" << std::endl; activeDevice = -1; return false; } if (device >= 0 && device < count) { checkCudaError(cudaSetDevice(device)); cudaDeviceProp prop = {0}; checkCudaError(cudaGetDeviceProperties(&prop, device)); if (prop.major < 2) { device = -1; } } else { size_t freeMax = 0; for (int i = 0; i < count; i++) { checkCudaError(cudaSetDevice(i)); cudaDeviceProp prop = {0}; checkCudaError(cudaGetDeviceProperties(&prop, i)); if (prop.major < 2) { continue; } size_t free = 0, total = 0; checkCudaError(cudaMemGetInfo(&free, &total)); if (free > freeMax) { freeMax = free; device = i; } } } if (device < 0) { std::cout << "Compute capability 2.0 support required; using CPU only" << std::endl; activeDevice = -1; return false; } checkCudaError(cudaSetDevice(device)); checkCudaError(cudaFree(0)); cudaDeviceProp prop = {0}; checkCudaError(cudaGetDeviceProperties(&prop, device)); size_t free = 0, total = 0; checkCudaError(cudaMemGetInfo(&free, &total)); printf("_________________________________________\n"); printf("%s (%lluMB free of %lluMB total)\n", prop.name, (unsigned long long) (free / bytesInMegaBytes), (unsigned long long) (total / bytesInMegaBytes)); printf("Using %d multiprocessors\n", prop.multiProcessorCount); printf("Max threads per processor: %d\n", prop.maxThreadsPerMultiProcessor); printf("Max threads per block: %d\n", prop.maxThreadsPerBlock); printf("Max threads per dim: (%d, %d, %d)\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]); printf("_________________________________________\n"); printf("\n"); activeDevice = device; return isValid(); }
static int hwloc_cuda_discover(struct hwloc_backend *backend) { struct hwloc_topology *topology = backend->topology; enum hwloc_type_filter_e filter; cudaError_t cures; int nb, i; hwloc_topology_get_type_filter(topology, HWLOC_OBJ_OS_DEVICE, &filter); if (filter == HWLOC_TYPE_FILTER_KEEP_NONE) return 0; if (!hwloc_topology_is_thissystem(topology)) { hwloc_debug("%s", "\nno CUDA detection (not thissystem)\n"); return 0; } cures = cudaGetDeviceCount(&nb); if (cures) return -1; for (i = 0; i < nb; i++) { int domain, bus, dev; char cuda_name[32]; char number[32]; struct cudaDeviceProp prop; hwloc_obj_t cuda_device, parent; unsigned cores; cuda_device = hwloc_alloc_setup_object(HWLOC_OBJ_OS_DEVICE, -1); snprintf(cuda_name, sizeof(cuda_name), "cuda%d", i); cuda_device->name = strdup(cuda_name); cuda_device->depth = (unsigned) HWLOC_TYPE_DEPTH_UNKNOWN; cuda_device->attr->osdev.type = HWLOC_OBJ_OSDEV_COPROC; hwloc_obj_add_info(cuda_device, "CoProcType", "CUDA"); hwloc_obj_add_info(cuda_device, "Backend", "CUDA"); hwloc_obj_add_info(cuda_device, "GPUVendor", "NVIDIA Corporation"); cures = cudaGetDeviceProperties(&prop, i); if (!cures) hwloc_obj_add_info(cuda_device, "GPUModel", prop.name); snprintf(number, sizeof(number), "%llu", ((unsigned long long) prop.totalGlobalMem) >> 10); hwloc_obj_add_info(cuda_device, "CUDAGlobalMemorySize", number); snprintf(number, sizeof(number), "%llu", ((unsigned long long) prop.l2CacheSize) >> 10); hwloc_obj_add_info(cuda_device, "CUDAL2CacheSize", number); snprintf(number, sizeof(number), "%d", prop.multiProcessorCount); hwloc_obj_add_info(cuda_device, "CUDAMultiProcessors", number); cores = hwloc_cuda_cores_per_MP(prop.major, prop.minor); if (cores) { snprintf(number, sizeof(number), "%u", cores); hwloc_obj_add_info(cuda_device, "CUDACoresPerMP", number); } snprintf(number, sizeof(number), "%llu", ((unsigned long long) prop.sharedMemPerBlock) >> 10); hwloc_obj_add_info(cuda_device, "CUDASharedMemorySizePerMP", number); parent = NULL; if (hwloc_cudart_get_device_pci_ids(NULL /* topology unused */, i, &domain, &bus, &dev) == 0) { parent = hwloc_pci_belowroot_find_by_busid(topology, domain, bus, dev, 0); if (!parent) parent = hwloc_pci_find_busid_parent(topology, domain, bus, dev, 0); } if (!parent) parent = hwloc_get_root_obj(topology); hwloc_insert_object_by_parent(topology, parent, cuda_device); } return 0; }
// ======================================================= // ======================================================= HydroMpiParameters::HydroMpiParameters(ConfigMap &_configMap) : HydroParameters(_configMap, false), mx(0), my(0), mz(0), myRank(0), nProcs(0), myMpiPos(), nNeighbors(0), neighborsRank(), neighborsBC() { // MPI parameters : mx = configMap.getInteger("mpi", "mx", 1); my = configMap.getInteger("mpi", "my", 1); mz = configMap.getInteger("mpi", "mz", 1); // copy MPI topology sizes into gParams structure (so that it will also // be available as a global constant, usefull for GPU implementation in godunov_unsplit_mhd.cuh). _gParams.mx = mx; _gParams.my = my; _gParams.mz = mz; // check that parameters are consistent bool error = false; error |= (mx < 1); error |= (my < 1); error |= (mz < 1); if (dimType == TWO_D and mz != 1) error = true; TEST_FOR_EXCEPTION_PRINT(error, std::runtime_error, "Inconsistent geometry; check parameter file for nx, ny, nz and mx, my, mz !\n", &std::cerr); // get world communicator size and check it is consistent with mesh grid sizes nProcs = MpiComm::world().getNProc(); TEST_FOR_EXCEPTION_PRINT(nProcs != mx*my*mz, std::runtime_error, "Inconsistent MPI cartesian virtual topology geometry; \n mx*my*mz must match with parameter given to mpirun !!!\n", &std::cerr); // create the MPI communicator for our cartesian mesh if (dimType == TWO_D) { communicator = new MpiCommCart(mx, my, MPI_CART_PERIODIC_TRUE, MPI_REORDER_TRUE); nDim = 2; } else { communicator = new MpiCommCart(mx, my, mz, MPI_CART_PERIODIC_TRUE, MPI_REORDER_TRUE); nDim = 3; } // get my MPI rank inside topology myRank = communicator->getRank(); // get my coordinates inside topology // myMpiPos[0] is between 0 and mx-1 // myMpiPos[1] is between 0 and my-1 // myMpiPos[2] is between 0 and mz-1 myMpiPos.resize(nDim); communicator->getMyCoords(&myMpiPos[0]); // copy coordinate into gParams structure (so that it will also // be available as a global constant, usefull for GPU implementation). _gParams.mpiPosX = myMpiPos[0]; _gParams.mpiPosY = myMpiPos[1]; _gParams.mpiPosZ = myMpiPos[2]; /* * compute MPI ranks of our neighbors and * set default boundary condition types */ if (dimType == TWO_D) { nNeighbors = N_NEIGHBORS_2D; neighborsRank.resize(nNeighbors); neighborsRank[X_MIN] = communicator->getNeighborRank<X_MIN>(); neighborsRank[X_MAX] = communicator->getNeighborRank<X_MAX>(); neighborsRank[Y_MIN] = communicator->getNeighborRank<Y_MIN>(); neighborsRank[Y_MAX] = communicator->getNeighborRank<Y_MAX>(); neighborsBC.resize(nNeighbors); neighborsBC[X_MIN] = BC_COPY; neighborsBC[X_MAX] = BC_COPY; neighborsBC[Y_MIN] = BC_COPY; neighborsBC[Y_MAX] = BC_COPY; } else { nNeighbors = N_NEIGHBORS_3D; neighborsRank.resize(nNeighbors); neighborsRank[X_MIN] = communicator->getNeighborRank<X_MIN>(); neighborsRank[X_MAX] = communicator->getNeighborRank<X_MAX>(); neighborsRank[Y_MIN] = communicator->getNeighborRank<Y_MIN>(); neighborsRank[Y_MAX] = communicator->getNeighborRank<Y_MAX>(); neighborsRank[Z_MIN] = communicator->getNeighborRank<Z_MIN>(); neighborsRank[Z_MAX] = communicator->getNeighborRank<Z_MAX>(); neighborsBC.resize(nNeighbors); neighborsBC[X_MIN] = BC_COPY; neighborsBC[X_MAX] = BC_COPY; neighborsBC[Y_MIN] = BC_COPY; neighborsBC[Y_MAX] = BC_COPY; neighborsBC[Z_MIN] = BC_COPY; neighborsBC[Z_MAX] = BC_COPY; } /* * identify outside boundaries (no actual communication if we are * doing BC_DIRICHLET or BC_NEUMANN) * * Please notice the duality * XMIN -- boundary_xmax * XMAX -- boundary_xmin * */ // X_MIN boundary if (myMpiPos[DIR_X] == 0) neighborsBC[X_MIN] = boundary_xmin; // X_MAX boundary if (myMpiPos[DIR_X] == mx-1) neighborsBC[X_MAX] = boundary_xmax; // Y_MIN boundary if (myMpiPos[DIR_Y] == 0) neighborsBC[Y_MIN] = boundary_ymin; // Y_MAX boundary if (myMpiPos[DIR_Y] == my-1) neighborsBC[Y_MAX] = boundary_ymax; if (dimType == THREE_D) { // Z_MIN boundary if (myMpiPos[DIR_Z] == 0) neighborsBC[Z_MIN] = boundary_zmin; // Y_MAX boundary if (myMpiPos[DIR_Z] == mz-1) neighborsBC[Z_MAX] = boundary_zmax; } // end THREE_D /* * Initialize CUDA device if needed. * When running on a Linux machine with mutiple GPU per node, it might be * very helpfull if admin has set the CUDA device compute mode to exclusive * so that a device is only attached to 1 host thread (i.e. 2 different host * thread can not communicate with the same GPU). * * As a sys-admin, just run for all devices command: * nvidia-smi -g $(DEV_ID) -c 1 * * If compute mode is set to normal mode, we need to use cudaSetDevice, * so that each MPI device is mapped onto a different GPU device. * * At CCRT, on machine Titane, each node (2 quadri-proc) "sees" only * half a Tesla S1070, that means cudaGetDeviceCount should return 2. * If we want the ration 1 MPI process <-> 1 GPU, we need to allocate * N nodes and 2*N tasks (MPI process). */ #ifdef __CUDACC__ // get device count int count; cutilSafeCall( cudaGetDeviceCount(&count) ); int devId = myRank % count; cutilSafeCall( cudaSetDevice(devId) ); cudaDeviceProp deviceProp; int myDevId = -1; cutilSafeCall( cudaGetDevice( &myDevId ) ); cutilSafeCall( cudaGetDeviceProperties( &deviceProp, myDevId ) ); // faire un cudaSetDevice et cudaGetDeviceProp et aficher le nom // ajouter un booleen dans le constructeur pour savoir si on veut faire ca // sachant que sur Titane, probablement que le mode exclusif est active // a verifier demain std::cout << "MPI process " << myRank << " is using GPU device num " << myDevId << std::endl; #endif //__CUDACC__ // fix space resolution : // need to take into account number of MPI process in each direction float xMax = configMap.getFloat("mesh","xmax",1.0); float yMax = configMap.getFloat("mesh","ymax",1.0); float zMax = configMap.getFloat("mesh","zmax",1.0); _gParams.dx = (xMax- _gParams.xMin)/(nx*mx); _gParams.dy = (yMax- _gParams.yMin)/(ny*my); _gParams.dz = (zMax- _gParams.zMin)/(nz*mz); // print information about current setup if (myRank == 0) { std::cout << "We are about to start simulation with the following characteristics\n"; std::cout << "Global resolution : " << nx*mx << " x " << ny*my << " x " << nz*mz << "\n"; std::cout << "Local resolution : " << nx << " x " << ny << " x " << nz << "\n"; std::cout << "MPI Cartesian topology : " << mx << "x" << my << "x" << mz << std::endl; } #ifdef __CUDACC__ char hostname[1024]; gethostname(hostname, 1023); std::cout << "hostname : " << hostname << std::endl; std::cout << hostname << " [MPI] myRank : " << myRank << std::endl; std::cout << hostname << " [GPU] myDevId : " << myDevId << " (" << deviceProp.name << ")" << std::endl; #endif // __CUDACC__ } // HydroMpiParameters::HydroMpiParameters
int getGpuCount() { int deviceCount; checkCudaErrors(cudaGetDeviceCount(&deviceCount)); return deviceCount; }
//==================================== // Setup/Init Stuff //==================================== bool Init(int argc, char **argv){ // Set window title to "Student Name: GPU Name" std::string deviceName; cudaDeviceProp deviceProp; int gpudevice = 0; int device_count = 0; cudaGetDeviceCount(&device_count); if(gpudevice > device_count){ std::cout << "Error: GPU device number is greater than the number of devices!" << "Perhaps a CUDA-capable GPU is not installed?" << std::endl; return false; } cudaGetDeviceProperties(&deviceProp, gpudevice); deviceName = deviceProp.name; deviceName = m_yourName + ": " + deviceProp.name; m_major = deviceProp.major; m_minor = deviceProp.minor; // Window setup stuff #if _WIN32 glutInit(&argc, argv); m_width = 800; m_height = 800; glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGBA); glutInitWindowSize(m_width, m_height); m_window = glutCreateWindow(deviceName.c_str()); #else glfwSetErrorCallback(ErrorCallback); if (!glfwInit()){ return false; } m_width = 800; m_height = 800; m_window = glfwCreateWindow(m_width, m_height, deviceName.c_str(), NULL, NULL); if (!m_window){ glfwTerminate(); return false; } glfwMakeContextCurrent(m_window); glfwSetKeyCallback(m_window, KeyCallback); #endif glewExperimental = GL_TRUE; if(glewInit()!=GLEW_OK){ return false; } // Init all of the things InitVAO(); InitTextures(); InitCuda(); InitPBO(&m_pbo); GLuint passthroughProgram; passthroughProgram = InitShader(); glUseProgram(passthroughProgram); glActiveTexture(GL_TEXTURE0); return true; }
int main() { int deviceCount, device, sm_major[999], sm_minor[999], compute_major[999], compute_minor[999]; int gpuDeviceCount = 0; struct cudaDeviceProp properties; if (cudaGetDeviceCount(&deviceCount) != cudaSuccess) { printf("Couldn't get device count: %s\n", cudaGetErrorString(cudaGetLastError())); return 1; } /* machines with no GPUs can still report one emulation device */ for (device = 0; device < deviceCount; ++device) { cudaGetDeviceProperties(&properties, device); if (properties.major != 9999) { /* 9999 means emulation only */ ++gpuDeviceCount; } sm_major[device] = properties.major; sm_minor[device] = properties.minor; if (sm_major[device] == 2 && sm_minor[device] == 0) { compute_major[device]=2; compute_minor[device]=0; } if (sm_major[device] == 2 && sm_minor[device] == 1) { compute_major[device]=2; compute_minor[device]=0; } if (sm_major[device] == 3 && sm_minor[device] == 0) { compute_major[device]=3; compute_minor[device]=0; } if (sm_major[device] == 3 && sm_minor[device] == 2) { compute_major[device]=3; compute_minor[device]=0; } if (sm_major[device] == 3 && sm_minor[device] == 5) { compute_major[device]=3; compute_minor[device]=5; } if (sm_major[device] == 5 && sm_minor[device] == 0) { compute_major[device]=5; compute_minor[device]=0; } if (sm_major[device] == 5 && sm_minor[device] == 2) { compute_major[device]=5; compute_minor[device]=2; } } for (device = 0; device < deviceCount; device ++) { printf("arch=compute_%d%d,code=sm_%d%d ",compute_major[device],compute_minor[device],sm_major[device],sm_minor[device]); } return 1; /* failure */ }
/** * Returns the GPU weights for all available GPUs. * * @param proportion vector that will receive the weights. * @param n maximum number of weights to be stored in the vector. * @return the number of weights stored in the vector. */ int getGPUWeights(int* proportion, int n) { /* * When any CUDA runtime function is called, the CUDA context is initialized. * If we call a fork after this initialization, the same context is shared * among the processes, what causes initialization errors and abnormal * execution. The getGPUWeights function are called before the fork procedure. * in the libmasa_entry_point, so, we must obtain all the GPU weights/proportion * using another process. This method fork a process only to obtain these * CUDA dependent values and the child process dies with its own CUDA context. * This context is not shared with the parent process, so we can continue * the Aligner execution without any problem. */ /* Communication using PIPE */ int pipe_fd[2]; if (pipe(pipe_fd)) { fprintf(stderr, "ERROR: GPU weights could not be obtained (1).\n"); exit(-1); } int pid = fork(); if (pid == 0) { /* Child */ close(pipe_fd[0]); int count; cudaGetDeviceCount(&count); cudaDeviceProp devProp; for (int deviceId=0; deviceId<count; deviceId++) { cutilSafeCall(cudaGetDeviceProperties(&devProp, deviceId)); int cores; if (devProp.major <= 1) { cores = 8; } else if (devProp.major == 2 && devProp.minor == 0) { cores = 32; } else if (devProp.major == 2 && devProp.minor == 1) { cores = 48; } else { cores = 192; } if (getCompiledCapability() <= (devProp.major*100+devProp.minor*10)) { int speed = devProp.clockRate*devProp.multiProcessorCount*cores/1000; if (write(pipe_fd[1], &speed, sizeof(speed)) == -1) { fprintf(stderr, "ERROR: GPU weights could not be obtained (2).\n"); exit(1); } } } cudaDeviceReset(); close(pipe_fd[1]); exit(7); } else { /* Parent */ close(pipe_fd[1]); int val; int count = 0; while (read(pipe_fd[0], &val, sizeof(val)) > 0) { if (count < n) { proportion[count++] = val; } } close(pipe_fd[0]); waitpid(pid, NULL, 0); // Join processes return count; } }
void parse_opts( int argc, char** argv, magma_opts *opts ) { // negative flag indicating -m, -n, -k not given int m = -1; int n = -1; int k = -1; // fill in default values opts->device = 0; opts->pad = 32; opts->nb = 0; // auto opts->nrhs = 1; opts->nstream = 1; opts->ngpu = magma_num_gpus(); opts->niter = 1; opts->nthread = 1; opts->itype = 1; opts->svd_work = 0; opts->version = 1; opts->fraction = 1.; opts->tolerance = 30.; opts->panel_nthread = 1; opts->fraction_dcpu = 0.0; opts->check = (getenv("MAGMA_TESTINGS_CHECK") != NULL); opts->lapack = (getenv("MAGMA_RUN_LAPACK") != NULL); opts->warmup = (getenv("MAGMA_WARMUP") != NULL); opts->all = (getenv("MAGMA_RUN_ALL") != NULL); opts->verbose = false; opts->uplo = MagmaLower; // potrf, etc. opts->transA = MagmaNoTrans; // gemm, etc. opts->transB = MagmaNoTrans; // gemm opts->side = MagmaLeft; // trsm, etc. opts->diag = MagmaNonUnit; // trsm, etc. opts->jobu = MagmaNoVec; // gesvd: no left singular vectors opts->jobvt = MagmaNoVec; // gesvd: no right singular vectors opts->jobz = MagmaNoVec; // heev: no eigen vectors opts->jobvr = MagmaNoVec; // geev: no right eigen vectors opts->jobvl = MagmaNoVec; // geev: no left eigen vectors #ifdef USE_FLOCK opts->flock_op = LOCK_SH; // default shared lock #endif printf( usage_short, argv[0] ); int ndevices; cudaGetDeviceCount( &ndevices ); int info; int ntest = 0; for( int i = 1; i < argc; ++i ) { // ----- matrix size // each -N fills in next entry of msize, nsize, ksize and increments ntest if ( strcmp("-N", argv[i]) == 0 && i+1 < argc ) { magma_assert( ntest < MAX_NTEST, "error: -N %s, max number of tests exceeded, ntest=%d.\n", argv[i], ntest ); i++; int m2, n2, k2; info = sscanf( argv[i], "%d,%d,%d", &m2, &n2, &k2 ); if ( info == 3 && m2 >= 0 && n2 >= 0 && k2 >= 0 ) { opts->msize[ ntest ] = m2; opts->nsize[ ntest ] = n2; opts->ksize[ ntest ] = k2; } else if ( info == 2 && m2 >= 0 && n2 >= 0 ) { opts->msize[ ntest ] = m2; opts->nsize[ ntest ] = n2; opts->ksize[ ntest ] = n2; // implicitly } else if ( info == 1 && m2 >= 0 ) { opts->msize[ ntest ] = m2; opts->nsize[ ntest ] = m2; // implicitly opts->ksize[ ntest ] = m2; // implicitly } else { fprintf( stderr, "error: -N %s is invalid; ensure m >= 0, n >= 0, k >= 0.\n", argv[i] ); exit(1); } ntest++; } // --range start:stop:step fills in msize[ntest:], nsize[ntest:], ksize[ntest:] // with given range and updates ntest else if ( strcmp("--range", argv[i]) == 0 && i+1 < argc ) { i++; int start, stop, step; info = sscanf( argv[i], "%d:%d:%d", &start, &stop, &step ); if ( info == 3 && start >= 0 && stop >= 0 && step != 0 ) { for( int n = start; (step > 0 ? n <= stop : n >= stop); n += step ) { if ( ntest >= MAX_NTEST ) { printf( "warning: --range %s, max number of tests reached, ntest=%d.\n", argv[i], ntest ); break; } opts->msize[ ntest ] = n; opts->nsize[ ntest ] = n; opts->ksize[ ntest ] = n; ntest++; } } else { fprintf( stderr, "error: --range %s is invalid; ensure start >= 0, stop >= start, step > 0.\n", argv[i] ); exit(1); } } // save m, n, k if -m, -n, -k is given; applied after loop else if ( strcmp("-m", argv[i]) == 0 && i+1 < argc ) { m = atoi( argv[++i] ); magma_assert( m >= 0, "error: -m %s is invalid; ensure m >= 0.\n", argv[i] ); } else if ( strcmp("-n", argv[i]) == 0 && i+1 < argc ) { n = atoi( argv[++i] ); magma_assert( n >= 0, "error: -n %s is invalid; ensure n >= 0.\n", argv[i] ); } else if ( strcmp("-k", argv[i]) == 0 && i+1 < argc ) { k = atoi( argv[++i] ); magma_assert( k >= 0, "error: -k %s is invalid; ensure k >= 0.\n", argv[i] ); } // ----- scalar arguments else if ( strcmp("--dev", argv[i]) == 0 && i+1 < argc ) { opts->device = atoi( argv[++i] ); magma_assert( opts->device >= 0 && opts->device < ndevices, "error: --dev %s is invalid; ensure dev in [0,%d].\n", argv[i], ndevices-1 ); } else if ( strcmp("--pad", argv[i]) == 0 && i+1 < argc ) { opts->pad = atoi( argv[++i] ); magma_assert( opts->pad >= 1 && opts->pad <= 4096, "error: --pad %s is invalid; ensure pad in [1,4096].\n", argv[i] ); } else if ( strcmp("--nrhs", argv[i]) == 0 && i+1 < argc ) { opts->nrhs = atoi( argv[++i] ); magma_assert( opts->nrhs >= 0, "error: --nrhs %s is invalid; ensure nrhs >= 0.\n", argv[i] ); } else if ( strcmp("--nb", argv[i]) == 0 && i+1 < argc ) { opts->nb = atoi( argv[++i] ); magma_assert( opts->nb > 0, "error: --nb %s is invalid; ensure nb > 0.\n", argv[i] ); } else if ( strcmp("--ngpu", argv[i]) == 0 && i+1 < argc ) { opts->ngpu = atoi( argv[++i] ); magma_assert( opts->ngpu <= MagmaMaxGPUs, "error: --ngpu %s exceeds MagmaMaxGPUs, %d.\n", argv[i], MagmaMaxGPUs ); magma_assert( opts->ngpu <= ndevices, "error: --ngpu %s exceeds number of CUDA devices, %d.\n", argv[i], ndevices ); magma_assert( opts->ngpu > 0, "error: --ngpu %s is invalid; ensure ngpu > 0.\n", argv[i] ); #ifndef _MSC_VER // not Windows // save in environment variable, so magma_num_gpus() picks it up setenv( "MAGMA_NUM_GPUS", argv[i], true ); #endif } else if ( strcmp("--nstream", argv[i]) == 0 && i+1 < argc ) { opts->nstream = atoi( argv[++i] ); magma_assert( opts->nstream > 0, "error: --nstream %s is invalid; ensure nstream > 0.\n", argv[i] ); } else if ( strcmp("--niter", argv[i]) == 0 && i+1 < argc ) { opts->niter = atoi( argv[++i] ); magma_assert( opts->niter > 0, "error: --niter %s is invalid; ensure niter > 0.\n", argv[i] ); } else if ( strcmp("--nthread", argv[i]) == 0 && i+1 < argc ) { opts->nthread = atoi( argv[++i] ); magma_assert( opts->nthread > 0, "error: --nthread %s is invalid; ensure nthread > 0.\n", argv[i] ); } else if ( strcmp("--itype", argv[i]) == 0 && i+1 < argc ) { opts->itype = atoi( argv[++i] ); magma_assert( opts->itype >= 1 && opts->itype <= 3, "error: --itype %s is invalid; ensure itype in [1,2,3].\n", argv[i] ); } else if ( strcmp("--svd_work", argv[i]) == 0 && i+1 < argc ) { opts->svd_work = atoi( argv[++i] ); magma_assert( opts->svd_work >= 0 && opts->svd_work <= 3, "error: --svd_work %s is invalid; ensure svd_work in [0,1,2,3].\n", argv[i] ); } else if ( strcmp("--version", argv[i]) == 0 && i+1 < argc ) { opts->version = atoi( argv[++i] ); magma_assert( opts->version >= 1, "error: --version %s is invalid; ensure version > 0.\n", argv[i] ); } else if ( strcmp("--fraction", argv[i]) == 0 && i+1 < argc ) { opts->fraction = atof( argv[++i] ); magma_assert( opts->fraction >= 0 && opts->fraction <= 1, "error: --fraction %s is invalid; ensure fraction in [0,1].\n", argv[i] ); } else if ( strcmp("--tolerance", argv[i]) == 0 && i+1 < argc ) { opts->tolerance = atof( argv[++i] ); magma_assert( opts->tolerance >= 0 && opts->tolerance <= 1000, "error: --tolerance %s is invalid; ensure tolerance in [0,1000].\n", argv[i] ); } else if ( strcmp("--panel_nthread", argv[i]) == 0 && i+1 < argc ) { opts->panel_nthread = atoi( argv[++i] ); magma_assert( opts->panel_nthread > 0, "error: --panel_nthread %s is invalid; ensure panel_nthread > 0.\n", argv[i] ); } else if ( strcmp("--fraction_dcpu", argv[i]) == 0 && i+1 < argc ) { opts->fraction_dcpu = atof( argv[++i] ); magma_assert( opts->fraction_dcpu > 0 && opts->fraction_dcpu<=1, "error: --fraction_dcpu %s is invalid; ensure fraction_dcpu in [0, 1]\n", argv[i] ); } // ----- boolean arguments // check results else if ( strcmp("-c", argv[i]) == 0 || strcmp("--check", argv[i]) == 0 ) { opts->check = 1; } else if ( strcmp("-c2", argv[i]) == 0 || strcmp("--check2", argv[i]) == 0 ) { opts->check = 2; } else if ( strcmp("--nocheck", argv[i]) == 0 ) { opts->check = 0; } else if ( strcmp("-l", argv[i]) == 0 || strcmp("--lapack", argv[i]) == 0 ) { opts->lapack = true; } else if ( strcmp("--nolapack", argv[i]) == 0 ) { opts->lapack = false; } else if ( strcmp("--warmup", argv[i]) == 0 ) { opts->warmup = true; } else if ( strcmp("--nowarmup", argv[i]) == 0 ) { opts->warmup = false; } else if ( strcmp("--all", argv[i]) == 0 ) { opts->all = true; } else if ( strcmp("--notall", argv[i]) == 0 ) { opts->all = false; } else if ( strcmp("--verbose", argv[i]) == 0 ) { opts->verbose= true; } // ----- lapack flag arguments else if ( strcmp("-L", argv[i]) == 0 ) { opts->uplo = MagmaLower; } else if ( strcmp("-U", argv[i]) == 0 ) { opts->uplo = MagmaUpper; } else if ( strcmp("-F", argv[i]) == 0 ) { opts->uplo = MagmaUpperLower; } else if ( strcmp("-NN", argv[i]) == 0 ) { opts->transA = MagmaNoTrans; opts->transB = MagmaNoTrans; } else if ( strcmp("-NT", argv[i]) == 0 ) { opts->transA = MagmaNoTrans; opts->transB = MagmaTrans; } else if ( strcmp("-NC", argv[i]) == 0 ) { opts->transA = MagmaNoTrans; opts->transB = MagmaConjTrans; } else if ( strcmp("-TN", argv[i]) == 0 ) { opts->transA = MagmaTrans; opts->transB = MagmaNoTrans; } else if ( strcmp("-TT", argv[i]) == 0 ) { opts->transA = MagmaTrans; opts->transB = MagmaTrans; } else if ( strcmp("-TC", argv[i]) == 0 ) { opts->transA = MagmaTrans; opts->transB = MagmaConjTrans; } else if ( strcmp("-CN", argv[i]) == 0 ) { opts->transA = MagmaConjTrans; opts->transB = MagmaNoTrans; } else if ( strcmp("-CT", argv[i]) == 0 ) { opts->transA = MagmaConjTrans; opts->transB = MagmaTrans; } else if ( strcmp("-CC", argv[i]) == 0 ) { opts->transA = MagmaConjTrans; opts->transB = MagmaConjTrans; } else if ( strcmp("-T", argv[i]) == 0 ) { opts->transA = MagmaTrans; } else if ( strcmp("-C", argv[i]) == 0 ) { opts->transA = MagmaConjTrans; } else if ( strcmp("-SL", argv[i]) == 0 ) { opts->side = MagmaLeft; } else if ( strcmp("-SR", argv[i]) == 0 ) { opts->side = MagmaRight; } else if ( strcmp("-DN", argv[i]) == 0 ) { opts->diag = MagmaNonUnit; } else if ( strcmp("-DU", argv[i]) == 0 ) { opts->diag = MagmaUnit; } else if ( strcmp("-UA", argv[i]) == 0 ) { opts->jobu = MagmaAllVec; } else if ( strcmp("-US", argv[i]) == 0 ) { opts->jobu = MagmaSomeVec; } else if ( strcmp("-UO", argv[i]) == 0 ) { opts->jobu = MagmaOverwriteVec; } else if ( strcmp("-UN", argv[i]) == 0 ) { opts->jobu = MagmaNoVec; } else if ( strcmp("-VA", argv[i]) == 0 ) { opts->jobvt = MagmaAllVec; } else if ( strcmp("-VS", argv[i]) == 0 ) { opts->jobvt = MagmaSomeVec; } else if ( strcmp("-VO", argv[i]) == 0 ) { opts->jobvt = MagmaOverwriteVec; } else if ( strcmp("-VN", argv[i]) == 0 ) { opts->jobvt = MagmaNoVec; } else if ( strcmp("-JN", argv[i]) == 0 ) { opts->jobz = MagmaNoVec; } else if ( strcmp("-JV", argv[i]) == 0 ) { opts->jobz = MagmaVec; } else if ( strcmp("-LN", argv[i]) == 0 ) { opts->jobvl = MagmaNoVec; } else if ( strcmp("-LV", argv[i]) == 0 ) { opts->jobvl = MagmaVec; } else if ( strcmp("-RN", argv[i]) == 0 ) { opts->jobvr = MagmaNoVec; } else if ( strcmp("-RV", argv[i]) == 0 ) { opts->jobvr = MagmaVec; } // ----- misc else if ( strcmp("-x", argv[i]) == 0 || strcmp("--exclusive", argv[i]) == 0 ) { #ifdef USE_FLOCK opts->flock_op = LOCK_EX; #else fprintf( stderr, "ignoring %s: USE_FLOCK not defined; flock not supported.\n", argv[i] ); #endif } // ----- usage else if ( strcmp("-h", argv[i]) == 0 || strcmp("--help", argv[i]) == 0 ) { fprintf( stderr, usage, argv[0], MAX_NTEST ); exit(0); } else { fprintf( stderr, "error: unrecognized option %s\n", argv[i] ); exit(1); } } // if -N or --range not given, use default range if ( ntest == 0 ) { int n2 = 1024 + 64; for( int i = 0; i < MAX_NTEST; ++i ) { opts->msize[i] = n2; opts->nsize[i] = n2; opts->ksize[i] = n2; n2 += 1024; } ntest = 10; } assert( ntest <= MAX_NTEST ); opts->ntest = ntest; // fill in msize[:], nsize[:], ksize[:] if -m, -n, -k were given if ( m >= 0 ) { for( int j = 0; j < MAX_NTEST; ++j ) { opts->msize[j] = m; } } if ( n >= 0 ) { for( int j = 0; j < MAX_NTEST; ++j ) { opts->nsize[j] = n; } } if ( k >= 0 ) { for( int j = 0; j < MAX_NTEST; ++j ) { opts->ksize[j] = k; } } // find max dimensions opts->mmax = 0; opts->nmax = 0; opts->kmax = 0; for( int i = 0; i < ntest; ++i ) { opts->mmax = max( opts->mmax, opts->msize[i] ); opts->nmax = max( opts->nmax, opts->nsize[i] ); opts->kmax = max( opts->kmax, opts->ksize[i] ); } // disallow jobu=O, jobvt=O if ( opts->jobu == MagmaOverwriteVec && opts->jobvt == MagmaOverwriteVec ) { printf( "jobu and jobvt cannot both be Overwrite.\n" ); exit(1); } // lock file #ifdef USE_FLOCK opts->flock_fd = open_lockfile( lockfile, opts->flock_op ); #endif // set device magma_setdevice( opts->device ); }
mplugin* mscorefactory_kgpu::create_plugin() { if (first_access) { // first call, from master thread first_access = false; // just once, thanks // initialize GPU int deviceCount = 0; int bestDeviceIndex = 0; double highestDeviceScore = 0.0; // device score = clockRate // * multiProcessorCount cudaDeviceProp deviceProp; cuInit(0); // loop through all devices and choose which one to use based on // highest device score cudaError_t err = cudaGetDeviceCount(&deviceCount); if (err != cudaSuccess) { logevent(error, init, cudaGetErrorString(err) << endl); } logevent(info, init, "GPU devices found: " << deviceCount << endl); for (int curDeviceNumber=0; curDeviceNumber < deviceCount; curDeviceNumber++) { logevent(info, init, "testing device " << curDeviceNumber << endl); memset(&deviceProp, 0, sizeof(deviceProp)); if (cudaSuccess == cudaGetDeviceProperties( &deviceProp, curDeviceNumber)) { double curDeviceScore = deviceProp.multiProcessorCount * deviceProp.clockRate; if (curDeviceScore > highestDeviceScore) { highestDeviceScore = curDeviceScore; bestDeviceIndex = curDeviceNumber; } } else { logevent( error, init, "unable to access GPU properties, using non-GPU k-score" << endl); return new mscore_k(); } } memset(&deviceProp, 0, sizeof(deviceProp)); if (cudaSuccess == cudaGetDeviceProperties(&deviceProp, bestDeviceIndex)) { logevent( info, init, "Using GPU device " << bestDeviceIndex << ": \"" << deviceProp.name << "\", " << prettymem(deviceProp.totalGlobalMem) << ", " << deviceProp.multiProcessorCount << " multiprocessors, " << deviceProp.clockRate/1000 << " MHz" << endl); size_t free_mem, total_mem; cudaMemGetInfo(&free_mem, &total_mem); m_initialFreeMemory = free_mem; logevent( info, init, "device initial free memory: " << prettymem(m_initialFreeMemory) << endl); } else { logevent( error, init, "unable to access device properites, using non-GPU k-score" << endl); return new mscore_k(); } logevent(info, init, "resetting GPU" << endl); if (cudaSuccess != cudaDeviceReset() ) { logevent( error, init, "error resetting GPU, using non-GPU k-score" << endl); return new mscore_k(); } logevent(info, initok, "done resetting GPU" << endl); if (cudaSuccess != cudaSetDevice(bestDeviceIndex)) { logevent( error, init, "Error - cannot set GPU, using non-GPU k-score" << endl); return new mscore_k(); } else { logevent( info, initok, "GPU " << bestDeviceIndex << " initialized." << endl); } size_t free, total; free = total = 0; cudaMemGetInfo(&free, &total); m_initialFreeMemory = free; logevent( info, init, "Device memory: " << prettymem(free) << " free / " << prettymem(total) << " total" << endl); if (m_initialFreeMemory) { mscore_kgpu_thrust_init(m_initialFreeMemory); } } if (!m_initialFreeMemory) { logevent( error, init, "Error - insufficient GPU memory, using non-GPU k-score" << endl); return new mscore_k(); } return new mscore_kgpu(); }
int cuda_devices(void) { int count; CUDA_ERROR(cudaGetDeviceCount(&count)); return count; }
static void DeviceSetup(const int dev, int &ngpu) { MFEM_CUDA_CHECK(cudaGetDeviceCount(&ngpu)); MFEM_VERIFY(ngpu > 0, "No CUDA device found!"); MFEM_CUDA_CHECK(cudaSetDevice(dev)); }