Example #1
0
bool configureGpu(bool use_gpu_acceleration, std::vector<int> &valid_devices, int use_all_gpus, 
  int &numBkgWorkers_gpu) {
#ifdef ION_COMPILE_CUDA
  const unsigned long long gpu_mem = 2.5 * 1024 * 1024 * 1024;

  if (!use_gpu_acceleration)
    return false;

  // Get number of GPUs in system
  int num_gpus = 0;
  cudaError_t err = cudaGetDeviceCount( &num_gpus );

  if (err != cudaSuccess) {
    printf("CUDA: No GPU device available. Defaulting to CPU only computation\n");
    return false;
  }

  if ( use_all_gpus )
  {
    // Add all GPUs to the valid device list
    for ( int dev = 0; dev < num_gpus;  dev++ )
      valid_devices.push_back(dev);
  }
  else
  {
    // Only add the highest compute devices to the compute list
    int version = 0;
    int major = 0;
    int minor = 0;
    cudaDeviceProp dev_props;

    // Iterate over GPUs to find the highest compute device
    for ( int dev = 0; dev < num_gpus;  dev++ )
    {
      cudaGetDeviceProperties( &dev_props, dev );
      if ( (dev_props.major*10) + dev_props.minor > version )
      {
        version = (dev_props.major*10) + dev_props.minor;
        major = dev_props.major;
        minor = dev_props.minor;
      }
    }

    for ( int dev = 0; dev < num_gpus;  dev++ )
    {
      cudaGetDeviceProperties(&dev_props, dev);
      if (dev_props.major == major && dev_props.minor == minor) {
        if (dev_props.totalGlobalMem > gpu_mem) {
    valid_devices.push_back(dev);
        }
      }
    } 
  }

  // Set the number of GPU workers and tell CUDA about our list of valid devices
  if (valid_devices.size() > 0) {
    numBkgWorkers_gpu = int(valid_devices.size());
    cudaSetValidDevices( &valid_devices[0], int( valid_devices.size() ) );
  }
  else {
    printf("CUDA: No GPU device available. Defaulting to CPU only computation\n");
    return false;   
  }

 
  PoissonCDFApproxMemo poiss_cache; 
  poiss_cache.Allocate (MAX_POISSON_TABLE_COL,MAX_POISSON_TABLE_ROW,POISSON_TABLE_STEP);
  poiss_cache.GenerateValues(); // fill out my table


  for(int i=valid_devices.size()-1 ; i >= 0; i--){
    try{
      //cudaSetDevice(valid_devices[i]);
      cout << "CUDA "<< valid_devices[i] << ": Creating Context and Constant memory on device with id: "<<  valid_devices[i]<< endl;
      InitConstantMemoryOnGpu(valid_devices[i],poiss_cache);
    }
    catch(cudaException &e) {
      cout << "CUDA "<< valid_devices[i] << ": Context could not be created. removing device with id: "<<  valid_devices[i] << " from valid device list" << endl;
      valid_devices.erase (valid_devices.begin()+i);
      numBkgWorkers_gpu -= 1;
      if(numBkgWorkers_gpu == 0) cout << "CUDA: no context could be created, defaulting to CPU only execution" << endl; 
    }

  }

  if(numBkgWorkers_gpu == 0) return false;

  return true;

#else
  
  return false;

#endif

}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
    pArgc = &argc;
    pArgv = argv;

    printf("%s Starting...\n\n", argv[0]);
    printf(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n");

    int deviceCount = 0;
    cudaError_t error_id = cudaGetDeviceCount(&deviceCount);

    if (error_id != cudaSuccess)
    {
        printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id));
        exit(EXIT_FAILURE);
    }

    // This function call returns 0 if there are no CUDA capable devices.
    if (deviceCount == 0)
    {
        printf("There are no available device(s) that support CUDA\n");
    }
    else
    {
        printf("Detected %d CUDA Capable device(s)\n", deviceCount);
    }

    int dev, driverVersion = 0, runtimeVersion = 0;

    for (dev = 0; dev < deviceCount; ++dev)
    {
        cudaSetDevice(dev);
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, dev);

        printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);

        // Console log
        cudaDriverGetVersion(&driverVersion);
        cudaRuntimeGetVersion(&runtimeVersion);
        printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", driverVersion/1000, (driverVersion%100)/10, runtimeVersion/1000, (runtimeVersion%100)/10);
        printf("  CUDA Capability Major/Minor version number:    %d.%d\n", deviceProp.major, deviceProp.minor);

        char msg[256];
        sprintf(msg, "  Total amount of global memory:                 %.0f MBytes (%llu bytes)\n",
                (float)deviceProp.totalGlobalMem/1048576.0f, (unsigned long long) deviceProp.totalGlobalMem);
        printf("%s", msg);

        printf("  (%2d) Multiprocessors x (%3d) CUDA Cores/MP:    %d CUDA Cores\n",
               deviceProp.multiProcessorCount,
               _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
               _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
        printf("  GPU Clock rate:                                %.0f MHz (%0.2f GHz)\n", deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f);


#if CUDART_VERSION >= 5000
        // This is supported in CUDA 5.0 (runtime API device properties)
        printf("  Memory Clock rate:                             %.0f Mhz\n", deviceProp.memoryClockRate * 1e-3f);
        printf("  Memory Bus Width:                              %d-bit\n",   deviceProp.memoryBusWidth);

        if (deviceProp.l2CacheSize)
        {
            printf("  L2 Cache Size:                                 %d bytes\n", deviceProp.l2CacheSize);
        }
#else
        // This only available in CUDA 4.0-4.2 (but these were only exposed in the CUDA Driver API)
        int memoryClock;
        getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev);
        printf("  Memory Clock rate:                             %.0f Mhz\n", memoryClock * 1e-3f);
        int memBusWidth;
        getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
        printf("  Memory Bus Width:                              %d-bit\n", memBusWidth);
        int L2CacheSize;
        getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);

        if (L2CacheSize)
        {
            printf("  L2 Cache Size:                                 %d bytes\n", L2CacheSize);
        }
#endif

        printf("  Max Texture Dimension Size (x,y,z)             1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n",
               deviceProp.maxTexture1D   , deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1],
               deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]);
        printf("  Max Layered Texture Size (dim) x layers        1D=(%d) x %d, 2D=(%d,%d) x %d\n",
               deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1],
               deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]);

        printf("  Total amount of constant memory:               %lu bytes\n", deviceProp.totalConstMem);
        printf("  Total amount of shared memory per block:       %lu bytes\n", deviceProp.sharedMemPerBlock);
        printf("  Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
        printf("  Warp size:                                     %d\n", deviceProp.warpSize);
        printf("  Maximum number of threads per multiprocessor:  %d\n", deviceProp.maxThreadsPerMultiProcessor);
        printf("  Maximum number of threads per block:           %d\n", deviceProp.maxThreadsPerBlock);
        printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n",
               deviceProp.maxThreadsDim[0],
               deviceProp.maxThreadsDim[1],
               deviceProp.maxThreadsDim[2]);
        printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n",
               deviceProp.maxGridSize[0],
               deviceProp.maxGridSize[1],
               deviceProp.maxGridSize[2]);
        printf("  Maximum memory pitch:                          %lu bytes\n", deviceProp.memPitch);
        printf("  Texture alignment:                             %lu bytes\n", deviceProp.textureAlignment);
        printf("  Concurrent copy and kernel execution:          %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount);
        printf("  Run time limit on kernels:                     %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
        printf("  Integrated GPU sharing Host Memory:            %s\n", deviceProp.integrated ? "Yes" : "No");
        printf("  Support host page-locked memory mapping:       %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");
        printf("  Alignment requirement for Surfaces:            %s\n", deviceProp.surfaceAlignment ? "Yes" : "No");
        printf("  Device has ECC support:                        %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled");
#ifdef WIN32
        printf("  CUDA Device Driver Mode (TCC or WDDM):         %s\n", deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)");
#endif
        printf("  Device supports Unified Addressing (UVA):      %s\n", deviceProp.unifiedAddressing ? "Yes" : "No");
        printf("  Device PCI Bus ID / PCI location ID:           %d / %d\n", deviceProp.pciBusID, deviceProp.pciDeviceID);

        const char *sComputeMode[] =
        {
            "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
            "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
            "Prohibited (no host thread can use ::cudaSetDevice() with this device)",
            "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
            "Unknown",
            NULL
        };
        printf("  Compute Mode:\n");
        printf("     < %s >\n", sComputeMode[deviceProp.computeMode]);
    }

    // csv masterlog info
    // *****************************
    // exe and CUDA driver name
    printf("\n");
    std::string sProfileString = "deviceQuery, CUDA Driver = CUDART";
    char cTemp[16];

    // driver version
    sProfileString += ", CUDA Driver Version = ";
#ifdef WIN32
    sprintf_s(cTemp, 10, "%d.%d", driverVersion/1000, (driverVersion%100)/10);
#else
    sprintf(cTemp, "%d.%d", driverVersion/1000, (driverVersion%100)/10);
#endif
    sProfileString +=  cTemp;

    // Runtime version
    sProfileString += ", CUDA Runtime Version = ";
#ifdef WIN32
    sprintf_s(cTemp, 10, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10);
#else
    sprintf(cTemp, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10);
#endif
    sProfileString +=  cTemp;

    // Device count
    sProfileString += ", NumDevs = ";
#ifdef WIN32
    sprintf_s(cTemp, 10, "%d", deviceCount);
#else
    sprintf(cTemp, "%d", deviceCount);
#endif
    sProfileString += cTemp;

    // Print Out all device Names
    for (dev = 0; dev < deviceCount; ++dev)
    {
#ifdef _WIN32
    sprintf_s(cTemp, 13, ", Device%d = ", dev);
#else
    sprintf(cTemp, ", Device%d = ", dev);
#endif
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, dev);
        sProfileString += cTemp;
        sProfileString += deviceProp.name;
    }

    sProfileString += "\n";
    printf("%s", sProfileString.c_str());

    // finish
    exit(EXIT_SUCCESS);
}
Example #3
0
void initQuda(int dev)
{
  static int initialized = 0;
  if (initialized) {
    return;
  }
  initialized = 1;

#if (CUDA_VERSION >= 4000) && defined(MULTI_GPU)
  //check if CUDA_NIC_INTEROP is set to 1 in the enviroment
  char* cni_str = getenv("CUDA_NIC_INTEROP");
  if(cni_str == NULL){
    errorQuda("Environment variable CUDA_NIC_INTEROP is not set\n");
  }
  int cni_int = atoi(cni_str);
  if (cni_int != 1){
    errorQuda("Environment variable CUDA_NIC_INTEROP is not set to 1\n");    
  }
#endif

  int deviceCount;
  cudaGetDeviceCount(&deviceCount);
  if (deviceCount == 0) {
    errorQuda("No devices supporting CUDA");
  }

  for(int i=0; i<deviceCount; i++) {
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, i);
    printfQuda("QUDA: Found device %d: %s\n", i, deviceProp.name);
  }

#ifdef QMP_COMMS
  int ndim;
  const int *dim;

  if ( QMP_is_initialized() != QMP_TRUE ) {
    errorQuda("QMP is not initialized");
  }
  num_QMP=QMP_get_number_of_nodes();
  rank_QMP=QMP_get_node_number();
  
  dev += rank_QMP % deviceCount;
  ndim = QMP_get_logical_number_of_dimensions();
  dim = QMP_get_logical_dimensions();

#elif defined(MPI_COMMS)

  comm_init();
  dev=comm_gpuid();

#else
  if (dev < 0) dev = deviceCount - 1;
#endif
  
  // Used for applying the gauge field boundary condition
  if( commCoords(3) == 0 ) qudaPt0=true;
  else qudaPt0=false;

  if( commCoords(3) == commDim(3)-1 ) qudaPtNm1=true;
  else qudaPtNm1=false;

  cudaDeviceProp deviceProp;
  cudaGetDeviceProperties(&deviceProp, dev);
  if (deviceProp.major < 1) {
    errorQuda("Device %d does not support CUDA", dev);
  }

  
  printfQuda("QUDA: Using device %d: %s\n", dev, deviceProp.name);

  cudaSetDevice(dev);
#ifdef HAVE_NUMA
  if(numa_config_set){
    if(gpu_affinity[dev] >=0){
      printfQuda("Numa setting to cpu node %d\n", gpu_affinity[dev]);
      if(numa_run_on_node(gpu_affinity[dev]) != 0){
        printfQuda("Warning: Setting numa to cpu node %d failed\n", gpu_affinity[dev]);
      }
    }

  }
#endif

  initCache();
  quda::initBlas();
}
Example #4
0
/** Initialize hardware counters, setup the function vector table
 * and get hardware information, this routine is called when the
 * PAPI process is initialized (IE PAPI_library_init)
 */
		int
_papi_nvml_init_substrate( int cidx )
{
		nvmlReturn_t ret;
		cudaError_t cuerr;

		int cuda_count = 0;
		unsigned int nvml_count = 0;

		ret = nvmlInit();
		if ( NVML_SUCCESS != ret ) {
				strcpy(_nvml_vector.cmp_info.disabled_reason, "The NVIDIA managament library failed to initialize.");
				goto disable;
		}

		cuerr = cuInit( 0 );
		if ( CUDA_SUCCESS != cuerr ) {
				strcpy(_nvml_vector.cmp_info.disabled_reason, "The CUDA library failed to initialize.");
				goto disable;
		}

		/* Figure out the number of CUDA devices in the system */
		ret = nvmlDeviceGetCount( &nvml_count );
		if ( NVML_SUCCESS != ret ) {
				strcpy(_nvml_vector.cmp_info.disabled_reason, "Unable to get a count of devices from the NVIDIA managament library.");
				goto disable;
		}

		cuerr = cudaGetDeviceCount( &cuda_count );
		if ( CUDA_SUCCESS != cuerr ) {
				strcpy(_nvml_vector.cmp_info.disabled_reason, "Unable to get a device count from CUDA.");
				goto disable;
		}

		/* We can probably recover from this, when we're clever */
		if ( nvml_count != cuda_count ) {
				strcpy(_nvml_vector.cmp_info.disabled_reason, "Cuda and the NVIDIA managament library have different device counts.");
				goto disable;
		}

		device_count = cuda_count;

		/* A per device representation of what events are present */
		features = (int*)papi_malloc(sizeof(int) * device_count );

		/* Handles to each device */
		devices = (nvmlDevice_t*)papi_malloc(sizeof(nvmlDevice_t) * device_count);

		/* Figure out what events are supported on each card. */
		if ( (papi_errorcode = detectDevices( ) ) != PAPI_OK ) {
			papi_free(features);
			papi_free(devices);
			sprintf(_nvml_vector.cmp_info.disabled_reason, "An error occured in device feature detection, please check your NVIDIA Management Library and CUDA install." );
			goto disable;
		}

		/* The assumption is that if everything went swimmingly in detectDevices, 
			all nvml calls here should be fine. */
		createNativeEvents( );

		/* Export the total number of events available */
		_nvml_vector.cmp_info.num_native_events = num_events;

		/* Export the component id */
		_nvml_vector.cmp_info.CmpIdx = cidx;

		/* Export the number of 'counters' */
		_nvml_vector.cmp_info.num_cntrs = num_events;

		return PAPI_OK;

disable:
		_nvml_vector.cmp_info.num_cntrs = 0;
		return PAPI_OK;	
}
Example #5
0
/** Documented at declaration */
int
gpujpeg_init_device(int device_id, int flags)
{
    int dev_count;
    cudaGetDeviceCount(&dev_count);
    if ( dev_count == 0 ) {
        fprintf(stderr, "[GPUJPEG] [Error] No CUDA enabled device\n");
        return -1;
    }

    if ( device_id < 0 || device_id >= dev_count ) {
        fprintf(stderr, "[GPUJPEG] [Error] Selected device %d is out of bound. Devices on your system are in range %d - %d\n",
                device_id, 0, dev_count - 1);
        return -1;
    }

    struct cudaDeviceProp devProp;
    if ( cudaSuccess != cudaGetDeviceProperties(&devProp, device_id) ) {
        fprintf(stderr,
                "[GPUJPEG] [Error] Can't get CUDA device properties!\n"
                "[GPUJPEG] [Error] Do you have proper driver for CUDA installed?\n"
               );
        return -1;
    }

    if ( devProp.major < 1 ) {
        fprintf(stderr, "[GPUJPEG] [Error] Device %d does not support CUDA\n", device_id);
        return -1;
    }

    if ( flags & GPUJPEG_OPENGL_INTEROPERABILITY ) {
        cudaGLSetGLDevice(device_id);
        gpujpeg_cuda_check_error("Enabling OpenGL interoperability");
    }

    if ( flags & GPUJPEG_VERBOSE ) {
        int cuda_driver_version = 0;
        cudaDriverGetVersion(&cuda_driver_version);
        printf("CUDA driver version:   %d.%d\n", cuda_driver_version / 1000, (cuda_driver_version % 100) / 10);

        int cuda_runtime_version = 0;
        cudaRuntimeGetVersion(&cuda_runtime_version);
        printf("CUDA runtime version:  %d.%d\n", cuda_runtime_version / 1000, (cuda_runtime_version % 100) / 10);

        printf("Using Device #%d:       %s (c.c. %d.%d)\n", device_id, devProp.name, devProp.major, devProp.minor);
    }

    cudaSetDevice(device_id);
    gpujpeg_cuda_check_error("Set CUDA device");

    // Test by simple copying that the device is ready
    uint8_t data[] = {8};
    uint8_t* d_data = NULL;
    cudaMalloc((void**)&d_data, 1);
    cudaMemcpy(d_data, data, 1, cudaMemcpyHostToDevice);
    cudaFree(d_data);
    cudaError_t error = cudaGetLastError();
    if ( cudaSuccess != error ) {
        fprintf(stderr, "[GPUJPEG] [Error] Failed to initialize CUDA device.\n");
        if ( flags & GPUJPEG_OPENGL_INTEROPERABILITY )
            fprintf(stderr, "[GPUJPEG] [Info]  OpenGL interoperability is used, is OpenGL context available?\n");
        return -1;
    }

    return 0;
}
Example #6
0
// This function returns the best GPU (with maximum GFLOPS)
inline int gpuGetMaxGflopsDeviceId()
{
    int current_device     = 0, sm_per_multiproc  = 0;
    int max_perf_device    = 0;
    int device_count       = 0, best_SM_arch      = 0;
    int devices_prohibited = 0;
    
    unsigned long long max_compute_perf = 0;
    cudaDeviceProp deviceProp;
    cudaGetDeviceCount(&device_count);
    
    checkCudaErrors(cudaGetDeviceCount(&device_count));

    if (device_count == 0)
    {
        fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error: no devices supporting CUDA.\n");
        exit(EXIT_FAILURE);
    }

    // Find the best major SM Architecture GPU device
    while (current_device < device_count)
    {
        cudaGetDeviceProperties(&deviceProp, current_device);

        // If this GPU is not running on Compute Mode prohibited, then we can add it to the list
        if (deviceProp.computeMode != cudaComputeModeProhibited)
        {
            if (deviceProp.major > 0 && deviceProp.major < 9999)
            {
                best_SM_arch = MAX(best_SM_arch, deviceProp.major);
            }
        }
        else
        {
            devices_prohibited++;
        }

        current_device++;
    }

    if (devices_prohibited == device_count)
    {
    	fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error: all devices have compute mode prohibited.\n");
    	exit(EXIT_FAILURE);
    }

    // Find the best CUDA capable GPU device
    current_device = 0;

    while (current_device < device_count)
    {
        cudaGetDeviceProperties(&deviceProp, current_device);

        // If this GPU is not running on Compute Mode prohibited, then we can add it to the list
        if (deviceProp.computeMode != cudaComputeModeProhibited)
        {
            if (deviceProp.major == 9999 && deviceProp.minor == 9999)
            {
                sm_per_multiproc = 1;
            }
            else
            {
                sm_per_multiproc = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
            }

            unsigned long long compute_perf  = (unsigned long long) deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;

            if (compute_perf  > max_compute_perf)
            {
                // If we find GPU with SM major > 2, search only these
                if (best_SM_arch > 2)
                {
                    // If our device==dest_SM_arch, choose this, or else pass
                    if (deviceProp.major == best_SM_arch)
                    {
                        max_compute_perf  = compute_perf;
                        max_perf_device   = current_device;
                    }
                }
                else
                {
                    max_compute_perf  = compute_perf;
                    max_perf_device   = current_device;
                }
            }
        }

        ++current_device;
    }

    return max_perf_device;
}
Example #7
0
magma_int_t
magma_zparse_opts(
    int argc, 
    char** argv, 
    magma_zopts *opts, 
    int *matrices,
    magma_queue_t queue )
{
    // negative flag indicating -m, -n, -k not given
    int m = -1;
    int n = -1;
    int k = -1;
    
    // fill in default values
    opts->input_format = Magma_CSR;
    opts->blocksize = 8;
    opts->alignment = 8;
    opts->output_format = Magma_CSR;
    opts->input_location = Magma_CPU;
    opts->output_location = Magma_CPU;
    opts->scaling = Magma_NOSCALE;
    opts->solver_par.epsilon = 10e-16;
    opts->solver_par.maxiter = 1000;
    opts->solver_par.verbose = 0;
    opts->solver_par.version = 0;
    opts->solver_par.restart = 30;
    opts->solver_par.num_eigenvalues = 0;
    opts->precond_par.solver = Magma_JACOBI;
    opts->precond_par.epsilon = 0.01;
    opts->precond_par.maxiter = 100;
    opts->precond_par.restart = 10;
    opts->solver_par.solver = Magma_CG;
    
    printf( usage_sparse_short, argv[0] );
    
    int ndevices;
    cudaGetDeviceCount( &ndevices );
    
    int info;
    int ntest = 0;
    

    for( int i = 1; i < argc; ++i ) {
        if ( strcmp("--format", argv[i]) == 0 && i+1 < argc ) {
            info = atoi( argv[++i] );
            switch( info ) {
                case 0: opts->output_format = Magma_CSR; break;
                case 1: opts->output_format = Magma_ELL; break;
                case 2: opts->output_format = Magma_SELLP; break;
                //case 2: opts->output_format = Magma_ELLRT; break;
            }
        } else if ( strcmp("--mscale", argv[i]) == 0 && i+1 < argc ) {
            info = atoi( argv[++i] );
            switch( info ) {
                case 0: opts->scaling = Magma_NOSCALE; break;
                case 1: opts->scaling = Magma_UNITDIAG; break;
                case 2: opts->scaling = Magma_UNITROW; break;
            }

        } else if ( strcmp("--solver", argv[i]) == 0 && i+1 < argc ) {
            info = atoi( argv[++i] );
            switch( info ) {
                case 0: opts->solver_par.solver = Magma_CG; break;
                case 1: opts->solver_par.solver = Magma_CGMERGE; break;
                case 2: opts->solver_par.solver = Magma_PCG; break;
                case 3: opts->solver_par.solver = Magma_BICGSTAB; break;
                case 4: opts->solver_par.solver = Magma_BICGSTABMERGE; break;
                case 5: opts->solver_par.solver = Magma_PBICGSTAB; break;
                case 6: opts->solver_par.solver = Magma_GMRES; break;
                case 7: opts->solver_par.solver = Magma_PGMRES; break;
                case 8: opts->solver_par.solver = Magma_LOBPCG; 
                            opts->solver_par.num_eigenvalues = 16;break;
                case 9: opts->solver_par.solver = Magma_JACOBI; break;
                case 10: opts->solver_par.solver = Magma_BAITER; break;
                case 21: opts->solver_par.solver = Magma_ITERREF; break;
            }
        } else if ( strcmp("--restart", argv[i]) == 0 && i+1 < argc ) {
            opts->solver_par.restart = atoi( argv[++i] );
        } else if ( strcmp("--precond", argv[i]) == 0 && i+1 < argc ) {
            info = atoi( argv[++i] );
            switch( info ) {
                case 0: opts->precond_par.solver = Magma_NONE; break;
                case 1: opts->precond_par.solver = Magma_JACOBI; break;
                case 2: opts->precond_par.solver = Magma_ILU; break;
                case 3: opts->precond_par.solver = Magma_CG; break;
                case 4: opts->precond_par.solver = Magma_BICGSTAB; break;
                case 5: opts->precond_par.solver = Magma_GMRES; break;
                case 6: opts->precond_par.solver = Magma_BAITER; break;

            }
        } else if ( strcmp("--ptol", argv[i]) == 0 && i+1 < argc ) {
            sscanf( argv[++i], "%lf", &opts->precond_par.epsilon );
        } else if ( strcmp("--blocksize", argv[i]) == 0 && i+1 < argc ) {
            opts->blocksize = atoi( argv[++i] );
        } else if ( strcmp("--alignment", argv[i]) == 0 && i+1 < argc ) {
            opts->alignment = atoi( argv[++i] );
        } else if ( strcmp("--verbose", argv[i]) == 0 && i+1 < argc ) {
            opts->solver_par.verbose = atoi( argv[++i] );
        }  else if ( strcmp("--maxiter", argv[i]) == 0 && i+1 < argc ) {
            opts->solver_par.maxiter = atoi( argv[++i] );
        } else if ( strcmp("--tol", argv[i]) == 0 && i+1 < argc ) {
            sscanf( argv[++i], "%lf", &opts->solver_par.epsilon );
        } else if ( strcmp("--ev", argv[i]) == 0 && i+1 < argc ) {
            opts->solver_par.num_eigenvalues = atoi( argv[++i] );
        } else if ( strcmp("--version", argv[i]) == 0 && i+1 < argc ) {
            opts->solver_par.version = atoi( argv[++i] );
        }        
        // ----- usage
        else if ( strcmp("-h",     argv[i]) == 0 ||
                  strcmp("--help", argv[i]) == 0 ) {
            fprintf( stderr, usage_sparse, argv[0] );
            return -1;
        } else {
            *matrices = i;
            break;
        }
    }
    // ensure to take a symmetric preconditioner for the PCG
    if ( opts->solver_par.solver == Magma_PCG 
        && opts->precond_par.solver == Magma_ILU )
            opts->precond_par.solver = Magma_ICC;
    return MAGMA_SUCCESS;
}
Example #8
0
// Parse command line
clp_return_type parse_cmdline( int argc , char ** argv, CMD & cmdline,
                               const Teuchos::Comm<int>& comm,
                               const bool uq )
{
    Teuchos::ParameterList params;
    Teuchos::CommandLineProcessor clp(false);

    const int num_grouping_types = 4;
    const GroupingType grouping_values[] = {
        GROUPING_NATURAL, GROUPING_MAX_ANISOTROPY, GROUPING_MORTAN_Z,
        GROUPING_TASMANIAN_SURROGATE
    };
    const char *grouping_names[] = { "natural", "max-anisotropy", "mortan-z", "tasmanian-surrogate" };

    const int num_sampling_types = 3;
    const SamplingType sampling_values[] = {
        SAMPLING_STOKHOS, SAMPLING_TASMANIAN, SAMPLING_FILE
    };
    const char *sampling_names[] = { "stokhos", "tasmanian", "file" };

    clp.setOption("serial", "no-serial",     &cmdline.USE_SERIAL, "use the serial device");
    clp.setOption("threads",                 &cmdline.USE_THREADS, "number of pthreads threads");
    clp.setOption("openmp",                  &cmdline.USE_OPENMP,  "number of openmp threads");
    clp.setOption("numa",                    &cmdline.USE_NUMA,  "number of numa nodes");
    clp.setOption("cores",                   &cmdline.USE_CORE_PER_NUMA, "cores per numa node");
    clp.setOption("cuda", "no-cuda",         &cmdline.USE_CUDA,  "use the CUDA device");
    clp.setOption("device",                  &cmdline.USE_CUDA_DEV,  "CUDA device ID.  Set to default of -1 to use the default device as determined by the local node MPI rank and --ngpus");
    clp.setOption("ngpus",                   &cmdline.USE_NGPUS, "Number of GPUs per node for multi-GPU runs via MPI");
    std::string fixtureSpec="2x2x2";
    clp.setOption("fixture",                 &fixtureSpec,  "fixture string: \"XxYxZ\"");
    clp.setOption("fixture-x",               &cmdline.USE_FIXTURE_X,  "fixture");
    clp.setOption("fixture-y",               &cmdline.USE_FIXTURE_Y,  "fixture");
    clp.setOption("fixture-z",               &cmdline.USE_FIXTURE_Z,  "fixture");
    clp.setOption("fixture-quadratic", "no-fixture-quadratic", &cmdline.USE_FIXTURE_QUADRATIC,  "quadratic");

    clp.setOption("atomic", "no-atomic",      &cmdline.USE_ATOMIC ,  "atomic");
    clp.setOption("trials",                   &cmdline.USE_TRIALS,  "trials");
    clp.setOption("xml-file",                 &cmdline.USE_FENL_XML_FILE, "XML file containing solver parameters");
    clp.setOption("belos", "no-belos",        &cmdline.USE_BELOS ,  "use Belos solver");
    clp.setOption("muelu", "no-muelu",        &cmdline.USE_MUELU,  "use MueLu preconditioner");
    clp.setOption("mean-based", "no-mean-based", &cmdline.USE_MEANBASED,  "use mean-based preconditioner");
    if(cmdline.USE_MUELU || cmdline.USE_MEANBASED)
        cmdline.USE_BELOS = true;

    clp.setOption("sampling", &cmdline.USE_UQ_SAMPLING, num_sampling_types, sampling_values, sampling_names, "UQ sampling method");
    clp.setOption("uq-fake",                  &cmdline.USE_UQ_FAKE,  "setup a fake UQ problem of this size");
    clp.setOption("uq-dim",                   &cmdline.USE_UQ_DIM,  "UQ dimension");
    clp.setOption("uq-order",                 &cmdline.USE_UQ_ORDER,  "UQ order");
    clp.setOption("uq-init-level",            &cmdline.USE_UQ_INIT_LEVEL,  "Initial adaptive sparse grid level");
    clp.setOption("uq-max-level",             &cmdline.USE_UQ_MAX_LEVEL,  "Max adaptive sparse grid level");
    clp.setOption("uq-max-samples",           &cmdline.USE_UQ_MAX_SAMPLES,  "Max number of samples to run");
    clp.setOption("uq-tol",                   &cmdline.USE_UQ_TOL,  "Adaptive sparse grid tolerance");
    clp.setOption("diff-coeff-linear",        &cmdline.USE_DIFF_COEFF_LINEAR,  "Linear term in diffusion coefficient");
    clp.setOption("diff-coeff-constant",      &cmdline.USE_DIFF_COEFF_CONSTANT,  "Constant term in diffusion coefficient");
    clp.setOption("mean",                     &cmdline.USE_MEAN,  "KL diffusion mean");
    clp.setOption("var",                      &cmdline.USE_VAR,  "KL diffusion variance");
    clp.setOption("cor",                      &cmdline.USE_COR,  "KL diffusion correlation");
    clp.setOption("exponential", "no-exponential", &cmdline.USE_EXPONENTIAL,  "take exponential of KL diffusion coefficient");
    clp.setOption("exp-shift",                &cmdline.USE_EXP_SHIFT,  "Linear shift of exponential of KL diffusion coefficient");
    clp.setOption("exp-scale",                &cmdline.USE_EXP_SCALE,  "Multiplicative scale of exponential of KL diffusion coefficient");
    clp.setOption("discontinuous-exp-scale", "continuous-exp-scale", &cmdline.USE_DISC_EXP_SCALE,  "use discontinuous scale factor on exponential");
    clp.setOption("isotropic", "anisotropic", &cmdline.USE_ISOTROPIC,  "use isotropic or anisotropic diffusion coefficient");
    clp.setOption("coeff-src",                &cmdline.USE_COEFF_SRC,  "Coefficient for source term");
    clp.setOption("coeff-adv",                &cmdline.USE_COEFF_ADV,  "Coefficient for advection term");
    clp.setOption("sparse", "tensor",         &cmdline.USE_SPARSE ,  "use sparse or tensor grid");
    clp.setOption("ensemble",                 &cmdline.USE_UQ_ENSEMBLE,  "UQ ensemble size.  This needs to be a valid choice based on available instantiations.");
    clp.setOption("grouping", &cmdline.USE_GROUPING, num_grouping_types, grouping_values, grouping_names, "Sample grouping method for ensemble propagation");
    clp.setOption("surrogate-grouping-level", &cmdline.TAS_GROUPING_INITIAL_LEVEL,  "Starting level for surrogate-based grouping");

    clp.setOption("vtune", "no-vtune",       &cmdline.VTUNE ,  "connect to vtune");
    clp.setOption("verbose", "no-verbose",   &cmdline.VERBOSE, "print verbose intialization info");
    clp.setOption("print", "no-print",        &cmdline.PRINT,  "print detailed test output");
    clp.setOption("print-its", "no-print-its",&cmdline.PRINT_ITS,  "print solver iterations after each sample");
    clp.setOption("summarize", "no-summarize",&cmdline.SUMMARIZE,  "summarize Teuchos timers at end of run");

    bool doDryRun = false;
    clp.setOption("echo", "no-echo",          &doDryRun,  "dry-run only");

    switch (clp.parse(argc, argv)) {
    case Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED:
        return CLP_HELP;
    case Teuchos::CommandLineProcessor::PARSE_ERROR:
    case Teuchos::CommandLineProcessor::PARSE_UNRECOGNIZED_OPTION:
        return CLP_ERROR;
    case Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL:
        break;
    }

#if defined( KOKKOS_HAVE_CUDA )
    // Set CUDA device based on local node rank
    if (cmdline.USE_CUDA && cmdline.USE_CUDA_DEV == -1) {
        int local_rank = 0;
        char *str;
        if ((str = std::getenv("MV2_COMM_WORLD_LOCAL_RANK")))
            local_rank = std::atoi(str);
        else if ((str = getenv("OMPI_COMM_WORLD_LOCAL_RANK")))
            local_rank = std::atoi(str);
        else if ((str = std::getenv("SLURM_LOCALID")))
            local_rank = std::atoi(str);
        cmdline.USE_CUDA_DEV = local_rank % cmdline.USE_NGPUS;

        // Check device is valid
        int num_device;
        cudaGetDeviceCount(&num_device);
        TEUCHOS_TEST_FOR_EXCEPTION(
            cmdline.USE_CUDA_DEV >= cmdline.USE_NGPUS, std::logic_error,
            "Invalid device ID " << cmdline.USE_CUDA_DEV << ".  You probably are trying" <<
            " to run with too many GPUs per node");
    }
#endif

    sscanf( fixtureSpec.c_str() , "%dx%dx%d" ,
            &cmdline.USE_FIXTURE_X ,
            &cmdline.USE_FIXTURE_Y ,
            &cmdline.USE_FIXTURE_Z );

    cmdline.USE_UQ = uq;

    if (doDryRun) {
        print_cmdline( std::cout , cmdline );
        cmdline.ECHO  = 1;
    } else {
        cmdline.ECHO  = 0;
    }
    cmdline.ERROR  = 0 ;

    return CLP_OK;

}
Example #9
0
int main(int argc, char **argv)
{        
    TESTING_INIT();
    magma_setdevice(0);

    magma_timestr_t  start, end;
    float      flops, magma_perf, cuda_perf, error, work[1];
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};
    magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE;
    magma_int_t n_local[4];

    FILE        *fp ; 
    magma_int_t N, m, i, j, lda, LDA, M;
    magma_int_t matsize;
    magma_int_t vecsize;
    magma_int_t istart = 64;
    magma_int_t incx = 1;
    char        uplo = MagmaLower;

    magmaFloatComplex alpha = MAGMA_C_MAKE(1., 0.); // MAGMA_C_MAKE(  1.5, -2.3 );
    magmaFloatComplex beta  = MAGMA_C_MAKE(0., 0.); // MAGMA_C_MAKE( -0.6,  0.8 );
    magmaFloatComplex *A, *X, *Y[4], *Ycublas, *Ymagma;
    magmaFloatComplex *dA, *dX[4], *dY[4], *d_lA[4], *dYcublas ;

    magma_queue_t stream[4][10];
    magmaFloatComplex *C_work;
    magmaFloatComplex *dC_work[4];

    int max_num_gpus;
    magma_int_t num_gpus = 1, nb;
    magma_int_t blocks, lwork;
    magma_int_t offset = 0;

    M = 0;
    N = 0;
    if (argc != 1){
        for(i = 1; i<argc; i++){
            if (strcmp("-N", argv[i])==0)
            {
                N = atoi(argv[++i]);
                istart = N;
            }
            else if (strcmp("-M", argv[i])==0)
                M = atoi(argv[++i]);
            else if (strcmp("-NGPU", argv[i])==0)
              num_gpus = atoi(argv[++i]);
            else if (strcmp("-offset", argv[i])==0)
              offset = atoi(argv[++i]);
        }
        if ( M == 0 ) {
            M = N;
        }
        if ( N == 0 ) {
            N = M;
        }
        if (M>0 && N>0)
        {    printf("  testing_chemv_mgpu -M %d -N %d -NGPU %d\n\n", (int) M, (int) N, (int) num_gpus);
            printf("  in %c side \n", uplo);
        }
        else
            {
                printf("\nUsage: \n");
                printf("  testing_chemv_mgpu -M %d -N %d -NGPU %d\n\n", 
                       1024, 1024, 1);
                exit(1);
            }
    }
    else {
#if defined(PRECISION_z)
        M = N = 8000;
#else
        M = N = 12480;
#endif 
        num_gpus = 2;
        offset = 0;
        printf("\nUsage: \n");
        printf("  testing_chemv_mgpu -M %d -N %d -NGPU %d\n\n", (int) M, (int) N, (int) num_gpus);
    }
         

    //////////////////////////////////////////////////////////////////////////
    cudaGetDeviceCount(&max_num_gpus);
    if (num_gpus > max_num_gpus){
      printf("More GPUs requested than available. Have to change it.\n");
      num_gpus = max_num_gpus;
    }
    printf("Number of GPUs to be used = %d\n", (int) num_gpus);
    for(int i=0; i< num_gpus; i++)
    {
        magma_queue_create(&stream[i][0]);
    }
    

    LDA = ((N+31)/32)*32;
    matsize = N*LDA;
    vecsize = N*incx;
    nb = 32;
    //nb = 64;

    printf("block size = %d\n", (int) nb);
   
    TESTING_MALLOC_CPU( A,       magmaFloatComplex, matsize );
    TESTING_MALLOC_CPU( X,       magmaFloatComplex, vecsize );
    TESTING_MALLOC_CPU( Ycublas, magmaFloatComplex, vecsize );
    TESTING_MALLOC_CPU( Ymagma,  magmaFloatComplex, vecsize );
    for(i=0; i<num_gpus; i++)
    {     
        TESTING_MALLOC_CPU( Y[i], magmaFloatComplex, vecsize );
    }

    magma_setdevice(0);
    TESTING_MALLOC_DEV( dA,       magmaFloatComplex, matsize );
    TESTING_MALLOC_DEV( dYcublas, magmaFloatComplex, vecsize );

    for(i=0; i<num_gpus; i++)
    {      
        n_local[i] = ((N/nb)/num_gpus)*nb;
        if (i < (N/nb)%num_gpus)
            n_local[i] += nb;
        else if (i == (N/nb)%num_gpus)
            n_local[i] += N%nb;
        
        magma_setdevice(i);
        
        TESTING_MALLOC_DEV( d_lA[i], magmaFloatComplex, LDA*n_local[i] );// potentially bugged 
        TESTING_MALLOC_DEV( dX[i],   magmaFloatComplex, vecsize );
        TESTING_MALLOC_DEV( dY[i],   magmaFloatComplex, vecsize );
        
        printf("device %2d n_local = %4d\n", (int) i, (int) n_local[i]); 
    }
    magma_setdevice(0);

      

    //////////////////////////////////////////////////////////////////////////

    /* Initialize the matrix */
    lapackf77_clarnv( &ione, ISEED, &matsize, A );
    magma_cmake_hermitian( N, A, LDA );

    blocks = N / nb + (N % nb != 0);
    lwork = LDA * (blocks + 1);
    TESTING_MALLOC_CPU( C_work, magmaFloatComplex, lwork );
    for(i=0; i<num_gpus; i++){
           magma_setdevice(i);  
           TESTING_MALLOC_DEV( dC_work[i], magmaFloatComplex, lwork );
           //fillZero(dC_work[i], lwork);
    }
      
     magma_setdevice(0);


    //////////////////////////////////////////////////////////////////////////
   
    fp = fopen ("results_chemv_mgpu.csv", "w") ;
    if( fp == NULL ){ printf("Couldn't open output file\n"); exit(1);}

    printf("CHEMV magmaFloatComplex precision\n\n");

    printf( "   n   CUBLAS,Gflop/s   MAGMABLAS,Gflop/s      \"error\"\n" 
            "==============================================================\n");
    fprintf(fp, "   n   CUBLAS,Gflop/s   MAGMABLAS,Gflop/s      \"error\"\n" 
            "==============================================================\n");


//    for( offset = 0; offset< N; offset ++ )
    
    for(int size = istart ; size <= N ; size += 128)
    {
    //    printf("offset = %d ", offset);
        m = size ;
    //    m = N;
        // lda = ((m+31)/32)*32;// 
        lda = LDA; 
        flops = FLOPS( (float)m ) / 1e6;

        printf(      "N %5d ", (int) m );
        fprintf( fp, "%5d, ", (int) m );

        vecsize = m * incx;
        lapackf77_clarnv( &ione, ISEED, &vecsize, X );
        lapackf77_clarnv( &ione, ISEED, &vecsize, Y[0] );

        /* =====================================================================
           Performs operation using CUDA-BLAS
           =================================================================== */
        magma_setdevice(0);
        magma_csetmatrix_1D_col_bcyclic(m, m, A, LDA, d_lA, lda, num_gpus, nb); 
        magma_setdevice(0);

    
    
    magma_csetmatrix( m, m, A, LDA, dA, lda );
        magma_csetvector( m, Y[0], incx, dYcublas, incx );
        
        for(i=0; i<num_gpus; i++){
            magma_setdevice(i);
            magma_csetvector( m, X, incx, dX[i], incx );
            magma_csetvector( m, Y[0], incx, dY[i], incx );


            blocks    = m / nb + (m % nb != 0);
            magma_csetmatrix( lda, blocks, C_work, LDA, dC_work[i], lda );
        }

        magma_setdevice(0);
        start = get_current_time();
        cublasChemv( uplo, m-offset, alpha, dA + offset + offset * lda, lda, dX[0] + offset, incx, beta, dYcublas + offset, incx );
         
        end = get_current_time();

        magma_cgetvector( m, dYcublas, incx, Ycublas, incx );
                
        
        cuda_perf = flops / GetTimerValue(start,end);
        printf(     "%11.2f", cuda_perf );
        fprintf(fp, "%11.2f,", cuda_perf );
       
        
        magma_setdevice(0);

        
        start = get_current_time();
        

        if(nb == 32)
       { 

        magmablas_chemv2_mgpu_32_offset( uplo, m, alpha, d_lA, lda, dX, incx, beta, dY, incx, 
                dC_work, lwork, num_gpus, nb, offset);
 
        }
        else // nb = 64
       { 

        magmablas_chemv2_mgpu_offset( uplo, m, alpha, d_lA, lda, dX, incx, beta, dY, incx, 
                dC_work, lwork, num_gpus, nb, offset);
 
        }
    
            
        for(i=1; i<num_gpus; i++)
        {
           magma_setdevice(i);
           cudaDeviceSynchronize();
        }
      
        end = get_current_time();
        magma_perf = flops / GetTimerValue(start,end); 
        printf(     "%11.2f", magma_perf );
        fprintf(fp, "%11.2f,", magma_perf );
       

        for(i=0; i<num_gpus; i++)
        {        
            magma_setdevice(i);
            magma_cgetvector( m, dY[i], incx, Y[i], incx );
        }
        magma_setdevice(0);

        
#ifdef validate        

        for( j= offset;j<m;j++)
        {
            for(i=1; i<num_gpus; i++)
            {

//            printf("Y[%d][%d] = %15.14f\n", i, j, Y[i][j].x);
#if defined(PRECISION_z) || defined(PRECISION_c)
            Y[0][j].x = Y[0][j].x + Y[i][j].x;
                        Y[0][j].y = Y[0][j].y + Y[i][j].y;
#else 
            Y[0][j] = Y[0][j] + Y[i][j];
            
#endif 

            }
        }

/*

#if defined(PRECISION_z) || defined(PRECISION_c)
        
        for( j=offset;j<m;j++)
        {
            if(Y[0][j].x != Ycublas[j].x)
            {
                     printf("Y-multi[%d] = %f, %f\n",  j, Y[0][j].x, Y[0][j].y );
                     printf("Ycublas[%d] = %f, %f\n",  j, Ycublas[j].x, Ycublas[j].y);
            }
        }

#else 

        for( j=offset;j<m;j++)
        {
            if(Y[0][j] != Ycublas[j])
            {
                     printf("Y-multi[%d] = %f\n",  j, Y[0][j] );
                     printf("Ycublas[%d] = %f\n",  j, Ycublas[j]);
            }
        }

#endif

*/        
        /* =====================================================================
           Computing the Difference Cublas VS Magma
           =================================================================== */
       
        magma_int_t nw = m - offset ;
        blasf77_caxpy( &nw, &c_neg_one, Y[0] + offset, &incx, Ycublas + offset, &incx);
        error = lapackf77_clange( "M", &nw, &ione, Ycublas + offset, &nw, work );
            
#if  0
        printf(      "\t\t %8.6e", error / m );
        fprintf( fp, "\t\t %8.6e", error / m );

        /*
         * Extra check with cblas vs magma
         */
        cblas_ccopy( m, Y, incx, Ycublas, incx );
        cblas_chemv( CblasColMajor, CblasLower, m, 
                     CBLAS_SADDR(alpha), A, LDA, X, incx, 
                     CBLAS_SADDR(beta), Ycublas, incx );
 
        blasf77_caxpy( &m, &c_neg_one, Ymagma, &incx, Ycublas, &incx);
        error = lapackf77_clange( "M", &m, &ione, Ycublas, &m, work );
#endif

        printf(      "\t\t %8.6e", error / m );
        fprintf( fp, "\t\t %8.6e", error / m );
 
#endif 
        printf("\n");        
        fprintf(fp, "\n");        
    }
    
    fclose( fp ) ; 

    /* Free Memory */
    TESTING_FREE_CPU( A );
    TESTING_FREE_CPU( X );
    TESTING_FREE_CPU( Ycublas );
    TESTING_FREE_CPU( Ymagma  );
    TESTING_FREE_CPU( C_work  );

    magma_setdevice(0);
    TESTING_FREE_DEV( dA );
    TESTING_FREE_DEV( dYcublas );
    
    for(i=0; i<num_gpus; i++)
    { 
        TESTING_FREE_CPU( Y[i] );
        magma_setdevice(i);

        TESTING_FREE_DEV( d_lA[i]    );
        TESTING_FREE_DEV( dX[i]      );
        TESTING_FREE_DEV( dY[i]      );
        TESTING_FREE_DEV( dC_work[i] );
    }

    magma_setdevice(0);
 ///////////////////////////////////////////////////////////   
      

    /* Free device */
    TESTING_FINALIZE();
    return 0;
}        
Example #10
0
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
    pArgc = &argc;
    pArgv = argv;

    printf("%s Starting...\n\n", argv[0]);
    printf(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n");

    int deviceCount = 0;
    cudaError_t error_id = cudaGetDeviceCount(&deviceCount);

    if (error_id != cudaSuccess)
    {
        printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id));
        printf("Result = FAIL\n");
        exit(EXIT_FAILURE);
    }

    // This function call returns 0 if there are no CUDA capable devices.
    if (deviceCount == 0)
    {
        printf("There are no available device(s) that support CUDA\n");
    }
    else
    {
        printf("Detected %d CUDA Capable device(s)\n", deviceCount);
    }

    int dev, driverVersion = 0, runtimeVersion = 0;

    for (dev = 0; dev < deviceCount; ++dev)
    {
        cudaSetDevice(dev);
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, dev);

        printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);

        // Console log
        cudaDriverGetVersion(&driverVersion);
        cudaRuntimeGetVersion(&runtimeVersion);
        printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", driverVersion/1000, (driverVersion%100)/10, runtimeVersion/1000, (runtimeVersion%100)/10);
        printf("  CUDA Capability Major/Minor version number:    %d.%d\n", deviceProp.major, deviceProp.minor);

        char msg[256];
        SPRINTF(msg, "  Total amount of global memory:                 %.0f MBytes (%llu bytes)\n",
                (float)deviceProp.totalGlobalMem/1048576.0f, (unsigned long long) deviceProp.totalGlobalMem);
        printf("%s", msg);

        printf("  (%2d) Multiprocessors, (%3d) CUDA Cores/MP:     %d CUDA Cores\n",
               deviceProp.multiProcessorCount,
               _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
               _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
        printf("  GPU Max Clock rate:                            %.0f MHz (%0.2f GHz)\n", deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f);


#if CUDART_VERSION >= 5000
        // This is supported in CUDA 5.0 (runtime API device properties)
        printf("  Memory Clock rate:                             %.0f Mhz\n", deviceProp.memoryClockRate * 1e-3f);
        printf("  Memory Bus Width:                              %d-bit\n",   deviceProp.memoryBusWidth);

        if (deviceProp.l2CacheSize)
        {
            printf("  L2 Cache Size:                                 %d bytes\n", deviceProp.l2CacheSize);
        }

#else
        // This only available in CUDA 4.0-4.2 (but these were only exposed in the CUDA Driver API)
        int memoryClock;
        getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev);
        printf("  Memory Clock rate:                             %.0f Mhz\n", memoryClock * 1e-3f);
        int memBusWidth;
        getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
        printf("  Memory Bus Width:                              %d-bit\n", memBusWidth);
        int L2CacheSize;
        getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);

        if (L2CacheSize)
        {
            printf("  L2 Cache Size:                                 %d bytes\n", L2CacheSize);
        }

#endif

        printf("  Maximum Texture Dimension Size (x,y,z)         1D=(%d), 2D=(%d, %d), 3D=(%d, %d, %d)\n",
               deviceProp.maxTexture1D   , deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1],
               deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]);
        printf("  Maximum Layered 1D Texture Size, (num) layers  1D=(%d), %d layers\n",
               deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1]);
        printf("  Maximum Layered 2D Texture Size, (num) layers  2D=(%d, %d), %d layers\n",
               deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]);


        printf("  Total amount of constant memory:               %lu bytes\n", deviceProp.totalConstMem);
        printf("  Total amount of shared memory per block:       %lu bytes\n", deviceProp.sharedMemPerBlock);
        printf("  Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
        printf("  Warp size:                                     %d\n", deviceProp.warpSize);
        printf("  Maximum number of threads per multiprocessor:  %d\n", deviceProp.maxThreadsPerMultiProcessor);
        printf("  Maximum number of threads per block:           %d\n", deviceProp.maxThreadsPerBlock);
        printf("  Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
               deviceProp.maxThreadsDim[0],
               deviceProp.maxThreadsDim[1],
               deviceProp.maxThreadsDim[2]);
        printf("  Max dimension size of a grid size    (x,y,z): (%d, %d, %d)\n",
               deviceProp.maxGridSize[0],
               deviceProp.maxGridSize[1],
               deviceProp.maxGridSize[2]);
        printf("  Maximum memory pitch:                          %lu bytes\n", deviceProp.memPitch);
        printf("  Texture alignment:                             %lu bytes\n", deviceProp.textureAlignment);
        printf("  Concurrent copy and kernel execution:          %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount);
        printf("  Run time limit on kernels:                     %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
        printf("  Integrated GPU sharing Host Memory:            %s\n", deviceProp.integrated ? "Yes" : "No");
        printf("  Support host page-locked memory mapping:       %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");
        printf("  Alignment requirement for Surfaces:            %s\n", deviceProp.surfaceAlignment ? "Yes" : "No");
        printf("  Device has ECC support:                        %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled");
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
        printf("  CUDA Device Driver Mode (TCC or WDDM):         %s\n", deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)");
#endif
        printf("  Device supports Unified Addressing (UVA):      %s\n", deviceProp.unifiedAddressing ? "Yes" : "No");
        printf("  Device PCI Domain ID / Bus ID / location ID:   %d / %d / %d\n", deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);

        const char *sComputeMode[] =
        {
            "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
            "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
            "Prohibited (no host thread can use ::cudaSetDevice() with this device)",
            "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
            "Unknown",
            NULL
        };
        printf("  Compute Mode:\n");
        printf("     < %s >\n", sComputeMode[deviceProp.computeMode]);
    }

    // If there are 2 or more GPUs, query to determine whether RDMA is supported
    if (deviceCount >= 2)
    {
        cudaDeviceProp prop[64];
        int gpuid[64]; // we want to find the first two GPUs that can support P2P
        int gpu_p2p_count = 0;

        for (int i=0; i < deviceCount; i++)
        {
            checkCudaErrors(cudaGetDeviceProperties(&prop[i], i));

            // Only boards based on Fermi or later can support P2P
            if ((prop[i].major >= 2)
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
                // on Windows (64-bit), the Tesla Compute Cluster driver for windows must be enabled to support this
                && prop[i].tccDriver
#endif
               )
            {
                // This is an array of P2P capable GPUs
                gpuid[gpu_p2p_count++] = i;
            }
        }

        // Show all the combinations of support P2P GPUs
        int can_access_peer;

        if (gpu_p2p_count >= 2)
        {
            for (int i = 0; i < gpu_p2p_count; i++)
            {
                for (int j = 0; j < gpu_p2p_count; j++)
                {
                    if (gpuid[i] == gpuid[j])
                    {
                        continue;
                    }
                    checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));
                        printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[gpuid[i]].name, gpuid[i],
                           prop[gpuid[j]].name, gpuid[j] ,
                           can_access_peer ? "Yes" : "No");
                }
            }
        }
    }

    // csv masterlog info
    // *****************************
    // exe and CUDA driver name
    printf("\n");
    std::string sProfileString = "deviceQuery, CUDA Driver = CUDART";
    char cTemp[16];

    // driver version
    sProfileString += ", CUDA Driver Version = ";
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
    sprintf_s(cTemp, 10, "%d.%d", driverVersion/1000, (driverVersion%100)/10);
#else
    sprintf(cTemp, "%d.%d", driverVersion/1000, (driverVersion%100)/10);
#endif
    sProfileString +=  cTemp;

    // Runtime version
    sProfileString += ", CUDA Runtime Version = ";
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
    sprintf_s(cTemp, 10, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10);
#else
    sprintf(cTemp, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10);
#endif
    sProfileString +=  cTemp;

    // Device count
    sProfileString += ", NumDevs = ";
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
    sprintf_s(cTemp, 10, "%d", deviceCount);
#else
    sprintf(cTemp, "%d", deviceCount);
#endif
    sProfileString += cTemp;

    // Print Out all device Names
    for (dev = 0; dev < deviceCount; ++dev)
    {
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
        sprintf_s(cTemp, 13, ", Device%d = ", dev);
#else
        sprintf(cTemp, ", Device%d = ", dev);
#endif
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, dev);
        sProfileString += cTemp;
        sProfileString += deviceProp.name;
    }

    sProfileString += "\n";
    printf("%s", sProfileString.c_str());

    printf("Result = PASS\n");

    // finish
    exit(EXIT_SUCCESS);
}
Example #11
0
PetscErrorCode  PetscOptionsCheckInitial_Private(void)
{
  char              string[64],mname[PETSC_MAX_PATH_LEN],*f;
  MPI_Comm          comm = PETSC_COMM_WORLD;
  PetscBool         flg1 = PETSC_FALSE,flg2 = PETSC_FALSE,flg3 = PETSC_FALSE,flag;
  PetscErrorCode    ierr;
  PetscReal         si;
  PetscInt          intensity;
  int               i;
  PetscMPIInt       rank;
  char              version[256];
#if !defined(PETSC_HAVE_THREADSAFETY)
  PetscReal         logthreshold;
#endif
#if defined(PETSC_USE_LOG)
  PetscViewerFormat format;
  PetscBool         flg4 = PETSC_FALSE;
#endif
  
  PetscFunctionBegin;
  ierr = MPI_Comm_rank(PETSC_COMM_WORLD,&rank);CHKERRQ(ierr);

#if !defined(PETSC_HAVE_THREADSAFETY)
  /*
      Setup the memory management; support for tracing malloc() usage
  */
  ierr = PetscOptionsHasName(NULL,"-malloc_log",&flg3);CHKERRQ(ierr);
  logthreshold = 0.0;
  ierr = PetscOptionsGetReal(NULL,"-malloc_log_threshold",&logthreshold,&flg1);CHKERRQ(ierr);
  if (flg1) flg3 = PETSC_TRUE;
#if defined(PETSC_USE_DEBUG)
  ierr = PetscOptionsGetBool(NULL,"-malloc",&flg1,&flg2);CHKERRQ(ierr);
  if ((!flg2 || flg1) && !petscsetmallocvisited) {
    if (flg2 || !(PETSC_RUNNING_ON_VALGRIND)) {
      /* turn off default -malloc if valgrind is being used */
      ierr = PetscSetUseTrMalloc_Private();CHKERRQ(ierr);
    }
  }
#else
  ierr = PetscOptionsGetBool(NULL,"-malloc_dump",&flg1,NULL);CHKERRQ(ierr);
  ierr = PetscOptionsGetBool(NULL,"-malloc",&flg2,NULL);CHKERRQ(ierr);
  if (flg1 || flg2 || flg3) {ierr = PetscSetUseTrMalloc_Private();CHKERRQ(ierr);}
#endif
  if (flg3) {
    ierr = PetscMallocSetDumpLogThreshold((PetscLogDouble)logthreshold);CHKERRQ(ierr);
  }
  flg1 = PETSC_FALSE;
  ierr = PetscOptionsGetBool(NULL,"-malloc_debug",&flg1,NULL);CHKERRQ(ierr);
  if (flg1) {
    ierr = PetscSetUseTrMalloc_Private();CHKERRQ(ierr);
    ierr = PetscMallocDebug(PETSC_TRUE);CHKERRQ(ierr);
  }
  flg1 = PETSC_FALSE;
  ierr = PetscOptionsGetBool(NULL,"-malloc_test",&flg1,NULL);CHKERRQ(ierr);
#if defined(PETSC_USE_DEBUG)
  if (flg1 && !PETSC_RUNNING_ON_VALGRIND) {
    ierr = PetscSetUseTrMalloc_Private();CHKERRQ(ierr);
    ierr = PetscMallocSetDumpLog();CHKERRQ(ierr);
    ierr = PetscMallocDebug(PETSC_TRUE);CHKERRQ(ierr);
  }
#endif

  flg1 = PETSC_FALSE;
  ierr = PetscOptionsGetBool(NULL,"-malloc_info",&flg1,NULL);CHKERRQ(ierr);
  if (!flg1) {
    flg1 = PETSC_FALSE;
    ierr = PetscOptionsGetBool(NULL,"-memory_view",&flg1,NULL);CHKERRQ(ierr);
  }
  if (flg1) {
    ierr = PetscMemorySetGetMaximumUsage();CHKERRQ(ierr);
  }
#endif

#if defined(PETSC_USE_LOG)
  ierr = PetscOptionsHasName(NULL,"-objects_dump",&PetscObjectsLog);CHKERRQ(ierr);
#endif

  /*
      Set the display variable for graphics
  */
  ierr = PetscSetDisplay();CHKERRQ(ierr);

  /*
      Print the PETSc version information
  */
  ierr = PetscOptionsHasName(NULL,"-v",&flg1);CHKERRQ(ierr);
  ierr = PetscOptionsHasName(NULL,"-version",&flg2);CHKERRQ(ierr);
  ierr = PetscOptionsHasName(NULL,"-help",&flg3);CHKERRQ(ierr);
  if (flg1 || flg2 || flg3) {

    /*
       Print "higher-level" package version message
    */
    if (PetscExternalVersionFunction) {
      ierr = (*PetscExternalVersionFunction)(comm);CHKERRQ(ierr);
    }

    ierr = PetscGetVersion(version,256);CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"--------------------------------------------\
------------------------------\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"%s\n",version);CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"%s",PETSC_AUTHOR_INFO);CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"See docs/changes/index.html for recent updates.\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"See docs/faq.html for problems.\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"See docs/manualpages/index.html for help. \n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"Libraries linked from %s\n",PETSC_LIB_DIR);CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"--------------------------------------------\
------------------------------\n");CHKERRQ(ierr);
  }

  /*
       Print "higher-level" package help message
  */
  if (flg3) {
    if (PetscExternalHelpFunction) {
      ierr = (*PetscExternalHelpFunction)(comm);CHKERRQ(ierr);
    }
  }

  /*
      Setup the error handling
  */
  flg1 = PETSC_FALSE;
  ierr = PetscOptionsGetBool(NULL,"-on_error_abort",&flg1,NULL);CHKERRQ(ierr);
  if (flg1) {
    ierr = MPI_Comm_set_errhandler(PETSC_COMM_WORLD,MPI_ERRORS_ARE_FATAL);CHKERRQ(ierr);
    ierr = PetscPushErrorHandler(PetscAbortErrorHandler,0);CHKERRQ(ierr);
  }
  flg1 = PETSC_FALSE;
  ierr = PetscOptionsGetBool(NULL,"-on_error_mpiabort",&flg1,NULL);CHKERRQ(ierr);
  if (flg1) { ierr = PetscPushErrorHandler(PetscMPIAbortErrorHandler,0);CHKERRQ(ierr);}
  flg1 = PETSC_FALSE;
  ierr = PetscOptionsGetBool(NULL,"-mpi_return_on_error",&flg1,NULL);CHKERRQ(ierr);
  if (flg1) {
    ierr = MPI_Comm_set_errhandler(comm,MPI_ERRORS_RETURN);CHKERRQ(ierr);
  }
  flg1 = PETSC_FALSE;
  ierr = PetscOptionsGetBool(NULL,"-no_signal_handler",&flg1,NULL);CHKERRQ(ierr);
  if (!flg1) {ierr = PetscPushSignalHandler(PetscSignalHandlerDefault,(void*)0);CHKERRQ(ierr);}
  flg1 = PETSC_FALSE;
  ierr = PetscOptionsGetBool(NULL,"-fp_trap",&flg1,NULL);CHKERRQ(ierr);
  if (flg1) {ierr = PetscSetFPTrap(PETSC_FP_TRAP_ON);CHKERRQ(ierr);}
  ierr = PetscOptionsGetInt(NULL,"-check_pointer_intensity",&intensity,&flag);CHKERRQ(ierr);
  if (flag) {ierr = PetscCheckPointerSetIntensity(intensity);CHKERRQ(ierr);}

  /*
      Setup debugger information
  */
  ierr = PetscSetDefaultDebugger();CHKERRQ(ierr);
  ierr = PetscOptionsGetString(NULL,"-on_error_attach_debugger",string,64,&flg1);CHKERRQ(ierr);
  if (flg1) {
    MPI_Errhandler err_handler;

    ierr = PetscSetDebuggerFromString(string);CHKERRQ(ierr);
    ierr = MPI_Comm_create_errhandler((MPI_Handler_function*)Petsc_MPI_DebuggerOnError,&err_handler);CHKERRQ(ierr);
    ierr = MPI_Comm_set_errhandler(comm,err_handler);CHKERRQ(ierr);
    ierr = PetscPushErrorHandler(PetscAttachDebuggerErrorHandler,0);CHKERRQ(ierr);
  }
  ierr = PetscOptionsGetString(NULL,"-debug_terminal",string,64,&flg1);CHKERRQ(ierr);
  if (flg1) { ierr = PetscSetDebugTerminal(string);CHKERRQ(ierr); }
  ierr = PetscOptionsGetString(NULL,"-start_in_debugger",string,64,&flg1);CHKERRQ(ierr);
  ierr = PetscOptionsGetString(NULL,"-stop_for_debugger",string,64,&flg2);CHKERRQ(ierr);
  if (flg1 || flg2) {
    PetscMPIInt    size;
    PetscInt       lsize,*nodes;
    MPI_Errhandler err_handler;
    /*
       we have to make sure that all processors have opened
       connections to all other processors, otherwise once the
       debugger has stated it is likely to receive a SIGUSR1
       and kill the program.
    */
    ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr);
    if (size > 2) {
      PetscMPIInt dummy = 0;
      MPI_Status  status;
      for (i=0; i<size; i++) {
        if (rank != i) {
          ierr = MPI_Send(&dummy,1,MPI_INT,i,109,PETSC_COMM_WORLD);CHKERRQ(ierr);
        }
      }
      for (i=0; i<size; i++) {
        if (rank != i) {
          ierr = MPI_Recv(&dummy,1,MPI_INT,i,109,PETSC_COMM_WORLD,&status);CHKERRQ(ierr);
        }
      }
    }
    /* check if this processor node should be in debugger */
    ierr  = PetscMalloc1(size,&nodes);CHKERRQ(ierr);
    lsize = size;
    ierr  = PetscOptionsGetIntArray(NULL,"-debugger_nodes",nodes,&lsize,&flag);CHKERRQ(ierr);
    if (flag) {
      for (i=0; i<lsize; i++) {
        if (nodes[i] == rank) { flag = PETSC_FALSE; break; }
      }
    }
    if (!flag) {
      ierr = PetscSetDebuggerFromString(string);CHKERRQ(ierr);
      ierr = PetscPushErrorHandler(PetscAbortErrorHandler,0);CHKERRQ(ierr);
      if (flg1) {
        ierr = PetscAttachDebugger();CHKERRQ(ierr);
      } else {
        ierr = PetscStopForDebugger();CHKERRQ(ierr);
      }
      ierr = MPI_Comm_create_errhandler((MPI_Handler_function*)Petsc_MPI_AbortOnError,&err_handler);CHKERRQ(ierr);
      ierr = MPI_Comm_set_errhandler(comm,err_handler);CHKERRQ(ierr);
    }
    ierr = PetscFree(nodes);CHKERRQ(ierr);
  }

  ierr = PetscOptionsGetString(NULL,"-on_error_emacs",emacsmachinename,128,&flg1);CHKERRQ(ierr);
  if (flg1 && !rank) {ierr = PetscPushErrorHandler(PetscEmacsClientErrorHandler,emacsmachinename);CHKERRQ(ierr);}

  /*
        Setup profiling and logging
  */
#if defined(PETSC_USE_INFO)
  {
    char logname[PETSC_MAX_PATH_LEN]; logname[0] = 0;
    ierr = PetscOptionsGetString(NULL,"-info",logname,250,&flg1);CHKERRQ(ierr);
    if (flg1 && logname[0]) {
      ierr = PetscInfoAllow(PETSC_TRUE,logname);CHKERRQ(ierr);
    } else if (flg1) {
      ierr = PetscInfoAllow(PETSC_TRUE,NULL);CHKERRQ(ierr);
    }
  }
#endif
#if defined(PETSC_USE_LOG)
  mname[0] = 0;
  ierr = PetscOptionsGetString(NULL,"-history",mname,PETSC_MAX_PATH_LEN,&flg1);CHKERRQ(ierr);
  if (flg1) {
    if (mname[0]) {
      ierr = PetscOpenHistoryFile(mname,&petsc_history);CHKERRQ(ierr);
    } else {
      ierr = PetscOpenHistoryFile(NULL,&petsc_history);CHKERRQ(ierr);
    }
  }
#if defined(PETSC_HAVE_MPE)
  flg1 = PETSC_FALSE;
  ierr = PetscOptionsHasName(NULL,"-log_mpe",&flg1);CHKERRQ(ierr);
  if (flg1) {ierr = PetscLogMPEBegin();CHKERRQ(ierr);}
#endif
  flg1 = PETSC_FALSE;
  flg3 = PETSC_FALSE;
  ierr = PetscOptionsGetBool(NULL,"-log_all",&flg1,NULL);CHKERRQ(ierr);
  ierr = PetscOptionsHasName(NULL,"-log_summary",&flg3);CHKERRQ(ierr);
  if (flg1)                      { ierr = PetscLogAllBegin();CHKERRQ(ierr); }
  else if (flg3)                 { ierr = PetscLogDefaultBegin();CHKERRQ(ierr);}

  ierr = PetscOptionsGetString(NULL,"-log_trace",mname,250,&flg1);CHKERRQ(ierr);
  if (flg1) {
    char name[PETSC_MAX_PATH_LEN],fname[PETSC_MAX_PATH_LEN];
    FILE *file;
    if (mname[0]) {
      sprintf(name,"%s.%d",mname,rank);
      ierr = PetscFixFilename(name,fname);CHKERRQ(ierr);
      file = fopen(fname,"w");
      if (!file) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_FILE_OPEN,"Unable to open trace file: %s",fname);
    } else file = PETSC_STDOUT;
    ierr = PetscLogTraceBegin(file);CHKERRQ(ierr);
  }

  ierr   = PetscOptionsGetViewer(PETSC_COMM_WORLD,NULL,"-log_view",NULL,&format,&flg4);CHKERRQ(ierr);
  if (flg4) {
    if (format == PETSC_VIEWER_ASCII_XML){
      ierr = PetscLogNestedBegin();CHKERRQ(ierr);
    } else {
      ierr = PetscLogDefaultBegin();CHKERRQ(ierr);
    }
  }
#endif

  ierr = PetscOptionsGetBool(NULL,"-saws_options",&PetscOptionsPublish,NULL);CHKERRQ(ierr);

#if defined(PETSC_HAVE_CUDA)
  ierr = PetscOptionsHasName(NULL,"-cuda_show_devices",&flg1);CHKERRQ(ierr);
  if (flg1) {
    struct cudaDeviceProp prop;
    int                   devCount;
    int                   device;
    cudaError_t           err = cudaSuccess;

    err = cudaGetDeviceCount(&devCount);
    if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaGetDeviceCount %s",cudaGetErrorString(err));
    for (device = 0; device < devCount; ++device) {
      err = cudaGetDeviceProperties(&prop, device);
      if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaGetDeviceProperties %s",cudaGetErrorString(err));
      ierr = PetscPrintf(PETSC_COMM_WORLD, "CUDA device %d: %s\n", device, prop.name);CHKERRQ(ierr);
    }
  }
  {
    int size;
    ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr);
    if (size>1) {
      int         devCount, device, rank;
      cudaError_t err = cudaSuccess;

      /* check to see if we force multiple ranks to hit the same GPU */
      ierr = PetscOptionsGetInt(NULL,"-cuda_set_device", &device, &flg1);CHKERRQ(ierr);
      if (flg1) {
        err = cudaSetDevice(device);
        if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaSetDevice %s",cudaGetErrorString(err));
      } else {
        /* we're not using the same GPU on multiple MPI threads. So try to allocated different   GPUs to different processes */

        /* First get the device count */
        err   = cudaGetDeviceCount(&devCount);
        if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaGetDeviceCount %s",cudaGetErrorString(err));

        /* next determine the rank and then set the device via a mod */
        ierr   = MPI_Comm_rank(PETSC_COMM_WORLD,&rank);CHKERRQ(ierr);
        device = rank % devCount;
        err    = cudaSetDevice(device);
        if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaSetDevice %s",cudaGetErrorString(err));
      }

      /* set the device flags so that it can map host memory ... do NOT throw exception on err!=cudaSuccess
       multiple devices may try to set the flags on the same device. So long as one of them succeeds, things
       are ok. */
      err = cudaSetDeviceFlags(cudaDeviceMapHost);
      if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaSetDeviceFlags %s",cudaGetErrorString(err));
    } else {
      int         device;
      cudaError_t err = cudaSuccess;

      /* the code below works for serial GPU simulations */
      ierr = PetscOptionsGetInt(NULL,"-cuda_set_device", &device, &flg1);CHKERRQ(ierr);
      if (flg1) {
        err = cudaSetDevice(device);
        if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaSetDevice %s",cudaGetErrorString(err));
      }

      /* set the device flags so that it can map host memory ... here, we error check. */
      err = cudaSetDeviceFlags(cudaDeviceMapHost);
      if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaSetDeviceFlags %s",cudaGetErrorString(err));
    }
  }
#endif


  /*
       Print basic help message
  */
  ierr = PetscOptionsHasName(NULL,"-help",&flg1);CHKERRQ(ierr);
  if (flg1) {
    ierr = (*PetscHelpPrintf)(comm,"Options for all PETSc programs:\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -help: prints help method for each option\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -on_error_abort: cause an abort when an error is detected. Useful \n ");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"       only when run in the debugger\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -on_error_attach_debugger [gdb,dbx,xxgdb,ups,noxterm]\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"       start the debugger in new xterm\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"       unless noxterm is given\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -start_in_debugger [gdb,dbx,xxgdb,ups,noxterm]\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"       start all processes in the debugger\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -on_error_emacs <machinename>\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"    emacs jumps to error file\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -debugger_nodes [n1,n2,..] Nodes to start in debugger\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -debugger_pause [m] : delay (in seconds) to attach debugger\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -stop_for_debugger : prints message on how to attach debugger manually\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"                      waits the delay for you to attach\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -display display: Location where X window graphics and debuggers are displayed\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -no_signal_handler: do not trap error signals\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -mpi_return_on_error: MPI returns error code, rather than abort on internal error\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -fp_trap: stop on floating point exceptions\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"           note on IBM RS6000 this slows run greatly\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -malloc_dump <optional filename>: dump list of unfreed memory at conclusion\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -malloc: use our error checking malloc\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -malloc no: don't use error checking malloc\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -malloc_info: prints total memory usage\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -malloc_log: keeps log of all memory allocations\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -malloc_debug: enables extended checking for memory corruption\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -options_table: dump list of options inputted\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -options_left: dump list of unused options\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -options_left no: don't dump list of unused options\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -tmp tmpdir: alternative /tmp directory\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -shared_tmp: tmp directory is shared by all processors\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -not_shared_tmp: each processor has separate tmp directory\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -memory_view: print memory usage at end of run\n");CHKERRQ(ierr);
#if defined(PETSC_USE_LOG)
    ierr = (*PetscHelpPrintf)(comm," -get_total_flops: total flops over all processors\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -log[_summary _summary_python]: logging objects and events\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -log_trace [filename]: prints trace of all PETSc calls\n");CHKERRQ(ierr);
#if defined(PETSC_HAVE_MPE)
    ierr = (*PetscHelpPrintf)(comm," -log_mpe: Also create logfile viewable through Jumpshot\n");CHKERRQ(ierr);
#endif
    ierr = (*PetscHelpPrintf)(comm," -info <optional filename>: print informative messages about the calculations\n");CHKERRQ(ierr);
#endif
    ierr = (*PetscHelpPrintf)(comm," -v: prints PETSc version number and release date\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -options_file <file>: reads options from file\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -petsc_sleep n: sleeps n seconds before running program\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"-----------------------------------------------\n");CHKERRQ(ierr);
  }

#if defined(PETSC_HAVE_POPEN)
  {
  char machine[128];
  ierr = PetscOptionsGetString(NULL,"-popen_machine",machine,128,&flg1);CHKERRQ(ierr);
  if (flg1) {
    ierr = PetscPOpenSetMachine(machine);CHKERRQ(ierr);
  }
  }
#endif

  ierr = PetscOptionsGetReal(NULL,"-petsc_sleep",&si,&flg1);CHKERRQ(ierr);
  if (flg1) {
    ierr = PetscSleep(si);CHKERRQ(ierr);
  }

  ierr = PetscOptionsGetString(NULL,"-info_exclude",mname,PETSC_MAX_PATH_LEN,&flg1);CHKERRQ(ierr);
  if (flg1) {
    ierr = PetscStrstr(mname,"null",&f);CHKERRQ(ierr);
    if (f) {
      ierr = PetscInfoDeactivateClass(0);CHKERRQ(ierr);
    }
  }

#if defined(PETSC_HAVE_CUSP) || defined(PETSC_HAVE_VIENNACL)
  ierr = PetscOptionsHasName(NULL,"-log_summary",&flg3);CHKERRQ(ierr);
  if (!flg3) {
  ierr = PetscOptionsHasName(NULL,"-log_view",&flg3);CHKERRQ(ierr);
  }
#endif
#if defined(PETSC_HAVE_CUSP)
  ierr = PetscOptionsGetBool(NULL,"-cusp_synchronize",&flg3,NULL);CHKERRQ(ierr);
  PetscCUSPSynchronize = flg3;
#elif defined(PETSC_HAVE_VIENNACL)
  ierr = PetscOptionsGetBool(NULL,"-viennacl_synchronize",&flg3,NULL);CHKERRQ(ierr);
  PetscViennaCLSynchronize = flg3;
#endif
  PetscFunctionReturn(0);
}
Example #12
0
int main(int argc, char** argv)
{
	bool srcbin = 0;
	bool invbk = 0;
	if(argc < 3){
		printf("Not enough args!\narg1: target image\narg2: source image\narg3: do source image adaptive threshold or not\narg4: invert back ground or not\n");
		getchar();
		return 1;
	}
	if(argc >= 4){
		if(!strcmp(argv[3], "1"))
			srcbin = 1;
	}
	if(argc >= 5){
		if(!strcmp(argv[4], "1"))
			invbk = 1;
	}

	IplImage* srcimg= 0, *srcimgb= 0, *srcimgb2 = 0, *bimg = 0, *b2img = 0,*bugimg = 0, *alg2dst = 0;
	srcimg= cvLoadImage(argv[2], 1);
	if (!srcimg)
	{
		printf("src img %s load failed!\n", argv[2]);
		getchar();
		return 1;
	}
	
	//choosing the parameters for our ccl
	int bn = 8; //how many partitions
	int nwidth = 512;
	if(srcimg->width > 512){
		nwidth = 1024;
		bn = 6;
	}
	if(srcimg->width > 1024){
		nwidth = 2048;
		bn = 3;
	}
	if(srcimg->width > 2048){
		printf("warning, image too wide, max support 2048. image is truncated.\n");
		getchar();
		return 1;
	}
	
	//start selection gpu devices
	int devCount;
	int smCnt = 0;
    cudaGetDeviceCount(&devCount);
 
    // Iterate through devices
	int devChosen = 0;
    for (int i = 0; i < devCount; ++i)
    {
        cudaDeviceProp devProp;
        cudaGetDeviceProperties(&devProp, i);
		if(devProp.major >= 2){//only one device supported
			smCnt = max(smCnt, devProp.multiProcessorCount);
			if(devProp.multiProcessorCount == smCnt)
				devChosen = i;
		}
    }
	
	if(smCnt == 0){
		//our ccl require CUDA cap 2.0 or above, but the Ostava's ccl can be run on any CUDA gpu
		printf("Error, no device with cap 2.x found. Only cpu alg will be run.\n");
		getchar();
		return 1;
	}
	
	if(smCnt != 0){
		cudaSetDevice(devChosen);
		bn = bn * smCnt;
	}

	int nheight = (cvGetSize(srcimg).height-2) / (2*bn);
	if((nheight*2*bn+2) < cvGetSize(srcimg).height)
		nheight++;
	nheight = nheight*2*bn+2;

	if(smCnt != 0)
		printf("gpu ccl for image width 512, 1024, 2048.\nchoosing device %d, width %d, height %d, blocks %d\n", devChosen, nwidth, nheight, bn);

	srcimgb= cvCreateImage(cvSize(nwidth, cvGetSize(srcimg).height),IPL_DEPTH_8U,1);
	srcimgb2= cvCreateImage(cvSize(nwidth, cvGetSize(srcimg).height),IPL_DEPTH_8U,1);
	cvSetImageROI(srcimg, cvRect(0, 0, min(cvGetSize(srcimg).width, nwidth), cvGetSize(srcimg).height));
	cvSetImageROI(srcimgb2, cvRect(0, 0, min(cvGetSize(srcimg).width, nwidth), cvGetSize(srcimg).height));
	cvSet(srcimgb2, cvScalar(0,0,0));
	cvCvtColor(srcimg, srcimgb2, CV_BGRA2GRAY);
	cvResetImageROI(srcimgb2);
	cvReleaseImage(&srcimg);
	if(srcbin)
		cvAdaptiveThreshold(srcimgb2, srcimgb, 1.0, CV_ADAPTIVE_THRESH_MEAN_C, invbk ? CV_THRESH_BINARY_INV :  CV_THRESH_BINARY);
	else
		cvThreshold(srcimgb2, srcimgb, 0.0, 1.0, invbk ? CV_THRESH_BINARY_INV :  CV_THRESH_BINARY);
	boundCheck(srcimgb);

	cvScale(srcimgb, srcimgb2, 255);
	//the source binary image to be labeled is saved as bsrc.bmp
	cvSaveImage("bsrc.bmp", srcimgb2);
	cvSet(srcimgb2, cvScalar(0,0,0));
	
	float elapsedMilliSeconds1;
	{//begin cpu labeling algorithm, the SBLA proposed by Zhao
		LABELDATATYPE *data=(LABELDATATYPE *)malloc(srcimgb->width * srcimgb->height * sizeof(LABELDATATYPE));
	
		for(int j = 0; j<srcimgb->height; j++)
			for(int i = 0; i<srcimgb->width; i++)
				data[i + j*srcimgb->width] = (srcimgb->imageData[i + j*srcimgb->widthStep]) ? 1 : 0;

		int iNumLabels;
		CPerformanceCounter perf;
		perf.Start();
    	iNumLabels = LabelSBLA(data, srcimgb->width, srcimgb->height);
		elapsedMilliSeconds1 = (float)perf.GetElapsedMilliSeconds();
		printf("cpu SBLA used %f ms, total labels %u\n", elapsedMilliSeconds1, iNumLabels);
		free(data);
	}
	
	IplImage *src2(0),*dst2(0);
	int iNumLabels;
	float elapsedMilliSeconds2;
	{//begin cpu labeling algorithm, the BBDT proposed by C. Grana, D. Borghesani, R. Cucchiara
		CPerformanceCounter perf;
		src2 = cvCreateImage( cvGetSize(srcimgb), IPL_DEPTH_8U, 1 );
		cvCopyImage(srcimgb,src2);
		dst2 = cvCreateImage( cvGetSize(srcimgb), IPL_DEPTH_32S, 1 );
		perf.Start();
		cvLabelingImageLab(src2, dst2, 1, &iNumLabels);
		elapsedMilliSeconds2 = (float)perf.GetElapsedMilliSeconds();
		printf("cpu BBDT used %f ms, total labels %u\n", elapsedMilliSeconds2, iNumLabels);
		cvSaveImage("bbdt.bmp", dst2);
//		cvReleaseImage(&src2);
//		cvReleaseImage(&dst2);
	}

	if(smCnt != 0){

	bugimg = cvCreateImage(cvSize(nwidth, 9*bn),IPL_DEPTH_8U,1);
	bimg = cvCreateImage(cvSize(nwidth, 2*bn),IPL_DEPTH_8U,1);
	b2img = cvCreateImage(cvSize(nwidth, 2*bn),IPL_DEPTH_8U,1);

//    cvNamedWindow("src",CV_WINDOW_AUTOSIZE);
//	cvShowImage("src",srcimg);

	//prepare buffers for our gpu algorithm
	CudaBuffer srcBuf, dstBuf, dstBuf2, bBuf, b2Buf, errBuf, glabel;
	srcBuf.Create2D(nwidth, nheight);		//the binary image to be processed
	dstBuf.Create2D(nwidth, (nheight-2)/2); //the label result, only about 1/4 the size of source image contains the final labels
	dstBuf2.Create2D(nwidth,(nheight-2)/2);	//a copy of the pass1 temp result, for debug purpose
	glabel.Create2D(4, 1);					//a int size global buffer for unique final label
	errBuf.Create2D(nwidth, 9*bn);			//a buffer for debug info
	bBuf.Create2D(nwidth, 2 * bn);			//the intersection info used by pass2
	b2Buf.Create2D(nwidth, 2 * bn);			//a copy of bBuf for debug purpose

	srcBuf.SetZeroData();
	srcBuf.CopyFrom(srcimgb->imageData, srcimgb->widthStep, nwidth, cvGetSize(srcimgb).height);

	float elapsedTimeInMs = 0.0f;
    
	//-------------------gpu part----------------------------
    cudaEvent_t start, stop;
    cutilSafeCall  ( cudaEventCreate( &start ) );
    cutilSafeCall  ( cudaEventCreate( &stop ) );
    cutilSafeCall( cudaEventRecord( start, 0 ) );  
	
	if(nwidth == 512)
		label_512(&dstBuf, &dstBuf2, &srcBuf, &bBuf, &b2Buf, &glabel, nheight, bn, &errBuf);
	else if(nwidth == 1024)
		label_1024(&dstBuf, &dstBuf2, &srcBuf, &bBuf, &b2Buf, &glabel, nheight, bn, &errBuf);
	else if(nwidth == 2048)
		label_2048(&dstBuf, &dstBuf2, &srcBuf, &bBuf, &b2Buf, &glabel, nheight, bn, &errBuf);

    cutilSafeCall( cudaEventRecord( stop, 0 ) );
//	cutilCheckMsg("kernel launch failure");
	cudaEventSynchronize(stop);
	cutilSafeCall( cudaEventElapsedTime( &elapsedTimeInMs, start, stop ) );
	uint tlabel = 0;

	cudaMemcpy(&tlabel, glabel.GetData(), 4, cudaMemcpyDeviceToHost);
	printf("gpu alg 1 used %f ms, total labels %u\n", elapsedTimeInMs, tlabel);

	dstBuf.CopyToHost(srcimgb->imageData, srcimgb->widthStep, nwidth, (nheight-2)/2);
	dstBuf2.CopyToHost(srcimgb2->imageData, srcimgb->widthStep, nwidth, (nheight-2)/2);
	errBuf.CopyToHost(bugimg->imageData, bugimg->widthStep, nwidth, 9*bn);
	bBuf.CopyToHost(bimg->imageData, bimg->widthStep, nwidth, 2*bn);
	b2Buf.CopyToHost(b2img->imageData, bimg->widthStep, nwidth, 2*bn);

//	cvNamedWindow("gpu",CV_WINDOW_AUTOSIZE);
//	cvShowImage("gpu",srcimgb);
	cvSaveImage(argv[1], srcimgb);
	cvSaveImage("gpu2.bmp", srcimgb2);	//the final labels of our algorithm
	cvSaveImage("bug.bmp", bugimg);
	cvSaveImage("b.bmp", bimg);
	cvSaveImage("b2.bmp", b2img);
	
	//now start the gpu ccl implemented by Ostava
	alg2dst= cvCreateImage(cvSize(nwidth*4, cvGetSize(srcimgb).height),IPL_DEPTH_8U,1);
	CCLBase* m_ccl;
	m_ccl = new CCL();	

	m_ccl->FindRegions(nwidth, cvGetSize(srcimgb).height, &srcBuf);
	m_ccl->GetConnectedRegionsBuffer()->CopyToHost(alg2dst->imageData, alg2dst->widthStep, nwidth*4, cvGetSize(srcimgb).height);
	delete m_ccl;
	cvSaveImage("alg2.bmp", alg2dst);

	cvReleaseImage(&bugimg);
	cvReleaseImage(&bimg);
	cvReleaseImage(&b2img);
	cvReleaseImage(&alg2dst);
//	}
	//cvWaitKey(0);
	
	//now start cross compare label results of our ccl and the BBDT, to check the correctness
//	if(smCnt != 0){
		ushort *gpures, *cpures;
		uint sz = nwidth * (cvGetSize(srcimgb).height/2);
		gpures = (ushort*)malloc(sz);
		cpures = (ushort*)malloc(sz);
		dstBuf.CopyToHost(gpures, nwidth, nwidth, (cvGetSize(srcimgb).height/2));
		
		//first, reduce cpu labels from one label for each pixel to one label for a 2x2 block, assuming 8-connectivity
		for(int j = 0; j < (cvGetSize(srcimgb).height/2); j++)
			for(int i = 0; i < (nwidth/2); i++){
				uint* cpup;
				ushort res = LBMAX;
				uint y = j*2, x = i*2;
				cpup = (uint*)(dst2->imageData + y*dst2->widthStep);
//				if(y < cvGetSize(srcimgb).height){
					if(cpup[x] != 0)
						res = cpup[x]-1;
					if(cpup[x+1] != 0)
						res = cpup[x+1]-1;
//				}
				y++;
				cpup = (uint*)(dst2->imageData + y*dst2->widthStep);
//				if(y < cvGetSize(srcimgb).height){
					if(cpup[x] != 0)
						res = cpup[x]-1;
					if(cpup[x+1] != 0)
						res = cpup[x+1]-1;
//				}
				cpures[i + j*(nwidth/2)] = res;
			}
		
		//our algo use unsigned short to represent a label, the first label starts a 0, and maximun labels is LBMAX
		if(iNumLabels > LBMAX)
			printf("too much cc, compare abort.\n");
		else{
			//create a error
			//cpures[5] = 12;
			//cpures[15] = 18;
			printf("Checking correctness of gpu alg1\nChecking gpu ref by cpu.\n");
			checkLabels(cpures, gpures, nwidth/2, cvGetSize(srcimgb).height/2, iNumLabels);

			printf("Checking cpu ref by gpu.\n");
			checkLabels(gpures, cpures, nwidth/2, cvGetSize(srcimgb).height/2, tlabel);
		}

		free(gpures);
		free(cpures);
		printf("speedup is %f, %f, %f\n", gpu2time/elapsedTimeInMs, elapsedMilliSeconds1/elapsedTimeInMs, elapsedMilliSeconds2/elapsedTimeInMs);
	}

	cvReleaseImage(&srcimgb);
	cvReleaseImage(&srcimgb2);
	cvReleaseImage(&dst2);
	cvReleaseImage(&src2);

    cutilSafeCall( cudaThreadExit() );
	return 0;

}
Example #13
0
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv) 
{
    pArgc = &argc;
    pArgv = argv;

    /*   shrQAStart(argc, argv);

    shrSetLogFileName ("deviceQuery.txt");


 */
    shrLog("%s Starting...\n\n", argv[0]);
    shrLog(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n");

    int deviceCount = 0;
    cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
    if (error_id != cudaSuccess) {
        shrLog( "cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id) );
        return -1;
    }
    // This function call returns 0 if there are no CUDA capable devices.
    if (deviceCount == 0)
        shrLog("There is no device supporting CUDA\n");
    else
        shrLog("Found %d CUDA Capable device(s)\n", deviceCount);

    int dev, driverVersion = 0, runtimeVersion = 0;
    for (dev = 0; dev < deviceCount; ++dev) {
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, dev);

        shrLog("\nDevice %d: \"%s\"\n", dev, deviceProp.name);

#if CUDART_VERSION >= 2020
        // Console log
        cudaDriverGetVersion(&driverVersion);
        cudaRuntimeGetVersion(&runtimeVersion);
        shrLog("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", driverVersion/1000, (driverVersion%100)/10, runtimeVersion/1000, (runtimeVersion%100)/10);
#endif
        shrLog("  CUDA Capability Major/Minor version number:    %d.%d\n", deviceProp.major, deviceProp.minor);

        char msg[256];
        sprintf(msg, "  Total amount of global memory:                 %.0f MBytes (%llu bytes)\n",
                (float)deviceProp.totalGlobalMem/1048576.0f, (unsigned long long) deviceProp.totalGlobalMem);
        shrLog(msg);
#if CUDART_VERSION >= 2000
        shrLog("  (%2d) Multiprocessors x (%2d) CUDA Cores/MP:     %d CUDA Cores\n",
               deviceProp.multiProcessorCount,
               ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
               ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
#endif
        shrLog("  GPU Clock Speed:                               %.2f GHz\n", deviceProp.clockRate * 1e-6f);
#if CUDART_VERSION >= 4000
        // This is not available in the CUDA Runtime API, so we make the necessary calls the driver API to support this for output
        int memoryClock;
        getCudaAttribute<int>( &memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev );
        shrLog("  Memory Clock rate:                             %.2f Mhz\n", memoryClock * 1e-3f);
        int memBusWidth;
        getCudaAttribute<int>( &memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev );
        shrLog("  Memory Bus Width:                              %d-bit\n", memBusWidth);
        int L2CacheSize;
        getCudaAttribute<int>( &L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev );
        if (L2CacheSize) {
            shrLog("  L2 Cache Size:                                 %d bytes\n", L2CacheSize);
        }

        shrLog("  Max Texture Dimension Size (x,y,z)             1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n",
               deviceProp.maxTexture1D, deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1],
                deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]);
        shrLog("  Max Layered Texture Size (dim) x layers        1D=(%d) x %d, 2D=(%d,%d) x %d\n",
               deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1],
                deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]);
#endif
        shrLog("  Total amount of constant memory:               %u bytes\n", deviceProp.totalConstMem);
        shrLog("  Total amount of shared memory per block:       %u bytes\n", deviceProp.sharedMemPerBlock);
        shrLog("  Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
        shrLog("  Warp size:                                     %d\n", deviceProp.warpSize);
        shrLog("  Maximum number of threads per block:           %d\n", deviceProp.maxThreadsPerBlock);
        shrLog("  Maximum sizes of each dimension of a block:    %d x %d x %d\n",
               deviceProp.maxThreadsDim[0],
                deviceProp.maxThreadsDim[1],
                deviceProp.maxThreadsDim[2]);
        shrLog("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n",
               deviceProp.maxGridSize[0],
                deviceProp.maxGridSize[1],
                deviceProp.maxGridSize[2]);
        shrLog("  Maximum memory pitch:                          %u bytes\n", deviceProp.memPitch);
        shrLog("  Texture alignment:                             %u bytes\n", deviceProp.textureAlignment);

#if CUDART_VERSION >= 4000
        shrLog("  Concurrent copy and execution:                 %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount);
#else
        shrLog("  Concurrent copy and execution:                 %s\n", deviceProp.deviceOverlap ? "Yes" : "No");
#endif

#if CUDART_VERSION >= 2020
        shrLog("  Run time limit on kernels:                     %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
        shrLog("  Integrated GPU sharing Host Memory:            %s\n", deviceProp.integrated ? "Yes" : "No");
        shrLog("  Support host page-locked memory mapping:       %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");
#endif
#if CUDART_VERSION >= 3000
        shrLog("  Concurrent kernel execution:                   %s\n", deviceProp.concurrentKernels ? "Yes" : "No");
        shrLog("  Alignment requirement for Surfaces:            %s\n", deviceProp.surfaceAlignment ? "Yes" : "No");
#endif
#if CUDART_VERSION >= 3010
        shrLog("  Device has ECC support enabled:                %s\n", deviceProp.ECCEnabled ? "Yes" : "No");
#endif
#if CUDART_VERSION >= 3020
        shrLog("  Device is using TCC driver mode:               %s\n", deviceProp.tccDriver ? "Yes" : "No");
#endif
#if CUDART_VERSION >= 4000
        shrLog("  Device supports Unified Addressing (UVA):      %s\n", deviceProp.unifiedAddressing ? "Yes" : "No");
        shrLog("  Device PCI Bus ID / PCI location ID:           %d / %d\n", deviceProp.pciBusID, deviceProp.pciDeviceID );
#endif

#if CUDART_VERSION >= 2020
        const char *sComputeMode[] = {
            "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
            "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
            "Prohibited (no host thread can use ::cudaSetDevice() with this device)",
            "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
            "Unknown",
            NULL
        };
        shrLog("  Compute Mode:\n");
        shrLog("     < %s >\n", sComputeMode[deviceProp.computeMode]);
#endif
    }

    // csv masterlog info
    // *****************************
    // exe and CUDA driver name
    shrLog("\n");
    std::string sProfileString = "deviceQuery, CUDA Driver = CUDART";
    char cTemp[10];
    
    // driver version
    sProfileString += ", CUDA Driver Version = ";
#ifdef WIN32
    sprintf_s(cTemp, 10, "%d.%d", driverVersion/1000, (driverVersion%100)/10);
#else
    sprintf(cTemp, "%d.%d", driverVersion/1000, (driverVersion%100)/10);
#endif
    sProfileString +=  cTemp;
    
    // Runtime version
    sProfileString += ", CUDA Runtime Version = ";
#ifdef WIN32
    sprintf_s(cTemp, 10, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10);
#else
    sprintf(cTemp, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10);
#endif
    sProfileString +=  cTemp;
    
    // Device count
    sProfileString += ", NumDevs = ";
#ifdef WIN32
    sprintf_s(cTemp, 10, "%d", deviceCount);
#else
    sprintf(cTemp, "%d", deviceCount);
#endif
    sProfileString += cTemp;
    
    // First 2 device names, if any
    for (dev = 0; dev < ((deviceCount > 2) ? 2 : deviceCount); ++dev)
    {
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, dev);
        sProfileString += ", Device = ";
        sProfileString += deviceProp.name;
    }
    sProfileString += "\n";
    //shrLogEx(LOGBOTH | MASTER, 0, sProfileString.c_str());

    std::cout << sProfileString.c_str() << std::endl;
    std::cout << "Press <ENTER>" << std::endl;

    //
    getchar();

    runtimeTest();

    getchar();
    // finish
    return 0;
}
Example #14
0
int getDeviceProps (int *deviceCount, char **deviceProps) {
    // Cuda Runtime interface
    void *cudaRT = NULL;
    cudaGetDeviceCount_f cudaGetDeviceCount = NULL;
    cudaGetDeviceProperties_f cudaGetDeviceProperties = NULL;

    cudaError_t cuErr;
    int ndevices; // Number of devices reported by Cuda runtime
    int undevices = 0; // Number of devices that are unusable by simEngine
    unsigned int deviceid;
    unsigned int sort;
    simCudaDevice *devices;

    cudaRT = dlopen(CUDART_LIBRARY_NAME, RTLD_NOW);
    if(!cudaRT) {
        char full_library_name[PATH_MAX];
        sprintf(full_library_name, "/usr/local/cuda/lib64/%s", CUDART_LIBRARY_NAME);
        cudaRT = dlopen(full_library_name, RTLD_NOW);
        if(!cudaRT) {
            sprintf(full_library_name, "/usr/local/cuda/lib/%s", CUDART_LIBRARY_NAME);
            cudaRT = dlopen(full_library_name, RTLD_NOW);
            if(!cudaRT) {
                snprintf(error_message, BUFFER_LENGTH,
                         "Failed to load CUDA runtime environment from %s.\n"
                         "\tIs the CUDA runtime environment installed in the default location\n"
                         "\tOR is LD_LIBRARY_PATH environment variable set to include CUDA libraries?",
                         CUDART_LIBRARY_NAME);
                error_message[BUFFER_LENGTH - 1] = '\0';
                return DeviceProps_NoCudaRuntime;
            }
        }
    }

    cudaGetDeviceCount = (cudaGetDeviceCount_f)dlsym(cudaRT, "cudaGetDeviceCount");
    cudaGetDeviceProperties = (cudaGetDeviceProperties_f)dlsym(cudaRT, "cudaGetDeviceProperties");

    if(!cudaGetDeviceCount || !cudaGetDeviceProperties) {
        snprintf(error_message, BUFFER_LENGTH,
                 "Failed to load CUDA functions from %s.\n"
                 "\tThe CUDA library found is incompatible with simEngine.",
                 CUDART_LIBRARY_NAME);
        error_message[BUFFER_LENGTH - 1] = '\0';
        return DeviceProps_NoCudaRuntime;
    }

    if (cudaSuccess != cudaGetDeviceCount(&ndevices)) {
        snprintf(error_message, BUFFER_LENGTH,
                 "Error obtaining device count.\n"
                 "\tIs there a CUDA capable GPU available on this computer?");
        error_message[BUFFER_LENGTH - 1] = '\0';
        return DeviceProps_UnknownError;
    }

    if (0 == ndevices) {
        snprintf(error_message, BUFFER_LENGTH,
                 "No suitable devices found.\n"
                 "\tIs your CUDA driver installed, and have you rebooted since installation?");
        error_message[BUFFER_LENGTH - 1] = '\0';
        return DeviceProps_NoDevices;
    }

    devices = (simCudaDevice *)malloc(sizeof(simCudaDevice) * ndevices);

    // Retrieve the properties for all Cuda devices
    for (deviceid = 0; deviceid < ndevices; ++deviceid) {
        if (cudaSuccess != cudaGetDeviceProperties(&devices[deviceid-undevices].props, deviceid)) {
            snprintf(error_message, BUFFER_LENGTH,
                     "Error obtaining properties for device %d.\n"
                     "\tThe CUDA library found is incompatible with simEngine.",
                     deviceid);
            error_message[BUFFER_LENGTH - 1] = '\0';
            free(devices);
            return DeviceProps_UnknownError;
        }
        // Filter out emulation devices
        if(9999 == devices[deviceid-undevices].props.major) {
            undevices += 1;
        }
        // Track GFLOPs of real devices
        else {
            devices[deviceid-undevices].gflops = devices[deviceid-undevices].props.multiProcessorCount * devices[deviceid-undevices].props.clockRate;
            devices[deviceid-undevices].unsorted = 1;
        }
    }

    // Subtract emulation devices from device count
    *deviceCount = ndevices - undevices;
    if (0 == *deviceCount) {
        snprintf(error_message, BUFFER_LENGTH,
                 "Only emulation device found.\n"
                 "\tDo you have a CUDA device?\n"
                 "\tIs the CUDA driver installed?\n"
                 "\tHave you rebooted after installing the driver?\n"
                 "\tDo you have device permissions set to allow CUDA computation?");
        error_message[BUFFER_LENGTH - 1] = '\0';
        free(devices);
        return DeviceProps_EmulationOnly;
    }



    // Sort the useable devices by max GFLOPs
    char *write = props_buffer;
    for(sort = 0; sort<(ndevices - undevices) && sort<MAX_DEVICES; ++sort) {
        int max_gflops = 0;
        int max_gflops_dev = 0;
        int written = 0;
        for(deviceid = 0; deviceid<(ndevices - undevices); ++deviceid) {
            if(devices[deviceid].unsorted && devices[deviceid].gflops > max_gflops) {
                max_gflops = devices[deviceid].gflops;
                max_gflops_dev = deviceid;
            }
        }
        // Print one device per line with properties colon separated
        written = snprintf(write, BUFFER_LENGTH,
                           "%d:%s:%zd:%zd:%d:%d:%zd:%d:%d,%d,%d:%d,%d,%d:%zd:%d:%d:%d:%zd:%d:%d:%d:%d:%d:%d",
                           max_gflops_dev,
                           devices[max_gflops_dev].props.name,
                           // Switch to kB to not overflow an int
                           devices[max_gflops_dev].props.totalGlobalMem>>10,
                           devices[max_gflops_dev].props.sharedMemPerBlock,
                           devices[max_gflops_dev].props.regsPerBlock,
                           devices[max_gflops_dev].props.warpSize,
                           devices[max_gflops_dev].props.memPitch,
                           devices[max_gflops_dev].props.maxThreadsPerBlock,
                           devices[max_gflops_dev].props.maxThreadsDim[0],
                           devices[max_gflops_dev].props.maxThreadsDim[1],
                           devices[max_gflops_dev].props.maxThreadsDim[2],
                           devices[max_gflops_dev].props.maxGridSize[0],
                           devices[max_gflops_dev].props.maxGridSize[1],
                           devices[max_gflops_dev].props.maxGridSize[2],
                           devices[max_gflops_dev].props.totalConstMem,
                           devices[max_gflops_dev].props.major,
                           devices[max_gflops_dev].props.minor,
                           devices[max_gflops_dev].props.clockRate,
                           devices[max_gflops_dev].props.textureAlignment,
                           devices[max_gflops_dev].props.deviceOverlap,
                           devices[max_gflops_dev].props.multiProcessorCount,
                           devices[max_gflops_dev].props.kernelExecTimeoutEnabled,
                           devices[max_gflops_dev].props.integrated,
                           devices[max_gflops_dev].props.canMapHostMemory,
                           devices[max_gflops_dev].props.computeMode
                          );
        write += 1 + written;
        devices[max_gflops_dev].unsorted = 0;
    }

    *deviceProps = props_buffer;

    free(devices);
    error_message[0] = '\0';
    return DeviceProps_Success;
}
Example #15
0
int main(int argc, char* atgv[])
{
    int ndevices = 0;
    cudaError_t cuda_status = cudaGetDeviceCount(&ndevices);
    if (cuda_status != cudaSuccess)
    {
        fprintf(stderr, "Cannot get the cuda device count, status = %d: %s\n",
                cuda_status, cudaGetErrorString(cuda_status));
        return cuda_status;
    }

    // Return if no cuda devices present.
    printf("%d CUDA device(s) found\n", ndevices);
    if (!ndevices) return 0;

    // Create input data. Each device will have an equal
    // piece of data.
    size_t np = nx * ny, size = np * sizeof(float);
    float* data = (float*)malloc(size * 2);
    float *input = data, *output = data + np;
    float invdrandmax = 1.0 / RAND_MAX;
    for (size_t i = 0; i < np; i++)
        input[i] = rand() * invdrandmax;

    struct time_t start, finish;
    get_time(&start);

    // Get control result on CPU (to compare with results on devices).
    pattern2d_cpu(1, nx, 1, 1, ny, 1, input, output, ndevices);

    get_time(&finish);

    printf("CPU time = %f sec\n", get_time_diff(&start, &finish));

    // Create config structures to store device-specific
    // values.
    config_t* configs = (config_t*)malloc(
                            sizeof(config_t) * ndevices);

    // Initialize CUDA devices.
    for (int idevice = 0; idevice < ndevices; idevice++)
    {
        config_t* config = configs + idevice;

        // TODO: Set curent CUDA device to idevice.
        cudaError_t cuda_status = cudaSetDevice(idevice);
        if (cuda_status != cudaSuccess) {
            fprintf(stderr, "Cannot get the cuda device count, status = %d: %s\n",
                    cuda_status, cudaGetErrorString(cuda_status));
            return cuda_status;
        }

        // Create device arrays for input and output data.
        cuda_status = cudaMalloc((void**)&config->in_dev, size);
        if (cuda_status != cudaSuccess)
        {
            fprintf(stderr, "Cannot allocate CUDA input buffer on device %d, status = %d: %s\n",
                    idevice, cuda_status, cudaGetErrorString(cuda_status));
            return cuda_status;
        }
        cuda_status = cudaMalloc((void**)&config->out_dev, size);
        if (cuda_status != cudaSuccess)
        {
            fprintf(stderr, "Cannot allocate CUDA output buffer on device %d, status = %d: %s\n",
                    idevice, cuda_status, cudaGetErrorString(cuda_status));
            return cuda_status;
        }

        // Copy input data to device buffer.
        cuda_status = cudaMemcpy(config->in_dev, input, size,
                                 cudaMemcpyHostToDevice);
        if (cuda_status != cudaSuccess)
        {
            fprintf(stderr, "Cannot copy input data to CUDA buffer on device %d, status = %d: %s\n",
                    idevice, cuda_status, cudaGetErrorString(cuda_status));
            return cuda_status;
        }

        printf("Device %d initialized\n", idevice);
    }

    // Start execution of kernels. One kernel
    // is executed on each device in parallel.
    for (int idevice = 0; idevice < ndevices; idevice++)
    {
        config_t* config = configs + idevice;

        // TODO: Set curent CUDA device to idevice.
        cudaError_t cuda_status = cudaSetDevice(idevice);

        get_time(&config->start);

        // Run test kernel on the current device.
        int status = pattern2d_gpu(1, nx, 1, 1, ny, 1,
                                   config->in_dev, config->out_dev, idevice);
        if (status)
        {
            fprintf(stderr, "Cannot execute pattern 2d on device %d, status = %d: %s\n",
                    idevice, status, cudaGetErrorString(status));
            return status;
        }
    }

    // Synchronize kernels execution.
    for (int idevice = 0; idevice < ndevices; idevice++)
    {
        config_t* config = configs + idevice;

        // TODO: Set curent CUDA device to idevice.
        cudaError_t cuda_status = cudaSetDevice(idevice);

        // Wait for current device to finish processing
        // the kernels.
        cuda_status = cudaThreadSynchronize();
        if (cuda_status != cudaSuccess)
        {
            fprintf(stderr, "Cannot synchronize thread by device %d, status = %d: %s\n",
                    idevice, cuda_status, cudaGetErrorString(cuda_status));
            return cuda_status;
        }

        get_time(&finish);

        printf("GPU %d time = %f sec\n", idevice,
               get_time_diff(&config->start, &finish));
    }

    // Check results and dispose resources used by devices.
    for (int idevice = 0; idevice < ndevices; idevice++)
    {
        config_t* config = configs + idevice;

        // TODO: Set curent CUDA device to idevice.

        // Offload results back to host memory.
        cuda_status = cudaMemcpy(input, config->out_dev, size,
                                 cudaMemcpyDeviceToHost);
        if (cuda_status != cudaSuccess)
        {
            fprintf(stderr, "Cannot copy output data from CUDA buffer on device %d, status = %d: %s\n",
                    idevice, cuda_status, cudaGetErrorString(cuda_status));
            return cuda_status;
        }

        // Free device arrays.
        cuda_status = cudaFree(config->in_dev);
        if (cuda_status != cudaSuccess)
        {
            fprintf(stderr, "Cannot release input buffer on device %d, status = %d: %s\n",
                    idevice, cuda_status, cudaGetErrorString(cuda_status));
            return cuda_status;
        }
        cuda_status = cudaFree(config->out_dev);
        if (cuda_status != cudaSuccess)
        {
            fprintf(stderr, "Cannot release output buffer on device %d, status = %d: %s\n",
                    idevice, cuda_status, cudaGetErrorString(cuda_status));
            return cuda_status;
        }

        printf("Device %d deinitialized\n", idevice);

        // Compare each GPU result to CPU result.
        // Find the maximum abs difference.
        int maxi = 0, maxj = 0;
        float maxdiff = fabs(input[0] - output[0]);
        for (int j = 0; j < ny; j++)
        {
            for (int i = 0; i < nx; i++)
            {
                float diff = fabs(
                                 input[i + j * nx] - output[i + j * nx]);
                if (diff > maxdiff)
                {
                    maxdiff = diff;
                    maxi = i;
                    maxj = j;
                }
            }
        }

        printf("Device %d result abs max diff = %f @ (%d,%d)\n",
               idevice, maxdiff, maxi, maxj);
    }

    // Measure time between first GPU launch and last GPU
    // finish. This will show how much time is spent on GPU
    // kernels in total.
    // XXX If this time is comparabe to the time of
    // individual GPU, then we likely reached our goal:
    // kernels are executed in parallel.
    printf("Total time of %d GPUs = %f\n", ndevices,
           get_time_diff(&configs[0].start, &finish));

    free(configs);
    free(data);

    return 0;
}
Example #16
0
int main(int argc, char **argv)
{
	char *rawfilename = NULL;
	int numiter = 250;
	int use_apc = 1;
	int use_normalization = 0;
	conjugrad_float_t lambda_single = F001; // 0.01
	conjugrad_float_t lambda_pair = FInf;
	conjugrad_float_t lambda_pair_factor = F02; // 0.2
	int conjugrad_k = 5;
	conjugrad_float_t conjugrad_eps = 0.01;

	parse_option *optList, *thisOpt;

	char *optstr;
	char *old_optstr = malloc(1);
	old_optstr[0] = 0;
	optstr = concat("r:i:n:w:k:e:l:ARh?", old_optstr);
	free(old_optstr);

#ifdef OPENMP
	int numthreads = 1;
	old_optstr = optstr;
	optstr = concat("t:", optstr);
	free(old_optstr);
#endif

#ifdef CUDA
	int use_def_gpu = 0;
	old_optstr = optstr;
	optstr = concat("d:", optstr);
	free(old_optstr);
#endif

#ifdef MSGPACK
	char* msgpackfilename = NULL;
	old_optstr = optstr;
	optstr = concat("b:", optstr);
	free(old_optstr);
#endif

	optList = parseopt(argc, argv, optstr);
	free(optstr);

	char* msafilename = NULL;
	char* matfilename = NULL;
	char* initfilename = NULL;

	conjugrad_float_t reweighting_threshold = F08; // 0.8

	while(optList != NULL) {
		thisOpt = optList;
		optList = optList->next;

		switch(thisOpt->option) {
#ifdef OPENMP
			case 't':
				numthreads = atoi(thisOpt->argument);

#ifdef CUDA
				use_def_gpu = -1; // automatically disable GPU if number of threads specified
#endif
				break;
#endif
#ifdef CUDA
			case 'd':
				use_def_gpu = atoi(thisOpt->argument);
				break;
#endif
#ifdef MSGPACK
			case 'b':
				msgpackfilename = thisOpt->argument;
				break;
#endif
			case 'r':
				rawfilename = thisOpt->argument;
				break;
			case 'i':
				initfilename = thisOpt->argument;
				break;
			case 'n':
				numiter = atoi(thisOpt->argument);
				break;
			case 'w':
				reweighting_threshold = (conjugrad_float_t)atof(thisOpt->argument);
				break;
			case 'l':
				lambda_pair_factor = (conjugrad_float_t)atof(thisOpt->argument);
				break;
			case 'k':
				conjugrad_k = (int)atoi(thisOpt->argument);
				break;
			case 'e':
				conjugrad_eps = (conjugrad_float_t)atof(thisOpt->argument);
				break;
			case 'A':
				use_apc = 0;
				break;
			case 'R':
				use_normalization = 1;
				break;
			case 'h':
			case '?':
				usage(argv[0], 1);
				break;

			case 0:
				if(msafilename == NULL) {
					msafilename = thisOpt->argument;
				} else if(matfilename == NULL) {
					matfilename = thisOpt->argument;
				} else {
					usage(argv[0], 0);
				}
				break;
			default:
				die("Unknown argument"); 
		}

		free(thisOpt);
	}

	if(msafilename == NULL || matfilename == NULL) {
		usage(argv[0], 0);
	}


	FILE *msafile = fopen(msafilename, "r");
	if( msafile == NULL) {
		printf("Cannot open %s!\n\n", msafilename);
		return 2;
	}

#ifdef JANSSON
	char* metafilename = malloc(2048);
	snprintf(metafilename, 2048, "%s.meta.json", msafilename);
	
	FILE *metafile = fopen(metafilename, "r");
	json_t *meta;
	if(metafile == NULL) {
		// Cannot find .meta.json file - create new empty metadata
		meta = meta_create();
	} else {
		// Load metadata from matfile.meta.json
		meta = meta_read_json(metafile);
		fclose(metafile);
	}

	json_object_set(meta, "method", json_string("ccmpred"));

	json_t *meta_step = meta_add_step(meta, "ccmpred");
	json_object_set(meta_step, "version", json_string(__VERSION));

	json_t *meta_parameters = json_object();
	json_object_set(meta_step, "parameters", meta_parameters);

	json_t *meta_steps = json_array();
	json_object_set(meta_step, "iterations", meta_steps);

	json_t *meta_results = json_object();
	json_object_set(meta_step, "results", meta_results);

#endif

	int ncol, nrow;
	unsigned char* msa = read_msa(msafile, &ncol, &nrow);
	fclose(msafile);

	int nsingle = ncol * (N_ALPHA - 1);
	int nvar = nsingle + ncol * ncol * N_ALPHA * N_ALPHA;
	int nsingle_padded = nsingle + N_ALPHA_PAD - (nsingle % N_ALPHA_PAD);
	int nvar_padded = nsingle_padded + ncol * ncol * N_ALPHA * N_ALPHA_PAD;

#ifdef CURSES
	bool color = detect_colors();
#else
	bool color = false;
#endif

	logo(color);

#ifdef CUDA
	int num_devices, dev_ret;
	struct cudaDeviceProp prop;
	dev_ret = cudaGetDeviceCount(&num_devices);
	if(dev_ret != CUDA_SUCCESS) {
		num_devices = 0;
	}


	if(num_devices == 0) {
		printf("No CUDA devices available, ");
		use_def_gpu = -1;
	} else if (use_def_gpu < -1 || use_def_gpu >= num_devices) {
		printf("Error: %d is not a valid device number. Please choose a number between 0 and %d\n", use_def_gpu, num_devices - 1);
		exit(1);
	} else {
		printf("Found %d CUDA devices, ", num_devices);
	}

	if (use_def_gpu != -1) {
		cudaError_t err = cudaSetDevice(use_def_gpu);
		if(cudaSuccess != err) {
			printf("Error setting device: %d\n", err);
			exit(1);
		}
		cudaGetDeviceProperties(&prop, use_def_gpu);
		printf("using device #%d: %s\n", use_def_gpu, prop.name);

		size_t mem_free, mem_total;
		err = cudaMemGetInfo(&mem_free, &mem_total);
		if(cudaSuccess != err) {
			printf("Error getting memory info: %d\n", err);
			exit(1);
		}

		size_t mem_needed = nrow * ncol * 2 + // MSAs
		                    sizeof(conjugrad_float_t) * nrow * ncol * 2 + // PC, PCS
		                    sizeof(conjugrad_float_t) * nrow * ncol * N_ALPHA_PAD + // PCN
		                    sizeof(conjugrad_float_t) * nrow + // Weights
		                    (sizeof(conjugrad_float_t) * ((N_ALPHA - 1) * ncol + ncol * ncol * N_ALPHA * N_ALPHA_PAD)) * 4;

		setlocale(LC_NUMERIC, "");
		printf("Total GPU RAM:  %'17lu\n", mem_total);
		printf("Free GPU RAM:   %'17lu\n", mem_free);
		printf("Needed GPU RAM: %'17lu ", mem_needed);

		if(mem_needed <= mem_free) {
			printf("✓\n");
		} else {
			printf("⚠\n");
		}

#ifdef JANSSON
		json_object_set(meta_parameters, "device", json_string("gpu"));
		json_t* meta_gpu = json_object();
		json_object_set(meta_parameters, "gpu_info", meta_gpu);

		json_object_set(meta_gpu, "name", json_string(prop.name));
		json_object_set(meta_gpu, "mem_total", json_integer(mem_total));
		json_object_set(meta_gpu, "mem_free", json_integer(mem_free));
		json_object_set(meta_gpu, "mem_needed", json_integer(mem_needed));
#endif


	} else {
		printf("using CPU");
#ifdef JANSSON
		json_object_set(meta_parameters, "device", json_string("cpu"));
#endif

#ifdef OPENMP
		printf(" (%d thread(s))", numthreads);
#ifdef JANSSON
		json_object_set(meta_parameters, "cpu_threads", json_integer(numthreads));
#endif
#endif
		printf("\n");

	}
#else // CUDA
	printf("using CPU");
#ifdef JANSSON
	json_object_set(meta_parameters, "device", json_string("cpu"));
#endif
#ifdef OPENMP
	printf(" (%d thread(s))\n", numthreads);
#ifdef JANSSON
	json_object_set(meta_parameters, "cpu_threads", json_integer(numthreads));
#endif
#endif // OPENMP
	printf("\n");
#endif // CUDA

	conjugrad_float_t *x = conjugrad_malloc(nvar_padded);
	if( x == NULL) {
		die("ERROR: Not enough memory to allocate variables!");
	}
	memset(x, 0, sizeof(conjugrad_float_t) * nvar_padded);

	// Auto-set lambda_pair
	if(isnan(lambda_pair)) {
		lambda_pair = lambda_pair_factor * (ncol - 1);
	}

	// fill up user data struct for passing to evaluate
	userdata *ud = (userdata *)malloc( sizeof(userdata) );
	if(ud == 0) { die("Cannot allocate memory for user data!"); }
	ud->msa = msa;
	ud->ncol = ncol;
	ud->nrow = nrow;
	ud->nsingle = nsingle;
	ud->nvar = nvar;
	ud->lambda_single = lambda_single;
	ud->lambda_pair = lambda_pair;
	ud->weights = conjugrad_malloc(nrow);
	ud->reweighting_threshold = reweighting_threshold;

	if(initfilename == NULL) {
		// Initialize emissions to pwm
		init_bias(x, ud);
	} else {
		// Load potentials from file
		read_raw(initfilename, ud, x);
	}

	// optimize with default parameters
	conjugrad_parameter_t *param = conjugrad_init();

	param->max_iterations = numiter;
	param->epsilon = conjugrad_eps;
	param->k = conjugrad_k;
	param->max_linesearch = 5;
	param->alpha_mul = F05;
	param->ftol = 1e-4;
	param->wolfe = F02;


	int (*init)(void *) = init_cpu;
	int (*destroy)(void *) = destroy_cpu;
	conjugrad_evaluate_t evaluate = evaluate_cpu;

#ifdef OPENMP
	omp_set_num_threads(numthreads);
	if(numthreads > 1) {
		init = init_cpu_omp;
		destroy = destroy_cpu_omp;
		evaluate = evaluate_cpu_omp;
	}
#endif

#ifdef CUDA
	if(use_def_gpu != -1) {
		init = init_cuda;
		destroy = destroy_cuda;
		evaluate = evaluate_cuda;
	}
#endif

	init(ud);

#ifdef JANSSON


	json_object_set(meta_parameters, "reweighting_threshold", json_real(ud->reweighting_threshold));
	json_object_set(meta_parameters, "apc", json_boolean(use_apc));
	json_object_set(meta_parameters, "normalization", json_boolean(use_normalization));

	json_t *meta_regularization = json_object();
	json_object_set(meta_parameters, "regularization", meta_regularization);

	json_object_set(meta_regularization, "type", json_string("l2")); 
	json_object_set(meta_regularization, "lambda_single", json_real(lambda_single));
	json_object_set(meta_regularization, "lambda_pair", json_real(lambda_pair));
	json_object_set(meta_regularization, "lambda_pair_factor", json_real(lambda_pair_factor));

	json_t *meta_opt = json_object();
	json_object_set(meta_parameters, "optimization", meta_opt);

	json_object_set(meta_opt, "method", json_string("libconjugrad"));
	json_object_set(meta_opt, "float_bits", json_integer((int)sizeof(conjugrad_float_t) * 8));
	json_object_set(meta_opt, "max_iterations", json_integer(param->max_iterations));
	json_object_set(meta_opt, "max_linesearch", json_integer(param->max_linesearch));
	json_object_set(meta_opt, "alpha_mul", json_real(param->alpha_mul));
	json_object_set(meta_opt, "ftol", json_real(param->ftol));
	json_object_set(meta_opt, "wolfe", json_real(param->wolfe));


	json_t *meta_msafile = meta_file_from_path(msafilename);
	json_object_set(meta_parameters, "msafile", meta_msafile);
	json_object_set(meta_msafile, "ncol", json_integer(ncol));
	json_object_set(meta_msafile, "nrow", json_integer(nrow));

	if(initfilename != NULL) {
		json_t *meta_initfile = meta_file_from_path(initfilename);
		json_object_set(meta_parameters, "initfile", meta_initfile);
		json_object_set(meta_initfile, "ncol", json_integer(ncol));
		json_object_set(meta_initfile, "nrow", json_integer(nrow));
	}

	double neff = 0;
	for(int i = 0; i < nrow; i++) {
		neff += ud->weights[i];
	}

	json_object_set(meta_msafile, "neff", json_real(neff));

	ud->meta_steps = meta_steps;

#endif

	printf("\nWill optimize %d %ld-bit variables\n\n", nvar, sizeof(conjugrad_float_t) * 8);

	if(color) { printf("\x1b[1m"); }
	printf("iter\teval\tf(x)    \t║x║     \t║g║     \tstep\n");
	if(color) { printf("\x1b[0m"); }


	conjugrad_float_t fx;
	int ret;
#ifdef CUDA
	if(use_def_gpu != -1) {
		conjugrad_float_t *d_x;
		cudaError_t err = cudaMalloc((void **) &d_x, sizeof(conjugrad_float_t) * nvar_padded);
		if (cudaSuccess != err) {
			printf("CUDA error No. %d while allocation memory for d_x\n", err);
			exit(1);
		}
		err = cudaMemcpy(d_x, x, sizeof(conjugrad_float_t) * nvar_padded, cudaMemcpyHostToDevice);
		if (cudaSuccess != err) {
			printf("CUDA error No. %d while copying parameters to GPU\n", err);
			exit(1);
		}
		ret = conjugrad_gpu(nvar_padded, d_x, &fx, evaluate, progress, ud, param);
		err = cudaMemcpy(x, d_x, sizeof(conjugrad_float_t) * nvar_padded, cudaMemcpyDeviceToHost);
		if (cudaSuccess != err) {
			printf("CUDA error No. %d while copying parameters back to CPU\n", err);
			exit(1);
		}
		err = cudaFree(d_x);
		if (cudaSuccess != err) {
			printf("CUDA error No. %d while freeing memory for d_x\n", err);
			exit(1);
		}
	} else {
	ret = conjugrad(nvar_padded, x, &fx, evaluate, progress, ud, param);
	}
#else
	ret = conjugrad(nvar_padded, x, &fx, evaluate, progress, ud, param);
#endif

	printf("\n");
	printf("%s with status code %d - ", (ret < 0 ? "Exit" : "Done"), ret);

	if(ret == CONJUGRAD_SUCCESS) {
		printf("Success!\n");
	} else if(ret == CONJUGRAD_ALREADY_MINIMIZED) {
		printf("Already minimized!\n");
	} else if(ret == CONJUGRADERR_MAXIMUMITERATION) {
		printf("Maximum number of iterations reached.\n");
	} else {
		printf("Unknown status code!\n");
	}

	printf("\nFinal fx = %f\n\n", fx);

	FILE* out = fopen(matfilename, "w");
	if(out == NULL) {
		printf("Cannot open %s for writing!\n\n", matfilename);
		return 3;
	}

	conjugrad_float_t *outmat = conjugrad_malloc(ncol * ncol);

	FILE *rawfile = NULL;
	if(rawfilename != NULL) {
		printf("Writing raw output to %s\n", rawfilename);
		rawfile = fopen(rawfilename, "w");

		if(rawfile == NULL) {
			printf("Cannot open %s for writing!\n\n", rawfilename);
			return 4;
		}

		write_raw(rawfile, x, ncol);
	}

#ifdef MSGPACK

	FILE *msgpackfile = NULL;
	if(msgpackfilename != NULL) {
		printf("Writing msgpack raw output to %s\n", msgpackfilename);
		msgpackfile = fopen(msgpackfilename, "w");

		if(msgpackfile == NULL) {
			printf("Cannot open %s for writing!\n\n", msgpackfilename);
			return 4;
		}

#ifndef JANSSON
		void *meta = NULL;
#endif

	}
#endif

	sum_submatrices(x, outmat, ncol);

	if(use_apc) {
		apc(outmat, ncol);
	}

	if(use_normalization) {
		normalize(outmat, ncol);
	}

	write_matrix(out, outmat, ncol, ncol);

#ifdef JANSSON

	json_object_set(meta_results, "fx_final", json_real(fx));
	json_object_set(meta_results, "num_iterations", json_integer(json_array_size(meta_steps)));
	json_object_set(meta_results, "opt_code", json_integer(ret));

	json_t *meta_matfile = meta_file_from_path(matfilename);
	json_object_set(meta_results, "matfile", meta_matfile);

	if(rawfilename != NULL) {
		json_object_set(meta_results, "rawfile", meta_file_from_path(rawfilename));
	}

	if(msgpackfilename != NULL) {
		json_object_set(meta_results, "msgpackfile", meta_file_from_path(msgpackfilename));
	}

	fprintf(out, "#>META> %s", json_dumps(meta, JSON_COMPACT));
	if(rawfile != NULL) {
		fprintf(rawfile, "#>META> %s", json_dumps(meta, JSON_COMPACT));
	}
#endif


	if(rawfile != NULL) {
		fclose(rawfile);
	}

#ifdef MSGPACK
	if(msgpackfile != NULL) {
		write_raw_msgpack(msgpackfile, x, ncol, meta);
		fclose(msgpackfile);
	}

#endif

	fflush(out);
	fclose(out);

	destroy(ud);

	conjugrad_free(outmat);
	conjugrad_free(x);
	conjugrad_free(ud->weights);
	free(ud);
	free(msa);
	free(param);
	
	printf("Output can be found in %s\n", matfilename);

	return 0;
}
Example #17
0
void thread_task(const int AS, const int num_of_nodes, int ingr_node_counter) 
{
    std::cout << std::endl << "------------ PROCESSING AS GRAPH --------------------" << std::endl << std::endl;

    // declare variables
    std::stringstream sstm;
    std::string ID_ = "AS";
    Graph * gr = NULL;

    std::uniform_real_distribution<double> RANDOM_GENERATOR(1, MAX_NUM_OF_INGRESS_);
    float multipleDeviceElapsedTime = 0, singleDeviceElapsedTime = 0, hostElapsedTime = 0, oldHostElapsedTime = 0;

    // create the AS id for the file name
    sstm << ID_ << AS;
    ID_ = sstm.str();

    std::cout << "Generating Topology..." << std::endl;

    // create brite and generate the topology
    Brite *b_topology = new Brite();
    
    if(num_of_nodes < 500000)
    {
	std::cout << "Generating" << std::endl;
        gr = b_topology->GenerateTopology(AS, num_of_nodes);
    }
    else
    {
	std::cout << "From File!" << std::endl;
        // convert number of nodes to a string
        std::string node = std::to_string(num_of_nodes);

        // read in the topology from BRITE file
        gr = b_topology->Populate_Topology_Result("../data/" + node +".brite");
    }

    std::cout << "Finished Generating" << std::endl;


    // get the number of vertices
    int numVertices = gr->numberOfVertex();


    // if no ingress nodes then generate a random amount (1 to 10)
    if (ingr_node_counter == 0) 
        {
            ingr_node_counter = RANDOM_GENERATOR(generator6);
        }

    // create memory for the results
    int ** gpuCost = new int * [ingr_node_counter];
    int ** sequentialCost = new int * [ingr_node_counter];
    int ** oldSequentialCost = new int * [ingr_node_counter];
    int ** multipleGPUResult = new int * [ingr_node_counter];
    
    for(int i = 0; i < ingr_node_counter; i++)
    {
        gpuCost[i] = new int[numVertices];
        sequentialCost[i] = new int[numVertices];
        oldSequentialCost[i] = new int[numVertices];
        multipleGPUResult[i] = new int[numVertices];
    }

    int * source = new int [ingr_node_counter];

    std::cout << std::endl << "Running Sequential Algorithm..." << std::endl;

    // loop for all ingress nodes
    for (int i = 0; i < ingr_node_counter; i++) 
    {
        source[i] = random_ingress_node_selector(*gr);
        hostElapsedTime += processGraphSequential(gr, source[i], sequentialCost[i]);
    }

    //totalOldBFSSequential += oldHostElapsedTime;
    totalNewBFSSequential += hostElapsedTime;

    std::cout << "Finished Sequential" << std::endl;

    // print the CPU results to a file and the screen
    std::cout << ID_ << ", CPU Execution time (sec),                   " << hostElapsedTime << ", Num Vertices, " << numVertices << ", Num Ingress, " << ingr_node_counter  << std::endl; 
    timingFile << ID_ << ", CPU Execution time (sec),                  " << hostElapsedTime << ", Num Vertices, " << numVertices << ", Num Ingress, " << ingr_node_counter  << std::endl; 






    std::cout << std::endl << "Running Single GPU Algorithm..." << std::endl;
    totalSingleGPU += singleDeviceElapsedTime = processGraphSingleGPU(gr, source, ingr_node_counter, gpuCost);     
    std::cout << "Finished Single GPU" << std::endl;

    // print the Single GPU results to a file and the screen
    std::cout << ID_ << ", CUDA Single Device Execution time (sec),    " << singleDeviceElapsedTime << ", Num Vertices, " << numVertices << ", Num Ingress, " << ingr_node_counter  << std::endl;
    timingFile << ID_ << ", CUDA Single Device Execution time (sec),   " << singleDeviceElapsedTime << ", Num Vertices, " << numVertices << ", Num Ingress, " << ingr_node_counter  << std::endl; 






    // CALL MULTI-GPU FUNCTION HERE!!!!
    std::cout << std::endl << "Running Multiple GPU Algorithm..." << std::endl;
    int numDevices;
    cudaGetDeviceCount(&numDevices);

    for(int i = 2; i <= numDevices; i++)
    {
        multipleDeviceElapsedTime = processGraphMultipleGPU(gr, source, ingr_node_counter, multipleGPUResult, i);

        std::cout << ID_ << ", CUDA Multiple Device Execution time (sec),  " << multipleDeviceElapsedTime << ", Num GPUs, " << i << ", Num Vertices, " << numVertices << ", Num Ingress, " << ingr_node_counter  << std::endl;
        timingFile << ID_ << ", CUDA Multiple Device Execution time (sec), " << multipleDeviceElapsedTime << ", Num GPUs, " << i << ", Num Vertices, " << numVertices << ", Num Ingress, " << ingr_node_counter  << std::endl;
    }
    std::cout << "Finished Multiple GPU" << std::endl << std::endl;




    std::cout << std::endl << "Verifying Results..." << std::endl;

    // verify results
    if(verifyResults(sequentialCost, gpuCost, multipleGPUResult, numVertices, ingr_node_counter))
        std::cout << "Results Match!!" << std::endl;
    else
        std::cout << "NOT EQUAL!!" << std::endl;

    std::cout << "Finished Verifying" << std::endl;

    // clean up memory   
    for(int i = 0; i < ingr_node_counter; i++)
    {
        delete [] gpuCost[i];
        delete [] sequentialCost[i];
        delete [] oldSequentialCost[i];
        delete [] multipleGPUResult[i];
    }


    // deallocate memory
    delete [] gpuCost;
    delete [] sequentialCost;
    delete [] oldSequentialCost;
    delete [] multipleGPUResult;
    delete [] source;
    delete b_topology;
    delete gr;
        
    std::cout << std::endl << "------------ FINISHED PROCESSING --------------------" << std::endl << std::endl;

}
int CudaDeviceContext::getDeviceCount() const {
    int count = 0;
    checkCudaError(cudaGetDeviceCount(&count));
    return count;
}
Example #19
0
extern "C" magma_int_t
magma_z_initP2P(
    magma_int_t *bw_bmark, magma_int_t *num_gpus,
    magma_queue_t queue )
{
    // Number of GPUs
    printf("Checking for multiple GPUs...\n");
    int gpu_n;
     (cudaGetDeviceCount(&gpu_n));
    printf("CUDA-capable device count: %i\n", gpu_n);
    if (gpu_n < 2)
    {
        printf("Two or more Tesla(s) with (SM 2.0)"
                        " class GPUs are required for P2P.\n");
    }

    // Query device properties
    cudaDeviceProp prop[64];
    int gpuid_tesla[64]; // find the first two GPU's that can support P2P
    int gpu_count = 0;   // GPUs that meet the criteria

    for (int i=0; i < gpu_n; i++) {
         (cudaGetDeviceProperties(&prop[i], i));
        // Only Tesla boards based on Fermi can support P2P
        {
            // This is an array of P2P capable GPUs
            gpuid_tesla[gpu_count++] = i;
        }
    }
    *num_gpus=gpu_n;

     for(int i=0; i<gpu_n; i++)
    {
        for(int j=i+1; j<gpu_n; j++)
        {
  // Check possibility for peer access
    printf("\nChecking GPU(s) for support of peer to peer memory access...\n");
    int can_access_peer_0_1, can_access_peer_1_0;
    // In this case we just pick the first two that we can support
     (cudaDeviceCanAccessPeer(&can_access_peer_0_1, gpuid_tesla[i], 
                                                        gpuid_tesla[j]));
     (cudaDeviceCanAccessPeer(&can_access_peer_1_0, gpuid_tesla[j], 
                                                        gpuid_tesla[i]));


    // Output results from P2P capabilities
    printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", 
                        prop[gpuid_tesla[i]].name, gpuid_tesla[i],                                                                  
                        prop[gpuid_tesla[j]].name, gpuid_tesla[j] ,
                             can_access_peer_0_1 ? "Yes" : "No");
    printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", 
                        prop[gpuid_tesla[j]].name, gpuid_tesla[j],
                        prop[gpuid_tesla[i]].name, gpuid_tesla[i],
                            can_access_peer_1_0 ? "Yes" : "No");

    if (can_access_peer_0_1 == 0 || can_access_peer_1_0 == 0)
    {
        printf("Two or more Tesla(s) with class"
                " GPUs are required for P2P to run.\n");
        printf("Support for UVA requires a Tesla with SM 2.0 capabilities.\n");
        printf("Peer to Peer access is not available between"
        " GPU%d <-> GPU%d, waiving test.\n", gpuid_tesla[i], gpuid_tesla[j]);
        printf("PASSED\n");
        //exit(EXIT_SUCCESS);
    }
     }    
    }

  // Enable peer access
     for(int i=0; i<gpu_n; i++)
     {
         for(int j=i+1; j<gpu_n; j++)
         {
             printf("Enabling peer access between GPU%d and GPU%d...\n",
                gpuid_tesla[i], gpuid_tesla[j]);
              (cudaSetDevice(gpuid_tesla[i]));
              (cudaDeviceEnablePeerAccess(gpuid_tesla[j], 0));
              (cudaSetDevice(gpuid_tesla[j]));
              (cudaDeviceEnablePeerAccess(gpuid_tesla[i], 0));
           magma_zcheckerr("P2P");
         }
     }

   magma_zcheckerr("P2P successful");


    // Enable peer access
    for(int i=0; i<gpu_n; i++)
    {
        for(int j=i+1; j<gpu_n; j++)
        {
    // Check that we got UVA on both devices
    printf("Checking GPU%d and GPU%d for UVA capabilities...\n", 
    gpuid_tesla[i], gpuid_tesla[j]);
    //const bool has_uva = (prop[gpuid_tesla[i]].unifiedAddressing && 
    //                            prop[gpuid_tesla[j]].unifiedAddressing);

    printf("> %s (GPU%d) supports UVA: %s\n", prop[gpuid_tesla[i]].name, 
    gpuid_tesla[i], (prop[gpuid_tesla[i]].unifiedAddressing ? "Yes" : "No") );
    printf("> %s (GPU%d) supports UVA: %s\n", prop[gpuid_tesla[j]].name, 
    gpuid_tesla[j], (prop[gpuid_tesla[j]].unifiedAddressing ? "Yes" : "No") );

        }
    }



  if (*bw_bmark==1) {


    // P2P memcopy() benchmark
   for(int i=0; i<gpu_n; i++)
    {
        for(int j=i+1; j<gpu_n; j++)
        {
    // Allocate buffers
    const size_t buf_size = 1024 * 1024 * 16 * sizeof(float);
    printf("Allocating buffers (%iMB on GPU%d, GPU%d and CPU Host)...\n", 
                int(buf_size / 1024 / 1024), gpuid_tesla[i], gpuid_tesla[j]);
    (cudaSetDevice(gpuid_tesla[i]));
    float* g0;
    (cudaMalloc(&g0, buf_size));
    (cudaSetDevice(gpuid_tesla[j]));
    float* g1;
    (cudaMalloc(&g1, buf_size));
    float* h0;
    (cudaMallocHost(&h0, buf_size)); // Automatically portable with UVA

    // Create CUDA event handles
    printf("Creating event handles...\n");
    cudaEvent_t start_event, stop_event;
    float time_memcpy;
    int eventflags = cudaEventBlockingSync;
    (cudaEventCreateWithFlags(&start_event, eventflags));
    (cudaEventCreateWithFlags(&stop_event, eventflags));


    (cudaEventRecord(start_event, 0));
    for (int k=0; k<100; k++)
    {
        // With UVA we don't need to specify source and target devices, the
        // runtime figures this out by itself from the pointers
            
        // Ping-pong copy between GPUs
        if (k % 2 == 0)
            (cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault));
        else
            (cudaMemcpy(g0, g1, buf_size, cudaMemcpyDefault));
    }
    (cudaEventRecord(stop_event, 0));
    (cudaEventSynchronize(stop_event));
    (cudaEventElapsedTime(&time_memcpy, start_event, stop_event));
    printf("cudaMemcpyPeer / cudaMemcpy between"
            "GPU%d and GPU%d: %.2fGB/s\n", gpuid_tesla[i], gpuid_tesla[j],
        (1.0f / (time_memcpy / 1000.0f)) 
            * ((100.0f * buf_size)) / 1024.0f / 1024.0f / 1024.0f);


     // Cleanup and shutdown
    printf("Cleanup of P2P benchmark...\n");
    (cudaEventDestroy(start_event));
    (cudaEventDestroy(stop_event));
    (cudaSetDevice(gpuid_tesla[i]));
    (magma_free( g0) );
    (cudaSetDevice(gpuid_tesla[j]));
    (magma_free( g1) );
    (magma_free_cpu( h0) );

    }
    }

    // host-device memcopy() benchmark

        for(int j=0; j<gpu_n; j++)
        {
    cudaSetDevice(gpuid_tesla[j]);

    int *h_data_source;
    int *h_data_sink;

    int *h_data_in[STREAM_COUNT];
    int *d_data_in[STREAM_COUNT];

    int *h_data_out[STREAM_COUNT];
    int *d_data_out[STREAM_COUNT];


    cudaEvent_t cycleDone[STREAM_COUNT];
    cudaStream_t stream[STREAM_COUNT];

    cudaEvent_t start, stop;

    // Allocate resources
    int memsize;
    memsize = 1000000 * sizeof(int);

    h_data_source = (int*) malloc(memsize);
    h_data_sink = (int*) malloc(memsize);    

    for( int i =0; i<STREAM_COUNT; ++i ) {
        
        ( cudaHostAlloc(&h_data_in[i], memsize, 
            cudaHostAllocDefault) );
        ( cudaMalloc(&d_data_in[i], memsize) );
        
        ( cudaHostAlloc(&h_data_out[i], memsize, 
            cudaHostAllocDefault) );
        ( cudaMalloc(&d_data_out[i], memsize) );

        
        ( cudaStreamCreate(&stream[i]) );
        ( cudaEventCreate(&cycleDone[i]) ); 
        
        cudaEventRecord(cycleDone[i], stream[i]);
    }

    cudaEventCreate(&start); cudaEventCreate(&stop);

    

    // Time host-device copies
    cudaEventRecord(start,0);
    ( cudaMemcpyAsync(d_data_in[0], h_data_in[0], memsize, 
        cudaMemcpyHostToDevice,0) );
    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);
    
    float memcpy_h2d_time;    
    cudaEventElapsedTime(&memcpy_h2d_time, start, stop);

    
    cudaEventRecord(start,0);
    ( cudaMemcpyAsync(h_data_out[0], d_data_out[0], memsize, 
        cudaMemcpyDeviceToHost, 0) );        
    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);
    
    float memcpy_d2h_time;    
    cudaEventElapsedTime(&memcpy_d2h_time, start, stop);
    
    cudaEventSynchronize(stop);
    printf("Measured timings (throughput):\n");
    printf(" Memcpy host to device GPU %d \t: %f ms (%f GB/s)\n", j,
        memcpy_h2d_time, (memsize * 1e-6)/ memcpy_h2d_time );
    printf(" Memcpy device GPU %d to host\t: %f ms (%f GB/s)\n", j,
        memcpy_d2h_time, (memsize * 1e-6)/ memcpy_d2h_time);

    // Free resources

    free( h_data_source );
    free( h_data_sink );

    for( int i =0; i<STREAM_COUNT; ++i ) {
        
        magma_free_cpu( h_data_in[i] );
        magma_free( d_data_in[i] );

        magma_free_cpu( h_data_out[i] );
        magma_free( d_data_out[i] );
        
        cudaStreamDestroy(stream[i]);
        cudaEventDestroy(cycleDone[i]);        
    }
    
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    
   }

  }//end if-loop bandwidth_benchmark

    magma_zcheckerr("P2P established");

    return MAGMA_SUCCESS;
}
bool CudaDeviceContextPrivate::create(int device) {
    if (isValid()) {
        return true;
    }

    if (device < 0) {
        std::cout << "Using CPU only" << std::endl;
        activeDevice = -1;
        return false;
    }

    int count = 0;
    checkCudaError(cudaGetDeviceCount(&count));
    if (count < 1) {
        std::cout << "No CUDA devices found; using CPU only" << std::endl;
        activeDevice = -1;
        return false;
    }

    if (device >= 0 && device < count) {
        checkCudaError(cudaSetDevice(device));
        cudaDeviceProp prop = {0};
        checkCudaError(cudaGetDeviceProperties(&prop, device));
        if (prop.major < 2) {
            device = -1;
        }
    } else {
        size_t freeMax = 0;
        for (int i = 0; i < count; i++) {
            checkCudaError(cudaSetDevice(i));
            cudaDeviceProp prop = {0};
            checkCudaError(cudaGetDeviceProperties(&prop, i));
            if (prop.major < 2) {
                continue;
            }
            size_t free = 0, total = 0;
            checkCudaError(cudaMemGetInfo(&free, &total));
            if (free > freeMax) {
                freeMax = free;
                device = i;
            }
        }
    }

    if (device < 0) {
        std::cout << "Compute capability 2.0 support required; using CPU only" << std::endl;
        activeDevice = -1;
        return false;
    }

    checkCudaError(cudaSetDevice(device));
    checkCudaError(cudaFree(0));

    cudaDeviceProp prop = {0};
    checkCudaError(cudaGetDeviceProperties(&prop, device));
    size_t free = 0, total = 0;
    checkCudaError(cudaMemGetInfo(&free, &total));

    printf("_________________________________________\n");
    printf("%s (%lluMB free of %lluMB total)\n", prop.name, (unsigned long long) (free / bytesInMegaBytes), (unsigned long long) (total / bytesInMegaBytes));
    printf("Using %d multiprocessors\n", prop.multiProcessorCount);
    printf("Max threads per processor: %d\n", prop.maxThreadsPerMultiProcessor);
    printf("Max threads per block: %d\n", prop.maxThreadsPerBlock);
    printf("Max threads per dim: (%d, %d, %d)\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
    printf("_________________________________________\n");
    printf("\n");

    activeDevice = device;
    return isValid();
}
Example #21
0
static int
hwloc_cuda_discover(struct hwloc_backend *backend)
{
  struct hwloc_topology *topology = backend->topology;
  enum hwloc_type_filter_e filter;
  cudaError_t cures;
  int nb, i;

  hwloc_topology_get_type_filter(topology, HWLOC_OBJ_OS_DEVICE, &filter);
  if (filter == HWLOC_TYPE_FILTER_KEEP_NONE)
    return 0;

  if (!hwloc_topology_is_thissystem(topology)) {
    hwloc_debug("%s", "\nno CUDA detection (not thissystem)\n");
    return 0;
  }

  cures = cudaGetDeviceCount(&nb);
  if (cures)
    return -1;

  for (i = 0; i < nb; i++) {
    int domain, bus, dev;
    char cuda_name[32];
    char number[32];
    struct cudaDeviceProp prop;
    hwloc_obj_t cuda_device, parent;
    unsigned cores;

    cuda_device = hwloc_alloc_setup_object(HWLOC_OBJ_OS_DEVICE, -1);
    snprintf(cuda_name, sizeof(cuda_name), "cuda%d", i);
    cuda_device->name = strdup(cuda_name);
    cuda_device->depth = (unsigned) HWLOC_TYPE_DEPTH_UNKNOWN;
    cuda_device->attr->osdev.type = HWLOC_OBJ_OSDEV_COPROC;

    hwloc_obj_add_info(cuda_device, "CoProcType", "CUDA");
    hwloc_obj_add_info(cuda_device, "Backend", "CUDA");
    hwloc_obj_add_info(cuda_device, "GPUVendor", "NVIDIA Corporation");

    cures = cudaGetDeviceProperties(&prop, i);
    if (!cures)
      hwloc_obj_add_info(cuda_device, "GPUModel", prop.name);

    snprintf(number, sizeof(number), "%llu", ((unsigned long long) prop.totalGlobalMem) >> 10);
    hwloc_obj_add_info(cuda_device, "CUDAGlobalMemorySize", number);

    snprintf(number, sizeof(number), "%llu", ((unsigned long long) prop.l2CacheSize) >> 10);
    hwloc_obj_add_info(cuda_device, "CUDAL2CacheSize", number);

    snprintf(number, sizeof(number), "%d", prop.multiProcessorCount);
    hwloc_obj_add_info(cuda_device, "CUDAMultiProcessors", number);

    cores = hwloc_cuda_cores_per_MP(prop.major, prop.minor);
    if (cores) {
      snprintf(number, sizeof(number), "%u", cores);
      hwloc_obj_add_info(cuda_device, "CUDACoresPerMP", number);
    }

    snprintf(number, sizeof(number), "%llu", ((unsigned long long) prop.sharedMemPerBlock) >> 10);
    hwloc_obj_add_info(cuda_device, "CUDASharedMemorySizePerMP", number);

    parent = NULL;
    if (hwloc_cudart_get_device_pci_ids(NULL /* topology unused */, i, &domain, &bus, &dev) == 0) {
      parent = hwloc_pci_belowroot_find_by_busid(topology, domain, bus, dev, 0);
      if (!parent)
	parent = hwloc_pci_find_busid_parent(topology, domain, bus, dev, 0);
    }
    if (!parent)
      parent = hwloc_get_root_obj(topology);

    hwloc_insert_object_by_parent(topology, parent, cuda_device);
  }

  return 0;
}
Example #22
0
  // =======================================================
  // =======================================================
  HydroMpiParameters::HydroMpiParameters(ConfigMap &_configMap) :
    HydroParameters(_configMap, false), mx(0), my(0), mz(0), 
    myRank(0), nProcs(0), myMpiPos(),
    nNeighbors(0), neighborsRank(), neighborsBC()
  {

    // MPI parameters :
    mx = configMap.getInteger("mpi", "mx", 1);
    my = configMap.getInteger("mpi", "my", 1);
    mz = configMap.getInteger("mpi", "mz", 1);

    // copy MPI topology sizes into gParams structure (so that it will also
    // be available as a global constant, usefull for GPU implementation in godunov_unsplit_mhd.cuh).
    _gParams.mx = mx;
    _gParams.my = my;
    _gParams.mz = mz;

    // check that parameters are consistent
    bool error = false;
    error |= (mx < 1);
    error |= (my < 1);
    error |= (mz < 1);
    if (dimType == TWO_D and mz != 1)
      error = true;
    TEST_FOR_EXCEPTION_PRINT(error,
			     std::runtime_error,
			     "Inconsistent geometry; check parameter file for nx, ny, nz and mx, my, mz !\n",
			     &std::cerr);

    // get world communicator size and check it is consistent with mesh grid sizes
    nProcs = MpiComm::world().getNProc();
    TEST_FOR_EXCEPTION_PRINT(nProcs != mx*my*mz,
			     std::runtime_error,
			     "Inconsistent MPI cartesian virtual topology geometry; \n mx*my*mz must match with parameter given to mpirun !!!\n",
			     &std::cerr);

    // create the MPI communicator for our cartesian mesh
    if (dimType == TWO_D) {
      communicator = new MpiCommCart(mx, my, MPI_CART_PERIODIC_TRUE, MPI_REORDER_TRUE);
      nDim = 2;
    } else {
      communicator = new MpiCommCart(mx, my, mz, MPI_CART_PERIODIC_TRUE, MPI_REORDER_TRUE);
      nDim = 3;
    }

    // get my MPI rank inside topology
    myRank = communicator->getRank();
    
    // get my coordinates inside topology
    // myMpiPos[0] is between 0 and mx-1
    // myMpiPos[1] is between 0 and my-1
    // myMpiPos[2] is between 0 and mz-1
    myMpiPos.resize(nDim);
    communicator->getMyCoords(&myMpiPos[0]);

    // copy coordinate into gParams structure (so that it will also
    // be available as a global constant, usefull for GPU implementation).
    _gParams.mpiPosX = myMpiPos[0];
    _gParams.mpiPosY = myMpiPos[1];
    _gParams.mpiPosZ = myMpiPos[2];

    /*
     * compute MPI ranks of our neighbors and 
     * set default boundary condition types
     */
    if (dimType == TWO_D) {
      nNeighbors = N_NEIGHBORS_2D;
      neighborsRank.resize(nNeighbors);
      neighborsRank[X_MIN] = communicator->getNeighborRank<X_MIN>();
      neighborsRank[X_MAX] = communicator->getNeighborRank<X_MAX>();
      neighborsRank[Y_MIN] = communicator->getNeighborRank<Y_MIN>();
      neighborsRank[Y_MAX] = communicator->getNeighborRank<Y_MAX>();
      
      neighborsBC.resize(nNeighbors);
      neighborsBC[X_MIN] = BC_COPY;
      neighborsBC[X_MAX] = BC_COPY;
      neighborsBC[Y_MIN] = BC_COPY;
      neighborsBC[Y_MAX] = BC_COPY;
    } else {
      nNeighbors = N_NEIGHBORS_3D;
      neighborsRank.resize(nNeighbors);
      neighborsRank[X_MIN] = communicator->getNeighborRank<X_MIN>();
      neighborsRank[X_MAX] = communicator->getNeighborRank<X_MAX>();
      neighborsRank[Y_MIN] = communicator->getNeighborRank<Y_MIN>();
      neighborsRank[Y_MAX] = communicator->getNeighborRank<Y_MAX>();
      neighborsRank[Z_MIN] = communicator->getNeighborRank<Z_MIN>();
      neighborsRank[Z_MAX] = communicator->getNeighborRank<Z_MAX>();

      neighborsBC.resize(nNeighbors);
      neighborsBC[X_MIN] = BC_COPY;
      neighborsBC[X_MAX] = BC_COPY;
      neighborsBC[Y_MIN] = BC_COPY;
      neighborsBC[Y_MAX] = BC_COPY;
      neighborsBC[Z_MIN] = BC_COPY;
      neighborsBC[Z_MAX] = BC_COPY;
    }

    /*
     * identify outside boundaries (no actual communication if we are
     * doing BC_DIRICHLET or BC_NEUMANN)
     *
     * Please notice the duality 
     * XMIN -- boundary_xmax
     * XMAX -- boundary_xmin
     *
     */

    // X_MIN boundary
    if (myMpiPos[DIR_X] == 0)
      neighborsBC[X_MIN] = boundary_xmin;

    // X_MAX boundary
    if (myMpiPos[DIR_X] == mx-1)
      neighborsBC[X_MAX] = boundary_xmax;

    // Y_MIN boundary
    if (myMpiPos[DIR_Y] == 0)
      neighborsBC[Y_MIN] = boundary_ymin;

    // Y_MAX boundary
    if (myMpiPos[DIR_Y] == my-1)
      neighborsBC[Y_MAX] = boundary_ymax;

    if (dimType == THREE_D) {

      // Z_MIN boundary
      if (myMpiPos[DIR_Z] == 0)
	neighborsBC[Z_MIN] = boundary_zmin;
      
      // Y_MAX boundary
      if (myMpiPos[DIR_Z] == mz-1)
	neighborsBC[Z_MAX] = boundary_zmax;
      
    } // end THREE_D

    /*
     * Initialize CUDA device if needed.
     * When running on a Linux machine with mutiple GPU per node, it might be
     * very helpfull if admin has set the CUDA device compute mode to exclusive
     * so that a device is only attached to 1 host thread (i.e. 2 different host
     * thread can not communicate with the same GPU).
     *
     * As a sys-admin, just run for all devices command:
     *   nvidia-smi -g $(DEV_ID) -c 1
     *
     * If compute mode is set to normal mode, we need to use cudaSetDevice, 
     * so that each MPI device is mapped onto a different GPU device.
     * 
     * At CCRT, on machine Titane, each node (2 quadri-proc) "sees" only 
     * half a Tesla S1070, that means cudaGetDeviceCount should return 2.
     * If we want the ration 1 MPI process <-> 1 GPU, we need to allocate
     * N nodes and 2*N tasks (MPI process). 
     */
#ifdef __CUDACC__
    // get device count
    int count;
    cutilSafeCall( cudaGetDeviceCount(&count) );
    
    int devId = myRank % count;
    cutilSafeCall( cudaSetDevice(devId) );
    
    cudaDeviceProp deviceProp;
    int myDevId = -1;
    cutilSafeCall( cudaGetDevice( &myDevId ) );
    cutilSafeCall( cudaGetDeviceProperties( &deviceProp, myDevId ) );
    // faire un cudaSetDevice et cudaGetDeviceProp et aficher le nom
    // ajouter un booleen dans le constructeur pour savoir si on veut faire ca
    // sachant que sur Titane, probablement que le mode exclusif est active
    // a verifier demain

    std::cout << "MPI process " << myRank << " is using GPU device num " << myDevId << std::endl;

#endif //__CUDACC__

    // fix space resolution :
    // need to take into account number of MPI process in each direction
    float    xMax = configMap.getFloat("mesh","xmax",1.0);
    float    yMax = configMap.getFloat("mesh","ymax",1.0);
    float    zMax = configMap.getFloat("mesh","zmax",1.0);
    _gParams.dx = (xMax- _gParams.xMin)/(nx*mx);
    _gParams.dy = (yMax- _gParams.yMin)/(ny*my);
    _gParams.dz = (zMax- _gParams.zMin)/(nz*mz);

    // print information about current setup
    if (myRank == 0) {
      std::cout << "We are about to start simulation with the following characteristics\n";

      std::cout << "Global resolution : " << 
	nx*mx << " x " << ny*my << " x " << nz*mz << "\n";
      std::cout << "Local  resolution : " << 
	nx << " x " << ny << " x " << nz << "\n";
      std::cout << "MPI Cartesian topology : " << mx << "x" << my << "x" << mz << std::endl;
    }

#ifdef __CUDACC__
    char hostname[1024];
    gethostname(hostname, 1023);
    std::cout << "hostname      : " << hostname << std::endl;
    std::cout << hostname << " [MPI] myRank  : " << myRank << std::endl;
    std::cout << hostname << " [GPU] myDevId : " << myDevId << " (" << deviceProp.name << ")" << std::endl;
#endif // __CUDACC__
  } // HydroMpiParameters::HydroMpiParameters
Example #23
0
int getGpuCount() {
    int deviceCount;
    checkCudaErrors(cudaGetDeviceCount(&deviceCount));
    return deviceCount;
}
Example #24
0
//====================================
// Setup/Init Stuff
//====================================
bool Init(int argc, char **argv){
    // Set window title to "Student Name: GPU Name"
    std::string deviceName;
    cudaDeviceProp deviceProp; 
    int gpudevice = 0;
    int device_count = 0;
    cudaGetDeviceCount(&device_count); 
    if(gpudevice > device_count){ 
        std::cout << "Error: GPU device number is greater than the number of devices!" << 
                     "Perhaps a CUDA-capable GPU is not installed?" << std::endl; 
        return false;
    } 
    cudaGetDeviceProperties(&deviceProp, gpudevice); 
    deviceName = deviceProp.name;
    deviceName = m_yourName + ": " + deviceProp.name;
    m_major = deviceProp.major;
    m_minor = deviceProp.minor;

    // Window setup stuff
#if _WIN32
	glutInit(&argc, argv);
	
	m_width = 800;
	m_height = 800;

	glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGBA);
	glutInitWindowSize(m_width, m_height);
	m_window = glutCreateWindow(deviceName.c_str());
#else
    glfwSetErrorCallback(ErrorCallback);

    if (!glfwInit()){
        return false;
    }
    m_width = 800;
    m_height = 800;
    m_window = glfwCreateWindow(m_width, m_height, deviceName.c_str(), NULL, NULL);
    if (!m_window){
        glfwTerminate();
        return false;
    }
    glfwMakeContextCurrent(m_window);
    glfwSetKeyCallback(m_window, KeyCallback);
#endif

    glewExperimental = GL_TRUE;
    if(glewInit()!=GLEW_OK){
        return false;
    }

    // Init all of the things
    InitVAO();
    InitTextures();
    InitCuda();
    InitPBO(&m_pbo);

    GLuint passthroughProgram;
    passthroughProgram = InitShader();
    glUseProgram(passthroughProgram);
    glActiveTexture(GL_TEXTURE0);

    return true;
}
int main()
{
    int deviceCount, device, sm_major[999], sm_minor[999], compute_major[999], compute_minor[999];
    int gpuDeviceCount = 0;
    struct cudaDeviceProp properties;

    if (cudaGetDeviceCount(&deviceCount) != cudaSuccess)
    {
        printf("Couldn't get device count: %s\n", cudaGetErrorString(cudaGetLastError()));
        return 1;
    }
    /* machines with no GPUs can still report one emulation device */
    for (device = 0; device < deviceCount; ++device)
    {
        cudaGetDeviceProperties(&properties, device);
        if (properties.major != 9999)
        {   /* 9999 means emulation only */
            ++gpuDeviceCount;
        }
        sm_major[device] = properties.major;
        sm_minor[device] = properties.minor;


        if (sm_major[device] == 2 && sm_minor[device] == 0)
        {
            compute_major[device]=2;
            compute_minor[device]=0;
        }

        if (sm_major[device] == 2 && sm_minor[device] == 1)
        {
            compute_major[device]=2;
            compute_minor[device]=0;
        }

        if (sm_major[device] == 3 && sm_minor[device] == 0)
        {
            compute_major[device]=3;
            compute_minor[device]=0;
        }

        if (sm_major[device] == 3 && sm_minor[device] == 2)
        {
            compute_major[device]=3;
            compute_minor[device]=0;
        }


        if (sm_major[device] == 3 && sm_minor[device] == 5)
        {
            compute_major[device]=3;
            compute_minor[device]=5;
        }


        if (sm_major[device] == 5 && sm_minor[device] == 0)
        {
            compute_major[device]=5;
            compute_minor[device]=0;
        }


        if (sm_major[device] == 5 && sm_minor[device] == 2)
        {
            compute_major[device]=5;
            compute_minor[device]=2;
        }


    }

    for (device = 0; device < deviceCount; device ++)
    {
        printf("arch=compute_%d%d,code=sm_%d%d ",compute_major[device],compute_minor[device],sm_major[device],sm_minor[device]);
    }



    return 1; /* failure */
}
Example #26
0
/**
 * Returns the GPU weights for all available GPUs.
 *
 * @param proportion vector that will receive the weights.
 * @param n maximum number of weights to be stored in the vector.
 * @return the number of weights stored in the vector.
 */
int getGPUWeights(int* proportion, int n) {
	/*
	 * When any CUDA runtime function is called, the CUDA context is initialized.
	 * If we call a fork after this initialization, the same context is shared
	 * among the processes, what causes initialization errors and abnormal
	 * execution. The getGPUWeights function are called before the fork procedure.
	 * in the libmasa_entry_point, so, we must obtain all the GPU weights/proportion
	 * using another process. This method fork a process only to obtain these
	 * CUDA dependent values and the child process dies with its own CUDA context.
	 * This context is not shared with the parent process, so we can continue
	 * the Aligner execution without any problem.
	 */

	/* Communication using PIPE */
	int pipe_fd[2];
	if (pipe(pipe_fd)) {
		fprintf(stderr, "ERROR: GPU weights could not be obtained (1).\n");
		exit(-1);
	}

	int pid = fork();
	if (pid == 0) {
		/* Child */
		close(pipe_fd[0]);
		int count;
		cudaGetDeviceCount(&count);

		cudaDeviceProp devProp;
		for (int deviceId=0; deviceId<count; deviceId++) {
			cutilSafeCall(cudaGetDeviceProperties(&devProp, deviceId));
	        int cores;
	        if (devProp.major <= 1) {
	        	cores = 8;
	        } else if (devProp.major == 2 && devProp.minor == 0) {
	        	cores = 32;
	        } else if (devProp.major == 2 && devProp.minor == 1) {
	        	cores = 48;
	        } else {
	        	cores = 192;
	        }
	        if (getCompiledCapability() <= (devProp.major*100+devProp.minor*10)) {
	        	int speed = devProp.clockRate*devProp.multiProcessorCount*cores/1000;
	        	if (write(pipe_fd[1], &speed, sizeof(speed)) == -1) {
	    			fprintf(stderr, "ERROR: GPU weights could not be obtained (2).\n");
	    			exit(1);
	        	}
	        }
		}
		cudaDeviceReset();
		close(pipe_fd[1]);
		exit(7);
	} else {
		/* Parent */
		close(pipe_fd[1]);
		int val;
		int count = 0;
		while (read(pipe_fd[0], &val, sizeof(val)) > 0) {
			if (count < n) {
				proportion[count++] = val;
			}
		}
		close(pipe_fd[0]);
		waitpid(pid, NULL, 0); // Join processes
		return count;
	}

}
Example #27
0
void parse_opts( int argc, char** argv, magma_opts *opts )
{
    // negative flag indicating -m, -n, -k not given
    int m = -1;
    int n = -1;
    int k = -1;
    
    // fill in default values
    opts->device   = 0;
    opts->pad      = 32;
    opts->nb       = 0;  // auto
    opts->nrhs     = 1;
    opts->nstream  = 1;
    opts->ngpu     = magma_num_gpus();
    opts->niter    = 1;
    opts->nthread  = 1;
    opts->itype    = 1;
    opts->svd_work = 0;
    opts->version  = 1;
    opts->fraction = 1.;
    opts->tolerance = 30.;
    opts->panel_nthread = 1;
    opts->fraction_dcpu = 0.0;
    opts->check     = (getenv("MAGMA_TESTINGS_CHECK") != NULL);
    opts->lapack    = (getenv("MAGMA_RUN_LAPACK")     != NULL);
    opts->warmup    = (getenv("MAGMA_WARMUP")         != NULL);
    opts->all       = (getenv("MAGMA_RUN_ALL")        != NULL);
    opts->verbose   = false;
    
    opts->uplo      = MagmaLower;      // potrf, etc.
    opts->transA    = MagmaNoTrans;    // gemm, etc.
    opts->transB    = MagmaNoTrans;    // gemm
    opts->side      = MagmaLeft;       // trsm, etc.
    opts->diag      = MagmaNonUnit;    // trsm, etc.
    opts->jobu      = MagmaNoVec;  // gesvd: no left  singular vectors
    opts->jobvt     = MagmaNoVec;  // gesvd: no right singular vectors
    opts->jobz      = MagmaNoVec;  // heev:  no eigen vectors
    opts->jobvr     = MagmaNoVec;  // geev:  no right eigen vectors
    opts->jobvl     = MagmaNoVec;  // geev:  no left  eigen vectors
    
    #ifdef USE_FLOCK
    opts->flock_op = LOCK_SH;  // default shared lock
    #endif
    
    printf( usage_short, argv[0] );
    
    int ndevices;
    cudaGetDeviceCount( &ndevices );
    
    int info;
    int ntest = 0;
    for( int i = 1; i < argc; ++i ) {
        // ----- matrix size
        // each -N fills in next entry of msize, nsize, ksize and increments ntest
        if ( strcmp("-N", argv[i]) == 0 && i+1 < argc ) {
            magma_assert( ntest < MAX_NTEST, "error: -N %s, max number of tests exceeded, ntest=%d.\n",
                          argv[i], ntest );
            i++;
            int m2, n2, k2;
            info = sscanf( argv[i], "%d,%d,%d", &m2, &n2, &k2 );
            if ( info == 3 && m2 >= 0 && n2 >= 0 && k2 >= 0 ) {
                opts->msize[ ntest ] = m2;
                opts->nsize[ ntest ] = n2;
                opts->ksize[ ntest ] = k2;
            }
            else if ( info == 2 && m2 >= 0 && n2 >= 0 ) {
                opts->msize[ ntest ] = m2;
                opts->nsize[ ntest ] = n2;
                opts->ksize[ ntest ] = n2;  // implicitly
            }
            else if ( info == 1 && m2 >= 0 ) {
                opts->msize[ ntest ] = m2;
                opts->nsize[ ntest ] = m2;  // implicitly
                opts->ksize[ ntest ] = m2;  // implicitly
            }
            else {
                fprintf( stderr, "error: -N %s is invalid; ensure m >= 0, n >= 0, k >= 0.\n",
                         argv[i] );
                exit(1);
            }
            ntest++;
        }
        // --range start:stop:step fills in msize[ntest:], nsize[ntest:], ksize[ntest:]
        // with given range and updates ntest
        else if ( strcmp("--range", argv[i]) == 0 && i+1 < argc ) {
            i++;
            int start, stop, step;
            info = sscanf( argv[i], "%d:%d:%d", &start, &stop, &step );
            if ( info == 3 && start >= 0 && stop >= 0 && step != 0 ) {
                for( int n = start; (step > 0 ? n <= stop : n >= stop); n += step ) {
                    if ( ntest >= MAX_NTEST ) {
                        printf( "warning: --range %s, max number of tests reached, ntest=%d.\n",
                                argv[i], ntest );
                        break;
                    }
                    opts->msize[ ntest ] = n;
                    opts->nsize[ ntest ] = n;
                    opts->ksize[ ntest ] = n;
                    ntest++;
                }
            }
            else {
                fprintf( stderr, "error: --range %s is invalid; ensure start >= 0, stop >= start, step > 0.\n",
                         argv[i] );
                exit(1);
            }
        }
        // save m, n, k if -m, -n, -k is given; applied after loop
        else if ( strcmp("-m", argv[i]) == 0 && i+1 < argc ) {
            m = atoi( argv[++i] );
            magma_assert( m >= 0, "error: -m %s is invalid; ensure m >= 0.\n", argv[i] );
        }
        else if ( strcmp("-n", argv[i]) == 0 && i+1 < argc ) {
            n = atoi( argv[++i] );
            magma_assert( n >= 0, "error: -n %s is invalid; ensure n >= 0.\n", argv[i] );
        }
        else if ( strcmp("-k", argv[i]) == 0 && i+1 < argc ) {
            k = atoi( argv[++i] );
            magma_assert( k >= 0, "error: -k %s is invalid; ensure k >= 0.\n", argv[i] );
        }
        
        // ----- scalar arguments
        else if ( strcmp("--dev", argv[i]) == 0 && i+1 < argc ) {
            opts->device = atoi( argv[++i] );
            magma_assert( opts->device >= 0 && opts->device < ndevices,
                          "error: --dev %s is invalid; ensure dev in [0,%d].\n", argv[i], ndevices-1 );
        }
        else if ( strcmp("--pad", argv[i]) == 0 && i+1 < argc ) {
            opts->pad = atoi( argv[++i] );
            magma_assert( opts->pad >= 1 && opts->pad <= 4096,
                          "error: --pad %s is invalid; ensure pad in [1,4096].\n", argv[i] );
        }
        else if ( strcmp("--nrhs",    argv[i]) == 0 && i+1 < argc ) {
            opts->nrhs = atoi( argv[++i] );
            magma_assert( opts->nrhs >= 0,
                          "error: --nrhs %s is invalid; ensure nrhs >= 0.\n", argv[i] );
        }
        else if ( strcmp("--nb",      argv[i]) == 0 && i+1 < argc ) {
            opts->nb = atoi( argv[++i] );
            magma_assert( opts->nb > 0,
                          "error: --nb %s is invalid; ensure nb > 0.\n", argv[i] );
        }
        else if ( strcmp("--ngpu",    argv[i]) == 0 && i+1 < argc ) {
            opts->ngpu = atoi( argv[++i] );
            magma_assert( opts->ngpu <= MagmaMaxGPUs,
                          "error: --ngpu %s exceeds MagmaMaxGPUs, %d.\n", argv[i], MagmaMaxGPUs );
            magma_assert( opts->ngpu <= ndevices,
                          "error: --ngpu %s exceeds number of CUDA devices, %d.\n", argv[i], ndevices );
            magma_assert( opts->ngpu > 0,
                          "error: --ngpu %s is invalid; ensure ngpu > 0.\n", argv[i] );
            #ifndef _MSC_VER  // not Windows
            // save in environment variable, so magma_num_gpus() picks it up
            setenv( "MAGMA_NUM_GPUS", argv[i], true );
            #endif
        }
        else if ( strcmp("--nstream", argv[i]) == 0 && i+1 < argc ) {
            opts->nstream = atoi( argv[++i] );
            magma_assert( opts->nstream > 0,
                          "error: --nstream %s is invalid; ensure nstream > 0.\n", argv[i] );
        }
        else if ( strcmp("--niter",   argv[i]) == 0 && i+1 < argc ) {
            opts->niter = atoi( argv[++i] );
            magma_assert( opts->niter > 0,
                          "error: --niter %s is invalid; ensure niter > 0.\n", argv[i] );
        }
        else if ( strcmp("--nthread", argv[i]) == 0 && i+1 < argc ) {
            opts->nthread = atoi( argv[++i] );
            magma_assert( opts->nthread > 0,
                          "error: --nthread %s is invalid; ensure nthread > 0.\n", argv[i] );
        }
        else if ( strcmp("--itype",   argv[i]) == 0 && i+1 < argc ) {
            opts->itype = atoi( argv[++i] );
            magma_assert( opts->itype >= 1 && opts->itype <= 3,
                          "error: --itype %s is invalid; ensure itype in [1,2,3].\n", argv[i] );
        }
        else if ( strcmp("--svd_work", argv[i]) == 0 && i+1 < argc ) {
            opts->svd_work = atoi( argv[++i] );
            magma_assert( opts->svd_work >= 0 && opts->svd_work <= 3,
                          "error: --svd_work %s is invalid; ensure svd_work in [0,1,2,3].\n", argv[i] );
        }
        else if ( strcmp("--version", argv[i]) == 0 && i+1 < argc ) {
            opts->version = atoi( argv[++i] );
            magma_assert( opts->version >= 1,
                          "error: --version %s is invalid; ensure version > 0.\n", argv[i] );
        }
        else if ( strcmp("--fraction", argv[i]) == 0 && i+1 < argc ) {
            opts->fraction = atof( argv[++i] );
            magma_assert( opts->fraction >= 0 && opts->fraction <= 1,
                          "error: --fraction %s is invalid; ensure fraction in [0,1].\n", argv[i] );
        }
        else if ( strcmp("--tolerance", argv[i]) == 0 && i+1 < argc ) {
            opts->tolerance = atof( argv[++i] );
            magma_assert( opts->tolerance >= 0 && opts->tolerance <= 1000,
                          "error: --tolerance %s is invalid; ensure tolerance in [0,1000].\n", argv[i] );
        }
        else if ( strcmp("--panel_nthread", argv[i]) == 0 && i+1 < argc ) {
            opts->panel_nthread = atoi( argv[++i] );
            magma_assert( opts->panel_nthread > 0,
                          "error: --panel_nthread %s is invalid; ensure panel_nthread > 0.\n", argv[i] );
        }
        else if ( strcmp("--fraction_dcpu", argv[i]) == 0 && i+1 < argc ) {
            opts->fraction_dcpu = atof( argv[++i] );
            magma_assert( opts->fraction_dcpu > 0 && opts->fraction_dcpu<=1,
                          "error: --fraction_dcpu %s is invalid; ensure fraction_dcpu in [0, 1]\n", argv[i] );
        }
        // ----- boolean arguments
        // check results
        else if ( strcmp("-c",         argv[i]) == 0 ||
                  strcmp("--check",    argv[i]) == 0 ) { opts->check  = 1; }
        else if ( strcmp("-c2",        argv[i]) == 0 ||
                  strcmp("--check2",   argv[i]) == 0 ) { opts->check  = 2; }
        else if ( strcmp("--nocheck",  argv[i]) == 0 ) { opts->check  = 0; }
        else if ( strcmp("-l",         argv[i]) == 0 ||
                  strcmp("--lapack",   argv[i]) == 0 ) { opts->lapack = true;  }
        else if ( strcmp("--nolapack", argv[i]) == 0 ) { opts->lapack = false; }
        else if ( strcmp("--warmup",   argv[i]) == 0 ) { opts->warmup = true;  }
        else if ( strcmp("--nowarmup", argv[i]) == 0 ) { opts->warmup = false; }
        else if ( strcmp("--all",      argv[i]) == 0 ) { opts->all    = true;  }
        else if ( strcmp("--notall",   argv[i]) == 0 ) { opts->all    = false; }
        else if ( strcmp("--verbose",  argv[i]) == 0 ) { opts->verbose= true;  }
        
        // ----- lapack flag arguments
        else if ( strcmp("-L",  argv[i]) == 0 ) { opts->uplo = MagmaLower; }
        else if ( strcmp("-U",  argv[i]) == 0 ) { opts->uplo = MagmaUpper; }
        else if ( strcmp("-F",  argv[i]) == 0 ) { opts->uplo = MagmaUpperLower; }
        
        else if ( strcmp("-NN", argv[i]) == 0 ) { opts->transA = MagmaNoTrans;   opts->transB = MagmaNoTrans;   }
        else if ( strcmp("-NT", argv[i]) == 0 ) { opts->transA = MagmaNoTrans;   opts->transB = MagmaTrans;     }
        else if ( strcmp("-NC", argv[i]) == 0 ) { opts->transA = MagmaNoTrans;   opts->transB = MagmaConjTrans; }
        else if ( strcmp("-TN", argv[i]) == 0 ) { opts->transA = MagmaTrans;     opts->transB = MagmaNoTrans;   }
        else if ( strcmp("-TT", argv[i]) == 0 ) { opts->transA = MagmaTrans;     opts->transB = MagmaTrans;     }
        else if ( strcmp("-TC", argv[i]) == 0 ) { opts->transA = MagmaTrans;     opts->transB = MagmaConjTrans; }
        else if ( strcmp("-CN", argv[i]) == 0 ) { opts->transA = MagmaConjTrans; opts->transB = MagmaNoTrans;   }
        else if ( strcmp("-CT", argv[i]) == 0 ) { opts->transA = MagmaConjTrans; opts->transB = MagmaTrans;     }
        else if ( strcmp("-CC", argv[i]) == 0 ) { opts->transA = MagmaConjTrans; opts->transB = MagmaConjTrans; }
        else if ( strcmp("-T",  argv[i]) == 0 ) { opts->transA = MagmaTrans;     }
        else if ( strcmp("-C",  argv[i]) == 0 ) { opts->transA = MagmaConjTrans; }
        
        else if ( strcmp("-SL", argv[i]) == 0 ) { opts->side  = MagmaLeft;  }
        else if ( strcmp("-SR", argv[i]) == 0 ) { opts->side  = MagmaRight; }
        
        else if ( strcmp("-DN", argv[i]) == 0 ) { opts->diag  = MagmaNonUnit; }
        else if ( strcmp("-DU", argv[i]) == 0 ) { opts->diag  = MagmaUnit;    }
        
        else if ( strcmp("-UA", argv[i]) == 0 ) { opts->jobu  = MagmaAllVec;       }
        else if ( strcmp("-US", argv[i]) == 0 ) { opts->jobu  = MagmaSomeVec;      }
        else if ( strcmp("-UO", argv[i]) == 0 ) { opts->jobu  = MagmaOverwriteVec; }
        else if ( strcmp("-UN", argv[i]) == 0 ) { opts->jobu  = MagmaNoVec;        }
        
        else if ( strcmp("-VA", argv[i]) == 0 ) { opts->jobvt = MagmaAllVec;       }
        else if ( strcmp("-VS", argv[i]) == 0 ) { opts->jobvt = MagmaSomeVec;      }
        else if ( strcmp("-VO", argv[i]) == 0 ) { opts->jobvt = MagmaOverwriteVec; }
        else if ( strcmp("-VN", argv[i]) == 0 ) { opts->jobvt = MagmaNoVec;        }
        
        else if ( strcmp("-JN", argv[i]) == 0 ) { opts->jobz  = MagmaNoVec; }
        else if ( strcmp("-JV", argv[i]) == 0 ) { opts->jobz  = MagmaVec;   }
        
        else if ( strcmp("-LN", argv[i]) == 0 ) { opts->jobvl = MagmaNoVec; }
        else if ( strcmp("-LV", argv[i]) == 0 ) { opts->jobvl = MagmaVec;   }
        
        else if ( strcmp("-RN", argv[i]) == 0 ) { opts->jobvr = MagmaNoVec; }
        else if ( strcmp("-RV", argv[i]) == 0 ) { opts->jobvr = MagmaVec;   }
        
        // ----- misc
        else if ( strcmp("-x",          argv[i]) == 0 ||
                  strcmp("--exclusive", argv[i]) == 0 ) {
            #ifdef USE_FLOCK
            opts->flock_op = LOCK_EX;
            #else
            fprintf( stderr, "ignoring %s: USE_FLOCK not defined; flock not supported.\n", argv[i] );
            #endif
        }
        
        // ----- usage
        else if ( strcmp("-h",     argv[i]) == 0 ||
                  strcmp("--help", argv[i]) == 0 ) {
            fprintf( stderr, usage, argv[0], MAX_NTEST );
            exit(0);
        }
        else {
            fprintf( stderr, "error: unrecognized option %s\n", argv[i] );
            exit(1);
        }
    }
    
    // if -N or --range not given, use default range
    if ( ntest == 0 ) {
        int n2 = 1024 + 64;
        for( int i = 0; i < MAX_NTEST; ++i ) {
            opts->msize[i] = n2;
            opts->nsize[i] = n2;
            opts->ksize[i] = n2;
            n2 += 1024;
        }
        ntest = 10;
    }
    assert( ntest <= MAX_NTEST );
    opts->ntest = ntest;
    
    // fill in msize[:], nsize[:], ksize[:] if -m, -n, -k were given
    if ( m >= 0 ) {
        for( int j = 0; j < MAX_NTEST; ++j ) {
            opts->msize[j] = m;
        }
    }
    if ( n >= 0 ) {
        for( int j = 0; j < MAX_NTEST; ++j ) {
            opts->nsize[j] = n;
        }
    }
    if ( k >= 0 ) {
        for( int j = 0; j < MAX_NTEST; ++j ) {
            opts->ksize[j] = k;
        }
    }
    
    // find max dimensions
    opts->mmax = 0;
    opts->nmax = 0;
    opts->kmax = 0;
    for( int i = 0; i < ntest; ++i ) {
        opts->mmax = max( opts->mmax, opts->msize[i] );
        opts->nmax = max( opts->nmax, opts->nsize[i] );
        opts->kmax = max( opts->kmax, opts->ksize[i] );
    }

    // disallow jobu=O, jobvt=O
    if ( opts->jobu == MagmaOverwriteVec && opts->jobvt == MagmaOverwriteVec ) {
        printf( "jobu and jobvt cannot both be Overwrite.\n" );
        exit(1);
    }
    
    // lock file
    #ifdef USE_FLOCK
    opts->flock_fd = open_lockfile( lockfile, opts->flock_op );
    #endif
    
    // set device
    magma_setdevice( opts->device );
}
Example #28
0
mplugin* mscorefactory_kgpu::create_plugin()
{
    if (first_access) { // first call, from master thread
        first_access = false; // just once, thanks
        // initialize GPU
        int deviceCount = 0;
        int bestDeviceIndex = 0;
        double highestDeviceScore = 0.0;  // device score = clockRate
					  // * multiProcessorCount
        cudaDeviceProp deviceProp;

        cuInit(0);

        // loop through all devices and choose which one to use based on
        // highest device score

        cudaError_t err = cudaGetDeviceCount(&deviceCount);
        if (err != cudaSuccess) {
             logevent(error, init, cudaGetErrorString(err) << endl);
        }
        logevent(info, init, "GPU devices found: " << deviceCount << endl);

        for (int curDeviceNumber=0; 
	     curDeviceNumber < deviceCount; 
	     curDeviceNumber++) {
            logevent(info, init, "testing device " << curDeviceNumber << endl);
            memset(&deviceProp, 0, sizeof(deviceProp));
            if (cudaSuccess == cudaGetDeviceProperties(
		  &deviceProp, curDeviceNumber)) {
	      double curDeviceScore = 
		deviceProp.multiProcessorCount * deviceProp.clockRate;
                if (curDeviceScore > highestDeviceScore) {
                    highestDeviceScore = curDeviceScore;
                    bestDeviceIndex = curDeviceNumber;
                }
            }
            else {
                logevent(
		  error, 
		  init, 
		  "unable to access GPU properties, using non-GPU k-score" 
		    << endl);

                return new mscore_k();
            }
        }

        memset(&deviceProp, 0, sizeof(deviceProp));
        if (cudaSuccess == 
	      cudaGetDeviceProperties(&deviceProp, bestDeviceIndex)) {
            logevent(
	      info,
	      init, 
	      "Using GPU device " << bestDeviceIndex << ": \"" 
	        << deviceProp.name << "\", "
	        << prettymem(deviceProp.totalGlobalMem) << ", " 
	        << deviceProp.multiProcessorCount << " multiprocessors, " 
	        << deviceProp.clockRate/1000 << " MHz" << endl);
            size_t free_mem, total_mem;
            cudaMemGetInfo(&free_mem, &total_mem);
            m_initialFreeMemory = free_mem;
            logevent(
	      info,
	      init, 
	      "device initial free memory: " << prettymem(m_initialFreeMemory) 
	        << endl);
        }
        else {
            logevent(
	      error, 
	      init, 
	      "unable to access device properites, using non-GPU k-score" 
	        << endl);
            return new mscore_k();
        }      

        logevent(info, init, "resetting GPU" << endl);
        if (cudaSuccess != cudaDeviceReset() ) {
            logevent(
	      error,
	      init, 
	      "error resetting GPU, using non-GPU k-score" << endl);
            return new mscore_k();
        }
        logevent(info, initok, "done resetting GPU" << endl);

        if (cudaSuccess != cudaSetDevice(bestDeviceIndex)) {
            logevent(
	      error, 
	      init, 
	      "Error - cannot set GPU, using non-GPU k-score"  << endl);
            return new mscore_k();
        }
        else {
            logevent(
	      info, 
	      initok, 
	      "GPU " << bestDeviceIndex << " initialized." << endl);
        }
        size_t free, total;
        free = total = 0;
        cudaMemGetInfo(&free, &total);
        m_initialFreeMemory = free;
        logevent(
	  info, 
	  init, 
	  "Device memory: " << prettymem(free) << " free / " 
	    << prettymem(total) << " total" << endl);
        if (m_initialFreeMemory) {
            mscore_kgpu_thrust_init(m_initialFreeMemory);
        }

    }
    if (!m_initialFreeMemory) {
        logevent(
	  error, 
	  init, 
	  "Error - insufficient GPU memory, using non-GPU k-score" << endl);
        return new mscore_k();
    }
    return new mscore_kgpu();
}
Example #29
0
int cuda_devices(void)
{
	int count;
	CUDA_ERROR(cudaGetDeviceCount(&count));
	return count;
}
Example #30
0
File: device.cpp Project: LLNL/mfem
static void DeviceSetup(const int dev, int &ngpu)
{
   MFEM_CUDA_CHECK(cudaGetDeviceCount(&ngpu));
   MFEM_VERIFY(ngpu > 0, "No CUDA device found!");
   MFEM_CUDA_CHECK(cudaSetDevice(dev));
}