예제 #1
0
파일: error.hpp 프로젝트: rabauke/cumath
 const char * what() const throw() {
   return cudaGetErrorString(err_num);
 }
예제 #2
0
파일: dnn_fwd.c 프로젝트: aalmah/Theano
int
APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
                         PyGpuArrayObject *om,
                         cudnnConvolutionDescriptor_t desc,
                         double alpha, double beta,
                         PyGpuArrayObject **output,
                         PyGpuContextObject *c) {
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
  float af = alpha, bf = beta;
  void *alpha_p;
  void *beta_p;

  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
    PyErr_SetString(PyExc_ValueError,
		    "images and kernel must have the same stack size");
    return 1;
  }

  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
    return 1;
  if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
    return 1;

  switch (input->ga.typecode) {
  case GA_DOUBLE:
    alpha_p = (void *)α
    beta_p = (void *)β
    break;
  case GA_FLOAT:
  case GA_HALF:
    alpha_p = (void *)⁡
    beta_p = (void *)&bf;
    break;
  default:
    PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution");
    return 1;
  }

#ifdef CONV_INPLACE
  Py_XDECREF(*output);
  *output = om;
  Py_INCREF(*output);
#else
  if (theano_prep_output(output, PyGpuArray_NDIM(om), PyGpuArray_DIMS(om),
                         om->ga.typecode, GA_C_ORDER, c) != 0)
    return 1;
  if (beta != 0.0 && pygpu_move(*output, om))
    return 1;
#endif

  if (c_set_tensorNd(*output, APPLY_SPECIFIC(output)) == -1)
    return 1;

  cudnnConvolutionFwdAlgo_t algo = CONV_ALGO;

  cuda_enter(c->ctx);
#ifdef CHOOSE_ALGO
  /* Static variables are only initialized once so this will not
   * reset the previous algo every time */
  static int reuse_algo = 0;
  static cudnnConvolutionFwdAlgo_t prev_algo = CONV_ALGO;

#ifndef CHOOSE_ONCE
  static size_t prev_img_dims[5] = {0};
  static size_t prev_kern_dims[5] = {0};

  reuse_algo = 1;
  for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
    reuse_algo = (reuse_algo &&
                  PyGpuArray_DIM(input, i) == prev_img_dims[i]);
    reuse_algo = (reuse_algo &&
                  PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
  }
#endif

  if (!reuse_algo) {
#ifdef CHOOSE_TIME
    int count;
    cudnnConvolutionFwdAlgoPerf_t choice;
    err = cudnnFindConvolutionForwardAlgorithm(
      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
      desc, APPLY_SPECIFIC(output), 1, &count, &choice);

    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
                   "error selecting convolution algo: %s",
                   cudnnGetErrorString(err));
      cuda_exit(c->ctx);
      return 1;
    }
    algo = choice.algo;
#else
    size_t free = 0, total = 0;
    cudaError_t err2 = cudaMemGetInfo(&free, &total);
    if (err2 != cudaSuccess) {
      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
                   "memory information on the GPU: %s\n",
                   cudaGetErrorString(err2));
      cuda_exit(c->ctx);
      return 1;
    }

    err = cudnnGetConvolutionForwardAlgorithm(
      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
      desc, APPLY_SPECIFIC(output),
      CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo);
    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
                   "error selecting convolution algo: %s",
                   cudnnGetErrorString(err));
      cuda_exit(c->ctx);
      return 1;
    }
#endif
    prev_algo = algo;
  } else {
    algo = prev_algo;
  }

#ifdef CHOOSE_ONCE
  reuse_algo = 1;
#else
  for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
    prev_img_dims[i] = PyGpuArray_DIM(input, i);
    prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
  }
#endif

#endif

  /* These two algos are not supported for 3d conv */
  if (PyGpuArray_NDIM(input) == 5 &&
      (algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM ||
       algo == CUDNN_CONVOLUTION_FWD_ALGO_GEMM))
    algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;

#if CUDNN_VERSION > 3000
  if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) {
    int nd;
    int pad[2];
    int stride[2];
    int upscale[2];
    cudnnConvolutionMode_t mode;
    err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
                                          upscale, &mode);
    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
                   "error getting convolution properties: %s",
                   cudnnGetErrorString(err));
      cuda_exit(c->ctx);
      return 1;
    }

    if (stride[0] != 1 || stride[1] != 1 ||
        PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
        (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1)) {
      algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
    }
  }
#endif

#if CUDNN_VERSION < 3000
  /* cuDNN before v3 does not support kernels larger than input even
   * if appropriate padding is selected. */
  for (unsigned int i = 2; i < PyGpuArray_NDIM(input); i++) {
    if (PyGpuArray_DIM(kerns, i) > PyGpuArray_DIM(input, i)) {
      PyErr_SetString(PyExc_RuntimeError, "the current version "
                      "of CuDNN does not support kernels larger than the "
                      "inputs in any spatial dimension, even if the inputs "
                      "are padded such that the padded inputs are larger "
                      "than the kernels. Update your installation of CuDNN "
                      "to V3 or more recent to solve the issue.");
      cuda_exit(c->ctx);
      return 1;
    }
  }
#endif

  {
    size_t worksize;
    gpudata *workspace;
    err = cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle),
                                                  APPLY_SPECIFIC(input),
                                                  APPLY_SPECIFIC(kerns),
                                                  desc,
                                                  APPLY_SPECIFIC(output),
                                                  algo,
                                                  &worksize);
    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
                   "error getting worksize: %s",
                   cudnnGetErrorString(err));
      cuda_exit(c->ctx);
      return 1;
    }

    /*
     * This is less than ideal since we need to free it after (which
     * introduces a synchronization point. But we don't have a module
     * to place a nice get_work_mem() function in.
     */
    if (worksize != 0) {
      workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL);
      if (workspace == NULL) {
        PyErr_SetString(PyExc_RuntimeError,
                        "Could not allocate working memory");
        cuda_exit(c->ctx);
        return 1;
      }
    }

    err = cudnnConvolutionForward(
      APPLY_SPECIFIC(_handle),
      alpha_p,
      APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
      APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
      desc, algo,
      worksize == 0 ? NULL : *(void **)workspace, worksize,
      beta_p,
      APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output));

    if (worksize != 0)
      c->ops->buffer_release(workspace);
  }
  cuda_exit(c->ctx);

  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
		 cudnnGetErrorString(err));
    return 1;
  }
  return 0;
}
예제 #3
0
void oskar_cuda_device_info_scan(oskar_CudaDeviceInfo* device, int id)
{
    int arch, device_count = 0;
    cudaError_t error;
    struct cudaDeviceProp device_prop;
    size_t total_memory = 0, free_memory = 0;

    /* Set CUDA device. */
    cudaSetDevice(id);

    /* Set default values in case of errors. */
    device->name[0] = 0;
    device->compute.capability.major = 0;
    device->compute.capability.minor = 0;
    device->supports_double = 0;
    device->global_memory_size = 0;
    device->free_memory = 0;
    device->num_multiprocessors = 0;
    device->num_cores = 0;
    device->gpu_clock = 0;
    device->memory_clock = 0;
    device->memory_bus_width = 0;
    device->level_2_cache_size = 0;
    device->shared_memory_size = 0;
    device->num_registers = 0;
    device->warp_size = 0;
    device->max_threads_per_block = 0;
    device->max_threads_dim[0] = 0;
    device->max_threads_dim[1] = 0;
    device->max_threads_dim[2] = 0;
    device->max_grid_size[0] = 0;
    device->max_grid_size[1] = 0;
    device->max_grid_size[2] = 0;

    /* Get device count. */
    error = cudaGetDeviceCount(&device_count);
    if (error != cudaSuccess || device_count == 0)
    {
        fprintf(stderr, "Unable to determine number of CUDA devices: %s\n",
                cudaGetErrorString(error));
        return;
    }

    /* Check device ID is within range. */
    if (id > device_count - 1)
    {
        fprintf(stderr, "Error: Device ID out of range.\n");
        return;
    }

    /* Get device properties. */
    cudaGetDeviceProperties(&device_prop, id);
    strcpy(device->name, device_prop.name);
    device->compute.capability.major = device_prop.major;
    device->compute.capability.minor = device_prop.minor;
    device->supports_double = 0;
    if (device_prop.major >= 2 || device_prop.minor >= 3)
        device->supports_double = 1;
    total_memory = device_prop.totalGlobalMem / 1024;
    device->global_memory_size = total_memory;
    device->num_multiprocessors = device_prop.multiProcessorCount;
    arch = (device_prop.major << 4) + device_prop.minor;
    switch (arch)
    {
    case 0x10:
    case 0x11:
    case 0x12:
    case 0x13:
        device->num_cores = 8;
        break;
    case 0x20:
        device->num_cores = 32;
        break;
    case 0x21:
        device->num_cores = 48;
        break;
    case 0x30:
    case 0x32:
    case 0x35:
    case 0x37:
        device->num_cores = 192;
        break;
    case 0x50:
    case 0x52:
    case 0x53:
        device->num_cores = 128;
        break;
    case 0x60:
        device->num_cores = 64;
        break;
    case 0x61:
    case 0x62:
        device->num_cores = 128;
        break;
    default:
        device->num_cores = -1;
        break;
    }
    if (device->num_cores > 0)
        device->num_cores *= device->num_multiprocessors;
    device->gpu_clock = device_prop.clockRate;
#if CUDART_VERSION >= 4000
    device->memory_clock = device_prop.memoryClockRate;
    device->memory_bus_width = device_prop.memoryBusWidth;
    device->level_2_cache_size = device_prop.l2CacheSize;
#else
    device->memory_clock = -1;
    device->memory_bus_width = -1;
    device->level_2_cache_size = -1;
#endif

    /* Get free memory size. */
    cudaMemGetInfo(&free_memory, &total_memory);
    free_memory /= 1024;
    device->free_memory = free_memory;

    /* Get block properties. */
    device->shared_memory_size = device_prop.sharedMemPerBlock;
    device->num_registers = device_prop.regsPerBlock;
    device->warp_size = device_prop.warpSize;
    device->max_threads_per_block = device_prop.maxThreadsPerBlock;
    device->max_threads_dim[0] = device_prop.maxThreadsDim[0];
    device->max_threads_dim[1] = device_prop.maxThreadsDim[1];
    device->max_threads_dim[2] = device_prop.maxThreadsDim[2];
    device->max_grid_size[0] = device_prop.maxGridSize[0];
    device->max_grid_size[1] = device_prop.maxGridSize[1];
    device->max_grid_size[2] = device_prop.maxGridSize[2];
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv) 
{
    pArgc = &argc;
    pArgv = argv;

    shrQAStart(argc, argv);

    shrSetLogFileName ("deviceQuery.txt");
    shrLog("%s Starting...\n\n", argv[0]); 
    shrLog(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n");
        
    int deviceCount = 0;
    cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
    if (error_id != cudaSuccess) {
        shrLog( "cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id) );
        shrQAFinishExit(*pArgc, (const char **)pArgv, QA_FAILED);
    }
    // This function call returns 0 if there are no CUDA capable devices.
    if (deviceCount == 0)
        shrLog("There is no device supporting CUDA\n");
    else
        shrLog("Found %d CUDA Capable device(s)\n", deviceCount);

    int dev, driverVersion = 0, runtimeVersion = 0;     
    for (dev = 0; dev < deviceCount; ++dev) {
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, dev);

        shrLog("\nDevice %d: \"%s\"\n", dev, deviceProp.name);

    #if CUDART_VERSION >= 2020
        // Console log
        cudaDriverGetVersion(&driverVersion);
        cudaRuntimeGetVersion(&runtimeVersion);
        shrLog("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", driverVersion/1000, (driverVersion%100)/10, runtimeVersion/1000, (runtimeVersion%100)/10);
    #endif
        shrLog("  CUDA Capability Major/Minor version number:    %d.%d\n", deviceProp.major, deviceProp.minor);

        char msg[256];
        sprintf(msg, "  Total amount of global memory:                 %.0f MBytes (%llu bytes)\n", 
                      (float)deviceProp.totalGlobalMem/1048576.0f, (unsigned long long) deviceProp.totalGlobalMem);
        shrLog(msg);
    #if CUDART_VERSION >= 2000
        shrLog("  (%2d) Multiprocessors x (%3d) CUDA Cores/MP:    %d CUDA Cores\n", 
			deviceProp.multiProcessorCount,
			ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
			ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
    #endif
        shrLog("  GPU Clock rate:                                %.0f MHz (%0.2f GHz)\n", deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f);
    #if CUDART_VERSION >= 4000
	// This is not available in the CUDA Runtime API, so we make the necessary calls the driver API to support this for output
        int memoryClock;
        getCudaAttribute<int>( &memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev );
        shrLog("  Memory Clock rate:                             %.0f Mhz\n", memoryClock * 1e-3f);
        int memBusWidth;
        getCudaAttribute<int>( &memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev );
        shrLog("  Memory Bus Width:                              %d-bit\n", memBusWidth);
        int L2CacheSize;
        getCudaAttribute<int>( &L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev );
        if (L2CacheSize) {
            shrLog("  L2 Cache Size:                                 %d bytes\n", L2CacheSize);
        }

        shrLog("  Max Texture Dimension Size (x,y,z)             1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n",
                                                        deviceProp.maxTexture1D, deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1],
                                                        deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]);
        shrLog("  Max Layered Texture Size (dim) x layers        1D=(%d) x %d, 2D=(%d,%d) x %d\n",
                                                        deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1],
                                                        deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]);
    #endif
        shrLog("  Total amount of constant memory:               %u bytes\n", deviceProp.totalConstMem); 
        shrLog("  Total amount of shared memory per block:       %u bytes\n", deviceProp.sharedMemPerBlock);
        shrLog("  Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
        shrLog("  Warp size:                                     %d\n", deviceProp.warpSize);
        shrLog("  Maximum number of threads per multiprocessor:  %d\n", deviceProp.maxThreadsPerMultiProcessor);
        shrLog("  Maximum number of threads per block:           %d\n", deviceProp.maxThreadsPerBlock);
        shrLog("  Maximum sizes of each dimension of a block:    %d x %d x %d\n",
               deviceProp.maxThreadsDim[0],
               deviceProp.maxThreadsDim[1],
               deviceProp.maxThreadsDim[2]);
        shrLog("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n",
               deviceProp.maxGridSize[0],
               deviceProp.maxGridSize[1],
               deviceProp.maxGridSize[2]);
        shrLog("  Maximum memory pitch:                          %u bytes\n", deviceProp.memPitch);
		shrLog("  Texture alignment:                             %u bytes\n", deviceProp.textureAlignment);

    #if CUDART_VERSION >= 4000
        shrLog("  Concurrent copy and execution:                 %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount);
    #else
        shrLog("  Concurrent copy and execution:                 %s\n", deviceProp.deviceOverlap ? "Yes" : "No");
    #endif

    #if CUDART_VERSION >= 2020
        shrLog("  Run time limit on kernels:                     %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
        shrLog("  Integrated GPU sharing Host Memory:            %s\n", deviceProp.integrated ? "Yes" : "No");
        shrLog("  Support host page-locked memory mapping:       %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");
    #endif
    #if CUDART_VERSION >= 3000
        shrLog("  Concurrent kernel execution:                   %s\n", deviceProp.concurrentKernels ? "Yes" : "No");
        shrLog("  Alignment requirement for Surfaces:            %s\n", deviceProp.surfaceAlignment ? "Yes" : "No");
    #endif
    #if CUDART_VERSION >= 3010
        shrLog("  Device has ECC support enabled:                %s\n", deviceProp.ECCEnabled ? "Yes" : "No");
    #endif
    #if CUDART_VERSION >= 3020
        shrLog("  Device is using TCC driver mode:               %s\n", deviceProp.tccDriver ? "Yes" : "No");
    #endif
    #if CUDART_VERSION >= 4000
        shrLog("  Device supports Unified Addressing (UVA):      %s\n", deviceProp.unifiedAddressing ? "Yes" : "No");
        shrLog("  Device PCI Bus ID / PCI location ID:           %d / %d\n", deviceProp.pciBusID, deviceProp.pciDeviceID );
    #endif

    #if CUDART_VERSION >= 2020
        const char *sComputeMode[] = {
            "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
            "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
            "Prohibited (no host thread can use ::cudaSetDevice() with this device)",
            "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
            "Unknown",
            NULL
        };
        shrLog("  Compute Mode:\n");
        shrLog("     < %s >\n", sComputeMode[deviceProp.computeMode]);
    #endif
    }

    // csv masterlog info
    // *****************************
    // exe and CUDA driver name 
    shrLog("\n");    
    std::string sProfileString = "deviceQuery, CUDA Driver = CUDART";        
    char cTemp[10];
    
    // driver version
    sProfileString += ", CUDA Driver Version = ";
    #ifdef WIN32
	    sprintf_s(cTemp, 10, "%d.%d", driverVersion/1000, (driverVersion%100)/10);
    #else
	    sprintf(cTemp, "%d.%d", driverVersion/1000, (driverVersion%100)/10);
    #endif
    sProfileString +=  cTemp;
    
    // Runtime version
    sProfileString += ", CUDA Runtime Version = ";
    #ifdef WIN32
	    sprintf_s(cTemp, 10, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10);
    #else
	    sprintf(cTemp, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10);
    #endif
    sProfileString +=  cTemp;  
    
    // Device count      
    sProfileString += ", NumDevs = ";
    #ifdef WIN32
        sprintf_s(cTemp, 10, "%d", deviceCount);
    #else
        sprintf(cTemp, "%d", deviceCount);
    #endif
    sProfileString += cTemp;
    
    // First 2 device names, if any
    for (dev = 0; dev < ((deviceCount > 2) ? 2 : deviceCount); ++dev) 
    {
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, dev);
        sProfileString += ", Device = ";
        sProfileString += deviceProp.name;
    }
    sProfileString += "\n";
    shrLogEx(LOGBOTH | MASTER, 0, sProfileString.c_str());

    // finish
    shrQAFinishExit(argc, (const char **)argv, QA_PASSED);
}
예제 #5
0
파일: NativeOp.cpp 프로젝트: atuxhe/returnn
static void _cudaHandleError(cudaError_t err, const char *file, int line) {
	if (err != cudaSuccess) {
		printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
		exit(EXIT_FAILURE);
	}
}
예제 #6
0
 static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
 {
     if (cudaSuccess != err)
         error(cudaGetErrorString(err), file, line, func);
 }        
예제 #7
0
int
APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
                        PyGpuArrayObject *km,
                        cudnnConvolutionDescriptor_t desc,
                        double alpha, double beta, PyGpuArrayObject **kerns,
                        PyGpuContextObject *c) {
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
  float af = alpha, bf = beta;
  void *alpha_p;
  void *beta_p;

  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) {
    PyErr_SetString(PyExc_ValueError,
		    "GpuDnnConv images and kernel must have the same stack size");
    return 1;
  }

  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
    return 1;
  if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
    return 1;

  switch (input->ga.typecode) {
  case GA_DOUBLE:
    alpha_p = (void *)&alpha;
    beta_p = (void *)&beta;
    break;
  case GA_FLOAT:
  case GA_HALF:
    alpha_p = (void *)&af;
    beta_p = (void *)&bf;
    break;
  default:
    PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution");
    return 1;
  }

#ifdef CONV_INPLACE
  Py_XDECREF(*kerns);
  *kerns = km;
  Py_INCREF(*kerns);
#else
  if (theano_prep_output(kerns, PyGpuArray_NDIM(km), PyGpuArray_DIMS(km),
                         km->ga.typecode, GA_C_ORDER, c) != 0)
    return 1;
  if (beta != 0.0 && pygpu_move(*kerns, km))
    return 1;
#endif

  if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1)
    return 1;

  cudnnConvolutionBwdFilterAlgo_t algo = CONV_ALGO;

  cuda_enter(c->ctx);

#ifdef CHOOSE_ALGO
  static int reuse_algo = 0;
  static cudnnConvolutionBwdFilterAlgo_t prev_algo = CONV_ALGO;

#ifndef CHOOSE_ONCE
  static size_t prev_img_dims[5] = {0};
  static size_t prev_top_dims[5] = {0};

  reuse_algo = 1;
  for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
    reuse_algo = (reuse_algo &&
                  PyGpuArray_DIM(input, i) == prev_img_dims[i]);
    reuse_algo = (reuse_algo &&
                  PyGpuArray_DIM(output, i) == prev_top_dims[i]);
  }
#endif

  if (!reuse_algo) {
#ifdef CHOOSE_TIME
    int count;
    cudnnConvolutionBwdFilterAlgoPerf_t choice;

    err = cudnnFindConvolutionBackwardFilterAlgorithm(
      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
      APPLY_SPECIFIC(kerns), 1, &count, &choice);

    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
                   "error selecting convolution algo: %s",
                   cudnnGetErrorString(err));
      cuda_exit(c->ctx);
      return 1;
    }

    algo = choice.algo;
#else
    size_t free = 0, total = 0;
    cudaError_t err2 = cudaMemGetInfo(&free, &total);
    if (err2 != cudaSuccess){
      cudaGetLastError();
      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the memory "
                   "information on the GPU: %s\n", cudaGetErrorString(err2));
      cuda_exit(c->ctx);
      return 1;
    }

    err = cudnnGetConvolutionBackwardFilterAlgorithm(
      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
      desc, APPLY_SPECIFIC(kerns),
      CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo);
    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
                   "error selecting convolution algo: %s",
                   cudnnGetErrorString(err));
      cuda_exit(c->ctx);
      return 1;
    }
#endif
    prev_algo = algo;
  } else {
    algo = prev_algo;
  }

#ifdef CHOOSE_ONCE
  reuse_algo = 1;
#else
  for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
    prev_img_dims[i] = PyGpuArray_DIM(input, i);
    prev_top_dims[i] = PyGpuArray_DIM(output, i);
  }
#endif

#endif

#if CUDNN_VERSION > 3000
  if (algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT) {
    int nd;
    int pad[2];
    int stride[2];
    int upscale[2];
    cudnnConvolutionMode_t mode;
    err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
                                          upscale, &mode);
    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
                   "error getting convolution properties: %s",
                   cudnnGetErrorString(err));
      cuda_exit(c->ctx);
      return 1;
    }

    if (stride[0] != 1 || stride[1] != 1 ||
        PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
        (PyGpuArray_DIM(*kerns, 2) == 1 && PyGpuArray_DIM(*kerns, 3) == 1)) {
      algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
    }
  }
#endif

  size_t worksize;
  gpudata *workspace;

  err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
    APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
    APPLY_SPECIFIC(kerns), algo, &worksize);

  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
                 cudnnGetErrorString(err));
      cuda_exit(c->ctx);
    return 1;
  }

  if (worksize != 0) {
    workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL);
    if (workspace == NULL) {
      PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory");
      cuda_exit(c->ctx);
      return 1;
    }
  }

  cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
  cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
  cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);

  err = cudnnConvolutionBackwardFilter_v3(
    APPLY_SPECIFIC(_handle),
    alpha_p,
    APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
    APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
    desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize,
    beta_p,
    APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns));

  if (worksize != 0)
    c->ops->buffer_release(workspace);

  cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
  cuda_record(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
  cuda_record((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);

  cuda_exit(c->ctx);

  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
                 cudnnGetErrorString(err));
    return 1;
  }
  return 0;
}
 void CudaSynchronizedMemory<T>::pullFromDevice(size_t nElements)
 {
   ASRL_ASSERT_GT_DBG(m_size,0, "The array is empty");
   if(nElements > m_size)
     nElements = m_size;
   cudaError_t err = (cudaMemcpy((void*) m_host, (void *)m_device, nElements*sizeof(T), cudaMemcpyDeviceToHost));
   ASRL_ASSERT_EQ(err, cudaSuccess, "Unable to copy " << typeid(T).name() << " array of size " << m_size << " from device (" << err << "): " << cudaGetErrorString(err));
 }
 void CudaSynchronizedMemory<T>::pullFromDeviceAsync(cudaStream_t stream, size_t nElements)
 {
   ASRL_ASSERT_GT(m_size,0, "The array is empty");
   ASRL_ASSERT(m_pageLocked, "Asynchronous transfer is only valid for page-locked host memory");
   if(nElements > m_size)
     nElements = m_size;
   cudaError_t err = (cudaMemcpyAsync((void*) m_host, (void *)m_device, nElements*sizeof(T), cudaMemcpyDeviceToHost, stream));
   ASRL_ASSERT_EQ(err,cudaSuccess, "Unable to copy " << typeid(T).name() << " array of size " << m_size << " from device. Stream " << stream << ": (" << err << "): " << cudaGetErrorString(err));
 }
예제 #10
0
void compute_process(int agents_total, int nreps, int world_width, int world_height)
{
	int np, pid;
	MPI_Comm_rank(MPI_COMM_WORLD, &pid);
	MPI_Comm_size(MPI_COMM_WORLD, &np);
	int server_process = np - 1;
	MPI_Status status;

	/* create a type for struct agent */
	const int nitems=5;
   	int blocklengths[5] = {1,1,1,1,1};
   	MPI_Datatype types[5] = {MPI_INT, MPI_INT, MPI_INT, MPI_FLOAT, MPI_FLOAT};
	MPI_Datatype mpi_agent_type;
	MPI_Aint offsets[5];

	offsets[0] = offsetof(agent, id);
    	offsets[1] = offsetof(agent, x);
    	offsets[2] = offsetof(agent, y);
    	offsets[3] = offsetof(agent, z);
    	offsets[4] = offsetof(agent, w);

	MPI_Type_create_struct(nitems, blocklengths, offsets, types, &mpi_agent_type);
	MPI_Type_commit(&mpi_agent_type);

	unsigned int num_bytes = agents_total * sizeof(float4);
	unsigned int num_halo_points = RADIO * world_width;
	unsigned int num_halo_bytes = num_halo_points * sizeof(short int);

	//unsigned int world_node_height = (world_height / (np-1)) + (RADIO * 2);
	//if(pid == 0 or pid == np - 2)
	//	world_node_height -= RADIO;
 
	size_t size_world = world_width * world_height * sizeof(short int);
	short int *h_world = (short int *)malloc(size_world);
	*h_world = 0;
	short int *d_world;

	for(int j = 0; j < world_width * world_height; j++)
	{	
		h_world[j] = 0;
	}

	/* alloc host memory */
	agent *h_agents_in = (agent *)malloc(num_bytes);
	//agent *d_agents_in;
	float4 *h_agents_pos;
	float4 *d_agents_pos;
	
	
	//MPI_Recv(rcv_address, num_points, MPI_FLOAT, server_process, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
	MPI_Recv(h_agents_in, agents_total, mpi_agent_type, server_process, 0, MPI_COMM_WORLD, &status);

	//Iniatialize world
	for( int i = 0; i < agents_total; i++)
	{
		h_world[(world_width * (h_agents_in[i].y - 1) ) + h_agents_in[i].x] = (h_agents_in[i].x!=0?1:0);
		//if(h_world[(world_width * (h_agents_in[i].y - 1) ) + h_agents_in[i].x] == 1)
			//printf("world x: %d, y: %d\n", h_agents_in[i].x, h_agents_in[i].y);	
		h_agents_pos[i].x = h_agents_in[i].x;
		h_agents_pos[i].y = h_agents_in[i].y;
		h_agents_pos[i].z = h_agents_in[i].z;
		h_agents_pos[i].w = h_agents_in[i].w;
	}

/***
	if(pid ==1)
{
	int k=0;
	for(int j = 0; j < world_width * world_height; j++)
	{	
		if ( j%96 == 0 and j>0)
		{
			k++;
			printf("%d row: %d\n", h_world[j], k);
		}
		else
			printf("%d ", h_world[j]);
	}
}
***/

	// Error code to check return values for CUDA calls
        cudaError_t err = cudaSuccess;

	// Allocate the device pointer
    	err = cudaMalloc((void **)&d_world, size_world);

	if (err != cudaSuccess)
	{
        	fprintf(stderr, "Failed to allocate device pointer (error code %s)!\n", cudaGetErrorString(err));
        	exit(EXIT_FAILURE);
        }

	err = cudaMemcpy(d_world, h_world, size_world, cudaMemcpyHostToDevice);

	if (err != cudaSuccess)
    	{
        	fprintf(stderr, "Failed to copy pointer from host to device (error code %s)!\n", cudaGetErrorString(err));
        	exit(EXIT_FAILURE);
    	}


	//http://cuda-programming.blogspot.com.es/2013/02/cuda-array-in-cuda-how-to-use-cuda.html
	//http://stackoverflow.com/questions/17924705/structure-of-arrays-vs-array-of-structures-in-cuda
	// Allocate the device pointer

    	err = cudaMalloc((void **)&d_agents_pos, num_bytes);

	if (err != cudaSuccess)
	{
        	fprintf(stderr, "Failed to allocate device pointer (error code %s)!\n", cudaGetErrorString(err));
        	exit(EXIT_FAILURE);
        }

	err = cudaMemcpy(d_agents_pos, h_agents_pos, num_bytes, cudaMemcpyHostToDevice);

	if (err != cudaSuccess)
    	{
        	fprintf(stderr, "Failed to copy pointer from host to device (error code %s)!\n", cudaGetErrorString(err));
        	exit(EXIT_FAILURE);
    	}


	launch_kernel(d_agents_pos, d_world, world_width, world_height );

	MPI_Barrier( MPI_COMM_WORLD);

#ifdef DEBUG
//	printf("pid: %d\n", pid);
//	display_data(h_agents_in, agents_total );
#endif

	MPI_Send(h_agents_in, agents_total, mpi_agent_type, server_process, DATA_COLLECT, MPI_COMM_WORLD);


	/* Release resources */
	free(h_agents_in); 
/*	
	free(h_output);
	cudaFreeHost(h_left_boundary); cudaFreeHost(h_right_boundary);
	cudaFreeHost(h_left_halo); cudaFreeHost(h_right_halo);
	cudaFree(d_input); cudaFree(d_output);
*/
}
예제 #11
0
void GridGpu::gridding()
{
    cudaError_t status = kernelCall();
    if (status != cudaSuccess)
        qWarning() << cudaGetErrorString(status);
}
예제 #12
0
bool
CudaGLVertexBuffer::allocate() {

    int size = _numElements * _numVertices * sizeof(float);

    glGenBuffers(1, &_vbo);

#if defined(GL_EXT_direct_state_access)
    if (glNamedBufferDataEXT) {
        glNamedBufferDataEXT(_vbo, size, 0, GL_DYNAMIC_DRAW);
    } else {
#else
    {
#endif
        glBindBuffer(GL_ARRAY_BUFFER, _vbo);
        glBufferData(GL_ARRAY_BUFFER, size, 0, GL_DYNAMIC_DRAW);
        glBindBuffer(GL_ARRAY_BUFFER, 0);
    }

    // register vbo as cuda resource
    cudaError_t err = cudaGraphicsGLRegisterBuffer(
        &_cudaResource, _vbo, cudaGraphicsMapFlagsWriteDiscard);

    if (err != cudaSuccess) return false;
    return true;
}

void
CudaGLVertexBuffer::map() {

    if (_devicePtr) return;
    size_t num_bytes;
    void *ptr;

    cudaError_t err = cudaGraphicsMapResources(1, &_cudaResource, 0);
    if (err != cudaSuccess)
        Far::Error(Far::FAR_RUNTIME_ERROR,
                   "CudaGLVertexBuffer::map failed.\n%s\n",
                   cudaGetErrorString(err));
    err = cudaGraphicsResourceGetMappedPointer(&ptr, &num_bytes, _cudaResource);
    if (err != cudaSuccess)
        Far::Error(Far::FAR_RUNTIME_ERROR,
                   "CudaGLVertexBuffer::map failed.\n%s\n",
                   cudaGetErrorString(err));
    _devicePtr = ptr;
}

void
CudaGLVertexBuffer::unmap() {

    if (_devicePtr == NULL) return;
    cudaError_t err = cudaGraphicsUnmapResources(1, &_cudaResource, 0);
    if (err != cudaSuccess)
       Far::Error(Far::FAR_RUNTIME_ERROR,
                  "CudaGLVertexBuffer::unmap failed.\n%s\n",
                  cudaGetErrorString(err));
    _devicePtr = NULL;
}


}  // end namespace Osd
예제 #13
0
void testCuda(int m, int n, int nnz, std::vector<int>& rows, std::vector<int>& cols,
		std::vector<double>& values, double* matB){

    double tol=1e-9;
    double start, stop, time_to_build, time_to_solve;

    int cudaDevice = 0;

    checkCudaErrors(cudaSetDevice(cudaDevice));

    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, cudaDevice);
    printf("Device Number: %d\n", cudaDevice);
    printf("  Device name: %s\n", prop.name);
    checkCudaErrors(cudaDeviceReset());

	 size_t mem_tot = 0;
	 size_t mem_free = 0;

	 cudaMemGetInfo(&mem_free, & mem_tot);
	 printf("\nFree memory: %d", mem_free);

	MatSparse matA;
    matA.setSize(m, n);

    std::vector<int> I, J;
    std::vector<double> V;

    for (int k = 0; k < nnz; k++){
    	double _val = values[k];
    	int i = rows[k];
    	int j = cols[k];

    	if (fabs(_val) > tol){
        	I.push_back(i-1);
        	J.push_back(j-1);
        	V.push_back(_val);
        }
    }

    start = second();
    matA.fromTruples(I, J, V);
    stop = second();
    time_to_build = stop - start;
    std::cerr << "Time to Build in GPU (second): " << time_to_build << std::endl;


    // ******************************** GPU SOLVER ******************************** //

    // --- Initialize cuSPARSE
     	cusolverSpHandle_t cusolver_handle = NULL; checkCudaErrors(cusolverSpCreate(&cusolver_handle));
     	cusparseHandle_t cusparse_handle = NULL; checkCudaErrors(cusparseCreate(&cusparse_handle));
     	cudaStream_t cudaStream = NULL; checkCudaErrors(cudaStreamCreate(&cudaStream));
     	checkCudaErrors(cusolverSpSetStream(cusolver_handle, cudaStream));
     	checkCudaErrors(cusparseSetStream(cusparse_handle, cudaStream));


        cusparseMatDescr_t descrA;      checkCudaErrors(cusparseCreateMatDescr(&descrA));
        checkCudaErrors(cusparseSetMatType     (descrA, CUSPARSE_MATRIX_TYPE_GENERAL));
        checkCudaErrors(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO));


        printf("\nAlloc GPU memory...\n");
        double *d_A;            checkCudaErrors(cudaMalloc(&d_A, nnz * sizeof(double)));
        int *d_A_RowIndices;    checkCudaErrors(cudaMalloc(&d_A_RowIndices, (m + 1) * sizeof(int)));
        int *d_A_ColIndices;    checkCudaErrors(cudaMalloc(&d_A_ColIndices, nnz * sizeof(int)));
        double *d_x;        checkCudaErrors(cudaMalloc(&d_x, m * sizeof(double)));
        double *d_b; checkCudaErrors(cudaMalloc(&d_b, m * sizeof(double)));
        printf("\nError: %s", cudaGetErrorString(cudaGetLastError()));

        printf("\nCopying data...\n");
        checkCudaErrors(cudaMemcpy(d_A, matA.valuesPtr(), nnz * sizeof(double), cudaMemcpyHostToDevice));
        checkCudaErrors(cudaMemcpy(d_A_RowIndices, matA.RowPtr(), (m + 1) * sizeof(int), cudaMemcpyHostToDevice));
        checkCudaErrors(cudaMemcpy(d_A_ColIndices, matA.ColIdxPtr(), nnz * sizeof(int), cudaMemcpyHostToDevice));
        checkCudaErrors(cudaMemcpy(d_b, matB, m * sizeof(double), cudaMemcpyHostToDevice));

        double *h_x = (double *)malloc(m * sizeof(double));

        printf("\nError: %s", cudaGetErrorString(cudaGetLastError()));
        cudaMemGetInfo(&mem_free, &mem_tot);
        printf("\nFree memory: %d", mem_free);

        int reorder = 0;
        int singularity = 0;
        start = second();
        //checkCudaErrors(cusolverSpDcsrlsvluHost(cusolver_handle, Nrows, nnz, descrA, sparse.Values(),
        	//	sparse.RowPtr(), sparse.ColIdx(), mB.values, tol, reorder, h_x, &singularity));
        checkCudaErrors(cusolverSpDcsrlsvqr(cusolver_handle, m, nnz, descrA, d_A, d_A_RowIndices,
               		d_A_ColIndices, d_b, tol, reorder, d_x, &singularity));
        checkCudaErrors(cudaDeviceSynchronize());
        stop = second();
        time_to_solve = stop - start;


        checkCudaErrors(cudaMemcpy(h_x, d_x, m * sizeof(double), cudaMemcpyDeviceToHost));

//        for (int k=0; k<mA.getNumRows(); k++) solution[k] = h_x[k];


        checkCudaErrors(cusparseDestroy(cusparse_handle));
        checkCudaErrors(cusolverSpDestroy(cusolver_handle));
        checkCudaErrors(cudaStreamDestroy(cudaStream));
        checkCudaErrors(cudaFree(d_b));
        checkCudaErrors(cudaFree(d_x));

        checkCudaErrors(cudaFree(d_A));
        checkCudaErrors(cudaFree(d_A_RowIndices));
        checkCudaErrors(cudaFree(d_A_ColIndices));

        free(h_x);

        std::cerr << "Time to Build in GPU (second): " << time_to_build << std::endl;
        std::cerr << "Time to Solve in GPU (second): " << time_to_solve << std::endl;
        std::cerr << "done!";

    // ****************************************************************************** //
}
예제 #14
0
const char* WINAPI wine_cudaGetErrorString(cudaError_t error) {
    WINE_TRACE("\n");
    return cudaGetErrorString(error);
}
예제 #15
0
파일: gpuops.c 프로젝트: cbasasif/bart
static void cuda_error(int line, cudaError_t code)
{
	const char *err_str = cudaGetErrorString(code);
	error("cuda error: %d %s \n", line, err_str);
}
예제 #16
0
////////////////////////////////////////////////////////////////////////////////
//! Run the Cuda part of the computation
////////////////////////////////////////////////////////////////////////////////
void refreshData(struct cudaGraphicsResource **vbo_resource)
{
	int np; 
	MPI_Comm_size(MPI_COMM_WORLD, &np);
	MPI_Status status;

	int num_comp_nodes = np -1;
	unsigned int num_bytes = sizeof(sAgents);
	sAgents h_agents_in, h_agents_out[num_comp_nodes];
        size_t size_agents = agents_total * sizeof(float4);

        // map OpenGL buffer object for writing from CUDA
        float4 *d_agents;
        // Error code to check return values for CUDA calls
        cudaError_t err = cudaSuccess;

        checkCudaErrors(cudaGraphicsMapResources(1, vbo_resource, 0));
        size_t num_agents_bytes;
        checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_agents, &num_agents_bytes,
                                                         *vbo_resource));
		
	//for(int process = 0; process < num_comp_nodes; process++)
	//while(1)
	//{
		/* Wait for nodes to compute */
		MPI_Barrier(MPI_COMM_WORLD);

		for(int process = 0; process < num_comp_nodes; process++)
		{

			MPI_Recv(&h_agents_out[process], num_bytes, MPI_BYTE, process, DATA_COLLECT, MPI_COMM_WORLD, &status); 

			for( int i = 0; i < agents_total; i++)
			{
				if( h_agents_out[process].ids[i].y == 1 )
				{
					h_agents_in.pos[i] = h_agents_out[process].pos[i]; 
					h_agents_in.ids[i] = h_agents_out[process].ids[i];
					h_agents_in.pos[i].z = 0;
					h_agents_in.pos[i].w = 1;
				}
			}
	
		}
		#ifdef DEBUG
	        printf("Final Data:\n");	
		display_data(h_agents_in);
		#endif
		
		/***
		//pasar los valores al vbo
	        glBindBuffer(GL_ARRAY_BUFFER, vbo);
   	        unsigned int size = agent_width * agent_height * 4 * sizeof(float);    
	        glBufferSubData(GL_ARRAY_BUFFER, 0, size, &h_agents_in.pos);

	        glBindBuffer(GL_ARRAY_BUFFER, 0);
		***/

		// Copy the host pointer memory to the device memory
    		printf("Copy pointer from the host memory to the CUDA device\n");
    		err = cudaMemcpy(d_agents, h_agents_in.pos, size_agents, cudaMemcpyHostToDevice);

    		if (err != cudaSuccess)
    		{
    		    fprintf(stderr, "Failed to copy pointer from host to device (error code %s)!\n", cudaGetErrorString(err));
    		    exit(EXIT_FAILURE);
    		}
		//glutPostRedisplay();
	        printf("sale de refreshData:\n");	

	//}

    // unmap buffer object
    checkCudaErrors(cudaGraphicsUnmapResources(1, vbo_resource, 0));
	
	/* release resources */
	//free(&h_agents_in);
	//free(&h_agents_out); 
}
예제 #17
0
int main(int argc, char **argv) {
	printf("Computing Game Of Life On %d x %d Board.\n", DIM_X, DIM_Y);
	
	int *host_current, *host_future, *host_future_naive, *host_future_cached;
	int *gpu_current, *gpu_future;
	clock_t start, stop;
	
	cudaMallocHost((void**) &host_current, DIM_X * DIM_Y * sizeof(int));
	cudaMallocHost((void**) &host_future, DIM_X * DIM_Y * sizeof(int));	
	cudaMallocHost((void**) &host_future_naive, DIM_X * DIM_Y * sizeof(int));
	cudaMallocHost((void**) &host_future_cached, DIM_X * DIM_Y * sizeof(int));
	assert(cudaGetLastError() == cudaSuccess);
	
	cudaMalloc((void**) &gpu_current, DIM_X * DIM_Y * sizeof(int));
	cudaMalloc((void**) &gpu_future, DIM_X * DIM_Y * sizeof(int));
	printf("%s\n", cudaGetErrorString(cudaGetLastError()));
	assert(cudaGetLastError() == cudaSuccess);
	
	fill_board(host_current, 40); 
	add_glider(host_current);
	
	cudaMemcpy(gpu_current, host_current, DIM_X * DIM_Y * sizeof(int), cudaMemcpyHostToDevice);

//	print_board(host_current);
	
	float time_naive, time_cached, time_cpu;
	for(int i = 1; i < STEPS; i++) {
		printf("=========\n");
		
		start = clock();
		naive_game_of_life_wrapper(gpu_current, gpu_future);
		cudaMemcpy(host_future_naive, gpu_future, DIM_X * DIM_Y * sizeof(int), cudaMemcpyDeviceToHost);
		stop = clock();
		time_naive = (float)(stop - start)/CLOCKS_PER_SEC;
		printf("Time for Naive GPU To Compute Next Phase: %.5f s\n", time_naive);
		
		start = clock();
		cached_game_of_life_wrapper(gpu_current, gpu_future);
		cudaMemcpy(host_future_cached, gpu_future, DIM_X * DIM_Y * sizeof(int), cudaMemcpyDeviceToHost);
		stop = clock();
		time_cached = (float)(stop - start)/CLOCKS_PER_SEC;
		printf("Time for Cached GPU To Compute Next Phase: %.5f s\n", time_cached);
		
		start = clock();
		update_board(host_current, host_future);
		stop = clock();
		time_cpu = (float)(stop - start)/CLOCKS_PER_SEC;
		printf("Time for CPU To Compute Next Phase: %.5f s\n", time_cpu);
		
		printf("speedup for naive = %.2f; speedup for cached = %.2f; speedup for cached over naive = %.2f\n", time_cpu/time_naive, time_cpu/time_cached, time_naive/time_cached);

		check_boards(host_future, host_future_naive);
		check_boards(host_future, host_future_cached);
				
		int *temp;
		
		temp = host_current;
		host_current = host_future;
		host_future = temp;
		
		temp = gpu_current;
		gpu_current = gpu_future;
		gpu_future = temp;
	}
	
	cudaFree(host_future);
	cudaFree(host_future_naive);
	cudaFree(host_future_cached);
	cudaFree(host_current);
	cudaFree(gpu_current);
	cudaFree(gpu_future);
	
	return 0;
}
예제 #18
0
void compute_process()
{
	int np, pid;
	MPI_Comm_rank(MPI_COMM_WORLD, &pid);
	MPI_Comm_size(MPI_COMM_WORLD, &np);
	int server_process = np - 1;
	MPI_Status status;

	int num_comp_nodes = np -1;
	unsigned int num_bytes = sizeof(sAgents);
	unsigned int num_halo_points = RADIO * world_width;
	unsigned int num_halo_bytes = num_halo_points * sizeof(int);
 
	size_t size_world = world_width * world_height * sizeof(int);
	int *h_world = (int *)malloc(size_world);
	int *d_world;

	int left_neighbor = (pid > 0) ? (pid - 1) : MPI_PROC_NULL;
	int right_neighbor = (pid < np -2) ? (pid + 1) : MPI_PROC_NULL;


	for(int j = 0; j < world_width * world_height; j++)
	{	
		h_world[j] = 0;
	}


	sAgents h_agents_in, h_agents_left_node, h_agents_right_node;
	float4 h_agents_pos[agents_total], h_agents_ids[agents_total];
	float4 *d_agents_pos, *d_agents_ids;
	unsigned int num_bytes_agents = agents_total * sizeof(float4);

	int world_height_node = world_height / num_comp_nodes;

	// Error code to check return values for CUDA calls
        cudaError_t err = cudaSuccess;

	// Allocate the device pointer
    	err = cudaMalloc((void **)&d_world, size_world);

	if (err != cudaSuccess)
	{
        	fprintf(stderr, "Failed to allocate device pointer (error code %s)!\n", cudaGetErrorString(err));
        	exit(EXIT_FAILURE);
        }

    	err = cudaMalloc((void **)&d_agents_pos, num_bytes_agents);

	if (err != cudaSuccess)
	{
        	fprintf(stderr, "Failed to allocate device pointer (error code %s)!\n", cudaGetErrorString(err));
        	exit(EXIT_FAILURE);
        }

    	err = cudaMalloc((void **)&d_agents_ids, num_bytes_agents);

	if (err != cudaSuccess)
	{
        	fprintf(stderr, "Failed to allocate device pointer (error code %s)!\n", cudaGetErrorString(err));
        	exit(EXIT_FAILURE);
        }
		
	MPI_Recv(&h_agents_in, num_bytes, MPI_BYTE, server_process, 0, MPI_COMM_WORLD, &status);

	for(int i = 0; i < agents_total; i++)
	{
		//identify the active agents according to the y coordinate and set the busy cells in the world
		if(  ( round(h_agents_in.pos[i].y) >= (pid * world_height_node) ) and ( round(h_agents_in.pos[i].y) < ( (pid + 1) * world_height_node ) )  )
		{	
			h_agents_in.ids[i].y = 1;
			h_world[(int)round( (world_width * (h_agents_in.pos[i].y - 1) ) + h_agents_in.pos[i].x )] = h_agents_in.ids[i].x;
		}
		//Copy the data to a local arrays
		h_agents_pos[i] = h_agents_in.pos[i];
		h_agents_ids[i] = h_agents_in.ids[i];
	}


	err = cudaMemcpy(d_world, h_world, size_world, cudaMemcpyHostToDevice);

	if (err != cudaSuccess)
    	{
        	fprintf(stderr, "Failed to copy pointer from host to device (error code %s)!\n", cudaGetErrorString(err));
        	exit(EXIT_FAILURE);
    	}


	//for(int it = 0; it < nreps ; it++)
	while(1)
	{
		int it=4;
		err = cudaMemcpy(d_agents_pos, h_agents_pos, num_bytes_agents, cudaMemcpyHostToDevice);

		if (err != cudaSuccess)
	    	{
	        	fprintf(stderr, "Failed to copy pointer from host to device (error code %s)!\n", cudaGetErrorString(err));
	        	exit(EXIT_FAILURE);
	    	}
	
		err = cudaMemcpy(d_agents_ids, h_agents_ids, num_bytes_agents, cudaMemcpyHostToDevice);

		if (err != cudaSuccess)
	    	{
	        	fprintf(stderr, "Failed to copy pointer from host to device (error code %s)!\n", cudaGetErrorString(err));
	        	exit(EXIT_FAILURE);
	    	}


		launch_kernel(d_agents_pos, d_agents_ids, d_world, world_width, world_height, agent_width, agent_height, world_height_node, pid );

		cudaMemcpy(h_agents_pos, d_agents_pos, num_bytes_agents, cudaMemcpyDeviceToHost);
		cudaMemcpy(h_agents_ids, d_agents_ids, num_bytes_agents, cudaMemcpyDeviceToHost);

		//copy the data to the struct
		for( int i = 0; i < agents_total; i++)
		{
			h_agents_in.pos[i] = h_agents_pos[i];
			h_agents_in.ids[i] = h_agents_ids[i];
		}


		MPI_Barrier(MPI_COMM_WORLD);
		MPI_Send(&h_agents_in, num_bytes, MPI_BYTE, server_process, DATA_COLLECT, MPI_COMM_WORLD);

		#ifdef DEBUG
		//printf("pid: %d\n", pid);
		//display_data(h_agents_in);
		#endif

		// send data to left, get data from right 
		MPI_Sendrecv(&h_agents_in, num_bytes, MPI_BYTE, left_neighbor, it, &h_agents_right_node, num_bytes, MPI_BYTE, right_neighbor, it, MPI_COMM_WORLD, &status);

		// send data to right, get data from left 
		MPI_Sendrecv(&h_agents_in, num_bytes, MPI_BYTE, right_neighbor, it, &h_agents_left_node, num_bytes, MPI_BYTE, left_neighbor, it, MPI_COMM_WORLD, &status);

		for( int i = 0; i < agents_total; i++)
		{
			if(pid != np-2)
			{
				if(h_agents_right_node.ids[i].y == 2)
				{
					h_agents_in.pos[i] = h_agents_right_node.pos[i];
					h_agents_pos[i] = h_agents_right_node.pos[i];
					h_agents_in.ids[i].y = 1;			
					h_agents_ids[i].y = 1;	
				}
			}
			if(pid != 0)				
			{
				if(h_agents_left_node.ids[i].y == 3)
				{
					h_agents_in.pos[i] = h_agents_left_node.pos[i];
					h_agents_pos[i] = h_agents_left_node.pos[i];
					h_agents_in.ids[i].y = 1;			
					h_agents_ids[i].y = 1;	
				}
			}
		}

/***
		if(pid == 1)
		{	
			printf("pid: %d\n", pid);
			display_data(h_agents_in);
			display_data(h_agents_right_node);
			display_data(h_agents_left_node);
		}
***/

	}


	/* Release resources */
//	free(h_agents_in); 
/*	
	free(h_output);
	cudaFreeHost(h_left_boundary); cudaFreeHost(h_right_boundary);
	cudaFreeHost(h_left_halo); cudaFreeHost(h_right_halo);
	cudaFree(d_input); cudaFree(d_output);
*/
}
예제 #19
0
int main(int argc, char *argv[])
{
    // needed to work correctly with piped benchmarkrunner
    setlinebuf(stdout);
    setlinebuf(stdin);

    int n_indices = 1;
    int n_dimensions = 1;
    char inBuf[200]; // ridiculously large input buffer.
    
    bool isFirst = true;

  do {

    // Allocate memory for the arrays
    int *h_indices = 0;
    double        *h_outputGPU  = 0;

    try
    {
        h_indices = new int [n_indices * n_dimensions];
        h_outputGPU  = new double [n_indices * n_dimensions];
    }
    catch (std::exception e)
    {
        std::cerr << "Caught exception: " << e.what() << std::endl;
        std::cerr << "Unable to allocate CPU memory (try running with fewer vectors/dimensions)" << std::endl;
        return -1;
    }

    int *d_indices;
    double        *d_output;

    try
    {
        cudaError_t cudaResult;
        cudaResult = cudaMalloc((void **)&d_indices, n_dimensions * n_indices * sizeof(int));

        if (cudaResult != cudaSuccess)
        {
            throw std::runtime_error(cudaGetErrorString(cudaResult));
        }
    }
    catch (std::runtime_error e)
    {
        std::cerr << "Caught exception: " << e.what() << std::endl;
        std::cerr << "Unable to allocate GPU memory (try running with fewer vectors/dimensions)" << std::endl;
        return -1;
    }

    // Initialize the indices (done on the host)
    for(int i = 0; i < n_indices; i++) {
      h_indices[i] = i;
    }

    // Copy the indices to the device
    cudaMemcpy(d_indices, h_indices, n_dimensions * n_indices * sizeof(int), cudaMemcpyHostToDevice);
    cudaDeviceSynchronize();

    // Execute the QRNG on the device
    int n_vec;
    sobol_nikola_unsimplified(n_indices, d_indices, n_indices, &d_output, &n_vec);

    cudaDeviceSynchronize();

    cudaMemcpy(h_outputGPU, d_output, n_indices * n_dimensions * sizeof(double), cudaMemcpyDeviceToHost);

    // Cleanup and terminate
    delete h_indices;
    cudaFree(d_indices);
    cudaFree(d_output);

    if(!isFirst) {
      printf("RESULT ");

      for(int i = 0; i < std::min(n_indices,10); i++)
        printf("%f ", h_outputGPU[i]);

      printf("\n");
    }
    else {
      printf("OK\n");
      isFirst = false;
    }

    delete h_outputGPU;

      fgets(inBuf, 200, stdin);

      if (sscanf(inBuf, "%u", &n_indices) == 0)
      {
        // if input is not a number, it has to be "EXIT"
        if (strncmp("EXIT",inBuf,4)==0)
        {
          printf("OK\n");
          break;
        }
        else
        {
          printf("ERROR. Bad input: %s\n", inBuf);
          break;
        }
      }

    } while (true);

    cudaDeviceReset();
    return -1;
}
예제 #20
0
	void check_error(cudaError_t ret) const {
		if( ret != cudaSuccess ) {
			throw std::runtime_error(cudaGetErrorString(ret));
		}
	}
예제 #21
0
//--------------------------------------------------------------------------
// CUDA init
//--------------------------------------------------------------------------
bool CUDAContext::configInit( )
{
#ifdef EQUALIZER_USE_CUDA
    cudaDeviceProp props;
    uint32_t device = getPipe()->getDevice();

    // Setup the CUDA device
    if( device == LB_UNDEFINED_UINT32 )
    {
        device = _getFastestDeviceID();
        LBWARN << "No CUDA device, using the fastest device: " << device
               << std::endl;
    }

    int device_count = 0;
    cudaGetDeviceCount( &device_count );
    LBINFO << "CUDA devices found: " << device_count << std::endl;
    LBASSERT( static_cast< uint32_t >( device_count ) > device );
    if( static_cast< uint32_t >( device_count ) <= device )
    {
        sendError( ERROR_CUDACONTEXT_DEVICE_NOTFOUND )
            << lexical_cast< std::string >( device );
        return false;
    }

    // We assume GL interop here, otherwise use cudaSetDevice( device );
    // Attention: this call requires a valid GL context!
    cudaGLSetGLDevice( device );

    int usedDevice = static_cast< int >( device );
#ifdef _WIN32

    HGPUNV handle = 0;

    if( !WGLEW_NV_gpu_affinity )
    {
        LBWARN <<"WGL_NV_gpu_affinity unsupported, ignoring device setting"
               << std::endl;
        return true;
    }

    if( !wglEnumGpusNV( device, &handle ))
    {
        LBWARN << "wglEnumGpusNV failed : " << lunchbox::sysError << std::endl;
        return false;
    }

    cudaWGLGetDevice( &usedDevice, handle );
#else
    cudaGetDevice( &usedDevice );
#endif
    LBASSERT( device == static_cast< uint32_t >( device ));
    cudaGetDeviceProperties( &props, usedDevice );

    cudaError_t err = cudaGetLastError();
    if( cudaSuccess != err)
    {
        sendError( ERROR_CUDACONTEXT_INIT_FAILED ) <<
            std::string( cudaGetErrorString( err ));
        return false;
    }

    LBINFO << "Using CUDA device: " << device << std::endl;
    return true;
#else
    sendError( ERROR_CUDACONTEXT_MISSING_SUPPORT );
    return false;
#endif
}
예제 #22
0
파일: common.hpp 프로젝트: Achraf33/opencv
 static inline void checkCudaError(cudaError_t err, const char* file, const int line, const char* func)
 {
     if (cudaSuccess != err)
         cv::error(cv::Error::GpuApiCallError, cudaGetErrorString(err), func, file, line);
 }
void ControlCubeCache::_reSizeCache()
{
	_nLevels = _nextnLevels;
	_levelCube = _nextLevelCube;
	_offset	= _nextOffset;
	_nextnLevels = 0;
	_nextLevelCube = 0;

	_dimCube = exp2(_nLevels - _levelCube) + 2 * CUBE_INC;

	_sizeElement = pow(_dimCube, 3); 

	int dimV = exp2(_nLevels);
	_minValue = coordinateToIndex(vmml::vector<3,int>(0,0,0), _levelCube, _nLevels);
	_maxValue = coordinateToIndex(vmml::vector<3,int>(dimV-1,dimV-1,dimV-1), _levelCube, _nLevels);

	int dc = exp2(_nLevels - _levelCube);
	vmml::vector<3,int> mn = _cpuCache->getMinCoord();
	vmml::vector<3,int> mx = _cpuCache->getMaxCoord();
	_maxC = mx - mn;
	if ((mx.x() - mn.x()) % dc != 0)
		_maxC[0] += dc;
	if ((mx.y() - mn.y()) % dc != 0)
		_maxC[1] += dc;
	if ((mx.z() - mn.z()) % dc != 0)
		_maxC[2] += dc;

	if (cudaSuccess != cudaSetDevice(_device))
	{
		std::cerr<<"Control Cube Cache, error setting device: "<<cudaGetErrorString(cudaGetLastError())<<std::endl;
		throw;
	}
	if (_memory != 0)
		if (cudaSuccess != cudaFree((void*)_memory))
		{                                                                                               
			std::cerr<<"Control Cube Cache, error resizing cache: "<<cudaGetErrorString(cudaGetLastError())<<std::endl;
			throw;
		}
	size_t total = 0;
	size_t free = 0;

	if (cudaSuccess != cudaMemGetInfo(&free, &total))
	{
		std::cerr<<"Control Cube Cache, error resizing cache: "<<cudaGetErrorString(cudaGetLastError())<<std::endl;
		throw;
	}

	float memorySize = (0.80f*free); // Get 80% of free memory

	_maxNumCubes = memorySize/ (_sizeElement*sizeof(float));
	if (_maxNumCubes == 0)
	{
		std::cerr<<"Control Cube Cache: Memory aviable is not enough "<<memorySize/1024/1024<<" MB"<<std::endl;
		throw;
	}

	if (cudaSuccess != cudaMalloc((void**)&_memory, _maxNumCubes*_sizeElement*sizeof(float)))
	{
		std::cerr<<"Control Cube Cache, error resizing cache: "<<cudaGetErrorString(cudaGetLastError())<<std::endl;
		throw;
	}

	_freeSlots = _maxNumCubes;

	ControlElementCache::_reSizeCache();
}
예제 #24
0
static void HandleError(const char *file, int line, cudaError_t err)
{
    printf("ERROR in %s:%d: %s (%d)\n", file, line,
           cudaGetErrorString(err), err);
    exit(1);
}
예제 #25
0
void cudaAssert(cudaError code, const char *file, int line){
   if (code != cudaSuccess){
      fprintf(stderr,"cudaErrChk: %s %s %d\n", cudaGetErrorString(code), file, line);
      //exit(code);
   }
}
예제 #26
0
void MFNHashTypePlainCUDA::freeThreadAndDeviceMemory() {
    trace_printf("MFNHashTypePlainCUDA::freeThreadAndDeviceMemory()\n");

    cudaError_t err;

    // Free all the memory, then look for errors.
    cudaFree((void *)this->DeviceHashlistAddress);
    cudaFreeHost((void *)this->HostSuccessAddress);

    delete[] this->HostSuccessReportedAddress;

    // Only cudaFree if zeroCopy is in use.
    if (!this->useZeroCopy) {
        cudaFree((void *)this->DeviceSuccessAddress);
        cudaFree((void *)this->DeviceFoundPasswordsAddress);

    }
    
    cudaFreeHost((void *)this->HostFoundPasswordsAddress);

    cudaFreeHost((void*)this->HostStartPointAddress);
    cudaFree((void *)this->DeviceStartPointAddress);
    cudaFree((void *)this->DeviceStartPasswords32Address);

    // Free salted hashes if in use.
    if (this->hashAttributes.hashUsesWordlist) {
        cudaFree((void *)this->DeviceWordlistBlocks);
        cudaFree((void *)this->DeviceWordlistLengths);
    }

    if (this->hashAttributes.hashUsesSalt) {
        cudaFree((void *)this->DeviceSaltLengthsAddress);
        cudaFree((void *)this->DeviceSaltValuesAddress);
    }
    
    // Only free the bitmap memory if it has been allocated.
    if (this->DeviceBitmap256kb_Address) {
        cudaFree((void *)this->DeviceBitmap256kb_Address);
        this->DeviceBitmap256kb_Address = 0;
    }
    if (this->DeviceBitmap128mb_a_Address) {
        cudaFree((void *)this->DeviceBitmap128mb_a_Address);
        this->DeviceBitmap128mb_a_Address = 0;
    }
    if (this->DeviceBitmap128mb_b_Address) {
        cudaFree((void *)this->DeviceBitmap128mb_b_Address);
        this->DeviceBitmap128mb_b_Address = 0;
    }
    if (this->DeviceBitmap128mb_c_Address) {
        cudaFree((void *)this->DeviceBitmap128mb_c_Address);
        this->DeviceBitmap128mb_c_Address = 0;
    }
    if (this->DeviceBitmap128mb_d_Address) {
        cudaFree((void *)this->DeviceBitmap128mb_d_Address);
        this->DeviceBitmap128mb_d_Address = 0;
    }

    // Get any error that occurred above and report it.
    err = cudaGetLastError();
    if (err != cudaSuccess) {
        printf("Thread %d: CUDA error freeing memory: %s. Exiting.\n",
                this->threadId, cudaGetErrorString( err));
        exit(1);
    }
}
예제 #27
0
파일: gpu_data.cpp 프로젝트: davisking/dlib
    void gpu_data::
    set_size(
        size_t new_size
    )
    {
        if (new_size == 0)
        {
            if (device_in_use)
            {
                // Wait for any possible CUDA kernels that might be using our memory block to
                // complete before we free the memory.
                synchronize_stream(0);
                device_in_use = false;
            }
            wait_for_transfer_to_finish();
            data_size = 0;
            host_current = true;
            device_current = true;
            device_in_use = false;
            data_host.reset();
            data_device.reset();
        }
        else if (new_size != data_size)
        {
            if (device_in_use)
            {
                // Wait for any possible CUDA kernels that might be using our memory block to
                // complete before we free the memory.
                synchronize_stream(0);
                device_in_use = false;
            }
            wait_for_transfer_to_finish();
            data_size = new_size;
            host_current = true;
            device_current = true;
            device_in_use = false;

            try
            {
                CHECK_CUDA(cudaGetDevice(&the_device_id));

                // free memory blocks before we allocate new ones.
                data_host.reset();
                data_device.reset();

                void* data;
                CHECK_CUDA(cudaMallocHost(&data, new_size*sizeof(float)));
                // Note that we don't throw exceptions since the free calls are invariably
                // called in destructors.  They also shouldn't fail anyway unless someone
                // is resetting the GPU card in the middle of their program.
                data_host.reset((float*)data, [](float* ptr){
                    auto err = cudaFreeHost(ptr);
                    if(err!=cudaSuccess)
                        std::cerr << "cudaFreeHost() failed. Reason: " << cudaGetErrorString(err) << std::endl;
                });

                CHECK_CUDA(cudaMalloc(&data, new_size*sizeof(float)));
                data_device.reset((float*)data, [](float* ptr){
                    auto err = cudaFree(ptr);
                    if(err!=cudaSuccess)
                        std::cerr << "cudaFree() failed. Reason: " << cudaGetErrorString(err) << std::endl;
                });

                if (!cuda_stream)
                {
                    cudaStream_t cstream;
                    CHECK_CUDA(cudaStreamCreateWithFlags(&cstream, cudaStreamNonBlocking));
                    cuda_stream.reset(cstream, [](void* ptr){
                        auto err = cudaStreamDestroy((cudaStream_t)ptr);
                        if(err!=cudaSuccess)
                            std::cerr << "cudaStreamDestroy() failed. Reason: " << cudaGetErrorString(err) << std::endl;
                    });
                }

            }
            catch(...)
            {
                set_size(0);
                throw;
            }
        }
    }
예제 #28
0
extern int scanhash_groestlcoin(int thr_id, uint32_t *pdata, uint32_t *ptarget,
    uint32_t max_nonce, uint32_t *hashes_done)
{
	static THREAD uint32_t *foundNounce = nullptr;

    uint32_t start_nonce = pdata[19];
	unsigned int intensity = (device_sm[device_map[thr_id]] > 500) ? 24 : 23;
	uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, 1U << intensity);
	uint32_t throughput = min(throughputmax, max_nonce - start_nonce) & 0xfffffc00;

    if (opt_benchmark)
        ptarget[7] = 0x0000000f;

    // init
	static THREAD volatile bool init = false;
	if(!init)
    {
		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
		cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
		cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
		CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id]));

		groestlcoin_cpu_init(thr_id, throughputmax);
		CUDA_SAFE_CALL(cudaMallocHost(&foundNounce, 2 * 4));
		init = true;
    }

    // Endian Drehung ist notwendig
    uint32_t endiandata[32];
    for (int kk=0; kk < 32; kk++)
        be32enc(&endiandata[kk], pdata[kk]);

    // Context mit dem Endian gedrehten Blockheader vorbereiten (Nonce wird später ersetzt)
    groestlcoin_cpu_setBlock(thr_id, endiandata);

	do
	{
		// GPU
		const uint32_t Htarg = ptarget[7];

		groestlcoin_cpu_hash(thr_id, throughput, pdata[19], foundNounce, ptarget[7]);

		if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);}
		if(foundNounce[0] < 0xffffffff)
		{
			uint32_t tmpHash[8];
			endiandata[19] = SWAP32(foundNounce[0]);
			groestlhash(tmpHash, endiandata);

			if(tmpHash[7] <= Htarg && fulltest(tmpHash, ptarget))
			{
				int res = 1;
				if(opt_benchmark)
					applog(LOG_INFO, "GPU #%d Found nounce %08x", device_map[thr_id], foundNounce[0]);
				*hashes_done = pdata[19] - start_nonce + throughput;
				if(foundNounce[1] != 0xffffffff)
				{
					endiandata[19] = SWAP32(foundNounce[1]);
					groestlhash(tmpHash, endiandata);
					if(tmpHash[7] <= Htarg && fulltest(tmpHash, ptarget))
					{
						pdata[21] = foundNounce[1];
						res++;
						if(opt_benchmark)
							applog(LOG_INFO, "GPU #%d Found second nounce %08x", device_map[thr_id], foundNounce[1]);
					}
					else
					{
						if(tmpHash[7] != Htarg)
						{
							applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNounce[1]);
						}
					}
				}
				pdata[19] = foundNounce[0];
				return res;
			}
			else
			{
				if(tmpHash[7] != Htarg)
				{
					applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNounce[0]);
				}
			}
		}

		pdata[19] += throughput;
		cudaError_t err = cudaGetLastError();
		if(err != cudaSuccess)
		{
			applog(LOG_ERR, "GPU #%d: %s", device_map[thr_id], cudaGetErrorString(err));
			exit(EXIT_FAILURE);
		}
	} while(!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput)));

    *hashes_done = pdata[19] - start_nonce;
    return 0;
}
예제 #29
0
파일: init.c 프로젝트: mchandra/petsc
PetscErrorCode  PetscOptionsCheckInitial_Private(void)
{
  char           string[64],mname[PETSC_MAX_PATH_LEN],*f;
  MPI_Comm       comm = PETSC_COMM_WORLD;
  PetscBool      flg1 = PETSC_FALSE,flg2 = PETSC_FALSE,flg3 = PETSC_FALSE,flag;
  PetscErrorCode ierr;
  PetscReal      si;
  PetscInt       intensity;
  int            i;
  PetscMPIInt    rank;
  char           version[256];
#if !defined(PETSC_HAVE_THREADSAFETY)
  PetscReal      logthreshold;
  PetscBool      flg4 = PETSC_FALSE;
#endif

  PetscFunctionBegin;
  ierr = MPI_Comm_rank(PETSC_COMM_WORLD,&rank);CHKERRQ(ierr);

#if !defined(PETSC_HAVE_THREADSAFETY)
  /*
      Setup the memory management; support for tracing malloc() usage
  */
  ierr = PetscOptionsHasName(NULL,"-malloc_log",&flg3);CHKERRQ(ierr);
  logthreshold = 0.0;
  ierr = PetscOptionsGetReal(NULL,"-malloc_log_threshold",&logthreshold,&flg1);CHKERRQ(ierr);
  if (flg1) flg3 = PETSC_TRUE;
#if defined(PETSC_USE_DEBUG)
  ierr = PetscOptionsGetBool(NULL,"-malloc",&flg1,&flg2);CHKERRQ(ierr);
  if ((!flg2 || flg1) && !petscsetmallocvisited) {
    if (flg2 || !(PETSC_RUNNING_ON_VALGRIND)) {
      /* turn off default -malloc if valgrind is being used */
      ierr = PetscSetUseTrMalloc_Private();CHKERRQ(ierr);
    }
  }
#else
  ierr = PetscOptionsGetBool(NULL,"-malloc_dump",&flg1,NULL);CHKERRQ(ierr);
  ierr = PetscOptionsGetBool(NULL,"-malloc",&flg2,NULL);CHKERRQ(ierr);
  if (flg1 || flg2 || flg3) {ierr = PetscSetUseTrMalloc_Private();CHKERRQ(ierr);}
#endif
  if (flg3) {
    ierr = PetscMallocSetDumpLogThreshold((PetscLogDouble)logthreshold);CHKERRQ(ierr);
  }
  flg1 = PETSC_FALSE;
  ierr = PetscOptionsGetBool(NULL,"-malloc_debug",&flg1,NULL);CHKERRQ(ierr);
  if (flg1) {
    ierr = PetscSetUseTrMalloc_Private();CHKERRQ(ierr);
    ierr = PetscMallocDebug(PETSC_TRUE);CHKERRQ(ierr);
  }
  flg1 = PETSC_FALSE;
  ierr = PetscOptionsGetBool(NULL,"-malloc_test",&flg1,NULL);CHKERRQ(ierr);
#if defined(PETSC_USE_DEBUG)
  if (flg1 && !PETSC_RUNNING_ON_VALGRIND) {
    ierr = PetscSetUseTrMalloc_Private();CHKERRQ(ierr);
    ierr = PetscMallocSetDumpLog();CHKERRQ(ierr);
    ierr = PetscMallocDebug(PETSC_TRUE);CHKERRQ(ierr);
  }
#endif

  flg1 = PETSC_FALSE;
  ierr = PetscOptionsGetBool(NULL,"-malloc_info",&flg1,NULL);CHKERRQ(ierr);
  if (!flg1) {
    flg1 = PETSC_FALSE;
    ierr = PetscOptionsGetBool(NULL,"-memory_view",&flg1,NULL);CHKERRQ(ierr);
  }
  if (flg1) {
    ierr = PetscMemorySetGetMaximumUsage();CHKERRQ(ierr);
  }
#endif

#if defined(PETSC_USE_LOG)
  ierr = PetscOptionsHasName(NULL,"-objects_dump",&PetscObjectsLog);CHKERRQ(ierr);
#endif

  /*
      Set the display variable for graphics
  */
  ierr = PetscSetDisplay();CHKERRQ(ierr);

  /*
      Print the PETSc version information
  */
  ierr = PetscOptionsHasName(NULL,"-v",&flg1);CHKERRQ(ierr);
  ierr = PetscOptionsHasName(NULL,"-version",&flg2);CHKERRQ(ierr);
  ierr = PetscOptionsHasName(NULL,"-help",&flg3);CHKERRQ(ierr);
  if (flg1 || flg2 || flg3) {

    /*
       Print "higher-level" package version message
    */
    if (PetscExternalVersionFunction) {
      ierr = (*PetscExternalVersionFunction)(comm);CHKERRQ(ierr);
    }

    ierr = PetscGetVersion(version,256);CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"--------------------------------------------\
------------------------------\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"%s\n",version);CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"%s",PETSC_AUTHOR_INFO);CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"See docs/changes/index.html for recent updates.\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"See docs/faq.html for problems.\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"See docs/manualpages/index.html for help. \n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"Libraries linked from %s\n",PETSC_LIB_DIR);CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"--------------------------------------------\
------------------------------\n");CHKERRQ(ierr);
  }

  /*
       Print "higher-level" package help message
  */
  if (flg3) {
    if (PetscExternalHelpFunction) {
      ierr = (*PetscExternalHelpFunction)(comm);CHKERRQ(ierr);
    }
  }

  /*
      Setup the error handling
  */
  flg1 = PETSC_FALSE;
  ierr = PetscOptionsGetBool(NULL,"-on_error_abort",&flg1,NULL);CHKERRQ(ierr);
  if (flg1) {
    ierr = MPI_Comm_set_errhandler(PETSC_COMM_WORLD,MPI_ERRORS_ARE_FATAL);CHKERRQ(ierr);
    ierr = PetscPushErrorHandler(PetscAbortErrorHandler,0);CHKERRQ(ierr);
  }
  flg1 = PETSC_FALSE;
  ierr = PetscOptionsGetBool(NULL,"-on_error_mpiabort",&flg1,NULL);CHKERRQ(ierr);
  if (flg1) { ierr = PetscPushErrorHandler(PetscMPIAbortErrorHandler,0);CHKERRQ(ierr);}
  flg1 = PETSC_FALSE;
  ierr = PetscOptionsGetBool(NULL,"-mpi_return_on_error",&flg1,NULL);CHKERRQ(ierr);
  if (flg1) {
    ierr = MPI_Comm_set_errhandler(comm,MPI_ERRORS_RETURN);CHKERRQ(ierr);
  }
  flg1 = PETSC_FALSE;
  ierr = PetscOptionsGetBool(NULL,"-no_signal_handler",&flg1,NULL);CHKERRQ(ierr);
  if (!flg1) {ierr = PetscPushSignalHandler(PetscSignalHandlerDefault,(void*)0);CHKERRQ(ierr);}
  flg1 = PETSC_FALSE;
  ierr = PetscOptionsGetBool(NULL,"-fp_trap",&flg1,NULL);CHKERRQ(ierr);
  if (flg1) {ierr = PetscSetFPTrap(PETSC_FP_TRAP_ON);CHKERRQ(ierr);}
  ierr = PetscOptionsGetInt(NULL,"-check_pointer_intensity",&intensity,&flag);CHKERRQ(ierr);
  if (flag) {ierr = PetscCheckPointerSetIntensity(intensity);CHKERRQ(ierr);}

  /*
      Setup debugger information
  */
  ierr = PetscSetDefaultDebugger();CHKERRQ(ierr);
  ierr = PetscOptionsGetString(NULL,"-on_error_attach_debugger",string,64,&flg1);CHKERRQ(ierr);
  if (flg1) {
    MPI_Errhandler err_handler;

    ierr = PetscSetDebuggerFromString(string);CHKERRQ(ierr);
    ierr = MPI_Comm_create_errhandler((MPI_Handler_function*)Petsc_MPI_DebuggerOnError,&err_handler);CHKERRQ(ierr);
    ierr = MPI_Comm_set_errhandler(comm,err_handler);CHKERRQ(ierr);
    ierr = PetscPushErrorHandler(PetscAttachDebuggerErrorHandler,0);CHKERRQ(ierr);
  }
  ierr = PetscOptionsGetString(NULL,"-debug_terminal",string,64,&flg1);CHKERRQ(ierr);
  if (flg1) { ierr = PetscSetDebugTerminal(string);CHKERRQ(ierr); }
  ierr = PetscOptionsGetString(NULL,"-start_in_debugger",string,64,&flg1);CHKERRQ(ierr);
  ierr = PetscOptionsGetString(NULL,"-stop_for_debugger",string,64,&flg2);CHKERRQ(ierr);
  if (flg1 || flg2) {
    PetscMPIInt    size;
    PetscInt       lsize,*nodes;
    MPI_Errhandler err_handler;
    /*
       we have to make sure that all processors have opened
       connections to all other processors, otherwise once the
       debugger has stated it is likely to receive a SIGUSR1
       and kill the program.
    */
    ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr);
    if (size > 2) {
      PetscMPIInt dummy = 0;
      MPI_Status  status;
      for (i=0; i<size; i++) {
        if (rank != i) {
          ierr = MPI_Send(&dummy,1,MPI_INT,i,109,PETSC_COMM_WORLD);CHKERRQ(ierr);
        }
      }
      for (i=0; i<size; i++) {
        if (rank != i) {
          ierr = MPI_Recv(&dummy,1,MPI_INT,i,109,PETSC_COMM_WORLD,&status);CHKERRQ(ierr);
        }
      }
    }
    /* check if this processor node should be in debugger */
    ierr  = PetscMalloc1(size,&nodes);CHKERRQ(ierr);
    lsize = size;
    ierr  = PetscOptionsGetIntArray(NULL,"-debugger_nodes",nodes,&lsize,&flag);CHKERRQ(ierr);
    if (flag) {
      for (i=0; i<lsize; i++) {
        if (nodes[i] == rank) { flag = PETSC_FALSE; break; }
      }
    }
    if (!flag) {
      ierr = PetscSetDebuggerFromString(string);CHKERRQ(ierr);
      ierr = PetscPushErrorHandler(PetscAbortErrorHandler,0);CHKERRQ(ierr);
      if (flg1) {
        ierr = PetscAttachDebugger();CHKERRQ(ierr);
      } else {
        ierr = PetscStopForDebugger();CHKERRQ(ierr);
      }
      ierr = MPI_Comm_create_errhandler((MPI_Handler_function*)Petsc_MPI_AbortOnError,&err_handler);CHKERRQ(ierr);
      ierr = MPI_Comm_set_errhandler(comm,err_handler);CHKERRQ(ierr);
    }
    ierr = PetscFree(nodes);CHKERRQ(ierr);
  }

  ierr = PetscOptionsGetString(NULL,"-on_error_emacs",emacsmachinename,128,&flg1);CHKERRQ(ierr);
  if (flg1 && !rank) {ierr = PetscPushErrorHandler(PetscEmacsClientErrorHandler,emacsmachinename);CHKERRQ(ierr);}

  /*
        Setup profiling and logging
  */
#if defined(PETSC_USE_INFO)
  {
    char logname[PETSC_MAX_PATH_LEN]; logname[0] = 0;
    ierr = PetscOptionsGetString(NULL,"-info",logname,250,&flg1);CHKERRQ(ierr);
    if (flg1 && logname[0]) {
      ierr = PetscInfoAllow(PETSC_TRUE,logname);CHKERRQ(ierr);
    } else if (flg1) {
      ierr = PetscInfoAllow(PETSC_TRUE,NULL);CHKERRQ(ierr);
    }
  }
#endif
#if defined(PETSC_USE_LOG)
  mname[0] = 0;
  ierr = PetscOptionsGetString(NULL,"-history",mname,PETSC_MAX_PATH_LEN,&flg1);CHKERRQ(ierr);
  if (flg1) {
    if (mname[0]) {
      ierr = PetscOpenHistoryFile(mname,&petsc_history);CHKERRQ(ierr);
    } else {
      ierr = PetscOpenHistoryFile(NULL,&petsc_history);CHKERRQ(ierr);
    }
  }
#if defined(PETSC_HAVE_MPE)
  flg1 = PETSC_FALSE;
  ierr = PetscOptionsHasName(NULL,"-log_mpe",&flg1);CHKERRQ(ierr);
  if (flg1) {ierr = PetscLogMPEBegin();CHKERRQ(ierr);}
#endif
  flg1 = PETSC_FALSE;
  flg2 = PETSC_FALSE;
  flg3 = PETSC_FALSE;
  ierr = PetscOptionsGetBool(NULL,"-log_all",&flg1,NULL);CHKERRQ(ierr);
  ierr = PetscOptionsGetBool(NULL,"-log",&flg2,NULL);CHKERRQ(ierr);
  ierr = PetscOptionsHasName(NULL,"-log_summary",&flg3);CHKERRQ(ierr);
  ierr = PetscOptionsHasName(NULL,"-log_view",&flg4);CHKERRQ(ierr);
  if (flg1)                      { ierr = PetscLogAllBegin();CHKERRQ(ierr); }
  else if (flg2 || flg3 || flg4) { ierr = PetscLogBegin();CHKERRQ(ierr);}

  ierr = PetscOptionsGetString(NULL,"-log_trace",mname,250,&flg1);CHKERRQ(ierr);
  if (flg1) {
    char name[PETSC_MAX_PATH_LEN],fname[PETSC_MAX_PATH_LEN];
    FILE *file;
    if (mname[0]) {
      sprintf(name,"%s.%d",mname,rank);
      ierr = PetscFixFilename(name,fname);CHKERRQ(ierr);
      file = fopen(fname,"w");
      if (!file) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_FILE_OPEN,"Unable to open trace file: %s",fname);
    } else file = PETSC_STDOUT;

    ierr = PetscLogTraceBegin(file);CHKERRQ(ierr);
  }
#endif

  ierr = PetscOptionsGetBool(NULL,"-saws_options",&PetscOptionsPublish,NULL);CHKERRQ(ierr);

#if defined(PETSC_HAVE_CUDA)
  ierr = PetscOptionsHasName(NULL,"-cuda_show_devices",&flg1);CHKERRQ(ierr);
  if (flg1) {
    struct cudaDeviceProp prop;
    int                   devCount;
    int                   device;
    cudaError_t           err = cudaSuccess;

    err = cudaGetDeviceCount(&devCount);
    if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaGetDeviceCount %s",cudaGetErrorString(err));
    for (device = 0; device < devCount; ++device) {
      err = cudaGetDeviceProperties(&prop, device);
      if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaGetDeviceProperties %s",cudaGetErrorString(err));
      ierr = PetscPrintf(PETSC_COMM_WORLD, "CUDA device %d: %s\n", device, prop.name);CHKERRQ(ierr);
    }
  }
  {
    int size;
    ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr);
    if (size>1) {
      int         devCount, device, rank;
      cudaError_t err = cudaSuccess;

      /* check to see if we force multiple ranks to hit the same GPU */
      ierr = PetscOptionsGetInt(NULL,"-cuda_set_device", &device, &flg1);CHKERRQ(ierr);
      if (flg1) {
        err = cudaSetDevice(device);
        if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaSetDevice %s",cudaGetErrorString(err));
      } else {
        /* we're not using the same GPU on multiple MPI threads. So try to allocated different   GPUs to different processes */

        /* First get the device count */
        err   = cudaGetDeviceCount(&devCount);
        if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaGetDeviceCount %s",cudaGetErrorString(err));

        /* next determine the rank and then set the device via a mod */
        ierr   = MPI_Comm_rank(PETSC_COMM_WORLD,&rank);CHKERRQ(ierr);
        device = rank % devCount;
        err    = cudaSetDevice(device);
        if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaSetDevice %s",cudaGetErrorString(err));
      }

      /* set the device flags so that it can map host memory ... do NOT throw exception on err!=cudaSuccess
       multiple devices may try to set the flags on the same device. So long as one of them succeeds, things
       are ok. */
      err = cudaSetDeviceFlags(cudaDeviceMapHost);
      if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaSetDeviceFlags %s",cudaGetErrorString(err));
    } else {
      int         device;
      cudaError_t err = cudaSuccess;

      /* the code below works for serial GPU simulations */
      ierr = PetscOptionsGetInt(NULL,"-cuda_set_device", &device, &flg1);CHKERRQ(ierr);
      if (flg1) {
        err = cudaSetDevice(device);
        if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaSetDevice %s",cudaGetErrorString(err));
      }

      /* set the device flags so that it can map host memory ... here, we error check. */
      err = cudaSetDeviceFlags(cudaDeviceMapHost);
      if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaSetDeviceFlags %s",cudaGetErrorString(err));
    }
  }
#endif


  /*
       Print basic help message
  */
  ierr = PetscOptionsHasName(NULL,"-help",&flg1);CHKERRQ(ierr);
  if (flg1) {
    ierr = (*PetscHelpPrintf)(comm,"Options for all PETSc programs:\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -help: prints help method for each option\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -on_error_abort: cause an abort when an error is detected. Useful \n ");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"       only when run in the debugger\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -on_error_attach_debugger [gdb,dbx,xxgdb,ups,noxterm]\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"       start the debugger in new xterm\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"       unless noxterm is given\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -start_in_debugger [gdb,dbx,xxgdb,ups,noxterm]\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"       start all processes in the debugger\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -on_error_emacs <machinename>\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"    emacs jumps to error file\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -debugger_nodes [n1,n2,..] Nodes to start in debugger\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -debugger_pause [m] : delay (in seconds) to attach debugger\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -stop_for_debugger : prints message on how to attach debugger manually\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"                      waits the delay for you to attach\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -display display: Location where X window graphics and debuggers are displayed\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -no_signal_handler: do not trap error signals\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -mpi_return_on_error: MPI returns error code, rather than abort on internal error\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -fp_trap: stop on floating point exceptions\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"           note on IBM RS6000 this slows run greatly\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -malloc_dump <optional filename>: dump list of unfreed memory at conclusion\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -malloc: use our error checking malloc\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -malloc no: don't use error checking malloc\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -malloc_info: prints total memory usage\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -malloc_log: keeps log of all memory allocations\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -malloc_debug: enables extended checking for memory corruption\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -options_table: dump list of options inputted\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -options_left: dump list of unused options\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -options_left no: don't dump list of unused options\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -tmp tmpdir: alternative /tmp directory\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -shared_tmp: tmp directory is shared by all processors\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -not_shared_tmp: each processor has separate tmp directory\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -memory_view: print memory usage at end of run\n");CHKERRQ(ierr);
#if defined(PETSC_USE_LOG)
    ierr = (*PetscHelpPrintf)(comm," -get_total_flops: total flops over all processors\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -log[_summary _summary_python]: logging objects and events\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -log_trace [filename]: prints trace of all PETSc calls\n");CHKERRQ(ierr);
#if defined(PETSC_HAVE_MPE)
    ierr = (*PetscHelpPrintf)(comm," -log_mpe: Also create logfile viewable through Jumpshot\n");CHKERRQ(ierr);
#endif
    ierr = (*PetscHelpPrintf)(comm," -info <optional filename>: print informative messages about the calculations\n");CHKERRQ(ierr);
#endif
    ierr = (*PetscHelpPrintf)(comm," -v: prints PETSc version number and release date\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -options_file <file>: reads options from file\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm," -petsc_sleep n: sleeps n seconds before running program\n");CHKERRQ(ierr);
    ierr = (*PetscHelpPrintf)(comm,"-----------------------------------------------\n");CHKERRQ(ierr);
  }

#if defined(PETSC_HAVE_POPEN)
  {
  char machine[128];
  ierr = PetscOptionsGetString(NULL,"-popen_machine",machine,128,&flg1);CHKERRQ(ierr);
  if (flg1) {
    ierr = PetscPOpenSetMachine(machine);CHKERRQ(ierr);
  }
  }
#endif

  ierr = PetscOptionsGetReal(NULL,"-petsc_sleep",&si,&flg1);CHKERRQ(ierr);
  if (flg1) {
    ierr = PetscSleep(si);CHKERRQ(ierr);
  }

  ierr = PetscOptionsGetString(NULL,"-info_exclude",mname,PETSC_MAX_PATH_LEN,&flg1);CHKERRQ(ierr);
  if (flg1) {
    ierr = PetscStrstr(mname,"null",&f);CHKERRQ(ierr);
    if (f) {
      ierr = PetscInfoDeactivateClass(0);CHKERRQ(ierr);
    }
  }

#if defined(PETSC_HAVE_CUSP) || defined(PETSC_HAVE_VIENNACL)
  ierr = PetscOptionsHasName(NULL,"-log_summary",&flg3);CHKERRQ(ierr);
  if (!flg3) {
  ierr = PetscOptionsHasName(NULL,"-log_view",&flg3);CHKERRQ(ierr);
  }
#endif
#if defined(PETSC_HAVE_CUSP)
  ierr = PetscOptionsGetBool(NULL,"-cusp_synchronize",&flg3,NULL);CHKERRQ(ierr);
  PetscCUSPSynchronize = flg3;
#elif defined(PETSC_HAVE_VIENNACL)
  ierr = PetscOptionsGetBool(NULL,"-viennacl_synchronize",&flg3,NULL);CHKERRQ(ierr);
  PetscViennaCLSynchronize = flg3;
#endif
  PetscFunctionReturn(0);
}
예제 #30
0
파일: main.cpp 프로젝트: vls/vlssrc.thesis
int
main( int argc,char** argv)
{
	printf("hello world\n");

	if (!InitCUDA())
	{
		return 0;
	}



	int iter = 1000;
	int trainnum = 20;
	bool isProfiler = false;
	int intProfiler = 0;
	int testnum = -1;
	float maxtime = 0.0f;
	cutGetCmdLineArgumenti(argc, (const char**) argv, "train", &trainnum);
	cutGetCmdLineArgumenti(argc, (const char**) argv, "iter", &iter);
	cutGetCmdLineArgumenti(argc, (const char**) argv, "profiler", &intProfiler);
	cutGetCmdLineArgumenti(argc, (const char**) argv, "test", &testnum);
	cutGetCmdLineArgumentf(argc, (const char**) argv, "maxtime", &maxtime);
	printf("%d\n", intProfiler);
	if(intProfiler)
	{
		isProfiler = true;
	}
	if(testnum == -1) testnum = trainnum /2;
	printf("Iter = %d\n", iter);
	printf("TrainNum = %d\n", trainnum);
	printf("TestNum = %d\n", testnum);

	CUT_DEVICE_INIT(argc, argv);


	cublasStatus status;
	status = cublasInit();
	if(status != CUBLAS_STATUS_SUCCESS)
	{
		printf("Can't init cublas\n");
		printf("%s\n", cudaGetErrorString(cudaGetLastError()));
		return -1;
	}


	Image* imageList = new Image[trainnum+testnum];
	read64("my_optdigits.tra", imageList, trainnum + testnum);

	const int warmUpTime = 3;
	if(!isProfiler)
	{
		freopen("verbose.txt", "w", stdout);
		for(int i=0;i< warmUpTime;i++)
		{
			runImage(argc, argv, imageList, trainnum < warmUpTime ? trainnum : warmUpTime, 0, 10, false, 0.0f);
		}
		freopen("CON", "w", stdout);
		printf("Warm-up complete.\n\n\n");
	}
#ifdef _DEBUG
	freopen("out.txt", "w", stdout);
#endif // _DEBUG
	runImage(argc, argv, imageList, trainnum, testnum, iter, true, maxtime);
	freopen("CON", "w", stdout);
	delete[] imageList;
	//TestReduce();
	
	cublasShutdown();
	if(!isProfiler)
	{
		CUT_EXIT(argc, argv);
	}
	//getchar();
	return 0;
}