int main(int argc,char **argv){

// Print GPU properties
//print_properties();

// Files to print the result after the last time step
FILE *rho_file;
FILE *E_file;
rho_file = fopen("rho_final.txt", "w");
E_file = fopen("E_final.txt", "w");

// Construct initial condition for problem
ICsinus Config(-1.0, 1.0, -1.0, 1.0); 
//ICsquare Config(0.5,0.5,gasGam);

// Set initial values for Configuration 1
/*
Config.set_rho(rhoConfig19);
Config.set_pressure(pressureConfig19);
Config.set_u(uConfig19);
Config.set_v(vConfig19);
*/

// Determining global border based on left over tiles (a little hack)
int globalPadding;
globalPadding = (nx+2*border+16)/16;
globalPadding = 16*globalPadding - (nx+2*border);
//printf("Globalpad: %i\n", globalPadding);

// Change border to add padding
//border = border + globalPadding/2;

// Initiate the matrices for the unknowns in the Euler equations
cpu_ptr_2D rho(nx, ny, border,1);
cpu_ptr_2D E(nx, ny, border,1);
cpu_ptr_2D rho_u(nx, ny, border,1);
cpu_ptr_2D rho_v(nx, ny, border,1);
cpu_ptr_2D zeros(nx, ny, border,1);

// Set initial condition
Config.setIC(rho, rho_u, rho_v, E);

double timeStart = get_wall_time();

// Test 
cpu_ptr_2D rho_dummy(nx, ny, border);
cpu_ptr_2D E_dummy(nx, ny, border);

/*
rho_dummy.xmin = -1.0;
rho_dummy.ymin = -1.0;
E_dummy.xmin = -1.0;
E_dummy.ymin = -1.0;
*/

// Set block and grid sizes
dim3 gridBC = dim3(1, 1, 1);
dim3 blockBC = dim3(BLOCKDIM_BC,1,1);

dim3 gridBlockFlux;
dim3 threadBlockFlux;

dim3 gridBlockRK;
dim3 threadBlockRK;

computeGridBlock(gridBlockFlux, threadBlockFlux, nx + 2*border, ny + 2*border, INNERTILEDIM_X, INNERTILEDIM_Y, BLOCKDIM_X, BLOCKDIM_Y);

computeGridBlock(gridBlockRK, threadBlockRK, nx + 2*border, ny + 2*border, BLOCKDIM_X_RK, BLOCKDIM_Y_RK, BLOCKDIM_X_RK, BLOCKDIM_Y_RK);

int nElements = gridBlockFlux.x*gridBlockFlux.y;

// Allocate memory for the GPU pointers
gpu_ptr_1D L_device(nElements);
gpu_ptr_1D dt_device(1);

gpu_ptr_2D rho_device(nx, ny, border);
gpu_ptr_2D E_device(nx, ny, border);
gpu_ptr_2D rho_u_device(nx, ny, border);
gpu_ptr_2D rho_v_device(nx, ny, border); 

gpu_ptr_2D R0(nx, ny, border);
gpu_ptr_2D R1(nx, ny, border);
gpu_ptr_2D R2(nx, ny, border);
gpu_ptr_2D R3(nx, ny, border);

gpu_ptr_2D Q0(nx, ny, border);
gpu_ptr_2D Q1(nx, ny, border);
gpu_ptr_2D Q2(nx, ny, border);
gpu_ptr_2D Q3(nx, ny, border);

// Allocate pinned memory on host
init_allocate();

// Set BC arguments
set_bc_args(BCArgs[0], rho_device.getRawPtr(), rho_u_device.getRawPtr(), rho_v_device.getRawPtr(), E_device.getRawPtr(), nx+2*border, ny+2*border, border);
set_bc_args(BCArgs[1], Q0.getRawPtr(), Q1.getRawPtr(), Q2.getRawPtr(), Q3.getRawPtr(), nx+2*border, ny+2*border, border);
set_bc_args(BCArgs[2], rho_device.getRawPtr(), rho_u_device.getRawPtr(), rho_v_device.getRawPtr(), E_device.getRawPtr(), nx+2*border, ny+2*border, border);

// Set FLUX arguments
set_flux_args(fluxArgs[0], L_device.getRawPtr(), rho_device.getRawPtr(), rho_u_device.getRawPtr(), rho_v_device.getRawPtr(), E_device.getRawPtr(), R0.getRawPtr(),R1.getRawPtr(), R2.getRawPtr(), R3.getRawPtr(), nx, ny, border, rho.get_dx(), rho.get_dy(), theta, gasGam, INNERTILEDIM_X, INNERTILEDIM_Y);
set_flux_args(fluxArgs[1], L_device.getRawPtr(), Q0.getRawPtr(), Q1.getRawPtr(), Q2.getRawPtr(), Q3.getRawPtr(), R0.getRawPtr(),R1.getRawPtr(), R2.getRawPtr(), R3.getRawPtr(), nx, ny, border, rho.get_dx(), rho.get_dy(), theta, gasGam, INNERTILEDIM_X, INNERTILEDIM_Y);

// Set TIME argument
set_dt_args(dtArgs, L_device.getRawPtr(), dt_device.getRawPtr(), nElements, rho.get_dx(), rho.get_dy(), cfl_number);

// Set Rk arguments
set_rk_args(RKArgs[0], dt_device.getRawPtr(), rho_device.getRawPtr(), rho_u_device.getRawPtr(), rho_v_device.getRawPtr(), E_device.getRawPtr(), R0.getRawPtr(), R1.getRawPtr(), R2.getRawPtr(), R3.getRawPtr(), Q0.getRawPtr(), Q1.getRawPtr(), Q2.getRawPtr(), Q3.getRawPtr(), nx, ny, border); 
set_rk_args(RKArgs[1], dt_device.getRawPtr(), Q0.getRawPtr(), Q1.getRawPtr(), Q2.getRawPtr(), Q3.getRawPtr(), R0.getRawPtr(), R1.getRawPtr(), R2.getRawPtr(), R3.getRawPtr(), rho_device.getRawPtr(), rho_u_device.getRawPtr(), rho_v_device.getRawPtr(), E_device.getRawPtr(), nx, ny, border); 


L_device.set(FLT_MAX);

/*
R0.upload(zeros.get_ptr()); 
R1.upload(zeros.get_ptr()); 
R2.upload(zeros.get_ptr()); 
R3.upload(zeros.get_ptr()); 

Q0.upload(zeros.get_ptr()); 
Q1.upload(zeros.get_ptr()); 
Q2.upload(zeros.get_ptr()); 
Q3.upload(zeros.get_ptr()); 
*/

R0.set(0,0,0,nx,ny,border); 
R1.set(0,0,0,nx,ny,border); 
R2.set(0,0,0,nx,ny,border); 
R3.set(0,0,0,nx,ny,border); 

Q0.set(0,0,0,nx,ny,border); 
Q1.set(0,0,0,nx,ny,border); 
Q2.set(0,0,0,nx,ny,border); 
Q3.set(0,0,0,nx,ny,border); 


rho_device.upload(rho.get_ptr());
rho_u_device.upload(rho_u.get_ptr());
rho_v_device.upload(rho_v.get_ptr());
E_device.upload(E.get_ptr());

// Update boudries
callCollectiveSetBCPeriodic(gridBC, blockBC, BCArgs[0]);

//Create cuda stream
cudaStream_t stream1;
cudaStreamCreate(&stream1);
cudaEvent_t dt_complete;
cudaEventCreate(&dt_complete);


while (currentTime < timeLength && step < maxStep){	
	
	//RK1	
	//Compute flux
	callFluxKernel(gridBlockFlux, threadBlockFlux, 0, fluxArgs[0]);	
	
	// Compute timestep (based on CFL condition)
	callDtKernel(TIMETHREADS, dtArgs);
	
	cudaMemcpyAsync(dt_host, dt_device.getRawPtr(), sizeof(float), cudaMemcpyDeviceToHost, stream1);
	cudaEventRecord(dt_complete, stream1);

	// Perform RK1 step
	callRKKernel(gridBlockRK, threadBlockRK, 0, RKArgs[0]);
	
	//Update boudries
	callCollectiveSetBCPeriodic(gridBC, blockBC, BCArgs[1]);		

	//RK2
	// Compute flux
	callFluxKernel(gridBlockFlux, threadBlockFlux, 1, fluxArgs[1]);

	//Perform RK2 step
	callRKKernel(gridBlockRK, threadBlockRK, 1, RKArgs[1]);	

	//cudaEventRecord(srteam_sync, srteam1);

	callCollectiveSetBCPeriodic(gridBC, blockBC, BCArgs[2]);

	cudaEventSynchronize(dt_complete);

	step++;	
	currentTime += *dt_host;	
//	printf("Step: %i, current time: %.6f dt:%.6f\n" , step,currentTime, dt_host[0]);

}


//cuProfilerStop();
//cudaProfilerStop();

printf("Elapsed time %.5f", get_wall_time() - timeStart);

E_device.download(E.get_ptr());
rho_u_device.download(rho_u.get_ptr());
rho_v_device.download(rho_v.get_ptr());
rho_device.download(rho_dummy.get_ptr());

rho_dummy.printToFile(rho_file, true, false);


Config.exactSolution(E_dummy, currentTime);
E_dummy.printToFile(E_file, true, false);


float LinfError = Linf(E_dummy, rho_dummy);
float L1Error = L1(E_dummy, rho_dummy); 
float L1Error2 = L1test(E_dummy, rho_dummy);

printf("nx: %i\t Linf error %.9f\t L1 error %.7f L1test erro %.7f", nx, LinfError, L1Error, L1Error2);


printf("nx: %i step: %i, current time: %.6f dt:%.6f\n" , nx, step,currentTime, dt_host[0]); 


/*
cudaMemcpy(L_host, L_device, sizeof(float)*(nElements), cudaMemcpyDeviceToHost);
for (int i =0; i < nElements; i++)
	printf(" %.7f ", L_host[i]); 
*/


printf("%s\n", cudaGetErrorString(cudaGetLastError()));

return(0);
}
void ControlCubeCache::_reSizeCache()
{
	_nLevels = _nextnLevels;
	_levelCube = _nextLevelCube;
	_offset	= _nextOffset;
	_nextnLevels = 0;
	_nextLevelCube = 0;

	_dimCube = exp2(_nLevels - _levelCube) + 2 * CUBE_INC;

	_sizeElement = pow(_dimCube, 3); 

	int dimV = exp2(_nLevels);
	_minValue = coordinateToIndex(vmml::vector<3,int>(0,0,0), _levelCube, _nLevels);
	_maxValue = coordinateToIndex(vmml::vector<3,int>(dimV-1,dimV-1,dimV-1), _levelCube, _nLevels);

	int dc = exp2(_nLevels - _levelCube);
	vmml::vector<3,int> mn = _cpuCache->getMinCoord();
	vmml::vector<3,int> mx = _cpuCache->getMaxCoord();
	_maxC = mx - mn;
	if ((mx.x() - mn.x()) % dc != 0)
		_maxC[0] += dc;
	if ((mx.y() - mn.y()) % dc != 0)
		_maxC[1] += dc;
	if ((mx.z() - mn.z()) % dc != 0)
		_maxC[2] += dc;

	if (cudaSuccess != cudaSetDevice(_device))
	{
		std::cerr<<"Control Cube Cache, error setting device: "<<cudaGetErrorString(cudaGetLastError())<<std::endl;
		throw;
	}
	if (_memory != 0)
		if (cudaSuccess != cudaFree((void*)_memory))
		{                                                                                               
			std::cerr<<"Control Cube Cache, error resizing cache: "<<cudaGetErrorString(cudaGetLastError())<<std::endl;
			throw;
		}
	size_t total = 0;
	size_t free = 0;

	if (cudaSuccess != cudaMemGetInfo(&free, &total))
	{
		std::cerr<<"Control Cube Cache, error resizing cache: "<<cudaGetErrorString(cudaGetLastError())<<std::endl;
		throw;
	}

	float memorySize = (0.80f*free); // Get 80% of free memory

	_maxNumCubes = memorySize/ (_sizeElement*sizeof(float));
	if (_maxNumCubes == 0)
	{
		std::cerr<<"Control Cube Cache: Memory aviable is not enough "<<memorySize/1024/1024<<" MB"<<std::endl;
		throw;
	}

	if (cudaSuccess != cudaMalloc((void**)&_memory, _maxNumCubes*_sizeElement*sizeof(float)))
	{
		std::cerr<<"Control Cube Cache, error resizing cache: "<<cudaGetErrorString(cudaGetLastError())<<std::endl;
		throw;
	}

	_freeSlots = _maxNumCubes;

	ControlElementCache::_reSizeCache();
}
Beispiel #3
0
int
APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
                        CudaNdarray *km, cudnnConvolutionDescriptor_t desc,
                        float alpha, float beta, CudaNdarray **kerns) {
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;

  if (CudaNdarray_HOST_DIMS(input)[1] != CudaNdarray_HOST_DIMS(km)[1]) {
    PyErr_SetString(PyExc_ValueError,
                   "GpuDnnConv images and kernel must have the same stack size\n");
    return 1;
  }

  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
    return 1;
  if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
    return 1;

  int nb_dim = CudaNdarray_NDIM(output);

#ifdef CONV_INPLACE
  Py_XDECREF(*kerns);
  *kerns = km;
  Py_INCREF(*kerns);
#else
  if (CudaNdarray_prep_output(kerns, nb_dim, CudaNdarray_HOST_DIMS(km)) != 0)
    return 1;
  if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*kerns, km))
    return 1;
#endif

  if (c_set_filterNd(*kerns, APPLY_SPECIFIC(kerns)) == -1)
    return 1;

  {
    size_t worksize;
    void *workspace;
    cudnnConvolutionBwdFilterAlgo_t chosen_algo;

    if (CHOOSE_ALGO)
    {

      // A new convolution implementation should be selected, based either on
      // timing or heuristics, if in one of the two following cases :
      // - The implementation should only be chosen during the first execution
      //   of an apply node and this is the first execution of the apply node.
      // - The implementation should be chosen as often as necessary and the
      //   shapes of the inputs differ from the last time an implementation
      //   was chosen.
      bool reuse_previous_algo;
      if (CHOOSE_ALGO_ONCE)
      {
        // Only choose a new implementation of none has been chosen before.
        reuse_previous_algo = APPLY_SPECIFIC(previous_algo_set);
      }
      else
      {
        // Reuse the previous implementation if the the kernels and the outputs
        // have the same shapes as they had when the previous implementation
        // was selected
        bool same_shapes = true;
        for (int i = 0; (i < nb_dim) && same_shapes; i++)
        {
            same_shapes &= (CudaNdarray_HOST_DIMS(input)[i] ==
                            APPLY_SPECIFIC(previous_input_shape)[i]);
            same_shapes &= (CudaNdarray_HOST_DIMS(output)[i] ==
                            APPLY_SPECIFIC(previous_output_shape)[i]);
        }
        reuse_previous_algo = same_shapes;
      }

      // If the previously choosen implementation can't be reused, select a
      // new one based on the shapes of the current inputs
      if (!reuse_previous_algo)
      {
        // Obtain a convolution algorithm appropriate for the input and output
        // shapes. Either by choosing one according to heuristics or by making
        // cuDNN time every implementation and choose the best one.
        if (CHOOSE_ALGO_TIME)
        {
          // Time the different implementations to choose the best one
          int requestedCount = 1;
          int count;
          cudnnConvolutionBwdFilterAlgoPerf_t choosen_algo_perf;
          err = cudnnFindConvolutionBackwardFilterAlgorithm(_handle,
                                                            APPLY_SPECIFIC(input),
                                                            APPLY_SPECIFIC(output),
                                                            desc,
                                                            APPLY_SPECIFIC(kerns),
                                                            requestedCount,
                                                            &count,
                                                            &choosen_algo_perf);
          if (err != CUDNN_STATUS_SUCCESS) {
            PyErr_Format(PyExc_RuntimeError,
                         "GpuDnnConvGradW: error selecting convolution algo: "
                         "%s", cudnnGetErrorString(err));
            return 1;
          }

          chosen_algo = choosen_algo_perf.algo;
        }
        else
        {
          // Choose the convolution implementation using heuristics based on the
          // shapes of the inputs and the amount of memory available.

          // Get the amount of available memory
          size_t free = 0, total = 0;
          cudaError_t err2 = cudaMemGetInfo(&free, &total);
          if (err2 != cudaSuccess){
            cudaGetLastError();
            fprintf(stderr,
                    "Error when trying to find the memory information"
                    " on the GPU: %s\n", cudaGetErrorString(err2));
            return 1;
          }

          // Use heuristics to choose the implementation
          err = cudnnGetConvolutionBackwardFilterAlgorithm(_handle,
                                                           APPLY_SPECIFIC(input),
                                                           APPLY_SPECIFIC(output),
                                                           desc,
                                                           APPLY_SPECIFIC(kerns),
                                                           CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
                                                           free,
                                                           &chosen_algo);

          if (err != CUDNN_STATUS_SUCCESS) {
            PyErr_Format(PyExc_RuntimeError,
                         "GpuDnnConvGradW: error selecting convolution algo: %s",
                         cudnnGetErrorString(err));
            return 1;
          }
        }

        // Store the shapes of the inputs and kernels as well as the chosen
        // algorithm for future use.
        APPLY_SPECIFIC(previous_bwd_f_algo) = chosen_algo;
        APPLY_SPECIFIC(previous_algo_set) = true;
        for (int i = 0; i < nb_dim; i++)
        {
            APPLY_SPECIFIC(previous_input_shape)[i] =
                                            CudaNdarray_HOST_DIMS(input)[i];
            APPLY_SPECIFIC(previous_output_shape)[i] =
                                            CudaNdarray_HOST_DIMS(output)[i];
        }

      }
      else
      {
        // Reuse the previously chosen convlution implementation
        chosen_algo = APPLY_SPECIFIC(previous_bwd_f_algo);
      }
    }
    else
    {
        chosen_algo = CONV_ALGO;
    }

    // The FFT implementation (only in v3 and onward) does not support strides,
    // 1x1 filters or inputs with a spatial dimension larger than 1024.
    // If the chosen implementation is FFT, validate that it can be used
    // on the current data and default on a safe implementation if it
    // can't.
    if (chosen_algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT && nb_dim == 4)
    {

      // Extract the properties of the convolution descriptor
      int nd;
      int pad[2];
      int stride[2];
      int upscale[2];
      cudnnConvolutionMode_t mode;
      cudnnDataType_t data_type;
      err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
                                            upscale, &mode, &data_type);

      if (err != CUDNN_STATUS_SUCCESS) {
        PyErr_Format(PyExc_RuntimeError,
                     "GpuDnnConvGradW: error getting convolution properties: %s",
                     cudnnGetErrorString(err));
        return 1;
      }

      // Extract the spatial size of the filters
      int filter_h = CudaNdarray_HOST_DIMS(*kerns)[2];
      int filter_w = CudaNdarray_HOST_DIMS(*kerns)[3];

      // Extract the spatial size of the input
      int input_h = CudaNdarray_HOST_DIMS(input)[2];
      int input_w = CudaNdarray_HOST_DIMS(input)[3];

      // Ensure that the selected implementation supports the requested
      // convolution. Fall back to a safe implementation otherwise.
      if (stride[0] != 1 || stride[1] != 1 || input_h > 1024 ||
          input_w > 1024 || (filter_h == 1 && filter_w == 1))
      {
        chosen_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
      }
    }

    // Infer required workspace size from the chosen implementation
    err = cudnnGetConvolutionBackwardFilterWorkspaceSize(_handle,
                                                         APPLY_SPECIFIC(input),
                                                         APPLY_SPECIFIC(output),
                                                         desc,
                                                         APPLY_SPECIFIC(kerns),
                                                         chosen_algo,
                                                         &worksize);
    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
                   "GpuDnnConvGradW: error getting worksize: %s",
                   cudnnGetErrorString(err));
      return 1;
    }

    // Allocate workspace for the convolution
    workspace = get_work_mem(worksize);
    if (workspace == NULL && worksize != 0)
      return 1;

    // Perform the convolution
    err = cudnnConvolutionBackwardFilter(
      _handle,
      (void *)&alpha,
      APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(input),
      APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(output),
      desc,
      chosen_algo,
      workspace, worksize,
      (void *)&beta,
      APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(*kerns));

  }

  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradW: error doing operation: %s",
                 cudnnGetErrorString(err));
    return 1;
  }
  return 0;
}
void MFNHashTypePlainCUDA::allocateThreadAndDeviceMemory() {
    trace_printf("MFNHashTypePlainCUDA::allocateThreadAndDeviceMemory()\n");
    /**
     * Error variable - stores the result of the various mallocs & such.
     */
    cudaError_t err, err2;

    /**
     * Flags for calling cudaHostMalloc - will be set to cudaHostAllocMapped
     * if we are mapping memory to the host with zero copy.
     */
    unsigned int cudaHostMallocFlags = 0;

    if (this->useZeroCopy) {
        cudaHostMallocFlags |= cudaHostAllocMapped;
    }


    /*
     * Malloc the device hashlist space.  This is the number of available hashes
     * times the hash length in bytes.  The data will be copied later.
     */
    err = cudaMalloc((void **)&this->DeviceHashlistAddress,
        this->activeHashesProcessed.size() * this->hashLengthBytes);
    if (err != cudaSuccess) {
        printf("Unable to allocate %d bytes for device hashlist!  Exiting!\n",
                this->activeHashesProcessed.size() * this->hashLengthBytes);
        printf("return code: %d\n", err);
        exit(1);
    }

    /*
     * Allocate the host/device space for the success list (flags for found passwords).
     * This is a byte per password.  To avoid atomic write issues, each password
     * gets a full addressible byte, and the GPU handles the dependencies between
     * multiple threads trying to set a flag in the same segment of memory.
     *
     * On the host, it will be allocated as mapped memory if we are using zerocopy.
     *
     * As this region of memory is frequently copied back to the host, mapping it
     * improves performance.  In theory.
     */
    err = cudaHostAlloc((void **)&this->HostSuccessAddress,
        this->activeHashesProcessed.size(), cudaHostMallocFlags);
    if (err != cudaSuccess) {
        printf("Unable to allocate %d bytes for success flags!  Exiting!\n",
                this->activeHashesProcessed.size());
        printf("return code: %d\n", err);
        exit(1);
    }

    // Clear host success flags region - if we are mapping the memory, the GPU
    // will directly write this.
    memset(this->HostSuccessAddress, 0, this->activeHashesProcessed.size());

    // Allocate memory for the reported flags.
    this->HostSuccessReportedAddress = new uint8_t [this->activeHashesProcessed.size()];
    memset(this->HostSuccessReportedAddress, 0, this->activeHashesProcessed.size());

    // If zero copy is in use, get the device pointer for the success data, else
    // malloc a region of memory on the device.
    if (this->useZeroCopy) {
        err = cudaHostGetDevicePointer((void **)&this->DeviceSuccessAddress,
            this->HostSuccessAddress, 0);
        err2 = cudaSuccess;
    } else {
        err = cudaMalloc((void **)&this->DeviceSuccessAddress,
            this->activeHashesProcessed.size());
        err2 = cudaMemset(this->DeviceSuccessAddress, 0,
            this->activeHashesProcessed.size());
    }
    if ((err != cudaSuccess) || (err2 != cudaSuccess)) {
        printf("Unable to allocate %d bytes for device success list!  Exiting!\n",
                this->activeHashesProcessed.size());
        printf("return code: %d\n", err);
        printf("return code: %d\n", err2);
        exit(1);
    }

    /*
     * Allocate memory for the found passwords.  As this is commonly copied
     * back and forth, it will be made zero copy if requested.
     *
     * This requires (number hashes * passwordLength) bytes of data.
     */

    err = cudaHostAlloc((void **)&this->HostFoundPasswordsAddress,
        this->passwordLength * this->activeHashesProcessed.size() , cudaHostMallocFlags);
    if (err != cudaSuccess) {
        printf("Unable to allocate %d bytes for host password list!  Exiting!\n",
                this->passwordLength * this->activeHashesProcessed.size());
        printf("return code: %d\n", err);
        exit(1);
    }
    // Clear the host found password space.
    memset(this->HostFoundPasswordsAddress, 0,
            this->passwordLength * this->activeHashesProcessed.size());

    if (this->useZeroCopy) {
        err = cudaHostGetDevicePointer((void **)&this->DeviceFoundPasswordsAddress,
            this->HostFoundPasswordsAddress, 0);
        err2 = cudaSuccess;
    } else {
        err = cudaMalloc((void **)&this->DeviceFoundPasswordsAddress,
            this->passwordLength * this->activeHashesProcessed.size());
        err2 = cudaMemset(this->DeviceFoundPasswordsAddress, 0,
            this->passwordLength * this->activeHashesProcessed.size());
    }
    if ((err != cudaSuccess) || (err2 != cudaSuccess)) {
        printf("Unable to allocate %d bytes for device password list!  Exiting!\n",
                this->passwordLength * this->activeHashesProcessed.size());
        printf("return code: %d\n", err);
        printf("return code: %d\n", err2);
        exit(1);
    }

    /**
     * Allocate space for host and device start positions.  To improve performance,
     * this space is now aligned for improved coalescing performance.  All the
     * position 0 elements are together, followed by all the position 1 elements,
     * etc.
     *
     * This memory can be allocated as write combined, as it is not read by
     * the host ever - only written.  Since it is regularly transferred to the
     * GPU, this should help improve performance.
     */
    err = cudaHostAlloc((void**)&this->HostStartPointAddress,
        this->TotalKernelWidth * this->passwordLength,
        cudaHostAllocWriteCombined | cudaHostMallocFlags);

    err2 = cudaMalloc((void **)&this->DeviceStartPointAddress,
        this->TotalKernelWidth * this->passwordLength);

    if ((err != cudaSuccess) || (err2 != cudaSuccess)) {
        printf("Unable to allocate %d bytes for host/device startpos list!  Exiting!\n",
                this->TotalKernelWidth * this->passwordLength);
        printf("return code: %d\n", err);
        printf("return code: %d\n", err2);
        exit(1);
    }
    
    /**
     * Allocate space for the device start password values.  This is a copy of
     * the MFNHashTypePlain::HostStartPasswords32 vector for the GPU.
     */
    err = cudaMalloc((void **)&this->DeviceStartPasswords32Address,
        this->TotalKernelWidth * this->passwordLengthWords);
    
    if ((err != cudaSuccess)) {
        printf("Unable to allocate %d bytes for host/device startpos list!  Exiting!\n",
                this->TotalKernelWidth * this->passwordLengthWords);
        printf("return code: %d\n", err);
        exit(1);
    }


    /**
     * Finally, attempt to allocate space for the giant device bitmaps.  There
     * are 4x128MB bitmaps, and any number can be allocated.  If they are not
     * fully allocated, their address is set to null as a indicator to the device
     * that there is no data present.  Attempt to allocate as many as possible.
     *
     * This will be accessed regularly, so should probably not be zero copy.
     * Also, I'm not sure how mapping host memory into multiple threads would
     * work.  Typically, if the GPU doesn't have enough RAM for the full
     * set of bitmaps, it's a laptop, and therefore may be short on host RAM
     * for the pinned access.
     *
     * If there is an error in allocation, call cudaGetLastError() to clear it -
     * we know there has been an error, and do not want it to persist.
     */
    err = cudaMalloc((void **)&this->DeviceBitmap128mb_a_Address,
        128 * 1024 * 1024);
    if (err == cudaSuccess) {
        memalloc_printf("Successfully allocated Bitmap A\n");
    } else {
        memalloc_printf("Unable to allocate 128MB bitmap A\n");
        this->DeviceBitmap128mb_a_Address = 0;
        cudaGetLastError();
    }

    err = cudaMalloc((void **)&this->DeviceBitmap128mb_b_Address,
        128 * 1024 * 1024);
    if (err == cudaSuccess) {
        memalloc_printf("Successfully allocated Bitmap B\n");
    } else {
        memalloc_printf("Unable to allocate 128MB bitmap B\n");
        this->DeviceBitmap128mb_b_Address = 0;
        cudaGetLastError();
    }

    err = cudaMalloc((void **)&this->DeviceBitmap128mb_c_Address,
        128 * 1024 * 1024);
    if (err == cudaSuccess) {
        memalloc_printf("Successfully allocated Bitmap C\n");
    } else {
        memalloc_printf("Unable to allocate 128MB bitmap C\n");
        this->DeviceBitmap128mb_c_Address = 0;
        cudaGetLastError();
    }

    err = cudaMalloc((void **)&this->DeviceBitmap128mb_d_Address,
        128 * 1024 * 1024);
    if (err == cudaSuccess) {
        memalloc_printf("Successfully allocated Bitmap D\n");
    } else {
        memalloc_printf("Unable to allocate 128MB bitmap D\n");
        this->DeviceBitmap128mb_d_Address = 0;
        cudaGetLastError();
    }
    //printf("Thread %d memory allocated successfully\n", this->threadId);
}
Beispiel #5
0
int
APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
                        PyGpuArrayObject *km,
                        cudnnConvolutionDescriptor_t desc,
                        double alpha, double beta, PyGpuArrayObject **kerns,
                        PyGpuContextObject *c) {
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
  float af = alpha, bf = beta;
  void *alpha_p;
  void *beta_p;

  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) {
    PyErr_SetString(PyExc_ValueError,
		    "GpuDnnConv images and kernel must have the same stack size");
    return 1;
  }

  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
    return 1;
  if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
    return 1;

  switch (input->ga.typecode) {
  case GA_DOUBLE:
    alpha_p = (void *)&alpha;
    beta_p = (void *)&beta;
    break;
  case GA_FLOAT:
  case GA_HALF:
    alpha_p = (void *)&af;
    beta_p = (void *)&bf;
    break;
  default:
    PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution");
    return 1;
  }

#ifdef CONV_INPLACE
  Py_XDECREF(*kerns);
  *kerns = km;
  Py_INCREF(*kerns);
#else
  if (theano_prep_output(kerns, PyGpuArray_NDIM(km), PyGpuArray_DIMS(km),
                         km->ga.typecode, GA_C_ORDER, c) != 0)
    return 1;
  if (beta != 0.0 && pygpu_move(*kerns, km))
    return 1;
#endif

  if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1)
    return 1;

  cudnnConvolutionBwdFilterAlgo_t algo = CONV_ALGO;

  cuda_enter(c->ctx);

#ifdef CHOOSE_ALGO
  static int reuse_algo = 0;
  static cudnnConvolutionBwdFilterAlgo_t prev_algo = CONV_ALGO;

#ifndef CHOOSE_ONCE
  static size_t prev_img_dims[5] = {0};
  static size_t prev_top_dims[5] = {0};

  reuse_algo = 1;
  for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
    reuse_algo = (reuse_algo &&
                  PyGpuArray_DIM(input, i) == prev_img_dims[i]);
    reuse_algo = (reuse_algo &&
                  PyGpuArray_DIM(output, i) == prev_top_dims[i]);
  }
#endif

  if (!reuse_algo) {
#ifdef CHOOSE_TIME
    int count;
    cudnnConvolutionBwdFilterAlgoPerf_t choice;

    err = cudnnFindConvolutionBackwardFilterAlgorithm(
      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
      APPLY_SPECIFIC(kerns), 1, &count, &choice);

    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
                   "error selecting convolution algo: %s",
                   cudnnGetErrorString(err));
      cuda_exit(c->ctx);
      return 1;
    }

    algo = choice.algo;
#else
    size_t free = 0, total = 0;
    cudaError_t err2 = cudaMemGetInfo(&free, &total);
    if (err2 != cudaSuccess){
      cudaGetLastError();
      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the memory "
                   "information on the GPU: %s\n", cudaGetErrorString(err2));
      cuda_exit(c->ctx);
      return 1;
    }

    err = cudnnGetConvolutionBackwardFilterAlgorithm(
      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
      desc, APPLY_SPECIFIC(kerns),
      CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo);
    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
                   "error selecting convolution algo: %s",
                   cudnnGetErrorString(err));
      cuda_exit(c->ctx);
      return 1;
    }
#endif
    prev_algo = algo;
  } else {
    algo = prev_algo;
  }

#ifdef CHOOSE_ONCE
  reuse_algo = 1;
#else
  for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
    prev_img_dims[i] = PyGpuArray_DIM(input, i);
    prev_top_dims[i] = PyGpuArray_DIM(output, i);
  }
#endif

#endif

#if CUDNN_VERSION > 3000
  if (algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT) {
    int nd;
    int pad[2];
    int stride[2];
    int upscale[2];
    cudnnConvolutionMode_t mode;
    err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
                                          upscale, &mode);
    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
                   "error getting convolution properties: %s",
                   cudnnGetErrorString(err));
      cuda_exit(c->ctx);
      return 1;
    }

    if (stride[0] != 1 || stride[1] != 1 ||
        PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
        (PyGpuArray_DIM(*kerns, 2) == 1 && PyGpuArray_DIM(*kerns, 3) == 1)) {
      algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
    }
  }
#endif

  size_t worksize;
  gpudata *workspace;

  err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
    APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
    APPLY_SPECIFIC(kerns), algo, &worksize);

  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
                 cudnnGetErrorString(err));
      cuda_exit(c->ctx);
    return 1;
  }

  if (worksize != 0) {
    workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL);
    if (workspace == NULL) {
      PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory");
      cuda_exit(c->ctx);
      return 1;
    }
  }

  cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
  cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
  cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);

  err = cudnnConvolutionBackwardFilter_v3(
    APPLY_SPECIFIC(_handle),
    alpha_p,
    APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
    APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
    desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize,
    beta_p,
    APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns));

  if (worksize != 0)
    c->ops->buffer_release(workspace);

  cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
  cuda_record(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
  cuda_record((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);

  cuda_exit(c->ctx);

  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
                 cudnnGetErrorString(err));
    return 1;
  }
  return 0;
}
Beispiel #6
0
cudaError_t GridGpu::retrieveData(complexVector &gData)
{
    gData.resize(m_gridSize * m_gridSize);
    cudaMemcpy(gData.data(), m_d_gData, gData.size() * sizeof(complexGpu), cudaMemcpyDeviceToHost);
    return cudaGetLastError();
}
void MFNHashTypePlainCUDA::freeThreadAndDeviceMemory() {
    trace_printf("MFNHashTypePlainCUDA::freeThreadAndDeviceMemory()\n");

    cudaError_t err;

    // Free all the memory, then look for errors.
    cudaFree((void *)this->DeviceHashlistAddress);
    cudaFreeHost((void *)this->HostSuccessAddress);

    delete[] this->HostSuccessReportedAddress;

    // Only cudaFree if zeroCopy is in use.
    if (!this->useZeroCopy) {
        cudaFree((void *)this->DeviceSuccessAddress);
        cudaFree((void *)this->DeviceFoundPasswordsAddress);

    }
    
    cudaFreeHost((void *)this->HostFoundPasswordsAddress);

    cudaFreeHost((void*)this->HostStartPointAddress);
    cudaFree((void *)this->DeviceStartPointAddress);
    cudaFree((void *)this->DeviceStartPasswords32Address);

    // Free salted hashes if in use.
    if (this->hashAttributes.hashUsesWordlist) {
        cudaFree((void *)this->DeviceWordlistBlocks);
        cudaFree((void *)this->DeviceWordlistLengths);
    }

    if (this->hashAttributes.hashUsesSalt) {
        cudaFree((void *)this->DeviceSaltLengthsAddress);
        cudaFree((void *)this->DeviceSaltValuesAddress);
    }
    
    // Only free the bitmap memory if it has been allocated.
    if (this->DeviceBitmap256kb_Address) {
        cudaFree((void *)this->DeviceBitmap256kb_Address);
        this->DeviceBitmap256kb_Address = 0;
    }
    if (this->DeviceBitmap128mb_a_Address) {
        cudaFree((void *)this->DeviceBitmap128mb_a_Address);
        this->DeviceBitmap128mb_a_Address = 0;
    }
    if (this->DeviceBitmap128mb_b_Address) {
        cudaFree((void *)this->DeviceBitmap128mb_b_Address);
        this->DeviceBitmap128mb_b_Address = 0;
    }
    if (this->DeviceBitmap128mb_c_Address) {
        cudaFree((void *)this->DeviceBitmap128mb_c_Address);
        this->DeviceBitmap128mb_c_Address = 0;
    }
    if (this->DeviceBitmap128mb_d_Address) {
        cudaFree((void *)this->DeviceBitmap128mb_d_Address);
        this->DeviceBitmap128mb_d_Address = 0;
    }

    // Get any error that occurred above and report it.
    err = cudaGetLastError();
    if (err != cudaSuccess) {
        printf("Thread %d: CUDA error freeing memory: %s. Exiting.\n",
                this->threadId, cudaGetErrorString( err));
        exit(1);
    }
}
Beispiel #8
0
/** check whether cuda thinks there was an error and fail with msg, if this is the case
 * @ingroup tools
 */
static inline void checkCudaError(const char *msg) {
    cudaError_t err = cudaGetLastError();
    if (cudaSuccess != err) {
        throw std::runtime_error(std::string(msg) + ": " + cudaGetErrorString(err));
    }
}
Beispiel #9
0
int
main( int argc,char** argv)
{
	printf("hello world\n");

	if (!InitCUDA())
	{
		return 0;
	}



	int iter = 1000;
	int trainnum = 20;
	bool isProfiler = false;
	int intProfiler = 0;
	int testnum = -1;
	float maxtime = 0.0f;
	cutGetCmdLineArgumenti(argc, (const char**) argv, "train", &trainnum);
	cutGetCmdLineArgumenti(argc, (const char**) argv, "iter", &iter);
	cutGetCmdLineArgumenti(argc, (const char**) argv, "profiler", &intProfiler);
	cutGetCmdLineArgumenti(argc, (const char**) argv, "test", &testnum);
	cutGetCmdLineArgumentf(argc, (const char**) argv, "maxtime", &maxtime);
	printf("%d\n", intProfiler);
	if(intProfiler)
	{
		isProfiler = true;
	}
	if(testnum == -1) testnum = trainnum /2;
	printf("Iter = %d\n", iter);
	printf("TrainNum = %d\n", trainnum);
	printf("TestNum = %d\n", testnum);

	CUT_DEVICE_INIT(argc, argv);


	cublasStatus status;
	status = cublasInit();
	if(status != CUBLAS_STATUS_SUCCESS)
	{
		printf("Can't init cublas\n");
		printf("%s\n", cudaGetErrorString(cudaGetLastError()));
		return -1;
	}


	Image* imageList = new Image[trainnum+testnum];
	read64("my_optdigits.tra", imageList, trainnum + testnum);

	const int warmUpTime = 3;
	if(!isProfiler)
	{
		freopen("verbose.txt", "w", stdout);
		for(int i=0;i< warmUpTime;i++)
		{
			runImage(argc, argv, imageList, trainnum < warmUpTime ? trainnum : warmUpTime, 0, 10, false, 0.0f);
		}
		freopen("CON", "w", stdout);
		printf("Warm-up complete.\n\n\n");
	}
#ifdef _DEBUG
	freopen("out.txt", "w", stdout);
#endif // _DEBUG
	runImage(argc, argv, imageList, trainnum, testnum, iter, true, maxtime);
	freopen("CON", "w", stdout);
	delete[] imageList;
	//TestReduce();
	
	cublasShutdown();
	if(!isProfiler)
	{
		CUT_EXIT(argc, argv);
	}
	//getchar();
	return 0;
}
Beispiel #10
0
void describe_cuda_error(const char *whence, CUresult e)
{
//	CUresult e2;

	switch(e){
		case CUDA_SUCCESS:
			sprintf(DEFAULT_ERROR_STRING,"%s:  No errors.",whence);
			NADVISE(DEFAULT_ERROR_STRING);
			break;
#if CUDA_VERSION >= 6050
RUNTIME_ERROR_CASE(CUDA_ERROR_INVALID_GRAPHICS_CONTEXT,"Invalid graphics context")
#endif

#if CUDA_VERSION > 4000
RUNTIME_ERROR_CASE(CUDA_ERROR_PEER_ACCESS_UNSUPPORTED,"Peer access unsupported")
RUNTIME_ERROR_CASE(CUDA_ERROR_INVALID_PTX,"Invalid PTX")
RUNTIME_ERROR_CASE(CUDA_ERROR_ILLEGAL_ADDRESS,"Illegal address")
RUNTIME_ERROR_CASE(CUDA_ERROR_ASSERT,"Assertion error")
RUNTIME_ERROR_CASE(CUDA_ERROR_TOO_MANY_PEERS,"Too many peers")
RUNTIME_ERROR_CASE(CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED,"Host mem already registered")
RUNTIME_ERROR_CASE(CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED,"Host mem not registered")
RUNTIME_ERROR_CASE(CUDA_ERROR_HARDWARE_STACK_ERROR,"H/W stack error")

RUNTIME_ERROR_CASE(CUDA_ERROR_ILLEGAL_INSTRUCTION,"Illegal instruction");
RUNTIME_ERROR_CASE(CUDA_ERROR_MISALIGNED_ADDRESS,"Misaligned address")
RUNTIME_ERROR_CASE(CUDA_ERROR_INVALID_ADDRESS_SPACE,"Invalid address space")
RUNTIME_ERROR_CASE(CUDA_ERROR_INVALID_PC,"Invalid PC")
RUNTIME_ERROR_CASE(CUDA_ERROR_NOT_PERMITTED,"Not permitted")
RUNTIME_ERROR_CASE(CUDA_ERROR_NOT_SUPPORTED,"Not supported")
#endif // CUDA_VERSION > 4000
RUNTIME_ERROR_CASE(CUDA_ERROR_LAUNCH_FAILED,"Launch failed")
RUNTIME_ERROR_CASE(CUDA_ERROR_UNKNOWN,"Unknown error")
RUNTIME_ERROR_CASE( CUDA_ERROR_INVALID_DEVICE , "Invalid device." )
RUNTIME_ERROR_CASE( CUDA_ERROR_NO_DEVICE , "No device" )
RUNTIME_ERROR_CASE( CUDA_ERROR_INVALID_VALUE , "Invalid value." )
RUNTIME_ERROR_CASE(CUDA_ERROR_INVALID_IMAGE,"Invalid Image")
RUNTIME_ERROR_CASE(CUDA_ERROR_INVALID_CONTEXT,"Invalid context")
#ifdef CUDA_ERROR_NVLINK_UNCORRECTABLE
RUNTIME_ERROR_CASE(CUDA_ERROR_NVLINK_UNCORRECTABLE,"uncorrectable NVLink error")
#endif // CUDA_ERROR_NVLINK_UNCORRECTABLE
//RUNTIME_ERROR_CASE( CUDA_ERROR_INVALID_PITCH_VALUE , "Invalid pitch value." )
//RUNTIME_ERROR_CASE( CUDA_ERROR_INVALID_SYMBOL , "Invalid symbol." )
//RUNTIME_ERROR_CASE( CUDA_ERROR_MAP_OBJECT_FAILED , "Map buffer object failed." )
//RUNTIME_ERROR_CASE( CUDA_ERROR_UNMAP_OBJECT_FAILED , "Unmap buffer object failed." )
//RUNTIME_ERROR_CASE( CUDA_ERROR_INVALID_HOST_POINTER , "Invalid host pointer." )
//RUNTIME_ERROR_CASE( CUDA_ERROR_INVALID_DEVICE_POINTER , "Invalid device pointer." )
//RUNTIME_ERROR_CASE( CUDA_ERROR_INVALID_TEXTURE , "Invalid texture." )
//RUNTIME_ERROR_CASE( CUDA_ERROR_INVALID_TEXTURE_BINDING , "Invalid texture binding." )
RUNTIME_ERROR_CASE( CUDA_ERROR_OUT_OF_MEMORY , "out of memory." )
RUNTIME_ERROR_CASE( CUDA_ERROR_NOT_INITIALIZED , "not initialized." )
RUNTIME_ERROR_CASE( CUDA_ERROR_DEINITIALIZED , "de-initialized." )
RUNTIME_ERROR_CASE( CUDA_ERROR_PROFILER_DISABLED , "profiler is disabled." )
RUNTIME_ERROR_CASE( CUDA_ERROR_PROFILER_NOT_INITIALIZED , "profiler not initialized." )
RUNTIME_ERROR_CASE( CUDA_ERROR_PROFILER_ALREADY_STARTED , "profiler already started." )
RUNTIME_ERROR_CASE( CUDA_ERROR_PROFILER_ALREADY_STOPPED , "profiler already stopped." )
RUNTIME_ERROR_CASE( CUDA_ERROR_CONTEXT_ALREADY_CURRENT , "context already current." )
RUNTIME_ERROR_CASE( CUDA_ERROR_MAP_FAILED , "mapping failure." )
RUNTIME_ERROR_CASE( CUDA_ERROR_UNMAP_FAILED , "unmapping failure." )
RUNTIME_ERROR_CASE( CUDA_ERROR_ARRAY_IS_MAPPED , "array is mapped and cannot be destroyed." )
RUNTIME_ERROR_CASE( CUDA_ERROR_ALREADY_MAPPED , "already mapped." )
RUNTIME_ERROR_CASE( CUDA_ERROR_NO_BINARY_FOR_GPU , "no binary for GPU." )
RUNTIME_ERROR_CASE( CUDA_ERROR_ALREADY_ACQUIRED , "resource already acquired." )
RUNTIME_ERROR_CASE( CUDA_ERROR_NOT_MAPPED , "resource not mapped." )
RUNTIME_ERROR_CASE( CUDA_ERROR_NOT_MAPPED_AS_ARRAY , "not mapped as array." )
RUNTIME_ERROR_CASE( CUDA_ERROR_NOT_MAPPED_AS_POINTER , "not mapped as pointer." )
RUNTIME_ERROR_CASE( CUDA_ERROR_ECC_UNCORRECTABLE , "uncorrectable ECC error." )
RUNTIME_ERROR_CASE( CUDA_ERROR_UNSUPPORTED_LIMIT , "unsupported limit." )
RUNTIME_ERROR_CASE( CUDA_ERROR_CONTEXT_ALREADY_IN_USE , "context already in use." )
RUNTIME_ERROR_CASE( CUDA_ERROR_INVALID_SOURCE , "invalide device kernel source." )
RUNTIME_ERROR_CASE( CUDA_ERROR_FILE_NOT_FOUND , "file not found." )
RUNTIME_ERROR_CASE( CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND , "shared object symbol not found." )
RUNTIME_ERROR_CASE( CUDA_ERROR_SHARED_OBJECT_INIT_FAILED , "shared object init failed." )
RUNTIME_ERROR_CASE( CUDA_ERROR_OPERATING_SYSTEM , "OS call failed." )
RUNTIME_ERROR_CASE( CUDA_ERROR_INVALID_HANDLE , "invalid handle." )
RUNTIME_ERROR_CASE( CUDA_ERROR_NOT_FOUND , "named symbol not found." )
RUNTIME_ERROR_CASE( CUDA_ERROR_NOT_READY , "async operation not completed (not an error)." )
RUNTIME_ERROR_CASE( CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES , "launch out of resources)." )
RUNTIME_ERROR_CASE( CUDA_ERROR_LAUNCH_TIMEOUT , "launch timeout)." )
RUNTIME_ERROR_CASE( CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING , "incompatible texturing)." )
RUNTIME_ERROR_CASE( CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED , "peer access already enabled)." )
RUNTIME_ERROR_CASE( CUDA_ERROR_PEER_ACCESS_NOT_ENABLED , "peer access not enabled)." )
RUNTIME_ERROR_CASE( CUDA_ERROR_CONTEXT_IS_DESTROYED , "current context has been destroyed)." )
RUNTIME_ERROR_CASE( CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE , "primary context already initialized)." )

#ifdef FOOBAR
		//CUDA_RUNTIME_ERROR( cudaErrorECCUncorrectable , "Uncorrectable ECC error detected." )
		CUDA_RUNTIME_ERROR( cudaErrorStartupFailure ,
					"Startup failure." )
#endif // FOOBAR
		default:
			sprintf(DEFAULT_ERROR_STRING,
		"%s:  unrecognized cuda error code %d",whence,e);
			NWARN(DEFAULT_ERROR_STRING);
			break;
	}
#ifdef FOOBAR
	e2 = cudaGetLastError();		// clear error
#ifdef CAUTIOUS
	if( e2 != e ){
		NERROR1("CAUTIOUS:  describe_cuda_error:  errors do not match!?");
	}
#endif /* CAUTIOUS */
#endif // FOOBAR
}
void CUDARunner::FindBestConfiguration()
{
	unsigned long lowb=16;
	unsigned long highb=128;
	unsigned long lowt=16;
	unsigned long hight=256;
	unsigned long bestb=16;
	unsigned long bestt=16;
	int64 besttime=std::numeric_limits<int64>::max();

	if(m_requestedgrid>0 && m_requestedgrid<=65536)
	{
		lowb=m_requestedgrid;
		highb=m_requestedgrid;
	}

	if(m_requestedthreads>0 && m_requestedthreads<=65536)
	{
		lowt=m_requestedthreads;
		hight=m_requestedthreads;
	}

	for(int numb=lowb; numb<=highb; numb*=2)
	{
		for(int numt=lowt; numt<=hight; numt*=2)
		{
			AllocateResources(numb,numt);
			// clear out any existing error
			cudaError_t err=cudaGetLastError();
			err=cudaSuccess;

			int64 st=GetTimeMillis();

			for(int it=0; it<128*256*2 && err==0; it+=(numb*numt))
			{
				cutilSafeCall(cudaMemcpy(m_devin,m_in,sizeof(cuda_in),cudaMemcpyHostToDevice));

				cuda_process_helper(m_devin,m_devout,64,6,numb,numt);

				cutilSafeCall(cudaMemcpy(m_out,m_devout,numb*numt*sizeof(cuda_out),cudaMemcpyDeviceToHost));

				err=cudaGetLastError();
				if(err!=cudaSuccess)
				{
					printf("CUDA error %d\n",err);
				}
			}

			int64 et=GetTimeMillis();

			printf("Finding best configuration step end (%d,%d) %"PRI64d"ms  prev best=%"PRI64d"ms\n",numb,numt,et-st,besttime);

			if((et-st)<besttime && err==cudaSuccess)
			{
				bestb=numb;
				bestt=numt;
				besttime=et-st;
			}
		}
	}

	m_numb=bestb;
	m_numt=bestt;

	AllocateResources(m_numb,m_numt);

}
Beispiel #12
0
void describe_cuda_driver_error(const char *whence, cudaError_t e)
{
	cudaError_t e2;

	switch(e){
DRIVER_ERROR_CASE(cudaSuccess,"No driver errors")
#if CUDA_VERSION >= 6050
DRIVER_ERROR_CASE(cudaErrorInvalidGraphicsContext,"Invalid graphics context")
DRIVER_ERROR_CASE( cudaErrorInvalidPtx , "Invalid PTX." )
#endif
DRIVER_ERROR_CASE( cudaErrorInvalidDevice , "Invalid device." )
DRIVER_ERROR_CASE( cudaErrorInvalidValue , "Invalid value." )
DRIVER_ERROR_CASE( cudaErrorInvalidPitchValue , "Invalid pitch value." )
DRIVER_ERROR_CASE( cudaErrorInvalidSymbol , "Invalid symbol." )
DRIVER_ERROR_CASE( cudaErrorMapBufferObjectFailed , "Map buffer object failed." )
DRIVER_ERROR_CASE( cudaErrorUnmapBufferObjectFailed , "Unmap buffer object failed." )
DRIVER_ERROR_CASE( cudaErrorInvalidHostPointer , "Invalid host pointer." )
DRIVER_ERROR_CASE( cudaErrorInvalidDevicePointer , "Invalid device pointer." )
DRIVER_ERROR_CASE( cudaErrorInvalidTexture , "Invalid texture." )
DRIVER_ERROR_CASE( cudaErrorInvalidTextureBinding , "Invalid texture binding." )
DRIVER_ERROR_CASE( cudaErrorInvalidChannelDescriptor , "Invalid channel descriptor." )
DRIVER_ERROR_CASE( cudaErrorInvalidMemcpyDirection , "Invalid memcpy direction." )
DRIVER_ERROR_CASE( cudaErrorAddressOfConstant , "Address of constant error." )
DRIVER_ERROR_CASE( cudaErrorTextureFetchFailed , "Texture fetch failed." )
DRIVER_ERROR_CASE( cudaErrorTextureNotBound , "Texture not bound error." )
DRIVER_ERROR_CASE( cudaErrorSynchronizationError , "Synchronization error." )
DRIVER_ERROR_CASE( cudaErrorInvalidResourceHandle , "Invalid resource handle." )
DRIVER_ERROR_CASE( cudaErrorNotReady , "Not ready error." )
DRIVER_ERROR_CASE( cudaErrorInsufficientDriver , "CUDA runtime is newer than driver." )
DRIVER_ERROR_CASE( cudaErrorSetOnActiveProcess , "Set on active process error." )
DRIVER_ERROR_CASE( cudaErrorNoDevice , "No available CUDA device." )
DRIVER_ERROR_CASE( cudaErrorMissingConfiguration , "Missing configuration error." )
DRIVER_ERROR_CASE( cudaErrorMemoryAllocation, "Memory allocation error." )
DRIVER_ERROR_CASE( cudaErrorInitializationError , "Initialization error." )
DRIVER_ERROR_CASE( cudaErrorLaunchFailure , "Launch failure." )
DRIVER_ERROR_CASE( cudaErrorPriorLaunchFailure , "Prior launch failure." )
DRIVER_ERROR_CASE( cudaErrorLaunchTimeout , "Launch timeout error." )
DRIVER_ERROR_CASE( cudaErrorLaunchOutOfResources , "Launch out of resources error." )
DRIVER_ERROR_CASE( cudaErrorInvalidDeviceFunction , "Invalid device function." )
DRIVER_ERROR_CASE( cudaErrorInvalidConfiguration , "Invalid configuration." )
DRIVER_ERROR_CASE( cudaErrorInvalidFilterSetting , "Invalid filter setting." )
DRIVER_ERROR_CASE( cudaErrorInvalidNormSetting , "Invalid norm setting." )
DRIVER_ERROR_CASE(cudaErrorMixedDeviceExecution,"Mixed device execution")
DRIVER_ERROR_CASE(cudaErrorCudartUnloading,"CUDA runtime unloading")
DRIVER_ERROR_CASE(cudaErrorUnknown,"Unknown error condition")
DRIVER_ERROR_CASE(cudaErrorNotYetImplemented,"Function not yet implemented")
DRIVER_ERROR_CASE(cudaErrorMemoryValueTooLarge,"Memory value too large")
DRIVER_ERROR_CASE(cudaErrorInvalidSurface,"Invalid surface")
DRIVER_ERROR_CASE(cudaErrorECCUncorrectable,"ECC uncorrectable")
DRIVER_ERROR_CASE(cudaErrorSharedObjectSymbolNotFound,"Shared object symbol not found")
DRIVER_ERROR_CASE(cudaErrorSharedObjectInitFailed,"Shared object init failed")

DRIVER_ERROR_CASE(cudaErrorUnsupportedLimit,"Unsupported limit")
DRIVER_ERROR_CASE(cudaErrorDuplicateVariableName,"Duplicate variable name")
DRIVER_ERROR_CASE(cudaErrorDuplicateTextureName,"Duplicate texture name")
DRIVER_ERROR_CASE(cudaErrorDuplicateSurfaceName,"Duplicate surface name")
DRIVER_ERROR_CASE(cudaErrorDevicesUnavailable,"Devices unavailable")
DRIVER_ERROR_CASE(cudaErrorInvalidKernelImage,"Invalid kernel image")
DRIVER_ERROR_CASE(cudaErrorNoKernelImageForDevice,"No kernel image for device")
DRIVER_ERROR_CASE(cudaErrorIncompatibleDriverContext,"Incompatible driver context")
DRIVER_ERROR_CASE(cudaErrorPeerAccessAlreadyEnabled,"Peer access already enabled")

DRIVER_ERROR_CASE(cudaErrorPeerAccessNotEnabled,"Peer access not enabled")
DRIVER_ERROR_CASE(cudaErrorDeviceAlreadyInUse,"Device already in use")
DRIVER_ERROR_CASE(cudaErrorProfilerDisabled,"Profiler disabled")
DRIVER_ERROR_CASE(cudaErrorProfilerNotInitialized,"Profiler not intialized")
DRIVER_ERROR_CASE(cudaErrorProfilerAlreadyStarted,"Profiler already started")
DRIVER_ERROR_CASE(cudaErrorProfilerAlreadyStopped,"Profiler already stopped")
#if CUDA_VERSION > 4000
DRIVER_ERROR_CASE(cudaErrorAssert,"Assertion error")
DRIVER_ERROR_CASE(cudaErrorTooManyPeers,"Too many peers")
DRIVER_ERROR_CASE(cudaErrorHostMemoryAlreadyRegistered,"Host mem already registered")

DRIVER_ERROR_CASE(cudaErrorHostMemoryNotRegistered,"Host memory not registered")
DRIVER_ERROR_CASE(cudaErrorOperatingSystem,"OS error")
DRIVER_ERROR_CASE(cudaErrorPeerAccessUnsupported,"Peer access unsupported")
DRIVER_ERROR_CASE(cudaErrorLaunchMaxDepthExceeded,"Launch max depth exceeded")
DRIVER_ERROR_CASE(cudaErrorLaunchFileScopedTex,"Launch file scoped tex")
DRIVER_ERROR_CASE(cudaErrorLaunchFileScopedSurf,"Launch file scoped surf")
DRIVER_ERROR_CASE(cudaErrorSyncDepthExceeded,"Sync depth exceeded")
DRIVER_ERROR_CASE(cudaErrorLaunchPendingCountExceeded,"Launch pending count exceeded")
DRIVER_ERROR_CASE(cudaErrorNotPermitted,"Not permitted")
DRIVER_ERROR_CASE(cudaErrorNotSupported,"Not supported")
DRIVER_ERROR_CASE(cudaErrorHardwareStackError,"H/W Stack Error")
DRIVER_ERROR_CASE(cudaErrorIllegalInstruction,"Illegal instruction")
DRIVER_ERROR_CASE(cudaErrorMisalignedAddress,"Mis-aligned address")
DRIVER_ERROR_CASE(cudaErrorInvalidAddressSpace,"Invalid address space")
DRIVER_ERROR_CASE(cudaErrorInvalidPc,"Invalid PC")
DRIVER_ERROR_CASE(cudaErrorIllegalAddress,"Illegal address")
#endif // CUDA_VERSION > 4000
DRIVER_ERROR_CASE(cudaErrorStartupFailure,"Startup failure")
DRIVER_ERROR_CASE(cudaErrorApiFailureBase,"Unexpected driver error")

#ifdef WHAT_CUDA_VERSION
		// need to fix for cuda 6?
		// not in cuda 6?
		CUDA_DRIVER_ERROR( CUDA_ERROR_LAUNCH_FAILED ,
				"launch failed)." )
		// not in cuda 6?
		CUDA_DRIVER_ERROR( CUDA_ERROR_UNKNOWN ,
				"unknown error)." )
#endif // WHAT_CUDA_VERSION

		default:
			sprintf(DEFAULT_ERROR_STRING,
		"%s:  unrecognized cuda error code %d",whence,e);
			NWARN(DEFAULT_ERROR_STRING);
			break;
	}
	e2 = cudaGetLastError();		// clear error
#ifdef CAUTIOUS
	if( e2 != e ){
		sprintf(DEFAULT_ERROR_STRING,
	"e = %d (0x%x), cudaGetLastError() = %d (0x%x)",e,e,e2,e2);
		NADVISE(DEFAULT_ERROR_STRING);
		NERROR1("CAUTIOUS:  describe_cuda_driver_error:  errors do not match!?");
	}
#endif /* CAUTIOUS */
}
      <<<volume_gridSIZE, volume_blockSIZE>>>
      ( order, input, output ); 
#endif

      flux_term6a<FLOAT_TYPE>
      <<<flux_gridSIZE, flux_blockSIZE>>>
      ( order, mesh.device_info, input, output, pen );

      flux_term6b<FLOAT_TYPE>
      <<<flux_gridSIZE, flux_blockSIZE>>>
      ( order, mesh.device_info, input, output, pen );


    #if 0

      cudaError_t error = cudaGetLastError();
      std::string lastError = cudaGetErrorString(error); 
      std::cout<<lastError<<std::endl;

    #endif

      return 0;

    }



    int _prec_mvm ( mode_vector<FLOAT_TYPE,int> input,
                    mode_vector<FLOAT_TYPE,int> output ) const
    {
#ifdef USE_PRECONDITIONER
Beispiel #14
0
		ParallelCUDA()
{
	SetName("Hask");
}

void
	ConvolverObs::
	ConvInit() throw (...)
{
	// Get the data we want.
	m_data = (float *)GetData().ToCUDAArray();
	// Set up the filter.
	//CopyFilter((float *)GetFilter().GetData(), (int)GetRadius());
	// Get the temporary arrays.
	cudaError
		ret = cudaGetLastError();
	if (ret != cudaSuccess)
	{
		throw cudaGetErrorString(ret);
	}
	m_smoothX = (float *)GetSmoothX().ToCUDAArray(),
	m_smoothY = (float *)GetSmoothY().ToCUDAArray();
}

void
	ConvolverObs::
	Execute() throw (...)
{
	//DoTraceMessage(TIMING, "%s%s%s", "Start (", GetName(), "): Set");
	Log("Set");
	//cudaError
Beispiel #15
0
int main(int argc, char **argv) {
  uchar4 *h_inputImageRGBA,  *d_inputImageRGBA;
  uchar4 *h_outputImageRGBA, *d_outputImageRGBA;
  unsigned char *d_redBlurred, *d_greenBlurred, *d_blueBlurred;

  float *h_filter;
  int    filterWidth;

  std::string input_file;
  std::string output_file;
  std::string reference_file;
  double perPixelError = 0.0;
  double globalError   = 0.0;
  bool useEpsCheck = false;
  switch (argc)
  {
	case 2:
	  input_file = std::string(argv[1]);
	  output_file = "HW2_output.png";
	  reference_file = "HW2_reference.png";
	  break;
	case 3:
	  input_file  = std::string(argv[1]);
      output_file = std::string(argv[2]);
	  reference_file = "HW2_reference.png";
	  break;
	case 4:
	  input_file  = std::string(argv[1]);
      output_file = std::string(argv[2]);
	  reference_file = std::string(argv[3]);
	  break;
	case 6:
	  useEpsCheck=true;
	  input_file  = std::string(argv[1]);
	  output_file = std::string(argv[2]);
	  reference_file = std::string(argv[3]);
	  perPixelError = atof(argv[4]);
      globalError   = atof(argv[5]);
	  break;
	default:
      std::cerr << "Usage: ./HW2 input_file [output_filename] [reference_filename] [perPixelError] [globalError]" << std::endl;
      exit(1);
  }
  //load the image and give us our input and output pointers
  preProcess(&h_inputImageRGBA, &h_outputImageRGBA, &d_inputImageRGBA, &d_outputImageRGBA,
             &d_redBlurred, &d_greenBlurred, &d_blueBlurred,
             &h_filter, &filterWidth, input_file);

  allocateMemoryAndCopyToGPU(numRows(), numCols(), h_filter, filterWidth);
  GpuTimer timer;
  timer.Start();
  //call the students' code
  your_gaussian_blur(h_inputImageRGBA, d_inputImageRGBA, d_outputImageRGBA, numRows(), numCols(),
                     d_redBlurred, d_greenBlurred, d_blueBlurred, filterWidth);
  timer.Stop();
  cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
  int err = printf("Your GPU code ran in: %f msecs.\n", timer.Elapsed());

  if (err < 0) {
    //Couldn't print! Probably the student closed stdout - bad news
    std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
    exit(1);
  }

  //check results and output the blurred image

  size_t numPixels = numRows()*numCols();
  //copy the output back to the host
  checkCudaErrors(cudaMemcpy(h_outputImageRGBA, d_outputImageRGBA__, sizeof(uchar4) * numPixels, cudaMemcpyDeviceToHost));

  postProcess(output_file, h_outputImageRGBA);

  timer.Start();
  referenceCalculation(h_inputImageRGBA, h_outputImageRGBA,
                       numRows(), numCols(),
                       h_filter, filterWidth);
  timer.Stop();
  printf("Your CPU code ran in: %f msecs.\n", timer.Elapsed());

  postProcess(reference_file, h_outputImageRGBA);

    //  Cheater easy way with OpenCV
    //generateReferenceImage(input_file, reference_file, filterWidth);

  compareImages(reference_file, output_file, useEpsCheck, perPixelError, globalError);

  checkCudaErrors(cudaFree(d_redBlurred));
  checkCudaErrors(cudaFree(d_greenBlurred));
  checkCudaErrors(cudaFree(d_blueBlurred));

  cleanUp();

  return 0;
}
Beispiel #16
0
TEST_F(CublasWrapperTest, matrixTransMatrixMultiply) {
  const int numberOfRows = 5;
  const int numberOfColumns = 3;
  const int numberOfRows2 = 5;
  const int numberOfColumns2 = 4;
  PinnedHostMatrix matrixT(numberOfRows, numberOfColumns);
  PinnedHostMatrix matrix2(numberOfRows2, numberOfColumns2);

  matrixT(0, 0) = 1;
  matrixT(1, 0) = 2;
  matrixT(2, 0) = 3;
  matrixT(3, 0) = 4;
  matrixT(4, 0) = 5;

  matrixT(0, 1) = 10;
  matrixT(1, 1) = 20;
  matrixT(2, 1) = 30;
  matrixT(3, 1) = 40;
  matrixT(4, 1) = 50;

  matrixT(0, 2) = 1.1;
  matrixT(1, 2) = 2.2;
  matrixT(2, 2) = 3.3;
  matrixT(3, 2) = 4.4;
  matrixT(4, 2) = 5.5;

  for(int i = 0; i < numberOfRows2; ++i){
    matrix2(i, 0) = 6;
  }

  for(int i = 0; i < numberOfRows2; ++i){
    matrix2(i, 1) = 7;
  }

  for(int i = 0; i < numberOfRows2; ++i){
    matrix2(i, 2) = 8;
  }

  for(int i = 0; i < numberOfRows2; ++i){
    matrix2(i, 3) = 9;
  }

  DeviceMatrix* matrixTDevice = hostToDeviceStream1.transferMatrix(matrixT);
  DeviceMatrix* matrix2Device = hostToDeviceStream1.transferMatrix(matrix2);
  DeviceMatrix* resultDevice = new DeviceMatrix(numberOfColumns, numberOfColumns2);

  cublasWrapper.matrixTransMatrixMultiply(*matrixTDevice, *matrix2Device, *resultDevice);
  cublasWrapper.syncStream();
  handleCudaStatus(cudaGetLastError(), "Error with matrixTransMatrixMultiply in matrixTransMatrixMultiply: ");

  HostMatrix* resultHost = deviceToHostStream1.transferMatrix(*resultDevice);
  cublasWrapper.syncStream();
  handleCudaStatus(cudaGetLastError(), "Error with transfer in matrixTransMatrixMultiply: ");

  EXPECT_EQ(90, (*resultHost)(0, 0));
  EXPECT_EQ(105, (*resultHost)(0, 1));
  EXPECT_EQ(120, (*resultHost)(0, 2));
  EXPECT_EQ(135, (*resultHost)(0, 3));

  EXPECT_EQ(900, (*resultHost)(1, 0));
  EXPECT_EQ(1050, (*resultHost)(1, 1));
  EXPECT_EQ(1200, (*resultHost)(1, 2));
  EXPECT_EQ(1350, (*resultHost)(1, 3));

  EXPECT_EQ(99, (*resultHost)(2, 0));
  EXPECT_EQ(115.5, (*resultHost)(2, 1));
  EXPECT_EQ(132, (*resultHost)(2, 2));
  EXPECT_EQ(148.5, (*resultHost)(2, 3));

  delete matrixTDevice;
  delete matrix2Device;
  delete resultDevice;
  delete resultHost;
}
Beispiel #17
0
cudaError_t GridGpu::transferData(complexVector &trajData)
{
    cudaMemcpy(m_d_trajData, trajData.data(), trajData.size() * sizeof(complexGpu), cudaMemcpyHostToDevice);
    return cudaGetLastError();
}
int _tmain(int argc, _TCHAR* argv[]) 
{
	uchar4 *h_inputImageRGBA,  *d_inputImageRGBA;
	uchar4 *h_outputImageRGBA, *d_outputImageRGBA;
	unsigned char *d_redBlurred, *d_greenBlurred, *d_blueBlurred;

	float *h_filter;
	int    filterWidth;

	//PreProcess
	const std::string *filename = new std::string("./cinque_terre_small.jpg");
	cv::Mat imageInputRGBA;
	cv::Mat imageOutputRGBA;

	//make sure the context initializes ok
	checkCudaErrors(cudaFree(0));

	cv::Mat image = cv::imread(filename->c_str(), CV_LOAD_IMAGE_COLOR);
  
	if (image.empty()) 
	{
	std::cerr << "Couldn't open file: " << filename << std::endl;
	cv::waitKey(0);
	exit(1);
	}

	cv::cvtColor(image, imageInputRGBA, CV_BGR2RGBA);

	//allocate memory for the output
	imageOutputRGBA.create(image.rows, image.cols, CV_8UC4);

	//This shouldn't ever happen given the way the images are created
	//at least based upon my limited understanding of OpenCV, but better to check
	if (!imageInputRGBA.isContinuous() || !imageOutputRGBA.isContinuous()) {
	std::cerr << "Images aren't continuous!! Exiting." << std::endl;
	exit(1);
	}

	h_inputImageRGBA  = (uchar4 *)imageInputRGBA.ptr<unsigned char>(0);
	h_outputImageRGBA = (uchar4 *)imageOutputRGBA.ptr<unsigned char>(0);

	const size_t numPixels = image.rows * image.cols;
	//allocate memory on the device for both input and output
	checkCudaErrors(cudaMalloc(&d_inputImageRGBA, sizeof(uchar4) * numPixels));
	checkCudaErrors(cudaMalloc(&d_outputImageRGBA, sizeof(uchar4) * numPixels));
	checkCudaErrors(cudaMemset(d_outputImageRGBA, 0, numPixels * sizeof(uchar4))); //make sure no memory is left laying around

	//copy input array to the GPU
	checkCudaErrors(cudaMemcpy(d_inputImageRGBA, h_inputImageRGBA, sizeof(uchar4) * numPixels, cudaMemcpyHostToDevice));

	//now create the filter that they will use
	const int blurKernelWidth = 9;
	const float blurKernelSigma = 2.;

	filterWidth = blurKernelWidth;

	//create and fill the filter we will convolve with
	h_filter = new float[blurKernelWidth * blurKernelWidth];

	float filterSum = 0.f; //for normalization

	for (int r = -blurKernelWidth/2; r <= blurKernelWidth/2; ++r) 
	{
		for (int c = -blurKernelWidth/2; c <= blurKernelWidth/2; ++c)
		{
			float filterValue = expf( -(float)(c * c + r * r) / (2.f * blurKernelSigma * blurKernelSigma));
			h_filter[(r + blurKernelWidth/2) * blurKernelWidth + c + blurKernelWidth/2] = filterValue;
			filterSum += filterValue;
		}
	}

	float normalizationFactor = 1.f / filterSum;

	for (int r = -blurKernelWidth/2; r <= blurKernelWidth/2; ++r)
	{
		for (int c = -blurKernelWidth/2; c <= blurKernelWidth/2; ++c)
		{
			h_filter[(r + blurKernelWidth/2) * blurKernelWidth + c + blurKernelWidth/2] *= normalizationFactor;
		}
	}

	//blurred
	checkCudaErrors(cudaMalloc(&d_redBlurred,    sizeof(unsigned char) * numPixels));
	checkCudaErrors(cudaMalloc(&d_greenBlurred,  sizeof(unsigned char) * numPixels));
	checkCudaErrors(cudaMalloc(&d_blueBlurred,   sizeof(unsigned char) * numPixels));
	checkCudaErrors(cudaMemset(d_redBlurred,   0, sizeof(unsigned char) * numPixels));
	checkCudaErrors(cudaMemset(d_greenBlurred, 0, sizeof(unsigned char) * numPixels));
	checkCudaErrors(cudaMemset(d_blueBlurred,  0, sizeof(unsigned char) * numPixels));


	allocateMemoryAndCopyToGPU(image.rows, image.cols, h_filter, filterWidth);
	GpuTimer timer;
	timer.Start();
	//call the students' code
	your_gaussian_blur(h_inputImageRGBA, d_inputImageRGBA, d_outputImageRGBA, image.rows, image.cols,
						d_redBlurred, d_greenBlurred, d_blueBlurred, filterWidth);
	timer.Stop();
	cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
	int err = printf("%f msecs.\n", timer.Elapsed());

	if (err < 0) {
	//Couldn't print! Probably the student closed stdout - bad news
	std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
	exit(1);
	}

	cleanup();

	//check results and output the blurred image
	//PostProcess

	//copy the output back to the host
	checkCudaErrors(cudaMemcpy(imageOutputRGBA.ptr<unsigned char>(0), d_outputImageRGBA, sizeof(uchar4) * numPixels, cudaMemcpyDeviceToHost));

	cv::Mat imageOutputBGR;
	cv::cvtColor(imageOutputRGBA, imageOutputBGR, CV_RGBA2BGR);
	//output the image
	cv::imwrite("./blurredResult.jpg", imageOutputBGR);

	cv::namedWindow( "Display window", CV_WINDOW_NORMAL);
	cv::imshow("Display window", imageOutputBGR);
	
	cv::waitKey(0);


	checkCudaErrors(cudaFree(d_redBlurred));
	checkCudaErrors(cudaFree(d_greenBlurred));
	checkCudaErrors(cudaFree(d_blueBlurred));

	return 0;
}
Beispiel #19
0
void SimCudaHelper::CheckError(const char *msg)
{
    cudaError_t err = cudaGetLastError();
    CheckError(err, msg);
}
Beispiel #20
0
void testCuda(int m, int n, int nnz, std::vector<int>& rows, std::vector<int>& cols,
		std::vector<double>& values, double* matB){

    double tol=1e-9;
    double start, stop, time_to_build, time_to_solve;

    int cudaDevice = 0;

    checkCudaErrors(cudaSetDevice(cudaDevice));

    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, cudaDevice);
    printf("Device Number: %d\n", cudaDevice);
    printf("  Device name: %s\n", prop.name);
    checkCudaErrors(cudaDeviceReset());

	 size_t mem_tot = 0;
	 size_t mem_free = 0;

	 cudaMemGetInfo(&mem_free, & mem_tot);
	 printf("\nFree memory: %d", mem_free);

	MatSparse matA;
    matA.setSize(m, n);

    std::vector<int> I, J;
    std::vector<double> V;

    for (int k = 0; k < nnz; k++){
    	double _val = values[k];
    	int i = rows[k];
    	int j = cols[k];

    	if (fabs(_val) > tol){
        	I.push_back(i-1);
        	J.push_back(j-1);
        	V.push_back(_val);
        }
    }

    start = second();
    matA.fromTruples(I, J, V);
    stop = second();
    time_to_build = stop - start;
    std::cerr << "Time to Build in GPU (second): " << time_to_build << std::endl;


    // ******************************** GPU SOLVER ******************************** //

    // --- Initialize cuSPARSE
     	cusolverSpHandle_t cusolver_handle = NULL; checkCudaErrors(cusolverSpCreate(&cusolver_handle));
     	cusparseHandle_t cusparse_handle = NULL; checkCudaErrors(cusparseCreate(&cusparse_handle));
     	cudaStream_t cudaStream = NULL; checkCudaErrors(cudaStreamCreate(&cudaStream));
     	checkCudaErrors(cusolverSpSetStream(cusolver_handle, cudaStream));
     	checkCudaErrors(cusparseSetStream(cusparse_handle, cudaStream));


        cusparseMatDescr_t descrA;      checkCudaErrors(cusparseCreateMatDescr(&descrA));
        checkCudaErrors(cusparseSetMatType     (descrA, CUSPARSE_MATRIX_TYPE_GENERAL));
        checkCudaErrors(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO));


        printf("\nAlloc GPU memory...\n");
        double *d_A;            checkCudaErrors(cudaMalloc(&d_A, nnz * sizeof(double)));
        int *d_A_RowIndices;    checkCudaErrors(cudaMalloc(&d_A_RowIndices, (m + 1) * sizeof(int)));
        int *d_A_ColIndices;    checkCudaErrors(cudaMalloc(&d_A_ColIndices, nnz * sizeof(int)));
        double *d_x;        checkCudaErrors(cudaMalloc(&d_x, m * sizeof(double)));
        double *d_b; checkCudaErrors(cudaMalloc(&d_b, m * sizeof(double)));
        printf("\nError: %s", cudaGetErrorString(cudaGetLastError()));

        printf("\nCopying data...\n");
        checkCudaErrors(cudaMemcpy(d_A, matA.valuesPtr(), nnz * sizeof(double), cudaMemcpyHostToDevice));
        checkCudaErrors(cudaMemcpy(d_A_RowIndices, matA.RowPtr(), (m + 1) * sizeof(int), cudaMemcpyHostToDevice));
        checkCudaErrors(cudaMemcpy(d_A_ColIndices, matA.ColIdxPtr(), nnz * sizeof(int), cudaMemcpyHostToDevice));
        checkCudaErrors(cudaMemcpy(d_b, matB, m * sizeof(double), cudaMemcpyHostToDevice));

        double *h_x = (double *)malloc(m * sizeof(double));

        printf("\nError: %s", cudaGetErrorString(cudaGetLastError()));
        cudaMemGetInfo(&mem_free, &mem_tot);
        printf("\nFree memory: %d", mem_free);

        int reorder = 0;
        int singularity = 0;
        start = second();
        //checkCudaErrors(cusolverSpDcsrlsvluHost(cusolver_handle, Nrows, nnz, descrA, sparse.Values(),
        	//	sparse.RowPtr(), sparse.ColIdx(), mB.values, tol, reorder, h_x, &singularity));
        checkCudaErrors(cusolverSpDcsrlsvqr(cusolver_handle, m, nnz, descrA, d_A, d_A_RowIndices,
               		d_A_ColIndices, d_b, tol, reorder, d_x, &singularity));
        checkCudaErrors(cudaDeviceSynchronize());
        stop = second();
        time_to_solve = stop - start;


        checkCudaErrors(cudaMemcpy(h_x, d_x, m * sizeof(double), cudaMemcpyDeviceToHost));

//        for (int k=0; k<mA.getNumRows(); k++) solution[k] = h_x[k];


        checkCudaErrors(cusparseDestroy(cusparse_handle));
        checkCudaErrors(cusolverSpDestroy(cusolver_handle));
        checkCudaErrors(cudaStreamDestroy(cudaStream));
        checkCudaErrors(cudaFree(d_b));
        checkCudaErrors(cudaFree(d_x));

        checkCudaErrors(cudaFree(d_A));
        checkCudaErrors(cudaFree(d_A_RowIndices));
        checkCudaErrors(cudaFree(d_A_ColIndices));

        free(h_x);

        std::cerr << "Time to Build in GPU (second): " << time_to_build << std::endl;
        std::cerr << "Time to Solve in GPU (second): " << time_to_solve << std::endl;
        std::cerr << "done!";

    // ****************************************************************************** //
}
Beispiel #21
0
extern int scanhash_groestlcoin(int thr_id, uint32_t *pdata, uint32_t *ptarget,
    uint32_t max_nonce, uint32_t *hashes_done)
{
	static THREAD uint32_t *foundNounce = nullptr;

    uint32_t start_nonce = pdata[19];
	unsigned int intensity = (device_sm[device_map[thr_id]] > 500) ? 24 : 23;
	uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, 1U << intensity);
	uint32_t throughput = min(throughputmax, max_nonce - start_nonce) & 0xfffffc00;

    if (opt_benchmark)
        ptarget[7] = 0x0000000f;

    // init
	static THREAD volatile bool init = false;
	if(!init)
    {
		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
		cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
		cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
		CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id]));

		groestlcoin_cpu_init(thr_id, throughputmax);
		CUDA_SAFE_CALL(cudaMallocHost(&foundNounce, 2 * 4));
		init = true;
    }

    // Endian Drehung ist notwendig
    uint32_t endiandata[32];
    for (int kk=0; kk < 32; kk++)
        be32enc(&endiandata[kk], pdata[kk]);

    // Context mit dem Endian gedrehten Blockheader vorbereiten (Nonce wird später ersetzt)
    groestlcoin_cpu_setBlock(thr_id, endiandata);

	do
	{
		// GPU
		const uint32_t Htarg = ptarget[7];

		groestlcoin_cpu_hash(thr_id, throughput, pdata[19], foundNounce, ptarget[7]);

		if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);}
		if(foundNounce[0] < 0xffffffff)
		{
			uint32_t tmpHash[8];
			endiandata[19] = SWAP32(foundNounce[0]);
			groestlhash(tmpHash, endiandata);

			if(tmpHash[7] <= Htarg && fulltest(tmpHash, ptarget))
			{
				int res = 1;
				if(opt_benchmark)
					applog(LOG_INFO, "GPU #%d Found nounce %08x", device_map[thr_id], foundNounce[0]);
				*hashes_done = pdata[19] - start_nonce + throughput;
				if(foundNounce[1] != 0xffffffff)
				{
					endiandata[19] = SWAP32(foundNounce[1]);
					groestlhash(tmpHash, endiandata);
					if(tmpHash[7] <= Htarg && fulltest(tmpHash, ptarget))
					{
						pdata[21] = foundNounce[1];
						res++;
						if(opt_benchmark)
							applog(LOG_INFO, "GPU #%d Found second nounce %08x", device_map[thr_id], foundNounce[1]);
					}
					else
					{
						if(tmpHash[7] != Htarg)
						{
							applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNounce[1]);
						}
					}
				}
				pdata[19] = foundNounce[0];
				return res;
			}
			else
			{
				if(tmpHash[7] != Htarg)
				{
					applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNounce[0]);
				}
			}
		}

		pdata[19] += throughput;
		cudaError_t err = cudaGetLastError();
		if(err != cudaSuccess)
		{
			applog(LOG_ERR, "GPU #%d: %s", device_map[thr_id], cudaGetErrorString(err));
			exit(EXIT_FAILURE);
		}
	} while(!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput)));

    *hashes_done = pdata[19] - start_nonce;
    return 0;
}
Beispiel #22
0
int main(int argc, char **argv) {
  unsigned int *inputVals;
  unsigned int *inputPos;
  unsigned int *outputVals;
  unsigned int *outputPos;

  size_t numElems;

  std::string input_file;
  std::string template_file;
  std::string output_file;
  std::string reference_file = "red_eye_effect.gold";
  double perPixelError = 0.0;
  double globalError   = 0.0;
  bool useEpsCheck = false;

  switch (argc)
  {
	case 3:
	  input_file  = std::string(argv[1]);
      template_file = std::string(argv[2]);
	  output_file = "HW4_output.png";
	  break;
	case 4:
	  input_file  = std::string(argv[1]);
      template_file = std::string(argv[2]);
	  output_file = std::string(argv[3]);
	  break;
	default:
          std::cerr << "Usage: ./HW4 input_file template_file [output_filename]" << std::endl;
          exit(1);
  }
  //load the image and give us our input and output pointers
  preProcess(&inputVals, &inputPos, &outputVals, &outputPos, numElems, input_file, template_file);
  
  GpuTimer timer;
  timer.Start();

  //call the students' code
  your_sort(inputVals, inputPos, outputVals, outputPos, numElems);

  timer.Stop();
  cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
  printf("\n");
  int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed());

  if (err < 0) {
    //Couldn't print! Probably the student closed stdout - bad news
    std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
    exit(1);
  }

  //check results and output the red-eye corrected image
  postProcess(outputVals, outputPos, numElems, output_file);


  // check code moved from HW4.cu
  /****************************************************************************
  * You can use the code below to help with debugging, but make sure to       *
  * comment it out again before submitting your assignment for grading,       *
  * otherwise this code will take too much time and make it seem like your    *
  * GPU implementation isn't fast enough.                                     *
  *                                                                           *
  * This code MUST RUN BEFORE YOUR CODE in case you accidentally change       *
  * the input values when implementing your radix sort.                       *
  *                                                                           *
  * This code performs the reference radix sort on the host and compares your *
  * sorted values to the reference.                                           *
  *                                                                           *
  * Thrust containers are used for copying memory from the GPU                *
  * ************************************************************************* */
  thrust::device_ptr<unsigned int> d_inputVals(inputVals);
  thrust::device_ptr<unsigned int> d_inputPos(inputPos);

  thrust::host_vector<unsigned int> h_inputVals(d_inputVals,
                                                d_inputVals+numElems);
  thrust::host_vector<unsigned int> h_inputPos(d_inputPos,
                                               d_inputPos + numElems);

  thrust::host_vector<unsigned int> h_outputVals(numElems);
  thrust::host_vector<unsigned int> h_outputPos(numElems);
  
  reference_calculation(&h_inputVals[0], &h_inputPos[0],
						&h_outputVals[0], &h_outputPos[0],
						numElems);

  //postProcess(valsPtr, posPtr, numElems, reference_file);

  compareImages(reference_file, output_file, useEpsCheck, perPixelError, globalError);

  thrust::device_ptr<unsigned int> d_outputVals(outputVals);
  thrust::device_ptr<unsigned int> d_outputPos(outputPos);

  thrust::host_vector<unsigned int> h_yourOutputVals(d_outputVals,
                                                     d_outputVals + numElems);
  thrust::host_vector<unsigned int> h_yourOutputPos(d_outputPos,
                                                    d_outputPos + numElems);

  checkResultsExact(&h_outputVals[0], &h_yourOutputVals[0], numElems);
  //checkResultsExact(&h_outputPos[0], &h_yourOutputPos[0], numElems);

  checkCudaErrors(cudaFree(inputVals));
  checkCudaErrors(cudaFree(inputPos));
  checkCudaErrors(cudaFree(outputVals));
  checkCudaErrors(cudaFree(outputPos));

  return 0;
}
Beispiel #23
0
int main(int argc, char **argv) {
	printf("Computing Game Of Life On %d x %d Board.\n", DIM_X, DIM_Y);
	
	int *host_current, *host_future, *host_future_naive, *host_future_cached;
	int *gpu_current, *gpu_future;
	clock_t start, stop;
	
	cudaMallocHost((void**) &host_current, DIM_X * DIM_Y * sizeof(int));
	cudaMallocHost((void**) &host_future, DIM_X * DIM_Y * sizeof(int));	
	cudaMallocHost((void**) &host_future_naive, DIM_X * DIM_Y * sizeof(int));
	cudaMallocHost((void**) &host_future_cached, DIM_X * DIM_Y * sizeof(int));
	assert(cudaGetLastError() == cudaSuccess);
	
	cudaMalloc((void**) &gpu_current, DIM_X * DIM_Y * sizeof(int));
	cudaMalloc((void**) &gpu_future, DIM_X * DIM_Y * sizeof(int));
	printf("%s\n", cudaGetErrorString(cudaGetLastError()));
	assert(cudaGetLastError() == cudaSuccess);
	
	fill_board(host_current, 40); 
	add_glider(host_current);
	
	cudaMemcpy(gpu_current, host_current, DIM_X * DIM_Y * sizeof(int), cudaMemcpyHostToDevice);

//	print_board(host_current);
	
	float time_naive, time_cached, time_cpu;
	for(int i = 1; i < STEPS; i++) {
		printf("=========\n");
		
		start = clock();
		naive_game_of_life_wrapper(gpu_current, gpu_future);
		cudaMemcpy(host_future_naive, gpu_future, DIM_X * DIM_Y * sizeof(int), cudaMemcpyDeviceToHost);
		stop = clock();
		time_naive = (float)(stop - start)/CLOCKS_PER_SEC;
		printf("Time for Naive GPU To Compute Next Phase: %.5f s\n", time_naive);
		
		start = clock();
		cached_game_of_life_wrapper(gpu_current, gpu_future);
		cudaMemcpy(host_future_cached, gpu_future, DIM_X * DIM_Y * sizeof(int), cudaMemcpyDeviceToHost);
		stop = clock();
		time_cached = (float)(stop - start)/CLOCKS_PER_SEC;
		printf("Time for Cached GPU To Compute Next Phase: %.5f s\n", time_cached);
		
		start = clock();
		update_board(host_current, host_future);
		stop = clock();
		time_cpu = (float)(stop - start)/CLOCKS_PER_SEC;
		printf("Time for CPU To Compute Next Phase: %.5f s\n", time_cpu);
		
		printf("speedup for naive = %.2f; speedup for cached = %.2f; speedup for cached over naive = %.2f\n", time_cpu/time_naive, time_cpu/time_cached, time_naive/time_cached);

		check_boards(host_future, host_future_naive);
		check_boards(host_future, host_future_cached);
				
		int *temp;
		
		temp = host_current;
		host_current = host_future;
		host_future = temp;
		
		temp = gpu_current;
		gpu_current = gpu_future;
		gpu_future = temp;
	}
	
	cudaFree(host_future);
	cudaFree(host_future_naive);
	cudaFree(host_future_cached);
	cudaFree(host_current);
	cudaFree(gpu_current);
	cudaFree(gpu_future);
	
	return 0;
}
float WFIRFilterCuda::cudaFilter( WLEMData::ScalarT* const output, const WLEMData::ScalarT* const input,
                const WLEMData::ScalarT* const previous, size_t channels, size_t samples, const WLEMData::ScalarT* const coeffs,
                size_t coeffSize )
{
    CuScalarT *dev_in = NULL;
    size_t pitchIn;

    CuScalarT *dev_prev = NULL;
    size_t pitchPrev;

    CuScalarT *dev_out = NULL;
    size_t pitchOut;

    CuScalarT *dev_co = NULL;

    try
    {
        CudaThrowsCall( cudaMallocPitch( ( void** )&dev_in, &pitchIn, samples * sizeof( CuScalarT ), channels ) );
        CudaThrowsCall(
                        cudaMemcpy2D( dev_in, pitchIn, input, samples * sizeof( CuScalarT ), samples * sizeof( CuScalarT ),
                                        channels, cudaMemcpyHostToDevice ) );

        CudaThrowsCall( cudaMallocPitch( ( void** )&dev_prev, &pitchPrev, coeffSize * sizeof( CuScalarT ), channels ) );
        CudaThrowsCall(
                        cudaMemcpy2D( dev_prev, pitchPrev, previous, coeffSize * sizeof( CuScalarT ),
                                        coeffSize * sizeof( CuScalarT ), channels, cudaMemcpyHostToDevice ) );

        CudaThrowsCall( cudaMallocPitch( ( void** )&dev_out, &pitchOut, samples * sizeof( CuScalarT ), channels ) );

        CudaThrowsCall( cudaMalloc( ( void** )&dev_co, coeffSize * sizeof( CuScalarT ) ) );
        CudaThrowsCall( cudaMemcpy( dev_co, coeffs, coeffSize * sizeof( CuScalarT ), cudaMemcpyHostToDevice ) );
    }
    catch( const WException& e )
    {
        wlog::error( CLASS ) << e.what();
        if( dev_in )
        {
            CudaSafeCall( cudaFree( ( void* )dev_in ) );
        }
        if( dev_prev )
        {
            CudaSafeCall( cudaFree( ( void* )dev_prev ) );
        }
        if( dev_out )
        {
            CudaSafeCall( cudaFree( ( void* )dev_out ) );
        }
        if( dev_co )
        {
            CudaSafeCall( cudaFree( ( void* )dev_co ) );
        }
        throw WLBadAllocException( "Could not allocate CUDA memory!" );
    }

    size_t threadsPerBlock = 32;
    size_t blocksPerGrid = ( samples + threadsPerBlock - 1 ) / threadsPerBlock;
    size_t sharedMem = coeffSize * sizeof( CuScalarT );

    cudaEvent_t start, stop;
    cudaEventCreate( &start );
    cudaEventCreate( &stop );

    cudaEventRecord( start, 0 );
    cuFirFilter( blocksPerGrid, threadsPerBlock, sharedMem, dev_out, dev_in, dev_prev, channels, samples, dev_co, coeffSize,
                    pitchOut, pitchIn, pitchPrev );
    cudaError_t kernelError = cudaGetLastError();

    cudaEventRecord( stop, 0 );
    cudaEventSynchronize( stop );

    float elapsedTime;
    cudaEventElapsedTime( &elapsedTime, start, stop );
    cudaEventDestroy( start );
    cudaEventDestroy( stop );

    try
    {
        if( kernelError != cudaSuccess )
        {
            const std::string err( cudaGetErrorString( kernelError ) );
            throw WException( "CUDA kernel failed: " + err );
        }
        CudaThrowsCall(
                        cudaMemcpy2D( output, samples * sizeof( CuScalarT ), dev_out, pitchOut, samples * sizeof( CuScalarT ),
                                        channels, cudaMemcpyDeviceToHost ) );
    }
    catch( const WException& e )
    {
        wlog::error( CLASS ) << e.what();
        elapsedTime = -1.0;
    }

    CudaSafeCall( cudaFree( ( void* )dev_in ) );
    CudaSafeCall( cudaFree( ( void* )dev_prev ) );
    CudaSafeCall( cudaFree( ( void* )dev_out ) );
    CudaSafeCall( cudaFree( ( void* )dev_co ) );

    if( elapsedTime > -1.0 )
    {
        return elapsedTime;
    }
    else
    {
        throw WException( "Error in cudaFilter()" );
    }
}
Beispiel #25
0
//--------------------------------------------------------------------------
// CUDA init
//--------------------------------------------------------------------------
bool CUDAContext::configInit( )
{
#ifdef EQUALIZER_USE_CUDA
    cudaDeviceProp props;
    uint32_t device = getPipe()->getDevice();

    // Setup the CUDA device
    if( device == LB_UNDEFINED_UINT32 )
    {
        device = _getFastestDeviceID();
        LBWARN << "No CUDA device, using the fastest device: " << device
               << std::endl;
    }

    int device_count = 0;
    cudaGetDeviceCount( &device_count );
    LBINFO << "CUDA devices found: " << device_count << std::endl;
    LBASSERT( static_cast< uint32_t >( device_count ) > device );
    if( static_cast< uint32_t >( device_count ) <= device )
    {
        sendError( ERROR_CUDACONTEXT_DEVICE_NOTFOUND )
            << lexical_cast< std::string >( device );
        return false;
    }

    // We assume GL interop here, otherwise use cudaSetDevice( device );
    // Attention: this call requires a valid GL context!
    cudaGLSetGLDevice( device );

    int usedDevice = static_cast< int >( device );
#ifdef _WIN32

    HGPUNV handle = 0;

    if( !WGLEW_NV_gpu_affinity )
    {
        LBWARN <<"WGL_NV_gpu_affinity unsupported, ignoring device setting"
               << std::endl;
        return true;
    }

    if( !wglEnumGpusNV( device, &handle ))
    {
        LBWARN << "wglEnumGpusNV failed : " << lunchbox::sysError << std::endl;
        return false;
    }

    cudaWGLGetDevice( &usedDevice, handle );
#else
    cudaGetDevice( &usedDevice );
#endif
    LBASSERT( device == static_cast< uint32_t >( device ));
    cudaGetDeviceProperties( &props, usedDevice );

    cudaError_t err = cudaGetLastError();
    if( cudaSuccess != err)
    {
        sendError( ERROR_CUDACONTEXT_INIT_FAILED ) <<
            std::string( cudaGetErrorString( err ));
        return false;
    }

    LBINFO << "Using CUDA device: " << device << std::endl;
    return true;
#else
    sendError( ERROR_CUDACONTEXT_MISSING_SUPPORT );
    return false;
#endif
}
Beispiel #26
0
    static void check_status() {
	throw_(cudaGetLastError());
    }
Beispiel #27
0
/*===========================================================================*/
cudaError_t LastError()
{
    return cudaGetLastError();
}
Beispiel #28
0
int main(int argc, char **argv) {
  uchar4        *h_rgbaImage, *d_rgbaImage;
  unsigned char *h_greyImage, *d_greyImage;

  std::string input_file;
  std::string output_file;
  std::string reference_file;
  double perPixelError = 0.0;
  double globalError   = 0.0;
  bool useEpsCheck = false;
  switch (argc)
  {
	case 2:
	  input_file = std::string(argv[1]);
	  output_file = "HW1_output.png";
	  reference_file = "HW1_reference.png";
	  break;
	case 3:
	  input_file  = std::string(argv[1]);
      output_file = std::string(argv[2]);
	  reference_file = "HW1_reference.png";
	  break;
	case 4:
	  input_file  = std::string(argv[1]);
      output_file = std::string(argv[2]);
	  reference_file = std::string(argv[3]);
	  break;
	case 6:
	  useEpsCheck=true;
	  input_file  = std::string(argv[1]);
	  output_file = std::string(argv[2]);
	  reference_file = std::string(argv[3]);
	  perPixelError = atof(argv[4]);
      globalError   = atof(argv[5]);
	  break;
	default:
      std::cerr << "Usage: ./HW1 input_file [output_filename] [reference_filename] [perPixelError] [globalError]" << std::endl;
      exit(1);
  }
  //load the image and give us our input and output pointers
  preProcess(&h_rgbaImage, &h_greyImage, &d_rgbaImage, &d_greyImage, input_file);

  GpuTimer timer;
  timer.Start();
  //call the students' code
  lineDetect(h_rgbaImage, d_rgbaImage, d_greyImage, numRows(), numCols());
  timer.Stop();
  cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());

  int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed());

  if (err < 0) {
    //Couldn't print! Probably the student closed stdout - bad news
    std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
    exit(1);
  }

  size_t numPixels = numRows()*numCols();
  checkCudaErrors(cudaMemcpy(h_greyImage, d_greyImage, sizeof(unsigned char) * numPixels, cudaMemcpyDeviceToHost));

  //check results and output the grey image
  postProcess(output_file, h_greyImage);

  referenceCalculation(h_rgbaImage, h_greyImage, numRows(), numCols());

  postProcess(reference_file, h_greyImage);

  //generateReferenceImage(input_file, reference_file);
  compareImages(reference_file, output_file, useEpsCheck, perPixelError, 
                globalError);

  cleanup();

  return 0;
}
Beispiel #29
0
int main(int argc, char **argv) {
  float *d_luminance;
  unsigned int *d_cdf;

  size_t numRows, numCols;
  unsigned int numBins;

  std::string input_file;
  std::string output_file;
  std::string reference_file;
  double perPixelError = 0.0;
  double globalError   = 0.0;
  bool useEpsCheck = false;

  switch (argc)
  {
	case 2:
	  input_file = std::string(argv[1]);
	  output_file = "HW3_output.png";
	  reference_file = "HW3_reference.png";
	  break;
	case 3:
	  input_file  = std::string(argv[1]);
      output_file = std::string(argv[2]);
	  reference_file = "HW3_reference.png";
	  break;
	case 4:
	  input_file  = std::string(argv[1]);
      output_file = std::string(argv[2]);
	  reference_file = std::string(argv[3]);
	  break;
	case 6:
	  useEpsCheck=true;
	  input_file  = std::string(argv[1]);
	  output_file = std::string(argv[2]);
	  reference_file = std::string(argv[3]);
	  perPixelError = atof(argv[4]);
      globalError   = atof(argv[5]);
	  break;
	default:
      std::cerr << "Usage: ./HW3 input_file [output_filename] [reference_filename] [perPixelError] [globalError]" << std::endl;
      exit(1);
  }
  //load the image and give us our input and output pointers
  preProcess(&d_luminance, &d_cdf,
             &numRows, &numCols, &numBins, input_file);

  GpuTimer timer;
  float min_logLum, max_logLum;
  min_logLum = 0.f;
  max_logLum = 1.f;
  timer.Start();
  //call the students' code
  your_histogram_and_prefixsum(d_luminance, d_cdf, min_logLum, max_logLum,
                               numRows, numCols, numBins);
  timer.Stop();
  cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
  int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed());

  if (err < 0) {
    //Couldn't print! Probably the student closed stdout - bad news
    std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
    exit(1);
  }

  float *h_luminance = (float *) malloc(sizeof(float)*numRows*numCols);
  unsigned int *h_cdf = (unsigned int *) malloc(sizeof(unsigned int)*numBins);

  checkCudaErrors(cudaMemcpy(h_luminance, d_luminance, numRows*numCols*sizeof(float), cudaMemcpyDeviceToHost));

  //check results and output the tone-mapped image
  postProcess(output_file, numRows, numCols, min_logLum, max_logLum);

  for (size_t i = 1; i < numCols * numRows; ++i) {
	min_logLum = std::min(h_luminance[i], min_logLum);
    max_logLum = std::max(h_luminance[i], max_logLum);
  }

  referenceCalculation(h_luminance, h_cdf, numRows, numCols, numBins, min_logLum, max_logLum);

  checkCudaErrors(cudaMemcpy(d_cdf, h_cdf, sizeof(unsigned int) * numBins, cudaMemcpyHostToDevice));

  //check results and output the tone-mapped image
  postProcess(reference_file, numRows, numCols, min_logLum, max_logLum);

  cleanupGlobalMemory();

  compareImages(reference_file, output_file, useEpsCheck, perPixelError, globalError);

  return 0;
}
Beispiel #30
0
int kmeans_cuda(bool kmpp, float tolerance, float yinyang_t, uint32_t samples_size,
                uint16_t features_size, uint32_t clusters_size, uint32_t seed,
                uint32_t device, int32_t verbosity, const float *samples,
                float *centroids, uint32_t *assignments) {
  DEBUG("arguments: %d %.3f %.2f %" PRIu32 " %" PRIu16 " %" PRIu32 " %" PRIu32
        " %" PRIu32 " %" PRIi32 " %p %p %p\n",
        kmpp, tolerance, yinyang_t, samples_size, features_size, clusters_size,
        seed, device, verbosity, samples, centroids, assignments);
  auto check_result = check_args(
      tolerance, yinyang_t, samples_size, features_size, clusters_size,
      samples, centroids, assignments);
  if (check_result != kmcudaSuccess) {
    return check_result;
  }
  if (cudaSetDevice(device) != cudaSuccess) {
    return kmcudaNoSuchDevice;
  }

  void *device_samples;
  size_t device_samples_size = samples_size;
  device_samples_size *= features_size * sizeof(float);
  CUMALLOC(device_samples, device_samples_size, "samples");
  CUMEMCPY(device_samples, samples, device_samples_size, cudaMemcpyHostToDevice);
  unique_devptr device_samples_sentinel(device_samples);

  void *device_centroids;
  size_t centroids_size = clusters_size * features_size * sizeof(float);
  CUMALLOC(device_centroids, centroids_size, "centroids");
  unique_devptr device_centroids_sentinel(device_centroids);

  void *device_assignments;
  size_t assignments_size = samples_size * sizeof(uint32_t);
  CUMALLOC(device_assignments, assignments_size, "assignments");
  unique_devptr device_assignments_sentinel(device_assignments);

  void *device_assignments_prev;
  CUMALLOC(device_assignments_prev, assignments_size, "assignments_prev");
  unique_devptr device_assignments_prev_sentinel(device_assignments_prev);

  void *device_ccounts;
  CUMALLOC(device_ccounts, clusters_size * sizeof(uint32_t), "ccounts");
  unique_devptr device_ccounts_sentinel(device_ccounts);

  uint32_t yinyang_groups = yinyang_t * clusters_size;
  DEBUG("yinyang groups: %" PRIu32 "\n", yinyang_groups);
  void *device_assignments_yy = NULL, *device_bounds_yy = NULL,
      *device_drifts_yy = NULL, *device_passed_yy = NULL,
      *device_centroids_yy = NULL;
  if (yinyang_groups >= 1) {
    CUMALLOC(device_assignments_yy, clusters_size * sizeof(uint32_t),
             "yinyang assignments");
    size_t yyb_size = samples_size;
    yyb_size *= (yinyang_groups + 1) * sizeof(float);
    CUMALLOC(device_bounds_yy, yyb_size, "yinyang bounds");
    CUMALLOC(device_drifts_yy, centroids_size + clusters_size * sizeof(float),
             "yinyang drifts");
    CUMALLOC(device_passed_yy, assignments_size, "yinyang passed");
    size_t yyc_size = yinyang_groups * features_size * sizeof(float);
    if (yyc_size + (clusters_size + yinyang_groups) * sizeof(uint32_t)
        <= assignments_size) {
      device_centroids_yy = device_passed_yy;
    } else {
      CUMALLOC(device_centroids_yy, yyc_size, "yinyang group centroids");
    }
  }
  unique_devptr device_centroids_yinyang_sentinel(
      (device_centroids_yy != device_passed_yy)? device_centroids_yy : NULL);
  unique_devptr device_assignments_yinyang_sentinel(device_assignments_yy);
  unique_devptr device_bounds_yinyang_sentinel(device_bounds_yy);
  unique_devptr device_drifts_yinyang_sentinel(device_drifts_yy);
  unique_devptr device_passed_yinyang_sentinel(device_passed_yy);

  if (verbosity > 1) {
    RETERR(print_memory_stats());
  }
  RETERR(kmeans_cuda_setup(samples_size, features_size, clusters_size,
                           yinyang_groups, device, verbosity),
         DEBUG("kmeans_cuda_setup failed: %s\n",
               cudaGetErrorString(cudaGetLastError())));
  RETERR(kmeans_init_centroids(
      static_cast<KMCUDAInitMethod>(kmpp), samples_size, features_size,
      clusters_size, seed, verbosity, reinterpret_cast<float*>(device_samples),
      device_assignments, reinterpret_cast<float*>(device_centroids)),
         DEBUG("kmeans_init_centroids failed: %s\n",
               cudaGetErrorString(cudaGetLastError())));
  RETERR(kmeans_cuda_yy(
      tolerance, yinyang_groups, samples_size, clusters_size, features_size, verbosity,
      reinterpret_cast<float*>(device_samples),
      reinterpret_cast<float*>(device_centroids),
      reinterpret_cast<uint32_t*>(device_ccounts),
      reinterpret_cast<uint32_t*>(device_assignments_prev),
      reinterpret_cast<uint32_t*>(device_assignments),
      reinterpret_cast<uint32_t*>(device_assignments_yy),
      reinterpret_cast<float*>(device_centroids_yy),
      reinterpret_cast<float*>(device_bounds_yy),
      reinterpret_cast<float*>(device_drifts_yy),
      reinterpret_cast<uint32_t*>(device_passed_yy)),
         DEBUG("kmeans_cuda_internal failed: %s\n",
               cudaGetErrorString(cudaGetLastError())));
  CUMEMCPY(centroids, device_centroids, centroids_size, cudaMemcpyDeviceToHost);
  CUMEMCPY(assignments, device_assignments, assignments_size, cudaMemcpyDeviceToHost);
  DEBUG("return kmcudaSuccess\n");
  return kmcudaSuccess;
}