예제 #1
0
		CudaStereoSGMResources(int width_, int height_, int disparity_size_, int input_depth_bits_, int output_depth_bits_, EXECUTE_INOUT inout_type_) {

			if (input_depth_bits_ == 8 && disparity_size_ == 64)
				sgm_engine = new SemiGlobalMatchingImpl<uint8_t, 64>();
			else if (input_depth_bits_ == 8 && disparity_size_ == 128)
				sgm_engine = new SemiGlobalMatchingImpl<uint8_t, 128>();
			else if (input_depth_bits_ == 16 && disparity_size_ == 64)
				sgm_engine = new SemiGlobalMatchingImpl<uint16_t, 64>();
			else if (input_depth_bits_ == 16 && disparity_size_ == 128)
				sgm_engine = new SemiGlobalMatchingImpl<uint16_t, 128>();
			else
				throw std::logic_error("depth bits must be 8 or 16, and disparity size must be 64 or 128");

			if (is_cuda_input(inout_type_)) {
				this->d_src_left = NULL;
				this->d_src_right = NULL;
			}
			else {
				CudaSafeCall(cudaMalloc(&this->d_src_left, input_depth_bits_ / 8 * width_ * height_));
				CudaSafeCall(cudaMalloc(&this->d_src_right, input_depth_bits_ / 8 * width_ * height_));
			}
			
			CudaSafeCall(cudaMalloc(&this->d_left_disp, sizeof(uint16_t) * width_ * height_));
			CudaSafeCall(cudaMalloc(&this->d_right_disp, sizeof(uint16_t) * width_ * height_));

			CudaSafeCall(cudaMalloc(&this->d_tmp_left_disp, sizeof(uint16_t) * width_ * height_));
			CudaSafeCall(cudaMalloc(&this->d_tmp_right_disp, sizeof(uint16_t) * width_ * height_));

			CudaSafeCall(cudaMemset(this->d_left_disp, 0, sizeof(uint16_t) * width_ * height_));
			CudaSafeCall(cudaMemset(this->d_right_disp, 0, sizeof(uint16_t) * width_ * height_));
			CudaSafeCall(cudaMemset(this->d_tmp_left_disp, 0, sizeof(uint16_t) * width_ * height_));
			CudaSafeCall(cudaMemset(this->d_tmp_right_disp, 0, sizeof(uint16_t) * width_ * height_));
		}
예제 #2
0
ga_solver_cuda<TFloat>::ga_solver_cuda(population_set_cuda<TFloat>* population,
                                       evaluator_cuda<TFloat>* evaluator,
                                       prngenerator_cuda<TFloat>* prn_generator,
                                       uint32_t generation_target,
                                       TFloat* upper_bounds,
                                       TFloat* lower_bounds)

  : evolutionary_solver_cuda<TFloat>(population, evaluator, prn_generator,
                                     generation_target, upper_bounds,
                                     lower_bounds)
{
  // Defaults
  _migration_step = 0;
  _migration_size = 1;
  _migration_selection_size = 2;
  _selection_size = 2;
  _selection_stochastic_factor = 0;
  _crossover_rate = 0.9;
  _mutation_rate = 0.1;

  // Allocate GA resources
  const size_t TOTAL_AGENTS = _population->_TOTAL_AGENTS;

  CudaSafeCall(cudaMalloc((void**)&(_dev_couples_idx_array),
                          TOTAL_AGENTS * sizeof(uint32_t)));
  CudaSafeCall(cudaMalloc((void**)&(_dev_candidates_reservoir_array),
                          _ISLES * _AGENTS * _AGENTS * sizeof(uint32_t)));
}
예제 #3
0
TransferFunction::TransferFunction(vtkSmartPointer<vtkPiecewiseFunction> otf, vtkSmartPointer<vtkColorTransferFunction> ctf, QObject *parent) : QObject(parent)
{
    opacityTF = otf;
    colorTF = ctf;

    this->otf = QSharedPointer<ctkTransferFunction>(new ctkVTKPiecewiseFunction(opacityTF));
    this->ctf = QSharedPointer<ctkTransferFunction>(new ctkVTKColorTransferFunction(colorTF));

    connect(this->otf.data(), SIGNAL(changed()), this, SLOT(onOpacityTFChanged()));
    connect(this->ctf.data(), SIGNAL(changed()), this, SLOT(onColorTFChanged()));

    compositeTex = 0;

    // initialize each table
    opacityTF->GetTable(0.0, 1.0, TABLE_SIZE, opacityTable);
    colorTF->GetTable(0.0, 1.0, TABLE_SIZE, colorTable);
    CompositeTable();

    channelDesc = cudaCreateChannelDesc(32, 32, 32, 32, cudaChannelFormatKindFloat);
    CudaSafeCall(cudaMallocArray(&array, &channelDesc, TABLE_SIZE));
    CudaSafeCall(cudaMemcpyToArray(array, 0, 0, compositeTable, sizeof(float) * TABLE_SIZE * 4, cudaMemcpyHostToDevice));

    memset(&resourceDesc, 0, sizeof(resourceDesc));
    resourceDesc.resType = cudaResourceTypeArray;
    resourceDesc.res.array.array = array;

    memset(&texDesc, 0, sizeof(texDesc));
    texDesc.addressMode[0] = cudaAddressModeClamp;
    texDesc.filterMode = cudaFilterModeLinear;
    texDesc.normalizedCoords = true;
    texDesc.readMode = cudaReadModeElementType;

    CudaSafeCall(cudaCreateTextureObject(&compositeTex, &resourceDesc, &texDesc, NULL));
}
예제 #4
0
TransferFunction::~TransferFunction()
{
    if(compositeTex)
        CudaSafeCall(cudaDestroyTextureObject(compositeTex));

    CudaSafeCall(cudaFreeArray(array));
}
예제 #5
0
void TransferFunction::onColorTFChanged()
{
    //std::cout<<"Color changed"<<std::endl;
    if(compositeTex)
    {
        CudaSafeCall(cudaDestroyTextureObject(compositeTex));
        compositeTex = 0;
    }

    colorTF->GetTable(0.0, 1.0, TABLE_SIZE, colorTable);
    size_t j = 0, k = 0;
    for(size_t i = 0; i < TABLE_SIZE; ++i)
    {
        compositeTable[j++] = colorTable[k++];
        compositeTable[j++] = colorTable[k++];
        compositeTable[j++] = colorTable[k++];
        j++;
    }
    //CompositeTable();

    CudaSafeCall(cudaMemcpyToArray(array, 0, 0, compositeTable, sizeof(float) * TABLE_SIZE * 4, cudaMemcpyHostToDevice));
    CudaSafeCall(cudaCreateTextureObject(&compositeTex, &resourceDesc, &texDesc, NULL));

    Changed();
}
예제 #6
0
void plan_fft(FFT_plans *plans, Arrays *arr,
	      Detector_settings *sett, Command_line_opts *opts)
{
  /*
    ############ FFT Plans ################
  */

  //arrlen is maximum of Ninterp and fftpad*nfft
  arr->arr_len = (sett->fftpad * sett->nfft > sett->Ninterp ? sett->fftpad * sett->nfft : sett->Ninterp);

  CudaSafeCall ( cudaMalloc((void**)&arr->cu_xa, arr->arr_len*sizeof(cufftDoubleComplex)) );
  CudaSafeCall ( cudaMalloc((void**)&arr->cu_xb, arr->arr_len*sizeof(cufftDoubleComplex)) );

  CudaSafeCall ( cudaMalloc((void**)&arr->cu_xar, arr->arr_len*sizeof(cufftDoubleComplex)) );
  CudaSafeCall ( cudaMalloc((void**)&arr->cu_xbr, arr->arr_len*sizeof(cufftDoubleComplex)) );


  CudaSafeCall ( cudaMalloc((void**)&arr->cu_xa_f, arr->arr_len*sizeof(COMPLEX_TYPE)) );
  CudaSafeCall ( cudaMalloc((void**)&arr->cu_xb_f, arr->arr_len*sizeof(COMPLEX_TYPE)) );

  CudaSafeCall ( cudaMalloc((void**)&arr->cu_xar_f, arr->arr_len*sizeof(COMPLEX_TYPE)) );
  CudaSafeCall ( cudaMalloc((void**)&arr->cu_xbr_f, arr->arr_len*sizeof(COMPLEX_TYPE)) );

  if (opts->fftinterp == INT) { //interbinning
    CudaSafeCall ( cudaMalloc((void**)&arr->cu_xa2_f, sett->nfft*sizeof(COMPLEX_TYPE)) );
    CudaSafeCall ( cudaMalloc((void**)&arr->cu_xb2_f, sett->nfft*sizeof(COMPLEX_TYPE)) );
  }

  sett->nfftf = sett->fftpad*sett->nfft;

  if (opts->fftinterp == INT) { //interbinning
    cufftPlan1d( &(plans->plan),
		 sett->nfft,
		 CUFFT_TRANSFORM_TYPE, 1);
  } else { //fft & zero padding
    cufftPlan1d( &(plans->plan),
		 sett->nfftf,
		 CUFFT_TRANSFORM_TYPE, 1);

  }

  //plans for interpolation with splines

  cufftPlan1d(&(plans->pl_int),
	      sett->nfft,
	      CUFFT_Z2Z, 1);

  cufftPlan1d(&(plans->pl_inv),
	      sett->Ninterp,
	      CUFFT_Z2Z, 1);

  /*
    ############ FFT plans end ################
  */

}
 population_set_cuda<TFloat>::population_set_cuda
 (const uint32_t ISLES,
  const uint32_t AGENTS,
  const uint32_t DIMENSIONS)
   : population_set<TFloat>(ISLES, AGENTS, DIMENSIONS)
 {
   // Device Memory Allocation
   CudaSafeCall(cudaMalloc((void **) &_dev_data_array, _TOTAL_GENES * sizeof(TFloat)));
   CudaSafeCall(cudaMalloc((void **) &_dev_transformed_data_array, _TOTAL_GENES * sizeof(TFloat)));
   CudaSafeCall(cudaMalloc((void **) &_dev_fitness_array, _TOTAL_AGENTS * sizeof(TFloat)));
 }
예제 #8
0
void Application::_init()
{
    // Pick the best CUDA device
    const int deviceIdx = cutGetMaxGflopsDeviceId();
    CudaSafeCall( cudaSetDevice( deviceIdx ) );

    // CUDA configuration
    CudaSafeCall( cudaDeviceSetCacheConfig( cudaFuncCachePreferShared ) );

    return;
}
예제 #9
0
	void StereoSGM::execute(const void* left_pixels, const void* right_pixels, void* dst) {

		const void *d_input_left, *d_input_right;

		if (is_cuda_input(inout_type_)) {
			d_input_left = left_pixels;
			d_input_right = right_pixels;
		}
		else {
			CudaSafeCall(cudaMemcpy(cu_res_->d_src_left, left_pixels, input_depth_bits_ / 8 * width_ * height_, cudaMemcpyHostToDevice));
			CudaSafeCall(cudaMemcpy(cu_res_->d_src_right, right_pixels, input_depth_bits_ / 8 * width_ * height_, cudaMemcpyHostToDevice));
			d_input_left = cu_res_->d_src_left;
			d_input_right = cu_res_->d_src_right;
		}

		void* d_tmp_left_disp = cu_res_->d_tmp_left_disp;
		void* d_tmp_right_disp = cu_res_->d_tmp_right_disp;
		void* d_left_disp = cu_res_->d_left_disp;
		void* d_right_disp = cu_res_->d_right_disp;

		if (is_cuda_output(inout_type_) && output_depth_bits_ == 16)
			d_left_disp = dst; // when threre is no device-host copy or type conversion, use passed buffer
		
		cu_res_->sgm_engine->execute((uint16_t*)d_tmp_left_disp, (uint16_t*)d_tmp_right_disp,
			d_input_left, d_input_right, width_, height_, param_.P1, param_.P2, param_.uniqueness, param_.subpixel);

		sgm::details::median_filter((uint16_t*)d_tmp_left_disp, (uint16_t*)d_left_disp, width_, height_);
		sgm::details::median_filter((uint16_t*)d_tmp_right_disp, (uint16_t*)d_right_disp, width_, height_);
		sgm::details::check_consistency((uint16_t*)d_left_disp, (uint16_t*)d_right_disp, d_input_left, width_, height_, input_depth_bits_, param_.subpixel);

		if (!is_cuda_output(inout_type_) && output_depth_bits_ == 8) {
			sgm::details::cast_16bit_8bit_array((const uint16_t*)d_left_disp, (uint8_t*)d_tmp_left_disp, width_ * height_);
			CudaSafeCall(cudaMemcpy(dst, d_tmp_left_disp, sizeof(uint8_t) * width_ * height_, cudaMemcpyDeviceToHost));
		}
		else if (is_cuda_output(inout_type_) && output_depth_bits_ == 8) {
			sgm::details::cast_16bit_8bit_array((const uint16_t*)d_left_disp, (uint8_t*)dst, width_ * height_);
		}
		else if (!is_cuda_output(inout_type_) && output_depth_bits_ == 16) {
			CudaSafeCall(cudaMemcpy(dst, d_left_disp, sizeof(uint16_t) * width_ * height_, cudaMemcpyDeviceToHost));
		}
		else if (is_cuda_output(inout_type_) && output_depth_bits_ == 16) {
			// optimize! no-copy!
		}
		else {
			std::cerr << "not impl" << std::endl;
		}
	}
예제 #10
0
  prngenerator_cuda<TFloat>::prngenerator_cuda(uint32_t num_engines) : prngenerator<TFloat>::prngenerator(num_engines) {

    CurandSafeCall(curandCreateGenerator(&(_dev_bulk_prng_engine),
                                         CURAND_RNG_PSEUDO_DEFAULT));

    CudaSafeCall(cudaMalloc((void **) &(_dev_prng_engines),
                            _NUM_ENGINES * sizeof(curandState)));
  }
예제 #11
0
파일: GPUData.cpp 프로젝트: rottaca/FreeCV
bool GPUData::uploadData(size_t sizeNew, const void* data) {
	if (sizeNew != size) {
		// Release old buffer
		if(gpuPtr != NULL)
			cudaFree(gpuPtr);

		// Allocate new GPU buffer
		CudaSafeCall(cudaMalloc(&gpuPtr, sizeNew));
		size = sizeNew;
	}

	cudaMemcpy(gpuPtr,data,size,cudaMemcpyHostToDevice);
	return true;
}
예제 #12
0
void
ga_solver_cuda<TFloat>::setup_solver()
{
  // Pseudo random number allocation.
  const uint32_t SELECTION_OFFSET = _selection_functor_ptr->required_prns(this);
  const uint32_t BREEDING_OFFSET = _breed_functor_ptr->required_prns(this);

  _bulk_size = SELECTION_OFFSET + BREEDING_OFFSET;
  CudaSafeCall(
    cudaMalloc((void**)&(_dev_bulk_prns), _bulk_size * sizeof(TFloat)));

  _prn_sets = new TFloat*[2];
  _prn_sets[SELECTION_SET] = _dev_bulk_prns;
  _prn_sets[BREEDING_SET] = _dev_bulk_prns + SELECTION_OFFSET;

  evolutionary_solver_cuda<TFloat>::setup_solver();
}
예제 #13
0
		~CudaStereoSGMResources() {
			CudaSafeCall(cudaFree(this->d_src_left));
			CudaSafeCall(cudaFree(this->d_src_right));

			CudaSafeCall(cudaFree(this->d_left_disp));
			CudaSafeCall(cudaFree(this->d_right_disp));

			CudaSafeCall(cudaFree(this->d_tmp_left_disp));
			CudaSafeCall(cudaFree(this->d_tmp_right_disp));

			delete sgm_engine;
		}
예제 #14
0
void init_arrays(Arrays *arr, FLOAT_TYPE** cu_F,
		 Command_line_opts *opts, Detector_settings *sett)
{

  // Allocates and initializes to zero the data, detector ephemeris
  // and the F-statistic arrays

//  arr->xDat = (double *) calloc (sett->N, sizeof (double));
	CudaSafeCall( cudaMallocHost((void**)&arr->xDat, sizeof(double)*sett->N));
  CudaSafeCall ( cudaMalloc((void**)&arr->cu_xDat, sizeof(double)*sett->N));

//  arr->DetSSB = (double *) calloc (3*sett->N, sizeof (double));
	CudaSafeCall( cudaMallocHost((void**)&arr->DetSSB, sizeof(double)*3*sett->N) );
  CudaSafeCall ( cudaMalloc((void**)&arr->cu_DetSSB, sizeof(double)*3*sett->N));

  CudaSafeCall ( cudaMalloc((void**)cu_F, sizeof(FLOAT_TYPE)*sett->fftpad*sett->nfft));
  CudaSafeCall ( cudaMemset(*cu_F, 0, sizeof(FLOAT_TYPE)*sett->fftpad*sett->nfft));

  char filename[CHAR_BUFFER_SIZE];
  FILE *data;
  // Input time-domain data handling
  sprintf (filename, "%s/%03d/xdatc_%03d_%03d%s.bin", opts->dtaprefix, opts->ident, \
	   opts->ident, opts->band, opts->label);
  if ((data = fopen (filename, "r")) != NULL) {
    fread ((void *)(arr->xDat), sizeof (double), sett->N, data); // !!! wczytanie danych
    fclose (data);
  } else {
    perror (filename);
    printf("Problem with %s... Exiting...\n", filename);
    exit(1);
  }
  //copy to device
  CudaSafeCall ( cudaMemcpy(arr->cu_xDat, arr->xDat, sizeof(double)*sett->N, cudaMemcpyHostToDevice));


  int Nzeros=0;
  int i;
  // Checking for null values in the data
  for(i=0; i < sett->N; i++)
    if(!arr->xDat[i]) Nzeros++;

  // factor N/(N - Nzeros) to account for null values in the data
  sett->crf0 = (double)sett->N/(sett->N-Nzeros);


  //if white noise...
  if (opts->white_flag)
    sett->sig2 = sett->N*var (arr->xDat, sett->N);
  else
    sett->sig2 = -1.;

  double epsm, phir;

  /*
    ############ Efemerydy ################
  */

  // Ephemeris file handling
  sprintf (filename, "%s/%03d/DetSSB.bin", opts->dtaprefix, opts->ident);
  if ((data = fopen (filename, "r")) != NULL) {
    // Detector position w.r.t solar system baricenter
    // for every datapoint
    fread ((void *)(arr->DetSSB), sizeof (double), 3*sett->N, data);
    // Deterministic phase defining the position of the Earth
    // in its diurnal motion at t=0
    fread ((void *)(&phir), sizeof (double), 1, data);
    // Earth's axis inclination to the ecliptic at t=0
    fread ((void *)(&epsm), sizeof (double), 1, data);
    fclose (data);
  } else {
    perror (filename);
    printf("Problem with %s... Exiting...\n", filename);
    exit(1);
  }

  //copy DetSSB to device
  CudaSafeCall ( cudaMemcpy(arr->cu_DetSSB, arr->DetSSB, sizeof(double)*sett->N*3, cudaMemcpyHostToDevice));


  /*
    ############ Sincos ################
  */


  sett->sphir = sin (phir);
  sett->cphir = cos (phir);
  sett->sepsm = sin (epsm);
  sett->cepsm = cos (epsm);

  //misc. arrays
  //arr->aa = (double*) malloc(sizeof(double)*sett->N);
  //arr->bb = (double*) malloc(sizeof(double)*sett->N);
  CudaSafeCall( cudaMallocHost((void**)&arr->aa, sizeof(double)*sett->N) );
  CudaSafeCall( cudaMallocHost((void**)&arr->bb, sizeof(double)*sett->N) );
  CudaSafeCall ( cudaMalloc((void**)&arr->cu_aa, sizeof(double)*sett->nfft));
  CudaSafeCall ( cudaMalloc((void**)&arr->cu_bb, sizeof(double)*sett->nfft));

  CudaSafeCall ( cudaMalloc((void**)&arr->cu_shft, sizeof(double)*sett->N));
  CudaSafeCall ( cudaMalloc((void**)&arr->cu_shftf, sizeof(double)*sett->N));
  CudaSafeCall ( cudaMalloc((void**)&arr->cu_tshift, sizeof(double)*sett->N));

  //for splines
  init_spline_matrices(&arr->cu_d,
		       &arr->cu_dl,
		       &arr->cu_du,
		       &arr->cu_B,
		       sett->Ninterp);

  arr->cand_params_size = (sett->nmax - sett->nmin);
  arr->cand_buffer_size = (sett->nmax - sett->nmin)*CANDIDATE_BUFFER_SCALE;

  //parameters of found signal

  CudaSafeCall (cudaMalloc((void**)&arr->cu_cand_params, sizeof(FLOAT_TYPE)*arr->cand_params_size));
  CudaSafeCall (cudaMalloc((void**)&arr->cu_cand_buffer, sizeof(FLOAT_TYPE)*arr->cand_buffer_size));
  CudaSafeCall (cudaMalloc((void**)&arr->cu_cand_count, sizeof(int)));

  //arr->cand_buffer = (FLOAT_TYPE*)malloc(sizeof(FLOAT_TYPE)*arr->cand_buffer_size);
	CudaSafeCall( cudaMallocHost((void**)&arr->cand_buffer, sizeof(FLOAT_TYPE)*arr->cand_buffer_size) );
}
예제 #15
0
void Application::_deInit()
{
    CudaSafeCall( cudaDeviceReset() );

    return;
}
예제 #16
0
void cleanup(Detector_settings *sett,
	     Command_line_opts *opts,
	     Search_range *s_range,
	     Arrays *arr,
	     FFT_plans *plans,
	     Ampl_mod_coeff *amod,
	     FLOAT_TYPE *cu_F)
{

  //free(arr->xDat);
	CudaSafeCall( cudaFreeHost(arr->xDat) );
  CudaSafeCall( cudaFreeHost(arr->DetSSB) );
  free(arr->sinmodf);
  free(arr->cosmodf);
  //free(arr->aa);
  //free(arr->bb);
  //free(arr->DetSSB);

  //free(arr->cand_buffer);
	CudaSafeCall( cudaFreeHost(arr->aa) );
  CudaSafeCall( cudaFreeHost(arr->bb) );
  CudaSafeCall( cudaFreeHost(arr->cand_buffer) );

  cudaFree(arr->cu_xa);
  cudaFree(arr->cu_xb);
  cudaFree(arr->cu_xar);
  cudaFree(arr->cu_xbr);
  cudaFree(arr->cu_xa_f);
  cudaFree(arr->cu_xb_f);
  cudaFree(arr->cu_xar_f);
  cudaFree(arr->cu_xbr_f);
  cudaFree(arr->cu_xDat);

  cudaFree(arr->cu_aa);
  cudaFree(arr->cu_bb);

  cudaFree(arr->cu_shft);
  cudaFree(arr->cu_shftf);
  cudaFree(arr->cu_tshift);
  cudaFree(arr->cu_DetSSB);

  cudaFree(arr->cu_d);
  cudaFree(arr->cu_dl);
  cudaFree(arr->cu_du);
  cudaFree(arr->cu_B);

  cudaFree(cu_F);

  cudaFree(arr->cu_sinmodf);
  cudaFree(arr->cu_cosmodf);

  cudaFree(arr->cu_cand_buffer);
  cudaFree(arr->cu_cand_params);
  cudaFree(arr->cu_cand_count);

  free(sett->M);

  if (opts->fftinterp == INT ) {//interbinning
    cudaFree(arr->cu_xa2_f);
    cudaFree(arr->cu_xb2_f);
  }

}
예제 #17
0
float WFIRFilterCuda::cudaFilter( WLEMData::ScalarT* const output, const WLEMData::ScalarT* const input,
                const WLEMData::ScalarT* const previous, size_t channels, size_t samples, const WLEMData::ScalarT* const coeffs,
                size_t coeffSize )
{
    CuScalarT *dev_in = NULL;
    size_t pitchIn;

    CuScalarT *dev_prev = NULL;
    size_t pitchPrev;

    CuScalarT *dev_out = NULL;
    size_t pitchOut;

    CuScalarT *dev_co = NULL;

    try
    {
        CudaThrowsCall( cudaMallocPitch( ( void** )&dev_in, &pitchIn, samples * sizeof( CuScalarT ), channels ) );
        CudaThrowsCall(
                        cudaMemcpy2D( dev_in, pitchIn, input, samples * sizeof( CuScalarT ), samples * sizeof( CuScalarT ),
                                        channels, cudaMemcpyHostToDevice ) );

        CudaThrowsCall( cudaMallocPitch( ( void** )&dev_prev, &pitchPrev, coeffSize * sizeof( CuScalarT ), channels ) );
        CudaThrowsCall(
                        cudaMemcpy2D( dev_prev, pitchPrev, previous, coeffSize * sizeof( CuScalarT ),
                                        coeffSize * sizeof( CuScalarT ), channels, cudaMemcpyHostToDevice ) );

        CudaThrowsCall( cudaMallocPitch( ( void** )&dev_out, &pitchOut, samples * sizeof( CuScalarT ), channels ) );

        CudaThrowsCall( cudaMalloc( ( void** )&dev_co, coeffSize * sizeof( CuScalarT ) ) );
        CudaThrowsCall( cudaMemcpy( dev_co, coeffs, coeffSize * sizeof( CuScalarT ), cudaMemcpyHostToDevice ) );
    }
    catch( const WException& e )
    {
        wlog::error( CLASS ) << e.what();
        if( dev_in )
        {
            CudaSafeCall( cudaFree( ( void* )dev_in ) );
        }
        if( dev_prev )
        {
            CudaSafeCall( cudaFree( ( void* )dev_prev ) );
        }
        if( dev_out )
        {
            CudaSafeCall( cudaFree( ( void* )dev_out ) );
        }
        if( dev_co )
        {
            CudaSafeCall( cudaFree( ( void* )dev_co ) );
        }
        throw WLBadAllocException( "Could not allocate CUDA memory!" );
    }

    size_t threadsPerBlock = 32;
    size_t blocksPerGrid = ( samples + threadsPerBlock - 1 ) / threadsPerBlock;
    size_t sharedMem = coeffSize * sizeof( CuScalarT );

    cudaEvent_t start, stop;
    cudaEventCreate( &start );
    cudaEventCreate( &stop );

    cudaEventRecord( start, 0 );
    cuFirFilter( blocksPerGrid, threadsPerBlock, sharedMem, dev_out, dev_in, dev_prev, channels, samples, dev_co, coeffSize,
                    pitchOut, pitchIn, pitchPrev );
    cudaError_t kernelError = cudaGetLastError();

    cudaEventRecord( stop, 0 );
    cudaEventSynchronize( stop );

    float elapsedTime;
    cudaEventElapsedTime( &elapsedTime, start, stop );
    cudaEventDestroy( start );
    cudaEventDestroy( stop );

    try
    {
        if( kernelError != cudaSuccess )
        {
            const std::string err( cudaGetErrorString( kernelError ) );
            throw WException( "CUDA kernel failed: " + err );
        }
        CudaThrowsCall(
                        cudaMemcpy2D( output, samples * sizeof( CuScalarT ), dev_out, pitchOut, samples * sizeof( CuScalarT ),
                                        channels, cudaMemcpyDeviceToHost ) );
    }
    catch( const WException& e )
    {
        wlog::error( CLASS ) << e.what();
        elapsedTime = -1.0;
    }

    CudaSafeCall( cudaFree( ( void* )dev_in ) );
    CudaSafeCall( cudaFree( ( void* )dev_prev ) );
    CudaSafeCall( cudaFree( ( void* )dev_out ) );
    CudaSafeCall( cudaFree( ( void* )dev_co ) );

    if( elapsedTime > -1.0 )
    {
        return elapsedTime;
    }
    else
    {
        throw WException( "Error in cudaFilter()" );
    }
}
예제 #18
0
void
ga_solver_cuda<TFloat>::teardown_solver()
{
  delete[] _prn_sets;
  CudaSafeCall(cudaFree(_dev_bulk_prns));
}
예제 #19
0
ga_solver_cuda<TFloat>::~ga_solver_cuda()
{
  CudaSafeCall(cudaFree(_dev_couples_idx_array));
  CudaSafeCall(cudaFree(_dev_candidates_reservoir_array));
}
예제 #20
0
 prngenerator_cuda<TFloat>::~prngenerator_cuda() {
   CurandSafeCall(curandDestroyGenerator(_dev_bulk_prng_engine));
   CudaSafeCall(cudaFree(_dev_prng_engines));
 }
예제 #21
0
  void population_set_cuda<TFloat>::gen_cpy(TFloat * dst_data,
                                            const TFloat * src_data,
                                            size_t elements,
                                            GenomeCopyKind copy_type)
  {
    const size_t bytes_2_copy = elements * sizeof(TFloat);

    TFloat * store_buffer;

    switch(copy_type)
      {
      case GencpyHostToHost:
        memcpy(dst_data, src_data, bytes_2_copy);
        break;
      case GencpyDeviceToDevice:
        CudaSafeCall(cudaMemcpy(dst_data, src_data, bytes_2_copy, cudaMemcpyDeviceToDevice));
        break;
      case GencpyDeviceToHost:
        store_buffer = new TFloat[bytes_2_copy];
        // Copy data from CUDA into temporal buffer.
        CudaSafeCall(cudaMemcpy(store_buffer, src_data, bytes_2_copy, cudaMemcpyDeviceToHost));
        // Rearange genomes into CPU scheme.
        for (uint32_t i = 0; i < _ISLES; ++i)
          {
            for(uint32_t j = 0; j < _AGENTS; ++j)
              {
                const uint32_t locus_offset = i * _AGENTS + j;
                for(uint32_t k = 0; k < _DIMENSIONS; ++k)
                  {
                    const uint32_t cpu_idx = i * _AGENTS * _DIMENSIONS + j * _DIMENSIONS + k;
                    const uint32_t cuda_idx = k * _ISLES * _AGENTS + locus_offset;
                    dst_data[cpu_idx] = store_buffer[cuda_idx];
                  }
              }
          }

        delete [] store_buffer;
        break;
      case GencpyHostToDevice:
        store_buffer = new TFloat[bytes_2_copy];
        // Rearange genomes into CUDA scheme in temporal buffer.
        for (uint32_t i = 0; i < _ISLES; ++i)
          {
            for(uint32_t j = 0; j < _AGENTS; ++j)
              {
                const uint32_t locus_offset = i * _AGENTS + j;
                for(uint32_t k = 0; k < _DIMENSIONS; ++k)
                  {
                    const uint32_t cpu_idx = i * _AGENTS * _DIMENSIONS + j * _DIMENSIONS + k;
                    const uint32_t cuda_idx = k * _ISLES * _AGENTS + locus_offset;
                    store_buffer[cuda_idx] = src_data[cpu_idx];
                  }
              }
          }
        // Copy rearranged buffer into CUDA
        CudaSafeCall(cudaMemcpy(dst_data, store_buffer, bytes_2_copy, cudaMemcpyHostToDevice));

        delete [] store_buffer;
        break;
      }
  }
예제 #22
0
 population_set_cuda<TFloat>::~population_set_cuda()
 {
   CudaSafeCall(cudaFree(_dev_data_array));
   CudaSafeCall(cudaFree(_dev_transformed_data_array));
   CudaSafeCall(cudaFree(_dev_fitness_array));
 }
예제 #23
0
파일: GPUData.cpp 프로젝트: rottaca/FreeCV
void GPUData::downloadData(void* data) {
	CudaSafeCall(cudaMemcpy(data,gpuPtr,size,cudaMemcpyDeviceToHost));
}