CudaStereoSGMResources(int width_, int height_, int disparity_size_, int input_depth_bits_, int output_depth_bits_, EXECUTE_INOUT inout_type_) { if (input_depth_bits_ == 8 && disparity_size_ == 64) sgm_engine = new SemiGlobalMatchingImpl<uint8_t, 64>(); else if (input_depth_bits_ == 8 && disparity_size_ == 128) sgm_engine = new SemiGlobalMatchingImpl<uint8_t, 128>(); else if (input_depth_bits_ == 16 && disparity_size_ == 64) sgm_engine = new SemiGlobalMatchingImpl<uint16_t, 64>(); else if (input_depth_bits_ == 16 && disparity_size_ == 128) sgm_engine = new SemiGlobalMatchingImpl<uint16_t, 128>(); else throw std::logic_error("depth bits must be 8 or 16, and disparity size must be 64 or 128"); if (is_cuda_input(inout_type_)) { this->d_src_left = NULL; this->d_src_right = NULL; } else { CudaSafeCall(cudaMalloc(&this->d_src_left, input_depth_bits_ / 8 * width_ * height_)); CudaSafeCall(cudaMalloc(&this->d_src_right, input_depth_bits_ / 8 * width_ * height_)); } CudaSafeCall(cudaMalloc(&this->d_left_disp, sizeof(uint16_t) * width_ * height_)); CudaSafeCall(cudaMalloc(&this->d_right_disp, sizeof(uint16_t) * width_ * height_)); CudaSafeCall(cudaMalloc(&this->d_tmp_left_disp, sizeof(uint16_t) * width_ * height_)); CudaSafeCall(cudaMalloc(&this->d_tmp_right_disp, sizeof(uint16_t) * width_ * height_)); CudaSafeCall(cudaMemset(this->d_left_disp, 0, sizeof(uint16_t) * width_ * height_)); CudaSafeCall(cudaMemset(this->d_right_disp, 0, sizeof(uint16_t) * width_ * height_)); CudaSafeCall(cudaMemset(this->d_tmp_left_disp, 0, sizeof(uint16_t) * width_ * height_)); CudaSafeCall(cudaMemset(this->d_tmp_right_disp, 0, sizeof(uint16_t) * width_ * height_)); }
ga_solver_cuda<TFloat>::ga_solver_cuda(population_set_cuda<TFloat>* population, evaluator_cuda<TFloat>* evaluator, prngenerator_cuda<TFloat>* prn_generator, uint32_t generation_target, TFloat* upper_bounds, TFloat* lower_bounds) : evolutionary_solver_cuda<TFloat>(population, evaluator, prn_generator, generation_target, upper_bounds, lower_bounds) { // Defaults _migration_step = 0; _migration_size = 1; _migration_selection_size = 2; _selection_size = 2; _selection_stochastic_factor = 0; _crossover_rate = 0.9; _mutation_rate = 0.1; // Allocate GA resources const size_t TOTAL_AGENTS = _population->_TOTAL_AGENTS; CudaSafeCall(cudaMalloc((void**)&(_dev_couples_idx_array), TOTAL_AGENTS * sizeof(uint32_t))); CudaSafeCall(cudaMalloc((void**)&(_dev_candidates_reservoir_array), _ISLES * _AGENTS * _AGENTS * sizeof(uint32_t))); }
TransferFunction::TransferFunction(vtkSmartPointer<vtkPiecewiseFunction> otf, vtkSmartPointer<vtkColorTransferFunction> ctf, QObject *parent) : QObject(parent) { opacityTF = otf; colorTF = ctf; this->otf = QSharedPointer<ctkTransferFunction>(new ctkVTKPiecewiseFunction(opacityTF)); this->ctf = QSharedPointer<ctkTransferFunction>(new ctkVTKColorTransferFunction(colorTF)); connect(this->otf.data(), SIGNAL(changed()), this, SLOT(onOpacityTFChanged())); connect(this->ctf.data(), SIGNAL(changed()), this, SLOT(onColorTFChanged())); compositeTex = 0; // initialize each table opacityTF->GetTable(0.0, 1.0, TABLE_SIZE, opacityTable); colorTF->GetTable(0.0, 1.0, TABLE_SIZE, colorTable); CompositeTable(); channelDesc = cudaCreateChannelDesc(32, 32, 32, 32, cudaChannelFormatKindFloat); CudaSafeCall(cudaMallocArray(&array, &channelDesc, TABLE_SIZE)); CudaSafeCall(cudaMemcpyToArray(array, 0, 0, compositeTable, sizeof(float) * TABLE_SIZE * 4, cudaMemcpyHostToDevice)); memset(&resourceDesc, 0, sizeof(resourceDesc)); resourceDesc.resType = cudaResourceTypeArray; resourceDesc.res.array.array = array; memset(&texDesc, 0, sizeof(texDesc)); texDesc.addressMode[0] = cudaAddressModeClamp; texDesc.filterMode = cudaFilterModeLinear; texDesc.normalizedCoords = true; texDesc.readMode = cudaReadModeElementType; CudaSafeCall(cudaCreateTextureObject(&compositeTex, &resourceDesc, &texDesc, NULL)); }
TransferFunction::~TransferFunction() { if(compositeTex) CudaSafeCall(cudaDestroyTextureObject(compositeTex)); CudaSafeCall(cudaFreeArray(array)); }
void TransferFunction::onColorTFChanged() { //std::cout<<"Color changed"<<std::endl; if(compositeTex) { CudaSafeCall(cudaDestroyTextureObject(compositeTex)); compositeTex = 0; } colorTF->GetTable(0.0, 1.0, TABLE_SIZE, colorTable); size_t j = 0, k = 0; for(size_t i = 0; i < TABLE_SIZE; ++i) { compositeTable[j++] = colorTable[k++]; compositeTable[j++] = colorTable[k++]; compositeTable[j++] = colorTable[k++]; j++; } //CompositeTable(); CudaSafeCall(cudaMemcpyToArray(array, 0, 0, compositeTable, sizeof(float) * TABLE_SIZE * 4, cudaMemcpyHostToDevice)); CudaSafeCall(cudaCreateTextureObject(&compositeTex, &resourceDesc, &texDesc, NULL)); Changed(); }
void plan_fft(FFT_plans *plans, Arrays *arr, Detector_settings *sett, Command_line_opts *opts) { /* ############ FFT Plans ################ */ //arrlen is maximum of Ninterp and fftpad*nfft arr->arr_len = (sett->fftpad * sett->nfft > sett->Ninterp ? sett->fftpad * sett->nfft : sett->Ninterp); CudaSafeCall ( cudaMalloc((void**)&arr->cu_xa, arr->arr_len*sizeof(cufftDoubleComplex)) ); CudaSafeCall ( cudaMalloc((void**)&arr->cu_xb, arr->arr_len*sizeof(cufftDoubleComplex)) ); CudaSafeCall ( cudaMalloc((void**)&arr->cu_xar, arr->arr_len*sizeof(cufftDoubleComplex)) ); CudaSafeCall ( cudaMalloc((void**)&arr->cu_xbr, arr->arr_len*sizeof(cufftDoubleComplex)) ); CudaSafeCall ( cudaMalloc((void**)&arr->cu_xa_f, arr->arr_len*sizeof(COMPLEX_TYPE)) ); CudaSafeCall ( cudaMalloc((void**)&arr->cu_xb_f, arr->arr_len*sizeof(COMPLEX_TYPE)) ); CudaSafeCall ( cudaMalloc((void**)&arr->cu_xar_f, arr->arr_len*sizeof(COMPLEX_TYPE)) ); CudaSafeCall ( cudaMalloc((void**)&arr->cu_xbr_f, arr->arr_len*sizeof(COMPLEX_TYPE)) ); if (opts->fftinterp == INT) { //interbinning CudaSafeCall ( cudaMalloc((void**)&arr->cu_xa2_f, sett->nfft*sizeof(COMPLEX_TYPE)) ); CudaSafeCall ( cudaMalloc((void**)&arr->cu_xb2_f, sett->nfft*sizeof(COMPLEX_TYPE)) ); } sett->nfftf = sett->fftpad*sett->nfft; if (opts->fftinterp == INT) { //interbinning cufftPlan1d( &(plans->plan), sett->nfft, CUFFT_TRANSFORM_TYPE, 1); } else { //fft & zero padding cufftPlan1d( &(plans->plan), sett->nfftf, CUFFT_TRANSFORM_TYPE, 1); } //plans for interpolation with splines cufftPlan1d(&(plans->pl_int), sett->nfft, CUFFT_Z2Z, 1); cufftPlan1d(&(plans->pl_inv), sett->Ninterp, CUFFT_Z2Z, 1); /* ############ FFT plans end ################ */ }
population_set_cuda<TFloat>::population_set_cuda (const uint32_t ISLES, const uint32_t AGENTS, const uint32_t DIMENSIONS) : population_set<TFloat>(ISLES, AGENTS, DIMENSIONS) { // Device Memory Allocation CudaSafeCall(cudaMalloc((void **) &_dev_data_array, _TOTAL_GENES * sizeof(TFloat))); CudaSafeCall(cudaMalloc((void **) &_dev_transformed_data_array, _TOTAL_GENES * sizeof(TFloat))); CudaSafeCall(cudaMalloc((void **) &_dev_fitness_array, _TOTAL_AGENTS * sizeof(TFloat))); }
void Application::_init() { // Pick the best CUDA device const int deviceIdx = cutGetMaxGflopsDeviceId(); CudaSafeCall( cudaSetDevice( deviceIdx ) ); // CUDA configuration CudaSafeCall( cudaDeviceSetCacheConfig( cudaFuncCachePreferShared ) ); return; }
void StereoSGM::execute(const void* left_pixels, const void* right_pixels, void* dst) { const void *d_input_left, *d_input_right; if (is_cuda_input(inout_type_)) { d_input_left = left_pixels; d_input_right = right_pixels; } else { CudaSafeCall(cudaMemcpy(cu_res_->d_src_left, left_pixels, input_depth_bits_ / 8 * width_ * height_, cudaMemcpyHostToDevice)); CudaSafeCall(cudaMemcpy(cu_res_->d_src_right, right_pixels, input_depth_bits_ / 8 * width_ * height_, cudaMemcpyHostToDevice)); d_input_left = cu_res_->d_src_left; d_input_right = cu_res_->d_src_right; } void* d_tmp_left_disp = cu_res_->d_tmp_left_disp; void* d_tmp_right_disp = cu_res_->d_tmp_right_disp; void* d_left_disp = cu_res_->d_left_disp; void* d_right_disp = cu_res_->d_right_disp; if (is_cuda_output(inout_type_) && output_depth_bits_ == 16) d_left_disp = dst; // when threre is no device-host copy or type conversion, use passed buffer cu_res_->sgm_engine->execute((uint16_t*)d_tmp_left_disp, (uint16_t*)d_tmp_right_disp, d_input_left, d_input_right, width_, height_, param_.P1, param_.P2, param_.uniqueness, param_.subpixel); sgm::details::median_filter((uint16_t*)d_tmp_left_disp, (uint16_t*)d_left_disp, width_, height_); sgm::details::median_filter((uint16_t*)d_tmp_right_disp, (uint16_t*)d_right_disp, width_, height_); sgm::details::check_consistency((uint16_t*)d_left_disp, (uint16_t*)d_right_disp, d_input_left, width_, height_, input_depth_bits_, param_.subpixel); if (!is_cuda_output(inout_type_) && output_depth_bits_ == 8) { sgm::details::cast_16bit_8bit_array((const uint16_t*)d_left_disp, (uint8_t*)d_tmp_left_disp, width_ * height_); CudaSafeCall(cudaMemcpy(dst, d_tmp_left_disp, sizeof(uint8_t) * width_ * height_, cudaMemcpyDeviceToHost)); } else if (is_cuda_output(inout_type_) && output_depth_bits_ == 8) { sgm::details::cast_16bit_8bit_array((const uint16_t*)d_left_disp, (uint8_t*)dst, width_ * height_); } else if (!is_cuda_output(inout_type_) && output_depth_bits_ == 16) { CudaSafeCall(cudaMemcpy(dst, d_left_disp, sizeof(uint16_t) * width_ * height_, cudaMemcpyDeviceToHost)); } else if (is_cuda_output(inout_type_) && output_depth_bits_ == 16) { // optimize! no-copy! } else { std::cerr << "not impl" << std::endl; } }
prngenerator_cuda<TFloat>::prngenerator_cuda(uint32_t num_engines) : prngenerator<TFloat>::prngenerator(num_engines) { CurandSafeCall(curandCreateGenerator(&(_dev_bulk_prng_engine), CURAND_RNG_PSEUDO_DEFAULT)); CudaSafeCall(cudaMalloc((void **) &(_dev_prng_engines), _NUM_ENGINES * sizeof(curandState))); }
bool GPUData::uploadData(size_t sizeNew, const void* data) { if (sizeNew != size) { // Release old buffer if(gpuPtr != NULL) cudaFree(gpuPtr); // Allocate new GPU buffer CudaSafeCall(cudaMalloc(&gpuPtr, sizeNew)); size = sizeNew; } cudaMemcpy(gpuPtr,data,size,cudaMemcpyHostToDevice); return true; }
void ga_solver_cuda<TFloat>::setup_solver() { // Pseudo random number allocation. const uint32_t SELECTION_OFFSET = _selection_functor_ptr->required_prns(this); const uint32_t BREEDING_OFFSET = _breed_functor_ptr->required_prns(this); _bulk_size = SELECTION_OFFSET + BREEDING_OFFSET; CudaSafeCall( cudaMalloc((void**)&(_dev_bulk_prns), _bulk_size * sizeof(TFloat))); _prn_sets = new TFloat*[2]; _prn_sets[SELECTION_SET] = _dev_bulk_prns; _prn_sets[BREEDING_SET] = _dev_bulk_prns + SELECTION_OFFSET; evolutionary_solver_cuda<TFloat>::setup_solver(); }
~CudaStereoSGMResources() { CudaSafeCall(cudaFree(this->d_src_left)); CudaSafeCall(cudaFree(this->d_src_right)); CudaSafeCall(cudaFree(this->d_left_disp)); CudaSafeCall(cudaFree(this->d_right_disp)); CudaSafeCall(cudaFree(this->d_tmp_left_disp)); CudaSafeCall(cudaFree(this->d_tmp_right_disp)); delete sgm_engine; }
void init_arrays(Arrays *arr, FLOAT_TYPE** cu_F, Command_line_opts *opts, Detector_settings *sett) { // Allocates and initializes to zero the data, detector ephemeris // and the F-statistic arrays // arr->xDat = (double *) calloc (sett->N, sizeof (double)); CudaSafeCall( cudaMallocHost((void**)&arr->xDat, sizeof(double)*sett->N)); CudaSafeCall ( cudaMalloc((void**)&arr->cu_xDat, sizeof(double)*sett->N)); // arr->DetSSB = (double *) calloc (3*sett->N, sizeof (double)); CudaSafeCall( cudaMallocHost((void**)&arr->DetSSB, sizeof(double)*3*sett->N) ); CudaSafeCall ( cudaMalloc((void**)&arr->cu_DetSSB, sizeof(double)*3*sett->N)); CudaSafeCall ( cudaMalloc((void**)cu_F, sizeof(FLOAT_TYPE)*sett->fftpad*sett->nfft)); CudaSafeCall ( cudaMemset(*cu_F, 0, sizeof(FLOAT_TYPE)*sett->fftpad*sett->nfft)); char filename[CHAR_BUFFER_SIZE]; FILE *data; // Input time-domain data handling sprintf (filename, "%s/%03d/xdatc_%03d_%03d%s.bin", opts->dtaprefix, opts->ident, \ opts->ident, opts->band, opts->label); if ((data = fopen (filename, "r")) != NULL) { fread ((void *)(arr->xDat), sizeof (double), sett->N, data); // !!! wczytanie danych fclose (data); } else { perror (filename); printf("Problem with %s... Exiting...\n", filename); exit(1); } //copy to device CudaSafeCall ( cudaMemcpy(arr->cu_xDat, arr->xDat, sizeof(double)*sett->N, cudaMemcpyHostToDevice)); int Nzeros=0; int i; // Checking for null values in the data for(i=0; i < sett->N; i++) if(!arr->xDat[i]) Nzeros++; // factor N/(N - Nzeros) to account for null values in the data sett->crf0 = (double)sett->N/(sett->N-Nzeros); //if white noise... if (opts->white_flag) sett->sig2 = sett->N*var (arr->xDat, sett->N); else sett->sig2 = -1.; double epsm, phir; /* ############ Efemerydy ################ */ // Ephemeris file handling sprintf (filename, "%s/%03d/DetSSB.bin", opts->dtaprefix, opts->ident); if ((data = fopen (filename, "r")) != NULL) { // Detector position w.r.t solar system baricenter // for every datapoint fread ((void *)(arr->DetSSB), sizeof (double), 3*sett->N, data); // Deterministic phase defining the position of the Earth // in its diurnal motion at t=0 fread ((void *)(&phir), sizeof (double), 1, data); // Earth's axis inclination to the ecliptic at t=0 fread ((void *)(&epsm), sizeof (double), 1, data); fclose (data); } else { perror (filename); printf("Problem with %s... Exiting...\n", filename); exit(1); } //copy DetSSB to device CudaSafeCall ( cudaMemcpy(arr->cu_DetSSB, arr->DetSSB, sizeof(double)*sett->N*3, cudaMemcpyHostToDevice)); /* ############ Sincos ################ */ sett->sphir = sin (phir); sett->cphir = cos (phir); sett->sepsm = sin (epsm); sett->cepsm = cos (epsm); //misc. arrays //arr->aa = (double*) malloc(sizeof(double)*sett->N); //arr->bb = (double*) malloc(sizeof(double)*sett->N); CudaSafeCall( cudaMallocHost((void**)&arr->aa, sizeof(double)*sett->N) ); CudaSafeCall( cudaMallocHost((void**)&arr->bb, sizeof(double)*sett->N) ); CudaSafeCall ( cudaMalloc((void**)&arr->cu_aa, sizeof(double)*sett->nfft)); CudaSafeCall ( cudaMalloc((void**)&arr->cu_bb, sizeof(double)*sett->nfft)); CudaSafeCall ( cudaMalloc((void**)&arr->cu_shft, sizeof(double)*sett->N)); CudaSafeCall ( cudaMalloc((void**)&arr->cu_shftf, sizeof(double)*sett->N)); CudaSafeCall ( cudaMalloc((void**)&arr->cu_tshift, sizeof(double)*sett->N)); //for splines init_spline_matrices(&arr->cu_d, &arr->cu_dl, &arr->cu_du, &arr->cu_B, sett->Ninterp); arr->cand_params_size = (sett->nmax - sett->nmin); arr->cand_buffer_size = (sett->nmax - sett->nmin)*CANDIDATE_BUFFER_SCALE; //parameters of found signal CudaSafeCall (cudaMalloc((void**)&arr->cu_cand_params, sizeof(FLOAT_TYPE)*arr->cand_params_size)); CudaSafeCall (cudaMalloc((void**)&arr->cu_cand_buffer, sizeof(FLOAT_TYPE)*arr->cand_buffer_size)); CudaSafeCall (cudaMalloc((void**)&arr->cu_cand_count, sizeof(int))); //arr->cand_buffer = (FLOAT_TYPE*)malloc(sizeof(FLOAT_TYPE)*arr->cand_buffer_size); CudaSafeCall( cudaMallocHost((void**)&arr->cand_buffer, sizeof(FLOAT_TYPE)*arr->cand_buffer_size) ); }
void Application::_deInit() { CudaSafeCall( cudaDeviceReset() ); return; }
void cleanup(Detector_settings *sett, Command_line_opts *opts, Search_range *s_range, Arrays *arr, FFT_plans *plans, Ampl_mod_coeff *amod, FLOAT_TYPE *cu_F) { //free(arr->xDat); CudaSafeCall( cudaFreeHost(arr->xDat) ); CudaSafeCall( cudaFreeHost(arr->DetSSB) ); free(arr->sinmodf); free(arr->cosmodf); //free(arr->aa); //free(arr->bb); //free(arr->DetSSB); //free(arr->cand_buffer); CudaSafeCall( cudaFreeHost(arr->aa) ); CudaSafeCall( cudaFreeHost(arr->bb) ); CudaSafeCall( cudaFreeHost(arr->cand_buffer) ); cudaFree(arr->cu_xa); cudaFree(arr->cu_xb); cudaFree(arr->cu_xar); cudaFree(arr->cu_xbr); cudaFree(arr->cu_xa_f); cudaFree(arr->cu_xb_f); cudaFree(arr->cu_xar_f); cudaFree(arr->cu_xbr_f); cudaFree(arr->cu_xDat); cudaFree(arr->cu_aa); cudaFree(arr->cu_bb); cudaFree(arr->cu_shft); cudaFree(arr->cu_shftf); cudaFree(arr->cu_tshift); cudaFree(arr->cu_DetSSB); cudaFree(arr->cu_d); cudaFree(arr->cu_dl); cudaFree(arr->cu_du); cudaFree(arr->cu_B); cudaFree(cu_F); cudaFree(arr->cu_sinmodf); cudaFree(arr->cu_cosmodf); cudaFree(arr->cu_cand_buffer); cudaFree(arr->cu_cand_params); cudaFree(arr->cu_cand_count); free(sett->M); if (opts->fftinterp == INT ) {//interbinning cudaFree(arr->cu_xa2_f); cudaFree(arr->cu_xb2_f); } }
float WFIRFilterCuda::cudaFilter( WLEMData::ScalarT* const output, const WLEMData::ScalarT* const input, const WLEMData::ScalarT* const previous, size_t channels, size_t samples, const WLEMData::ScalarT* const coeffs, size_t coeffSize ) { CuScalarT *dev_in = NULL; size_t pitchIn; CuScalarT *dev_prev = NULL; size_t pitchPrev; CuScalarT *dev_out = NULL; size_t pitchOut; CuScalarT *dev_co = NULL; try { CudaThrowsCall( cudaMallocPitch( ( void** )&dev_in, &pitchIn, samples * sizeof( CuScalarT ), channels ) ); CudaThrowsCall( cudaMemcpy2D( dev_in, pitchIn, input, samples * sizeof( CuScalarT ), samples * sizeof( CuScalarT ), channels, cudaMemcpyHostToDevice ) ); CudaThrowsCall( cudaMallocPitch( ( void** )&dev_prev, &pitchPrev, coeffSize * sizeof( CuScalarT ), channels ) ); CudaThrowsCall( cudaMemcpy2D( dev_prev, pitchPrev, previous, coeffSize * sizeof( CuScalarT ), coeffSize * sizeof( CuScalarT ), channels, cudaMemcpyHostToDevice ) ); CudaThrowsCall( cudaMallocPitch( ( void** )&dev_out, &pitchOut, samples * sizeof( CuScalarT ), channels ) ); CudaThrowsCall( cudaMalloc( ( void** )&dev_co, coeffSize * sizeof( CuScalarT ) ) ); CudaThrowsCall( cudaMemcpy( dev_co, coeffs, coeffSize * sizeof( CuScalarT ), cudaMemcpyHostToDevice ) ); } catch( const WException& e ) { wlog::error( CLASS ) << e.what(); if( dev_in ) { CudaSafeCall( cudaFree( ( void* )dev_in ) ); } if( dev_prev ) { CudaSafeCall( cudaFree( ( void* )dev_prev ) ); } if( dev_out ) { CudaSafeCall( cudaFree( ( void* )dev_out ) ); } if( dev_co ) { CudaSafeCall( cudaFree( ( void* )dev_co ) ); } throw WLBadAllocException( "Could not allocate CUDA memory!" ); } size_t threadsPerBlock = 32; size_t blocksPerGrid = ( samples + threadsPerBlock - 1 ) / threadsPerBlock; size_t sharedMem = coeffSize * sizeof( CuScalarT ); cudaEvent_t start, stop; cudaEventCreate( &start ); cudaEventCreate( &stop ); cudaEventRecord( start, 0 ); cuFirFilter( blocksPerGrid, threadsPerBlock, sharedMem, dev_out, dev_in, dev_prev, channels, samples, dev_co, coeffSize, pitchOut, pitchIn, pitchPrev ); cudaError_t kernelError = cudaGetLastError(); cudaEventRecord( stop, 0 ); cudaEventSynchronize( stop ); float elapsedTime; cudaEventElapsedTime( &elapsedTime, start, stop ); cudaEventDestroy( start ); cudaEventDestroy( stop ); try { if( kernelError != cudaSuccess ) { const std::string err( cudaGetErrorString( kernelError ) ); throw WException( "CUDA kernel failed: " + err ); } CudaThrowsCall( cudaMemcpy2D( output, samples * sizeof( CuScalarT ), dev_out, pitchOut, samples * sizeof( CuScalarT ), channels, cudaMemcpyDeviceToHost ) ); } catch( const WException& e ) { wlog::error( CLASS ) << e.what(); elapsedTime = -1.0; } CudaSafeCall( cudaFree( ( void* )dev_in ) ); CudaSafeCall( cudaFree( ( void* )dev_prev ) ); CudaSafeCall( cudaFree( ( void* )dev_out ) ); CudaSafeCall( cudaFree( ( void* )dev_co ) ); if( elapsedTime > -1.0 ) { return elapsedTime; } else { throw WException( "Error in cudaFilter()" ); } }
void ga_solver_cuda<TFloat>::teardown_solver() { delete[] _prn_sets; CudaSafeCall(cudaFree(_dev_bulk_prns)); }
ga_solver_cuda<TFloat>::~ga_solver_cuda() { CudaSafeCall(cudaFree(_dev_couples_idx_array)); CudaSafeCall(cudaFree(_dev_candidates_reservoir_array)); }
prngenerator_cuda<TFloat>::~prngenerator_cuda() { CurandSafeCall(curandDestroyGenerator(_dev_bulk_prng_engine)); CudaSafeCall(cudaFree(_dev_prng_engines)); }
void population_set_cuda<TFloat>::gen_cpy(TFloat * dst_data, const TFloat * src_data, size_t elements, GenomeCopyKind copy_type) { const size_t bytes_2_copy = elements * sizeof(TFloat); TFloat * store_buffer; switch(copy_type) { case GencpyHostToHost: memcpy(dst_data, src_data, bytes_2_copy); break; case GencpyDeviceToDevice: CudaSafeCall(cudaMemcpy(dst_data, src_data, bytes_2_copy, cudaMemcpyDeviceToDevice)); break; case GencpyDeviceToHost: store_buffer = new TFloat[bytes_2_copy]; // Copy data from CUDA into temporal buffer. CudaSafeCall(cudaMemcpy(store_buffer, src_data, bytes_2_copy, cudaMemcpyDeviceToHost)); // Rearange genomes into CPU scheme. for (uint32_t i = 0; i < _ISLES; ++i) { for(uint32_t j = 0; j < _AGENTS; ++j) { const uint32_t locus_offset = i * _AGENTS + j; for(uint32_t k = 0; k < _DIMENSIONS; ++k) { const uint32_t cpu_idx = i * _AGENTS * _DIMENSIONS + j * _DIMENSIONS + k; const uint32_t cuda_idx = k * _ISLES * _AGENTS + locus_offset; dst_data[cpu_idx] = store_buffer[cuda_idx]; } } } delete [] store_buffer; break; case GencpyHostToDevice: store_buffer = new TFloat[bytes_2_copy]; // Rearange genomes into CUDA scheme in temporal buffer. for (uint32_t i = 0; i < _ISLES; ++i) { for(uint32_t j = 0; j < _AGENTS; ++j) { const uint32_t locus_offset = i * _AGENTS + j; for(uint32_t k = 0; k < _DIMENSIONS; ++k) { const uint32_t cpu_idx = i * _AGENTS * _DIMENSIONS + j * _DIMENSIONS + k; const uint32_t cuda_idx = k * _ISLES * _AGENTS + locus_offset; store_buffer[cuda_idx] = src_data[cpu_idx]; } } } // Copy rearranged buffer into CUDA CudaSafeCall(cudaMemcpy(dst_data, store_buffer, bytes_2_copy, cudaMemcpyHostToDevice)); delete [] store_buffer; break; } }
population_set_cuda<TFloat>::~population_set_cuda() { CudaSafeCall(cudaFree(_dev_data_array)); CudaSafeCall(cudaFree(_dev_transformed_data_array)); CudaSafeCall(cudaFree(_dev_fitness_array)); }
void GPUData::downloadData(void* data) { CudaSafeCall(cudaMemcpy(data,gpuPtr,size,cudaMemcpyDeviceToHost)); }