Fflcun2::ParticlesPosition Fflcun2::run(int nSteps) { for(int j = 0; j < nSteps; j++) { runCalcU(blocks, threads, devMatrixes); cudaThreadSynchronize(); if (cPar->ft == ConstParams::ROTATING) { chPar->hExtX = cPar->hExtXInit * sin(2 * M_PI * cPar->rff * chPar->time); chPar->hExtY = cPar->hExtYInit * cos(2 * M_PI * cPar->rff * chPar->time); } fillGloabalChangable(chPar); cudaThreadSynchronize(); runOneStep(blocks, threads, devMatrixes); cudaThreadSynchronize(); runApplyDeltas(blocks, threads, devMatrixes); cudaThreadSynchronize(); chPar->time += chPar->dTimeCurrent; } /* cudaMemcpy(partPos.x, devMatrixes.x, sizeof(float) * cPar->nPart, cudaMemcpyDeviceToHost); cudaMemcpy(partPos.y, devMatrixes.y, sizeof(float) * cPar->nPart, cudaMemcpyDeviceToHost); cudaMemcpy(partPos.z, devMatrixes.z, sizeof(float) * cPar->nPart, cudaMemcpyDeviceToHost); cudaMemcpy(partPos.theta, devMatrixes.theta, sizeof(float) * cPar->nPart, cudaMemcpyDeviceToHost); cudaMemcpy(partPos.phy, devMatrixes.phy, sizeof(float) * cPar->nPart, cudaMemcpyDeviceToHost); */ cudaMemcpy(partPos.x, devMatrixes.x, sizeof(float) * cPar->nPart, cudaMemcpyDeviceToHost); cudaMemcpy(partPos.y, devMatrixes.y, sizeof(float) * cPar->nPart, cudaMemcpyDeviceToHost); cudaMemcpy(partPos.z, devMatrixes.z, sizeof(float) * cPar->nPart, cudaMemcpyDeviceToHost); cudaMemcpy(partPos.theta, devMatrixes.theta, sizeof(float) * cPar->nPart, cudaMemcpyDeviceToHost); cudaMemcpy(partPos.phy, devMatrixes.phy, sizeof(float) * cPar->nPart, cudaMemcpyDeviceToHost); return partPos; }
void benchmark(int iterations) { // allocate memory for result unsigned int *d_result; unsigned int size = width * height * sizeof(unsigned int); cutilSafeCall( cudaMalloc( (void**) &d_result, size)); // warm-up gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError( cutStartTimer( timer)); // execute the kernel for(int i=0; i<iterations; i++) { gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); } cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError( cutStopTimer( timer)); // check if kernel execution generated an error cutilCheckMsg("Kernel execution failed"); printf("Processing time: %f (ms)\n", cutGetTimerValue( timer)); printf("%.2f Mpixels/sec\n", (width*height*iterations / (cutGetTimerValue( timer) / 1000.0f)) / 1e6); cutilSafeCall(cudaFree(d_result)); }
void RadialBasisFunction::Train(HostMatrix<float> &Input, HostMatrix<float> &Target){ //std::cout << "Training" << std::endl; // c_width = (float*) malloc(sizeof(float)*network_size); // memset(c_width,0,sizeof(float)*network_size); DeviceMatrix<float> device_X(Input); //std::cout << "KMeans" << std::endl; clock_t initialTime = clock(); KMeans KM; KM.SetSeed(seed); dCenters = KM.Execute(device_X,network_size); cudaThreadSynchronize(); times[0] = (clock() - initialTime); //std::cout << "Adjust Widths" << std::endl; /*Adjust width using mean of distance to neighbours*/ initialTime = clock(); AdjustWidths(number_neighbours); cudaThreadSynchronize(); times[1] = (clock() - initialTime); /*Training weights and scaling factor*/ HostMatrix<float> TargetArr(Target.Rows(),NumClasses); memset(TargetArr.Pointer(),0,sizeof(float)*TargetArr.Elements()); for(int i = 0; i < Target.Rows(); i++){ TargetArr(i,((int)Target(i,0)-1)) = 1; } DeviceMatrix<float> d_Target(TargetArr); //std::cout << "Calculating Weights" << std::endl; initialTime = clock(); DeviceMatrix<float> device_activ_matrix(device_X.Rows(),dCenters.Rows(),ColumnMajor); KernelActivationMatrix(device_activ_matrix.Pointer(),device_X.Pointer(),dCenters.Pointer(),device_X.Columns(),dCenters.Columns(),device_activ_matrix.Columns(),device_activ_matrix.Rows(),scaling_factor,device_c_width.Pointer()); DeviceMatrix<float> d_Aplus = UTILS::pseudoinverse(device_activ_matrix); dWeights = DeviceMatrix<float>(d_Aplus.Rows(),d_Target.Columns()); d_Aplus.Multiply(d_Aplus,d_Target,dWeights); /*Return Weights and Centers*/ cudaThreadSynchronize(); times[2] = (clock() - initialTime); // cudaMemcpy(c_width,device_c_width.Pointer(),sizeof(float)*device_c_width.Length(),cudaMemcpyDeviceToHost); // this->Weights = HostMatrix<float>(dWeights); // this->Centers = HostMatrix<float>(dCenters); }
//----------------------------------------------------------------------------- void QGLImageGpuWidget::fillPbo(iu::ImageGpu_8u_C4* output) { // map GL <-> CUDA resource uchar4 *d_dst = NULL; size_t start; cudaGraphicsMapResources(1, &cuda_pbo_resource_, 0); cudaGraphicsResourceGetMappedPointer((void**)&d_dst, &start, cuda_pbo_resource_); // get image data iuprivate::cuCopyImageToPbo(image_, num_channels_, bit_depth_, d_dst, min_, max_); cudaThreadSynchronize(); // get overlays iuprivate::OverlayList::iterator it; for ( it=overlay_list_.begin() ; it != overlay_list_.end(); it++ ) if ((*it)->isActive()) cuCopyOverlayToPbo((*it), d_dst, image_->size()); cudaThreadSynchronize(); if (output != NULL) { // copy final pbo to output iu::ImageGpu_8u_C4 temp(d_dst, image_->width(), image_->height(), image_->width()*sizeof(uchar4), true); iu::copy(&temp, output); } // unmap GL <-> CUDA resource cudaGraphicsUnmapResources(1, &cuda_pbo_resource_, 0); }
OsdCudaGLVertexBuffer::~OsdCudaGLVertexBuffer() { cudaThreadSynchronize(); unmap(); cudaGraphicsUnregisterResource(_cudaResource); cudaThreadSynchronize(); glDeleteBuffers(1, &_vbo); }
int main(int argc, char** argv) { float fTotalTime = 0; // int TARGET_WIDTH=atoi(argv[2]); // int TARGET_HEIGHT=atoi(argv[3]); // bool visualize_results=atoi(argv[4]); // unsigned int kernel_size=atoi(argv[2]); int gpuNr=atoi(argv[2]); checkCudaErrors(cudaSetDevice(gpuNr)); IplImage* gray_image = cvLoadImage(argv[1],CV_LOAD_IMAGE_GRAYSCALE); unsigned char * d_input_image; unsigned char * d_output_image; int widthImage=gray_image->width; int heightImage=gray_image->height; IplImage *output_image = cvCreateImage(cvSize(widthImage,heightImage), IPL_DEPTH_8U, 1); for( int i=0;i<heightImage;i++) for( int j=0;j<widthImage;j++) output_image->imageData[i*widthImage+j]=255; unsigned int * d_histogram; int total_threads=256; cudaMalloc(&d_histogram,sizeof(unsigned int)*256*total_threads); checkCudaErrors(cudaMalloc(&d_input_image,widthImage*heightImage*sizeof(unsigned char))); checkCudaErrors(cudaMalloc(&d_output_image,widthImage*heightImage*sizeof(unsigned char))); checkCudaErrors(cudaMemcpy(d_input_image,gray_image->imageData,widthImage*heightImage*sizeof(unsigned char),cudaMemcpyHostToDevice)); unsigned int windows_array[4]={15,17,25,31}; int total_implementations=4; double elapsed_time; for (int i=1;i<=total_implementations;i++) { for( int j=0;j<4;j++) { timer my_timer; MedianFilterUcharCUDA(d_input_image,d_output_image,d_histogram,widthImage,heightImage,windows_array[j],16,16,i); cudaThreadSynchronize(); elapsed_time=my_timer.elapsed(); printf("elapsed_time for implementation %d for window size %d was %f \n",i,windows_array[j],elapsed_time); } } timer array_timer; arrayFireRows(d_input_image,d_output_image,widthImage,heightImage,3,16,16); cudaThreadSynchronize(); elapsed_time=array_timer.elapsed(); printf("elapsed_time for array fire was %f \n",elapsed_time); checkCudaErrors(cudaMemcpy(output_image->imageData,d_output_image,widthImage*heightImage*sizeof(unsigned char),cudaMemcpyDeviceToHost)); // _medianfilter((unsigned char *)gray_image->imageData, (unsigned char *)output_image->imageData, widthImage, heightImage); cvSaveImage("output.jpg",output_image); }
void CudaInterface::fillParamMem(ParamMem_t& pmem, int byteVal) { checkCudaErrors(cudaSetDevice(mDevID)); checkCudaErrors(cudaGetDevice(&mDevID)); std::cout << " setting " << pmem.totalSize * sizeof(float) << " bytes to " << pmem.base << "\n"; if (pmem.device) { checkCudaErrors(cudaThreadSynchronize()); checkCudaErrors(cudaMemset(pmem.base, byteVal, pmem.totalSize * sizeof(float))); checkCudaErrors(cudaThreadSynchronize()); } else memset(pmem.base, byteVal, pmem.totalSize * sizeof(float)); }
void time_ongpu(int TA, int TB, int m, int k, int n) { int iter = 10; float *a = random_matrix(m,k); float *b = random_matrix(k,n); int lda = (!TA)?k:m; int ldb = (!TB)?n:k; float *c = random_matrix(m,n); float *a_cl = cuda_make_array(a, m*k); float *b_cl = cuda_make_array(b, k*n); float *c_cl = cuda_make_array(c, m*n); int i; clock_t start = clock(), end; for(i = 0; i<iter; ++i){ gemm_ongpu(TA,TB,m,n,k,1,a_cl,lda,b_cl,ldb,1,c_cl,n); cudaThreadSynchronize(); } double flop = ((double)m)*n*(2.*k + 2.)*iter; double gflop = flop/pow(10., 9); end = clock(); double seconds = sec(end-start); printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %lf s, %lf GFLOPS\n",m,k,k,n, TA, TB, seconds, gflop/seconds); cuda_free(a_cl); cuda_free(b_cl); cuda_free(c_cl); free(a); free(b); free(c); }
void Fflcun2::setConfig(std::string fname) { //WARNING - test it this->freeMemory(); cPar = new ConstParams; chPar = new ChangableParams; configFileName = fname; this->setSettings(); cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, 0); //TODO - throw exception if no device found blocks = ceil((float)cPar->nPart / SHARED_ARRAY); blocks += blocks % deviceProp.multiProcessorCount; threads = cPar->nPart / blocks; cPar->nPart = threads * blocks; std::cout << "After correction number of particles = " << cPar->nPart << std::endl; srand(time(NULL)); this->allocMemory(); this->initDevMatrixes(); fillGloabalChangable(chPar); fillGloabalConstant(cPar); runSetupKernel(blocks, threads, devMatrixes); cudaThreadSynchronize(); }
void gpuNUFFT::GpuNUFFTOperator::freeDeviceMemory(int n_coils) { if (!gpuMemAllocated) return; cufftDestroy(fft_plan); // Destroy the cuFFT plan. if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at thread synchronization 9: %s\n",cudaGetErrorString(cudaGetLastError())); freeLookupTable(); freeTotalDeviceMemory(data_indices_d,data_sorted_d,crds_d,gdata_d,sectors_d,sector_centers_d,NULL);//NULL as stop if (n_coils > 1 && deapo_d != NULL) cudaFree(deapo_d); if (this->applySensData()) cudaFree(sens_d); if (this->applyDensComp()) cudaFree(density_comp_d); showMemoryInfo(); gpuMemAllocated = false; }
void mpla_generic_dgemv(struct mpla_vector* b, struct mpla_generic_matrix* A, struct mpla_vector* x, void (*mpla_dgemv_core)(struct mpla_vector*, struct mpla_generic_matrix*, struct mpla_vector*, struct mpla_instance*), struct mpla_instance* instance) { // allocate redistributed vector struct mpla_vector x_redist; mpla_init_vector_for_block_rows(&x_redist, instance, x->vec_row_count); // redistribute input vector with row-block parallel distribution to column-block parallel distribution mpla_redistribute_vector_for_generic_dgesv(&x_redist, x, A, instance); // generic computation core: matrix-vector product mpla_dgemv_core(b, A, &x_redist, instance); // create sub-communicator for each process row int remain_dims[2]; remain_dims[0]=0; remain_dims[1]=1; MPI_Comm row_comm; MPI_Cart_sub(instance->comm, remain_dims, &row_comm); // summation of block row results double* sum; cudaMalloc((void**)&sum, sizeof(double)*b->cur_proc_row_count); cudaThreadSynchronize(); checkCUDAError("cudaMalloc"); MPI_Allreduce(b->data, sum, b->cur_proc_row_count, MPI_DOUBLE, MPI_SUM, row_comm); cudaMemcpy(b->data, sum, sizeof(double)*b->cur_proc_row_count, cudaMemcpyDeviceToDevice); // cleanup cudaFree(sum); mpla_free_vector(&x_redist, instance); MPI_Comm_free(&row_comm); }
cudaError_t cudaLaunch(const void *entry) { static cudaError_t (*nv_cudaLaunch)(const char *) = NULL; cudaError_t ret; struct timeval t; if(!nv_cudaLaunch) { nv_cudaLaunch = dlsym(RTLD_NEXT, "cudaLaunch"); if(!nv_cudaLaunch) { fprintf(stderr, "failed to find symbol cudaLaunch: %s\n", dlerror()); return cudaErrorSharedObjectSymbolNotFound; } } gettimeofday(&t, NULL); printf("[gvm] %lf intercepting cudaLaunch\n", t.tv_sec + t.tv_usec / 1000000.0); ret = nv_cudaLaunch(entry); cudaThreadSynchronize(); gettimeofday(&t, NULL); printf("[gvm] %lf intercepted cudaLaunch\n", t.tv_sec + t.tv_usec / 1000000.0); return ret; }
/** * Synchronize with device. * * @param sync True to synchronize, false if not. */ inline void synchronize(const bool sync = true) { #ifdef ENABLE_CUDA if (sync) { CUDA_CHECKED_CALL(cudaThreadSynchronize()); } #endif }
void Fflcun2::setN(int n) { this->freeMemory(); cPar = new ConstParams; chPar = new ChangableParams; this->setSettings(); this->cPar->nPart = n; cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, 0); blocks = ceil(cPar->nPart * 1.0 / SHARED_ARRAY); blocks += blocks % deviceProp.multiProcessorCount; threads = cPar->nPart / blocks; cPar->nPart = threads * blocks; std::cout << "After correction number of particles = " << cPar->nPart << std::endl; srand(time(NULL)); this->allocMemory(); this->initDevMatrixes(); fillGloabalChangable(chPar); fillGloabalConstant(cPar); runSetupKernel(blocks, threads, devMatrixes); cudaThreadSynchronize(); }
void mvReductArraysToHost ( int reduct_bytes ) { cutilSafeCall ( cudaMemcpy ( OP_reduct_h, OP_reduct_d, reduct_bytes, cudaMemcpyDeviceToHost ) ); cutilSafeCall ( cudaThreadSynchronize ( ) ); }
void mvConstArraysToDevice ( int consts_bytes ) { cutilSafeCall ( cudaMemcpy ( OP_consts_d, OP_consts_h, consts_bytes, cudaMemcpyHostToDevice ) ); cutilSafeCall ( cudaThreadSynchronize ( ) ); }
cudaError_t cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind) { static cudaError_t (*nv_cudaMemcpy)(void *, const void *, size_t, enum cudaMemcpyKind) = NULL; cudaError_t ret; struct timeval t; if(!nv_cudaMemcpy) { nv_cudaMemcpy = dlsym(RTLD_NEXT, "cudaMemcpy"); if(!nv_cudaMemcpy) { fprintf(stderr, "failed to find symbol cudaMemcpy: %s\n", dlerror()); return cudaErrorSharedObjectSymbolNotFound; } } gettimeofday(&t, NULL); printf("[gvm] %lf intercepting cudaMemcpy\n", t.tv_sec + t.tv_usec / 1000000.0); ret = nv_cudaMemcpy(dst, src, count, kind); cudaThreadSynchronize(); gettimeofday(&t, NULL); printf("[gvm] %lf intercepted cudaMemcpy( %lx %lx %ld %d ) = %d\n", t.tv_sec + t.tv_usec / 1000000.0, (unsigned long)dst, (unsigned long)src, count, kind, (int)ret); return ret; }
double time_invocation_cuda(std::size_t num_trials, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3) { cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start); for(std::size_t i = 0; i < num_trials; ++i) { f(arg1,arg2,arg3); } cudaEventRecord(stop); cudaThreadSynchronize(); float msecs = 0; cudaEventElapsedTime(&msecs, start, stop); cudaEventDestroy(start); cudaEventDestroy(stop); // return mean msecs return msecs / num_trials; }
void savegpuann(const gpuann& nn, fann *ann, unsigned int instanceIndex) { unsigned int neuronCount = ann->total_neurons; unsigned int weightsCount = ((ann->last_layer - 1)->last_neuron - 1)->last_con; chekedcudaMemcpyAsync(nn.h_tmp_sumArray, nn.d_sumArray + neuronCount * instanceIndex, neuronCount * sizeof(fann_type), cudaMemcpyDeviceToHost); chekedcudaMemcpyAsync(nn.h_tmp_valuesArray, nn.d_valuesArray + neuronCount * instanceIndex, neuronCount * sizeof(fann_type), cudaMemcpyDeviceToHost); chekedcudaMemcpyAsync(ann->weights, nn.d_weightsArray + weightsCount * instanceIndex, weightsCount * sizeof(fann_type), cudaMemcpyDeviceToHost); cudaThreadSynchronize(); struct fann_neuron *neuronsArray = ann->first_layer->first_neuron; struct fann_layer *last_layer = ann->last_layer; struct fann_layer *layer_it = ann->first_layer; for(; layer_it != last_layer; layer_it++) { struct fann_neuron * last_neuron = layer_it->last_neuron; struct fann_neuron * neuron_it = layer_it->first_neuron; for(; neuron_it != last_neuron; neuron_it++) { unsigned int currentNeuronShift = neuron_it - neuronsArray; neuron_it->value = nn.h_tmp_valuesArray[currentNeuronShift]; neuron_it->sum = nn.h_tmp_sumArray[currentNeuronShift]; } } }
// keplereq_wrapper_C: // C wrapper function to solve's Kepler's equation num times. // inputs: // ph_ma: pointer to beginning element of array of doubles containing mean anomaly in radians // ph_ecc: pointer to beginning element of array of doubles containing eccentricity // num: integer size of input arrays // ph_eccanom: pointer to beginning element of array of doubles eccentric anomaly in radians // outputs: // ph_eccanom: values overwritten with eccentric anomaly // assumptions: // input mean anomalies between 0 and 2pi // input eccentricities between 0 and 1 // all three arrays have at least num elements // void keplereq_wrapper_c(double *ph_ma, double *ph_ecc, int num, double *ph_eccanom) { int gpuid = init_cuda(); // put vectors in thrust format from raw points thrust::host_vector<double> h_ecc(ph_ecc,ph_ecc+num); thrust::host_vector<double> h_ma(ph_ma,ph_ma+num); cutCreateTimer(&memoryTime); cutCreateTimer(&kernelTime); cutResetTimer(memoryTime); cutResetTimer(kernelTime); if(gpuid>=0) { cutStartTimer(memoryTime); // transfer input params to GPU thrust::device_vector<double> d_ecc = h_ecc; thrust::device_vector<double> d_ma = h_ma; // allocate mem on GPU thrust::device_vector<double> d_eccanom(num); cudaThreadSynchronize(); cutStopTimer(memoryTime); // distribute the computation to the GPU cutStartTimer(kernelTime); thrust::for_each( thrust::make_zip_iterator(thrust::make_tuple(d_ma.begin(),d_ecc.begin(),d_eccanom.begin())), thrust::make_zip_iterator(thrust::make_tuple(d_ma.end(), d_ecc.end(), d_eccanom.end())), keplereq_functor() ); cudaThreadSynchronize(); cutStopTimer(kernelTime); // transfer results back to host cutStartTimer(memoryTime); thrust::copy(d_eccanom.begin(),d_eccanom.end(),ph_eccanom); cudaThreadSynchronize(); cutStopTimer(memoryTime); } else { // distribute the computation to the CPU cutStartTimer(kernelTime); thrust::for_each( thrust::make_zip_iterator(thrust::make_tuple(h_ma.begin(),h_ecc.begin(),ph_eccanom)), thrust::make_zip_iterator(thrust::make_tuple(h_ma.end(), h_ecc.end(), ph_eccanom+num)), keplereq_functor() ); cutStopTimer(kernelTime); } }
void op_fetch_data ( op_dat dat ) { cutilSafeCall ( cudaMemcpy ( dat->data, dat->data_d, dat->size * dat->set->size, cudaMemcpyDeviceToHost ) ); cutilSafeCall ( cudaThreadSynchronize ( ) ); }
void op_cpHostToDevice ( void ** data_d, void ** data_h, int size ) { cutilSafeCall ( cudaMalloc ( data_d, size ) ); cutilSafeCall ( cudaMemcpy ( *data_d, *data_h, size, cudaMemcpyHostToDevice ) ); cutilSafeCall ( cudaThreadSynchronize ( ) ); }
void MulC_I(std::vector<T* >& h_Imgs, T c, int n, int nImgs, T* d_scratchI[]) { #if 0 assert(nImgs < h_Imgs.size()); // load the first image copyArrayToDevice(d_scratchI[0], h_Imgs[0], n); cplVectorOpers::MulC_I(d_scratchI[0], c, n); // load the second image if (nImgs > 1) copyArrayToDevice(d_scratchI[1], h_Imgs[1], n); int i=2; for (; i < nImgs; ++i){ copyArrayToDeviceAsync(d_scratchI[i % 3], h_Imgs[i], n, STM_H2D); cplVectorOpers::MulC_I(d_scratchI[(i-1) % 3], c, n, STM_D2D); copyArrayFromDeviceAsync(h_Imgs[i-2], d_scratchI[(i-2)%3], n, STM_D2H); cudaThreadSynchronize(); } if (nImgs> 1) cplVectorOpers::MulC_I(d_scratchI[(i-1) % 3], c, n, STM_D2D); copyArrayFromDeviceAsync(h_Imgs[i-2], d_scratchI[(i-2) % 3], n, STM_D2H); cudaThreadSynchronize(); ++i; if (nImgs> 1) copyArrayFromDeviceAsync(h_Imgs[i-2], d_scratchI[(i-2) % 3], n, STM_D2H); #else for (int i=0; i < nImgs + 2; ++i) { if (i < nImgs) copyArrayToDeviceAsync(d_scratchI[i % 3], h_Imgs[i], n, STM_H2D); if ((i >=1) && ((i-1) < nImgs)) cplVectorOpers::MulC_I(d_scratchI[(i-1) % 3], c, n, STM_D2D); if ((i >=2) && ((i-2) < nImgs)) copyArrayFromDeviceAsync(h_Imgs[i-2], d_scratchI[(i-2)%3], n, STM_D2H); cudaThreadSynchronize(); } #endif }
void OsdCudaGLVertexBuffer::unmap() { cudaThreadSynchronize(); if (_devicePtr == NULL) return; cudaError_t err = cudaGraphicsUnmapResources(1, &_cudaResource, 0); if (err != cudaSuccess) OsdError(OSD_CUDA_GL_ERROR, "OsdCudaGLVertexBuffer::unmap failed.\n%s\n", cudaGetErrorString(err)); _devicePtr = NULL; }
void DeviceMemory<Type, Dim>:: initMem(int val, bool sync) { if(this->buffer == 0) return; CUDA_CHECK(cudaMemset(this->buffer, val, this->getSize() * sizeof(Type))); if(sync) cudaThreadSynchronize(); }
void runAutoTest(int argc, char **argv) { printf("[%s] (automated testing w/ readback)\n", sSDKsample); if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { cutilDeviceInit(argc, argv); } else { cudaSetDevice( cutGetMaxGflopsDeviceId() ); } loadDefaultImage( argv[0] ); if (argc > 1) { char *filename; if (cutGetCmdLineArgumentstr(argc, (const char **)argv, "file", &filename)) { initializeData(filename); } } else { loadDefaultImage( argv[0]); } g_CheckRender = new CheckBackBuffer(imWidth, imHeight, sizeof(Pixel), false); g_CheckRender->setExecPath(argv[0]); Pixel *d_result; cutilSafeCall( cudaMalloc( (void **)&d_result, imWidth*imHeight*sizeof(Pixel)) ); while (g_SobelDisplayMode <= 2) { printf("AutoTest: %s <%s>\n", sSDKsample, filterMode[g_SobelDisplayMode]); sobelFilter(d_result, imWidth, imHeight, g_SobelDisplayMode, imageScale ); cutilSafeCall( cudaThreadSynchronize() ); cudaMemcpy(g_CheckRender->imageData(), d_result, imWidth*imHeight*sizeof(Pixel), cudaMemcpyDeviceToHost); g_CheckRender->savePGM(sOriginal[g_Index], false, NULL); if (!g_CheckRender->PGMvsPGM(sOriginal[g_Index], sReference[g_Index], MAX_EPSILON_ERROR, 0.15f)) { g_TotalErrors++; } g_Index++; g_SobelDisplayMode = (SobelDisplayMode)g_Index; } cutilSafeCall( cudaFree( d_result ) ); delete g_CheckRender; if (!g_TotalErrors) printf("TEST PASSED!\n"); else printf("TEST FAILED!\n"); }
void op_mvHostToDevice ( void ** map, int size ) { void *tmp; cutilSafeCall ( cudaMalloc ( &tmp, size ) ); cutilSafeCall ( cudaMemcpy ( tmp, *map, size, cudaMemcpyHostToDevice ) ); cutilSafeCall ( cudaThreadSynchronize ( ) ); free ( *map ); *map = tmp; }
///////////////////////////////////// // error checking ///////////////////////////////////// magma_int_t magma_dcheckerr(const char *label) { cudaThreadSynchronize(); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { const char *e = cudaGetErrorString(err); fprintf(stderr, "CUDA Error: %s (at %s)", e, label); } return MAGMA_SUCCESS; }
// cleanup void free_myriad(int thr_id) { if (!init[thr_id]) return; cudaThreadSynchronize(); myriadgroestl_cpu_free(thr_id); init[thr_id] = false; cudaDeviceSynchronize(); }
inline void check_cuda_errors(const char *filename, const int line_number) { #ifdef CUDA_DEBUG cudaThreadSynchronize(); cudaError_t error = cudaGetLastError(); if(error != cudaSuccess) { printf("CUDA error at %s:%i: %s\n", filename, line_number, cudaGetErrorString(error)); exit(-1); } #endif }