Example #1
0
Fflcun2::ParticlesPosition Fflcun2::run(int nSteps) {
	for(int j = 0; j < nSteps; j++) {
		runCalcU(blocks, threads, devMatrixes);
		cudaThreadSynchronize();

		if (cPar->ft == ConstParams::ROTATING) {
			chPar->hExtX = cPar->hExtXInit * sin(2 * M_PI * cPar->rff * chPar->time);
			chPar->hExtY = cPar->hExtYInit * cos(2 * M_PI * cPar->rff * chPar->time);
		}
		fillGloabalChangable(chPar);
		cudaThreadSynchronize();

		runOneStep(blocks, threads, devMatrixes);
		cudaThreadSynchronize();

		runApplyDeltas(blocks, threads, devMatrixes);
		cudaThreadSynchronize();
		chPar->time += chPar->dTimeCurrent;
	}

/*	cudaMemcpy(partPos.x, devMatrixes.x, sizeof(float) * cPar->nPart, cudaMemcpyDeviceToHost);
	cudaMemcpy(partPos.y, devMatrixes.y, sizeof(float) * cPar->nPart, cudaMemcpyDeviceToHost);
	cudaMemcpy(partPos.z, devMatrixes.z, sizeof(float) * cPar->nPart, cudaMemcpyDeviceToHost);
	cudaMemcpy(partPos.theta, devMatrixes.theta, sizeof(float) * cPar->nPart, cudaMemcpyDeviceToHost);
	cudaMemcpy(partPos.phy, devMatrixes.phy, sizeof(float) * cPar->nPart, cudaMemcpyDeviceToHost);
*/
	cudaMemcpy(partPos.x, devMatrixes.x, sizeof(float) * cPar->nPart, cudaMemcpyDeviceToHost);
	cudaMemcpy(partPos.y, devMatrixes.y, sizeof(float) * cPar->nPart, cudaMemcpyDeviceToHost);
	cudaMemcpy(partPos.z, devMatrixes.z, sizeof(float) * cPar->nPart, cudaMemcpyDeviceToHost);
	cudaMemcpy(partPos.theta, devMatrixes.theta, sizeof(float) * cPar->nPart, cudaMemcpyDeviceToHost);
	cudaMemcpy(partPos.phy, devMatrixes.phy, sizeof(float) * cPar->nPart, cudaMemcpyDeviceToHost);

	return partPos;
}
void
benchmark(int iterations) 
{
    // allocate memory for result
    unsigned int *d_result;
    unsigned int size = width * height * sizeof(unsigned int);
    cutilSafeCall( cudaMalloc( (void**) &d_result, size));

    // warm-up
    gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads);

    cutilSafeCall( cudaThreadSynchronize() );
    cutilCheckError( cutStartTimer( timer));

    // execute the kernel
    for(int i=0; i<iterations; i++) {
        gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads);
    }

    cutilSafeCall( cudaThreadSynchronize() );
    cutilCheckError( cutStopTimer( timer));

    // check if kernel execution generated an error
    cutilCheckMsg("Kernel execution failed");

    printf("Processing time: %f (ms)\n", cutGetTimerValue( timer));
    printf("%.2f Mpixels/sec\n", (width*height*iterations / (cutGetTimerValue( timer) / 1000.0f)) / 1e6);

    cutilSafeCall(cudaFree(d_result));
}
void RadialBasisFunction::Train(HostMatrix<float> &Input, HostMatrix<float> &Target){

	//std::cout << "Training" << std::endl;

	//	c_width = (float*) malloc(sizeof(float)*network_size);
	//	memset(c_width,0,sizeof(float)*network_size);

	DeviceMatrix<float> device_X(Input);

	//std::cout << "KMeans" << std::endl;	
	clock_t initialTime = clock();
	KMeans KM;
	KM.SetSeed(seed);
	dCenters = KM.Execute(device_X,network_size);

	cudaThreadSynchronize();
	times[0] = (clock() - initialTime);

	//std::cout << "Adjust Widths" << std::endl;
	/*Adjust width using mean of distance to neighbours*/
	initialTime = clock();
	AdjustWidths(number_neighbours);

	cudaThreadSynchronize();
	times[1] = (clock() - initialTime);

	/*Training weights and scaling factor*/
	HostMatrix<float> TargetArr(Target.Rows(),NumClasses);
	memset(TargetArr.Pointer(),0,sizeof(float)*TargetArr.Elements());

	for(int i = 0; i < Target.Rows(); i++){
		TargetArr(i,((int)Target(i,0)-1)) = 1;
	}

	DeviceMatrix<float> d_Target(TargetArr);

	//std::cout << "Calculating Weights" << std::endl;

	initialTime = clock();

	DeviceMatrix<float> device_activ_matrix(device_X.Rows(),dCenters.Rows(),ColumnMajor);

	KernelActivationMatrix(device_activ_matrix.Pointer(),device_X.Pointer(),dCenters.Pointer(),device_X.Columns(),dCenters.Columns(),device_activ_matrix.Columns(),device_activ_matrix.Rows(),scaling_factor,device_c_width.Pointer());

	DeviceMatrix<float> d_Aplus = UTILS::pseudoinverse(device_activ_matrix);

	dWeights = DeviceMatrix<float>(d_Aplus.Rows(),d_Target.Columns());

	d_Aplus.Multiply(d_Aplus,d_Target,dWeights);


	/*Return Weights and Centers*/
	cudaThreadSynchronize();
	times[2] = (clock() - initialTime);

	// cudaMemcpy(c_width,device_c_width.Pointer(),sizeof(float)*device_c_width.Length(),cudaMemcpyDeviceToHost);
	//	this->Weights = HostMatrix<float>(dWeights);		
	//	this->Centers = HostMatrix<float>(dCenters);

}
//-----------------------------------------------------------------------------
void QGLImageGpuWidget::fillPbo(iu::ImageGpu_8u_C4* output)
{
  // map GL <-> CUDA resource
  uchar4 *d_dst = NULL;
  size_t start;
  cudaGraphicsMapResources(1, &cuda_pbo_resource_, 0);
  cudaGraphicsResourceGetMappedPointer((void**)&d_dst, &start, cuda_pbo_resource_);

  // get image data
  iuprivate::cuCopyImageToPbo(image_, num_channels_, bit_depth_, d_dst, min_, max_);
  cudaThreadSynchronize();

  // get overlays
  iuprivate::OverlayList::iterator it;
  for ( it=overlay_list_.begin() ; it != overlay_list_.end(); it++ )
    if ((*it)->isActive())
      cuCopyOverlayToPbo((*it), d_dst, image_->size());
  cudaThreadSynchronize();

  if (output != NULL)
  {
    // copy final pbo to output
    iu::ImageGpu_8u_C4 temp(d_dst, image_->width(), image_->height(),
                            image_->width()*sizeof(uchar4), true);
    iu::copy(&temp, output);
  }

  // unmap GL <-> CUDA resource
  cudaGraphicsUnmapResources(1, &cuda_pbo_resource_, 0);
}
OsdCudaGLVertexBuffer::~OsdCudaGLVertexBuffer() {

    cudaThreadSynchronize();
    unmap();
    cudaGraphicsUnregisterResource(_cudaResource);
    cudaThreadSynchronize();
    glDeleteBuffers(1, &_vbo);
}
int main(int argc, char** argv)
{

	float fTotalTime = 0;

// 	int TARGET_WIDTH=atoi(argv[2]);
// 	int TARGET_HEIGHT=atoi(argv[3]);
// 	bool visualize_results=atoi(argv[4]);
// 	unsigned int kernel_size=atoi(argv[2]);
 	int gpuNr=atoi(argv[2]);
 	checkCudaErrors(cudaSetDevice(gpuNr));

	
	IplImage* gray_image = cvLoadImage(argv[1],CV_LOAD_IMAGE_GRAYSCALE);
	unsigned char * d_input_image;
	unsigned char * d_output_image;
	int widthImage=gray_image->width;
	int heightImage=gray_image->height;
	
	IplImage *output_image = cvCreateImage(cvSize(widthImage,heightImage), IPL_DEPTH_8U, 1);
	for( int i=0;i<heightImage;i++)
	  for( int j=0;j<widthImage;j++)
	    output_image->imageData[i*widthImage+j]=255;
	
	  	  unsigned int * d_histogram;
	int total_threads=256;	  
	  cudaMalloc(&d_histogram,sizeof(unsigned int)*256*total_threads);
	checkCudaErrors(cudaMalloc(&d_input_image,widthImage*heightImage*sizeof(unsigned char)));  
	checkCudaErrors(cudaMalloc(&d_output_image,widthImage*heightImage*sizeof(unsigned char)));
	checkCudaErrors(cudaMemcpy(d_input_image,gray_image->imageData,widthImage*heightImage*sizeof(unsigned char),cudaMemcpyHostToDevice));
	unsigned int windows_array[4]={15,17,25,31};
	int total_implementations=4;
	double elapsed_time;
	for (int i=1;i<=total_implementations;i++)
	{
	  for( int j=0;j<4;j++)
	  {
	timer my_timer;  
	MedianFilterUcharCUDA(d_input_image,d_output_image,d_histogram,widthImage,heightImage,windows_array[j],16,16,i);
	cudaThreadSynchronize();
	elapsed_time=my_timer.elapsed();
	printf("elapsed_time for implementation %d for window size %d was %f \n",i,windows_array[j],elapsed_time);
	  }
	}
	timer array_timer;
	arrayFireRows(d_input_image,d_output_image,widthImage,heightImage,3,16,16);
		cudaThreadSynchronize();
	elapsed_time=array_timer.elapsed();
	printf("elapsed_time for array fire was %f \n",elapsed_time);
	checkCudaErrors(cudaMemcpy(output_image->imageData,d_output_image,widthImage*heightImage*sizeof(unsigned char),cudaMemcpyDeviceToHost));
// 	_medianfilter((unsigned char *)gray_image->imageData, (unsigned char *)output_image->imageData, widthImage, heightImage);
	
	cvSaveImage("output.jpg",output_image);
	



}
void
CudaInterface::fillParamMem(ParamMem_t& pmem, int byteVal) {
    checkCudaErrors(cudaSetDevice(mDevID));
    checkCudaErrors(cudaGetDevice(&mDevID));
    std::cout << "  setting " << pmem.totalSize * sizeof(float) << " bytes to " <<  pmem.base << "\n";
    if (pmem.device) {
        checkCudaErrors(cudaThreadSynchronize());
        checkCudaErrors(cudaMemset(pmem.base, byteVal, pmem.totalSize * sizeof(float)));
        checkCudaErrors(cudaThreadSynchronize());
    } else
        memset(pmem.base, byteVal, pmem.totalSize * sizeof(float));
}
Example #8
0
void time_ongpu(int TA, int TB, int m, int k, int n)
{
    int iter = 10;
    float *a = random_matrix(m,k);
    float *b = random_matrix(k,n);

    int lda = (!TA)?k:m;
    int ldb = (!TB)?n:k;

    float *c = random_matrix(m,n);

    float *a_cl = cuda_make_array(a, m*k);
    float *b_cl = cuda_make_array(b, k*n);
    float *c_cl = cuda_make_array(c, m*n);

    int i;
    clock_t start = clock(), end;
    for(i = 0; i<iter; ++i){
        gemm_ongpu(TA,TB,m,n,k,1,a_cl,lda,b_cl,ldb,1,c_cl,n);
        cudaThreadSynchronize();
    }
    double flop = ((double)m)*n*(2.*k + 2.)*iter;
    double gflop = flop/pow(10., 9);
    end = clock();
    double seconds = sec(end-start);
    printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %lf s, %lf GFLOPS\n",m,k,k,n, TA, TB, seconds, gflop/seconds);
    cuda_free(a_cl);
    cuda_free(b_cl);
    cuda_free(c_cl);
    free(a);
    free(b);
    free(c);
}
Example #9
0
void Fflcun2::setConfig(std::string fname) {	//WARNING - test it
	this->freeMemory();
	cPar = new ConstParams;
	chPar = new ChangableParams;
	configFileName = fname;
	this->setSettings();

	cudaDeviceProp deviceProp;
	cudaGetDeviceProperties(&deviceProp, 0);
	//TODO - throw exception if no device found

	blocks = ceil((float)cPar->nPart / SHARED_ARRAY);
	blocks += blocks % deviceProp.multiProcessorCount;
	threads = cPar->nPart / blocks;
	cPar->nPart = threads * blocks;
	std::cout << "After correction number of particles = " << cPar->nPart << std::endl;

	srand(time(NULL));

	this->allocMemory();
	this->initDevMatrixes();

	fillGloabalChangable(chPar);
	fillGloabalConstant(cPar);

	runSetupKernel(blocks, threads, devMatrixes);
	cudaThreadSynchronize();
}
Example #10
0
void gpuNUFFT::GpuNUFFTOperator::freeDeviceMemory(int n_coils)
{
  if (!gpuMemAllocated)
    return;

  cufftDestroy(fft_plan);
  // Destroy the cuFFT plan.
  if (DEBUG && (cudaThreadSynchronize() != cudaSuccess))
    printf("error at thread synchronization 9: %s\n",cudaGetErrorString(cudaGetLastError()));
  freeLookupTable();
  
  freeTotalDeviceMemory(data_indices_d,data_sorted_d,crds_d,gdata_d,sectors_d,sector_centers_d,NULL);//NULL as stop
  
  if (n_coils > 1 && deapo_d != NULL)
    cudaFree(deapo_d);
  
  if (this->applySensData())
    cudaFree(sens_d);
  
  if (this->applyDensComp())
    cudaFree(density_comp_d);

  showMemoryInfo();
  gpuMemAllocated = false;
}
Example #11
0
File: mpla.cpp Project: zaspel/MPLA
void mpla_generic_dgemv(struct mpla_vector* b, struct mpla_generic_matrix* A, struct mpla_vector* x, void (*mpla_dgemv_core)(struct mpla_vector*, struct mpla_generic_matrix*, struct mpla_vector*, struct mpla_instance*), struct mpla_instance* instance)
{
	// allocate redistributed vector
	struct mpla_vector x_redist;
	mpla_init_vector_for_block_rows(&x_redist, instance, x->vec_row_count);

	// redistribute input vector with row-block parallel distribution to column-block parallel distribution
	mpla_redistribute_vector_for_generic_dgesv(&x_redist, x, A, instance);
	
	// generic computation core: matrix-vector product
	mpla_dgemv_core(b, A, &x_redist, instance);

	// create sub-communicator for each process row
	int remain_dims[2];
	remain_dims[0]=0;
	remain_dims[1]=1;
	MPI_Comm row_comm;
	MPI_Cart_sub(instance->comm, remain_dims, &row_comm);

	// summation of block row results
	double* sum;
	cudaMalloc((void**)&sum, sizeof(double)*b->cur_proc_row_count);
	cudaThreadSynchronize();
	checkCUDAError("cudaMalloc");
	MPI_Allreduce(b->data, sum, b->cur_proc_row_count, MPI_DOUBLE, MPI_SUM, row_comm);
	cudaMemcpy(b->data, sum, sizeof(double)*b->cur_proc_row_count, cudaMemcpyDeviceToDevice);

	// cleanup
	cudaFree(sum);
	mpla_free_vector(&x_redist, instance);

	MPI_Comm_free(&row_comm);
}
Example #12
0
cudaError_t cudaLaunch(const void *entry)
{
	static cudaError_t (*nv_cudaLaunch)(const char *) = NULL;
	cudaError_t ret;
	struct timeval t;

	if(!nv_cudaLaunch) {
		nv_cudaLaunch = dlsym(RTLD_NEXT, "cudaLaunch");
		if(!nv_cudaLaunch) {
			fprintf(stderr, "failed to find symbol cudaLaunch: %s\n", dlerror());
			return cudaErrorSharedObjectSymbolNotFound;
		}
	}

	gettimeofday(&t, NULL);
	printf("[gvm] %lf intercepting cudaLaunch\n", t.tv_sec + t.tv_usec / 1000000.0);

	ret = nv_cudaLaunch(entry);

	cudaThreadSynchronize();

	gettimeofday(&t, NULL);
	printf("[gvm] %lf intercepted cudaLaunch\n", t.tv_sec + t.tv_usec / 1000000.0);


	return ret;
}
Example #13
0
 /**
  * Synchronize with device.
  *
  * @param sync True to synchronize, false if not.
  */
 inline void synchronize(const bool sync = true) {
   #ifdef ENABLE_CUDA
   if (sync) {
     CUDA_CHECKED_CALL(cudaThreadSynchronize());
   }
   #endif
 }
Example #14
0
void Fflcun2::setN(int n) {
	this->freeMemory();

	cPar = new ConstParams;
	chPar = new ChangableParams;
	this->setSettings();
	this->cPar->nPart = n;

	cudaDeviceProp deviceProp;
	cudaGetDeviceProperties(&deviceProp, 0);

	blocks = ceil(cPar->nPart * 1.0 / SHARED_ARRAY);
	blocks += blocks % deviceProp.multiProcessorCount;
	threads = cPar->nPart / blocks;
	cPar->nPart = threads * blocks;
	std::cout << "After correction number of particles = " << cPar->nPart << std::endl;

	srand(time(NULL));

	this->allocMemory();
	this->initDevMatrixes();

	fillGloabalChangable(chPar);
	fillGloabalConstant(cPar);

	runSetupKernel(blocks, threads, devMatrixes);
	cudaThreadSynchronize();
}
Example #15
0
void
mvReductArraysToHost ( int reduct_bytes )
{
  cutilSafeCall ( cudaMemcpy ( OP_reduct_h, OP_reduct_d, reduct_bytes,
                               cudaMemcpyDeviceToHost ) );
  cutilSafeCall ( cudaThreadSynchronize (  ) );
}
Example #16
0
void
mvConstArraysToDevice ( int consts_bytes )
{
  cutilSafeCall ( cudaMemcpy ( OP_consts_d, OP_consts_h, consts_bytes,
                               cudaMemcpyHostToDevice ) );
  cutilSafeCall ( cudaThreadSynchronize (  ) );
}
Example #17
0
cudaError_t cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind)
{
	static cudaError_t (*nv_cudaMemcpy)(void *, const void *, size_t, enum cudaMemcpyKind) = NULL;
	cudaError_t ret;
	struct timeval t;

	if(!nv_cudaMemcpy) {
		nv_cudaMemcpy = dlsym(RTLD_NEXT, "cudaMemcpy");
		if(!nv_cudaMemcpy) {
			fprintf(stderr, "failed to find symbol cudaMemcpy: %s\n", dlerror());
			return cudaErrorSharedObjectSymbolNotFound;
		}
	}

	gettimeofday(&t, NULL);
	printf("[gvm] %lf intercepting cudaMemcpy\n", t.tv_sec + t.tv_usec / 1000000.0);

	ret = nv_cudaMemcpy(dst, src, count, kind);

	cudaThreadSynchronize();

	gettimeofday(&t, NULL);
	printf("[gvm] %lf intercepted cudaMemcpy( %lx %lx %ld %d ) = %d\n", t.tv_sec + t.tv_usec / 1000000.0, (unsigned long)dst, (unsigned long)src, count, kind, (int)ret);

	return ret;
}
Example #18
0
  double time_invocation_cuda(std::size_t num_trials, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3)
{
  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);

  cudaEventRecord(start);
  for(std::size_t i = 0;
      i < num_trials;
      ++i)
  {
    f(arg1,arg2,arg3);
  }
  cudaEventRecord(stop);
  cudaThreadSynchronize();

  float msecs = 0;
  cudaEventElapsedTime(&msecs, start, stop);

  cudaEventDestroy(start);
  cudaEventDestroy(stop);

  // return mean msecs
  return msecs / num_trials;
}
void savegpuann(const gpuann& nn, fann *ann, unsigned int instanceIndex)
{
  unsigned int neuronCount = ann->total_neurons;
  unsigned int weightsCount = ((ann->last_layer - 1)->last_neuron - 1)->last_con;

  chekedcudaMemcpyAsync(nn.h_tmp_sumArray,     nn.d_sumArray + neuronCount * instanceIndex,      neuronCount  * sizeof(fann_type), cudaMemcpyDeviceToHost);
  chekedcudaMemcpyAsync(nn.h_tmp_valuesArray,  nn.d_valuesArray + neuronCount * instanceIndex,   neuronCount  * sizeof(fann_type), cudaMemcpyDeviceToHost);
  chekedcudaMemcpyAsync(ann->weights,          nn.d_weightsArray + weightsCount * instanceIndex, weightsCount * sizeof(fann_type), cudaMemcpyDeviceToHost);
  cudaThreadSynchronize();

  struct fann_neuron *neuronsArray = ann->first_layer->first_neuron;
  struct fann_layer *last_layer = ann->last_layer;
  struct fann_layer *layer_it   = ann->first_layer;

  for(; layer_it != last_layer; layer_it++)
  {
    struct fann_neuron * last_neuron = layer_it->last_neuron;
    struct fann_neuron * neuron_it   = layer_it->first_neuron;
    for(; neuron_it != last_neuron; neuron_it++)
    {
      unsigned int currentNeuronShift  = neuron_it - neuronsArray;
      neuron_it->value = nn.h_tmp_valuesArray[currentNeuronShift];
      neuron_it->sum = nn.h_tmp_sumArray[currentNeuronShift];
    }
  }
}
Example #20
0
// keplereq_wrapper_C:
//         C wrapper function to solve's Kepler's equation num times.  
// inputs: 
//         ph_ma:  pointer to beginning element of array of doubles containing mean anomaly in radians 
//         ph_ecc: pointer to beginning element of array of doubles containing eccentricity 
//         num:    integer size of input arrays 
//         ph_eccanom: pointer to beginning element of array of doubles eccentric anomaly in radians 
// outputs:
//         ph_eccanom: values overwritten with eccentric anomaly
// assumptions:
//         input mean anomalies between 0 and 2pi
//         input eccentricities between 0 and 1
//         all three arrays have at least num elements 
//
void keplereq_wrapper_c(double *ph_ma, double *ph_ecc, int num, double *ph_eccanom)
{
	int gpuid = init_cuda();
	// put vectors in thrust format from raw points
	thrust::host_vector<double> h_ecc(ph_ecc,ph_ecc+num);
	thrust::host_vector<double> h_ma(ph_ma,ph_ma+num);

	cutCreateTimer(&memoryTime);  	cutCreateTimer(&kernelTime);
	cutResetTimer(memoryTime);    	cutResetTimer(kernelTime);

	if(gpuid>=0)
	{
	cutStartTimer(memoryTime);
	// transfer input params to GPU
	thrust::device_vector<double> d_ecc = h_ecc;
	thrust::device_vector<double> d_ma = h_ma;
	// allocate mem on GPU
	thrust::device_vector<double> d_eccanom(num);
	cudaThreadSynchronize();
	cutStopTimer(memoryTime);
	
	// distribute the computation to the GPU
	cutStartTimer(kernelTime);
	thrust::for_each(
	   thrust::make_zip_iterator(thrust::make_tuple(d_ma.begin(),d_ecc.begin(),d_eccanom.begin())),
	   thrust::make_zip_iterator(thrust::make_tuple(d_ma.end(),  d_ecc.end(),  d_eccanom.end())), 
	   keplereq_functor() );
	cudaThreadSynchronize();
	cutStopTimer(kernelTime);

	// transfer results back to host
	cutStartTimer(memoryTime);
	thrust::copy(d_eccanom.begin(),d_eccanom.end(),ph_eccanom);
	cudaThreadSynchronize();
	cutStopTimer(memoryTime);
	}
	else
	{
	// distribute the computation to the CPU
	cutStartTimer(kernelTime);
	thrust::for_each(
	   thrust::make_zip_iterator(thrust::make_tuple(h_ma.begin(),h_ecc.begin(),ph_eccanom)),
	   thrust::make_zip_iterator(thrust::make_tuple(h_ma.end(),  h_ecc.end(),  ph_eccanom+num)), 
	   keplereq_functor() );
	cutStopTimer(kernelTime);	
	}
}
Example #21
0
void
op_fetch_data ( op_dat dat )
{
  cutilSafeCall ( cudaMemcpy ( dat->data, dat->data_d,
                               dat->size * dat->set->size,
                               cudaMemcpyDeviceToHost ) );
  cutilSafeCall ( cudaThreadSynchronize (  ) );
}
Example #22
0
void
op_cpHostToDevice ( void ** data_d, void ** data_h, int size )
{
  cutilSafeCall ( cudaMalloc ( data_d, size ) );
  cutilSafeCall ( cudaMemcpy ( *data_d, *data_h, size,
                               cudaMemcpyHostToDevice ) );
  cutilSafeCall ( cudaThreadSynchronize (  ) );
}
Example #23
0
    void MulC_I(std::vector<T* >& h_Imgs, T c,
                                int n, int nImgs, T* d_scratchI[])
    {
#if 0
        assert(nImgs < h_Imgs.size());
        // load the first image
        copyArrayToDevice(d_scratchI[0], h_Imgs[0], n);
        cplVectorOpers::MulC_I(d_scratchI[0], c, n);

        // load the second image
        if (nImgs > 1)
            copyArrayToDevice(d_scratchI[1], h_Imgs[1], n);

        int i=2;
        for (; i < nImgs; ++i){
            copyArrayToDeviceAsync(d_scratchI[i % 3], h_Imgs[i], n, STM_H2D);
            cplVectorOpers::MulC_I(d_scratchI[(i-1) % 3], c, n, STM_D2D);
            copyArrayFromDeviceAsync(h_Imgs[i-2], d_scratchI[(i-2)%3], n, STM_D2H);
            cudaThreadSynchronize();
        }
    
        if (nImgs> 1)
            cplVectorOpers::MulC_I(d_scratchI[(i-1) % 3], c, n, STM_D2D);
        copyArrayFromDeviceAsync(h_Imgs[i-2], d_scratchI[(i-2) % 3], n, STM_D2H);
        cudaThreadSynchronize();
    
        ++i;
        if (nImgs> 1)
            copyArrayFromDeviceAsync(h_Imgs[i-2], d_scratchI[(i-2) % 3], n, STM_D2H);
#else
        for (int i=0; i < nImgs + 2; ++i) {
            if (i < nImgs)
                copyArrayToDeviceAsync(d_scratchI[i % 3], h_Imgs[i], n, STM_H2D);

            if ((i >=1) && ((i-1) < nImgs))
                cplVectorOpers::MulC_I(d_scratchI[(i-1) % 3], c, n, STM_D2D);

            if ((i >=2) && ((i-2) < nImgs))
                copyArrayFromDeviceAsync(h_Imgs[i-2], d_scratchI[(i-2)%3], n, STM_D2H);
        
            cudaThreadSynchronize();
        }
#endif
    }
void
OsdCudaGLVertexBuffer::unmap() {

    cudaThreadSynchronize();
    if (_devicePtr == NULL) return;
    cudaError_t err = cudaGraphicsUnmapResources(1, &_cudaResource, 0);
    if (err != cudaSuccess)
        OsdError(OSD_CUDA_GL_ERROR, "OsdCudaGLVertexBuffer::unmap failed.\n%s\n", cudaGetErrorString(err));
    _devicePtr = NULL;
}
void DeviceMemory<Type, Dim>::
initMem(int val, bool sync)
{
  if(this->buffer == 0)
    return;

  CUDA_CHECK(cudaMemset(this->buffer, val, this->getSize() * sizeof(Type)));

  if(sync)
    cudaThreadSynchronize();
}
Example #26
0
void runAutoTest(int argc, char **argv)
{
    printf("[%s] (automated testing w/ readback)\n", sSDKsample);

    if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
        cutilDeviceInit(argc, argv);
    } else {
        cudaSetDevice( cutGetMaxGflopsDeviceId() );
    }

    loadDefaultImage( argv[0] );

    if (argc > 1) {
        char *filename;
        if (cutGetCmdLineArgumentstr(argc, (const char **)argv, "file", &filename)) {
            initializeData(filename);
        }
    } else {
        loadDefaultImage( argv[0]);
    }

    g_CheckRender       = new CheckBackBuffer(imWidth, imHeight, sizeof(Pixel), false);
    g_CheckRender->setExecPath(argv[0]);

    Pixel *d_result;
    cutilSafeCall( cudaMalloc( (void **)&d_result, imWidth*imHeight*sizeof(Pixel)) );

    while (g_SobelDisplayMode <= 2) 
    {
        printf("AutoTest: %s <%s>\n", sSDKsample, filterMode[g_SobelDisplayMode]);

        sobelFilter(d_result, imWidth, imHeight, g_SobelDisplayMode, imageScale );

        cutilSafeCall( cudaThreadSynchronize() );

        cudaMemcpy(g_CheckRender->imageData(), d_result, imWidth*imHeight*sizeof(Pixel), cudaMemcpyDeviceToHost);

        g_CheckRender->savePGM(sOriginal[g_Index], false, NULL);

        if (!g_CheckRender->PGMvsPGM(sOriginal[g_Index], sReference[g_Index], MAX_EPSILON_ERROR, 0.15f)) {
            g_TotalErrors++;
        }
        g_Index++;
        g_SobelDisplayMode = (SobelDisplayMode)g_Index;
    }

    cutilSafeCall( cudaFree( d_result ) );
    delete g_CheckRender;

    if (!g_TotalErrors) 
        printf("TEST PASSED!\n");
    else 
        printf("TEST FAILED!\n");
}
Example #27
0
void
op_mvHostToDevice ( void ** map, int size )
{
  void *tmp;
  cutilSafeCall ( cudaMalloc ( &tmp, size ) );
  cutilSafeCall ( cudaMemcpy ( tmp, *map, size,
                               cudaMemcpyHostToDevice ) );
  cutilSafeCall ( cudaThreadSynchronize (  ) );
  free ( *map );
  *map = tmp;
}
Example #28
0
/////////////////////////////////////
// error checking 
/////////////////////////////////////
magma_int_t
magma_dcheckerr(const char *label)
{
    cudaThreadSynchronize();
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess)
    {
        const char *e = cudaGetErrorString(err);
        fprintf(stderr, "CUDA Error: %s (at %s)", e, label);
    }
    return MAGMA_SUCCESS; 
}
Example #29
0
// cleanup
void free_myriad(int thr_id)
{
	if (!init[thr_id])
		return;

	cudaThreadSynchronize();

	myriadgroestl_cpu_free(thr_id);
	init[thr_id] = false;

	cudaDeviceSynchronize();
}
Example #30
0
inline void check_cuda_errors(const char *filename, const int line_number)
{
#ifdef CUDA_DEBUG
   cudaThreadSynchronize();
   cudaError_t error = cudaGetLastError();
   if(error != cudaSuccess)
   {
      printf("CUDA error at %s:%i: %s\n", filename, line_number, cudaGetErrorString(error));
      exit(-1);
   }
#endif
}