C++ (Cpp) dimBlock 예제들

예제 #1

0

파일 보기

파일: MultiplicacionMatricesFlotante.c 프로젝트: R2DA22/HPC

int matrixMult( float *A, float *B, float *C,int n, int m ,int o, int flag){
  
	int sizeA=n*m*sizeof(int);
	int sizeB=m*o*sizeof(int);
	int sizeC=n*o*sizeof(int);
  
	float *d_A, *d_B, *d_C;
	//Reservo Memoria en el dispositivo
	cudaMalloc((void **)&d_A,	sizeA);
	cudaMalloc((void **)&d_B, sizeB);
	cudaMalloc((void **)&d_C,	sizeC);
	
	clock_t t;
	t=clock();
	//Copio los datos al device
	cudaMemcpy(d_A, A, sizeA, cudaMemcpyHostToDevice);
	cudaMemcpy(d_B, B, sizeB, cudaMemcpyHostToDevice);
  
	dim3 dimBlock(32.0,32.0); //mayor cantidad de hilos por bloque
	dim3 dimGrid(ceil((float)n/dimBlock.x),ceil((float)n/dimBlock.y));
	// Ejecuto el Kernel (del dispositivo)
	if(flag==1){
		matMultParallelTiled<<<dimGrid,dimBlock>>>(d_A, d_B, d_C,n,m,o);
    cudaMemcpy(C, d_C, sizeC, cudaMemcpyDeviceToHost);
		printf("Multiplicacion paralela con tiling\t: %.8f\n",(clock()-t)/(double)CLOCKS_PER_SEC);
	}else{

예제 #2

0

파일 보기

파일: main.cpp 프로젝트: wme7/Matlab2CPP

int main() {
  // Initialize variables
  double *h_u; h_u = (double*)malloc(sizeof(double)*(NX*NY*NZ));

  // Set Domain Initial Condition and BCs
  Call_IC(h_u);

  // GPU Memory Arrays
  double *d_u;  checkCuda(cudaMalloc((void**)&d_u, sizeof(double)*(NX*NY)));
  double *d_un; checkCuda(cudaMalloc((void**)&d_un,sizeof(double)*(NX*NY)));

  // Copy Initial Condition from host to device
  checkCuda(cudaMemcpy(d_u, h_u,sizeof(double)*(NX*NY),cudaMemcpyHostToDevice));
  checkCuda(cudaMemcpy(d_un,h_u,sizeof(double)*(NX*NY),cudaMemcpyHostToDevice));

  // GPU kernel launch parameters
  dim3 dimBlock(BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z);
  dim3 dimGrid (DIVIDE_INTO(NX, BLOCK_SIZE_X), DIVIDE_INTO(NY, BLOCK_SIZE_Y), DIVIDE_INTO(NZ, BLOCK_SIZE_Z)); 

  // Request computer current time
  time_t t = clock();

  // Solver Loop 
  for (int step=0; step < NO_STEPS; step+=2) {
    if (step%10000==0) printf("Step %d of %d\n",step,(int)NO_STEPS);
      // Compute Laplace
      Call_Laplace(dimGrid,dimBlock,d_u,d_un);
      // Call_Laplace_Texture(dimGrid,dimBlock,d_u,d_un);
    }
  if (DEBUG) printf("CUDA error (Jacobi_Method) %s\n",cudaGetErrorString(cudaPeekAtLastError()));

  // Measure and Report computation time
  t = clock()-t; printf("Computing time (%f seconds).\n",((float)t)/CLOCKS_PER_SEC);

  // Copy data from device to host
  checkCuda(cudaMemcpy(h_u,d_u,sizeof(double)*(NX*NY*NZ),cudaMemcpyDeviceToHost));

  // uncomment to print solution to terminal
  if (DEBUG) Print2D(h_u);

  // Write solution to file
  Save_Results(h_u); 

  // Free device memory
  checkCuda(cudaFree(d_u));
  checkCuda(cudaFree(d_un));

  // Reset device
  checkCuda(cudaDeviceReset());

  // Free memory on host and device
  free(h_u);

  return 0;
}

예제 #3

0

파일 보기

파일: ConvolutionMCuda0.cpp 프로젝트: mertdikmen/ViVid

bool try_convolution0_mcuda(const MCudaMatrix3D::Ptr& video,
                            const MCudaMatrix3D::Ptr& kernel,
                            MCudaMatrix3D::Ptr& output)
{
    unsigned int yt = output->dim_y * output->dim_t;

    dim3 dimBlock(16, 16);
    dim3 dimGrid((output->dim_x-1) / 16 + 1,
                 (yt-1) / 16 + 1);
    do_convolution0(*video, *kernel, *output, dimGrid, dimBlock);
    return true;
}

예제 #4

0

파일 보기

파일: mix_kernels_hip.cpp 프로젝트: sunway513/mixbench

void runbench_warmup(double *cd, long size){
	const long reduced_grid_size = size/(UNROLLED_MEMORY_ACCESSES)/32;
	const int BLOCK_SIZE = 256;
	const int TOTAL_REDUCED_BLOCKS = reduced_grid_size/BLOCK_SIZE;

	dim3 dimBlock(BLOCK_SIZE, 1, 1);
	dim3 dimReducedGrid(TOTAL_REDUCED_BLOCKS, 1, 1);

	hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< short, BLOCK_SIZE, 0 >), dim3(dimReducedGrid), dim3(dimBlock ), 0, 0, (short)1, (short*)cd);
	CUDA_SAFE_CALL( hipGetLastError() );
	CUDA_SAFE_CALL( hipDeviceSynchronize() );
}

예제 #5

0

파일 보기

파일: mix_kernels_hip.cpp 프로젝트: sunway513/mixbench

void runbench(double *cd, long size){
	if( memory_ratio>UNROLL_ITERATIONS ){
		fprintf(stderr, "ERROR: memory_ratio exceeds UNROLL_ITERATIONS\n");
		exit(1);
	}

	const long compute_grid_size = size/(UNROLLED_MEMORY_ACCESSES)/2;
	const int BLOCK_SIZE = 256;
	const int TOTAL_BLOCKS = compute_grid_size/BLOCK_SIZE;
	const long long computations = 2*(long long)(COMP_ITERATIONS)*REGBLOCK_SIZE*compute_grid_size;
	const long long memoryoperations = (long long)(COMP_ITERATIONS)*compute_grid_size;

	dim3 dimBlock(BLOCK_SIZE, 1, 1);
    dim3 dimGrid(TOTAL_BLOCKS, 1, 1);
	hipEvent_t start, stop;

	initializeEvents(&start, &stop);
	hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< float, BLOCK_SIZE, memory_ratio >), dim3(dimGrid), dim3(dimBlock ), 0, 0, 1.0f, (float*)cd);
	float kernel_time_mad_sp = finalizeEvents(start, stop);

	initializeEvents(&start, &stop);
	hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< double, BLOCK_SIZE, memory_ratio >), dim3(dimGrid), dim3(dimBlock ), 0, 0, 1.0, cd);
	float kernel_time_mad_dp = finalizeEvents(start, stop);

	initializeEvents(&start, &stop);
	hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< int, BLOCK_SIZE, memory_ratio >), dim3(dimGrid), dim3(dimBlock ), 0, 0, 1, (int*)cd);
	float kernel_time_mad_int = finalizeEvents(start, stop);

	const double memaccesses_ratio = (double)(memory_ratio)/UNROLL_ITERATIONS;
	const double computations_ratio = 1.0-memaccesses_ratio;

	printf("         %4d,   %8.3f,%8.2f,%8.2f,%7.2f,   %8.3f,%8.2f,%8.2f,%7.2f,  %8.3f,%8.2f,%8.2f,%7.2f\n",
		UNROLL_ITERATIONS-memory_ratio,
		(computations_ratio*(double)computations)/(memaccesses_ratio*(double)memoryoperations*sizeof(float)),
		kernel_time_mad_sp,
		(computations_ratio*(double)computations)/kernel_time_mad_sp*1000./(double)(1000*1000*1000),
		(memaccesses_ratio*(double)memoryoperations*sizeof(float))/kernel_time_mad_sp*1000./(1000.*1000.*1000.),
		(computations_ratio*(double)computations)/(memaccesses_ratio*(double)memoryoperations*sizeof(double)),
		kernel_time_mad_dp,
		(computations_ratio*(double)computations)/kernel_time_mad_dp*1000./(double)(1000*1000*1000),
		(memaccesses_ratio*(double)memoryoperations*sizeof(double))/kernel_time_mad_dp*1000./(1000.*1000.*1000.),
		(computations_ratio*(double)computations)/(memaccesses_ratio*(double)memoryoperations*sizeof(int)),
		kernel_time_mad_int,
		(computations_ratio*(double)computations)/kernel_time_mad_int*1000./(double)(1000*1000*1000),
		(memaccesses_ratio*(double)memoryoperations*sizeof(int))/kernel_time_mad_int*1000./(1000.*1000.*1000.) );
}

예제 #6

0

파일 보기

파일: GpuDilate.hpp 프로젝트: samkreter/cvtile

ErrorCode GpuDilate<InputPixelType, InputBandCount, OutputPixelType, OutputBandCount>::launchKernel(unsigned blockWidth, unsigned blockHeight)
{
	dim3 dimBlock(blockWidth,blockHeight);

	size_t gridWidth = this->dataSize.width / dimBlock.x + (((this->dataSize.width % dimBlock.x)==0) ? 0 :1 );
	size_t gridHeight = this->dataSize.height / dimBlock.y + (((this->dataSize.height % dimBlock.y)==0) ? 0 :1 );
	dim3 dimGrid(gridWidth, gridHeight);

	// Bind the texture to the array and setup the access parameters
	cvt::gpu::bind_texture<InputPixelType,0>(this->gpuInputDataArray);
	cudaError cuer = cudaGetLastError();
	if (cudaSuccess != cuer)
	{
		return CudaError; // needs to be changed
	}

	// ====================================================
	// Really launch, after one last error check!
	// ====================================================
	cuer = cudaGetLastError();
	if (cudaSuccess != cuer)
	{
		return CudaError; // needs to be changed
	}
	//TODO: Use this line when updating to use shared memory
	 //const unsigned int shmem_bytes = neighbor_coordinates_.size() * sizeof(double) * blockWidth * blockHeight;
	 cvt::gpu::launch_dilate<InputPixelType, OutputPixelType>(dimGrid, dimBlock, 0, this->stream,(OutputPixelType *)this->gpuOutputData,
	   this->roiSize_.width,this->roiSize_.height, this->relativeOffsetsGpu_,
	   this->relativeOffsets_.size(),this->bufferWidth_);
	 
	cuer = cudaGetLastError();
	if (cuer != cudaSuccess) {
		std::cout << "CUDA ERROR = " << cuer << std::endl;
		throw std::runtime_error("KERNEL LAUNCH FAILURE");
	}
	return CudaError; // needs to be changed

};

예제 #7

0

파일 보기

파일: mix_kernels_hip_ro.cpp 프로젝트: ekondis/mixbench

void runbench(double *cd, long size){
	const long compute_grid_size = size/ELEMENTS_PER_THREAD;
	const int BLOCK_SIZE = 256;
	const int TOTAL_BLOCKS = compute_grid_size/BLOCK_SIZE;
	const long long computations = ELEMENTS_PER_THREAD*(long long)compute_grid_size+(2*ELEMENTS_PER_THREAD*compute_iterations)*(long long)compute_grid_size;
	const long long memoryoperations = size;

	dim3 dimBlock(BLOCK_SIZE, 1, 1);
	dim3 dimGrid(TOTAL_BLOCKS, 1, 1);
	hipEvent_t start, stop;

	initializeEvents(&start, &stop);
	hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< float, BLOCK_SIZE, ELEMENTS_PER_THREAD, compute_iterations >), dim3(dimGrid), dim3(dimBlock ), 0, 0, 1.0f, (float*)cd);
	float kernel_time_mad_sp = finalizeEvents(start, stop);

	initializeEvents(&start, &stop);
	hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< double, BLOCK_SIZE, ELEMENTS_PER_THREAD, compute_iterations >), dim3(dimGrid), dim3(dimBlock ), 0, 0, 1.0, cd);
	float kernel_time_mad_dp = finalizeEvents(start, stop);

	initializeEvents(&start, &stop);
	hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< int, BLOCK_SIZE, ELEMENTS_PER_THREAD, compute_iterations >), dim3(dimGrid), dim3(dimBlock ), 0, 0, 1, (int*)cd);
	float kernel_time_mad_int = finalizeEvents(start, stop);

	printf("         %4d,   %8.3f,%8.2f,%8.2f,%7.2f,   %8.3f,%8.2f,%8.2f,%7.2f,  %8.3f,%8.2f,%8.2f,%7.2f\n",
		compute_iterations,
		((double)computations)/((double)memoryoperations*sizeof(float)),
		kernel_time_mad_sp,
		((double)computations)/kernel_time_mad_sp*1000./(double)(1000*1000*1000),
		((double)memoryoperations*sizeof(float))/kernel_time_mad_sp*1000./(1000.*1000.*1000.),
		((double)computations)/((double)memoryoperations*sizeof(double)),
		kernel_time_mad_dp,
		((double)computations)/kernel_time_mad_dp*1000./(double)(1000*1000*1000),
		((double)memoryoperations*sizeof(double))/kernel_time_mad_dp*1000./(1000.*1000.*1000.),
		((double)computations)/((double)memoryoperations*sizeof(int)),
		kernel_time_mad_int,
		((double)computations)/kernel_time_mad_int*1000./(double)(1000*1000*1000),
		((double)memoryoperations*sizeof(int))/kernel_time_mad_int*1000./(1000.*1000.*1000.) );
}

예제 #8

0

파일 보기

파일: main.cpp 프로젝트: alex-virodov/cuda-kernel-sim

int main() 
{
	const int size = 33;
	float* in  = new float[size];
	float* out = new float[size];

	for (int i = 0; i < size; i++) { in[i] = (float)(i+1); };
	
    dim3 dimGrid (1, 1, 1);
    dim3 dimBlock(BLOCK_SIZE, 1, 1);

	// Cannot support CUDA's <<<x,y>>> syntax.
	schedule(code, in, out, size)
		.setBlockSize(dimBlock)
		.setGridSize(dimGrid)
		.run();

	for (int i = 0; i < size; i++) {
		printf("%0.2f %0.2f\n", in[i], out[i]);
	}



}

예제 #9

0

파일 보기

파일: pathfinder.c 프로젝트: adsc-hls/fcuda-benchmarks

/*
   compute N time steps
   */
int calc_path(DATATYPE *gpuWall, DATATYPE *gpuResult[2], int rows, int cols, \
    int pyramid_height, int blockCols, int borderCols)
{
  dim3 dimBlock(BLOCK_SIZE);
  dim3 dimGrid(blockCols);

  int size = rows * cols;

  int src = 1, dst = 0;

#ifdef NOC
  DATATYPE *memport = (DATATYPE*)malloc(sizeof(DATATYPE) * ((size - cols) + cols + cols));
  memcpy(memport, gpuWall, sizeof(DATATYPE) * (size - cols));
#endif

  for (int t = 0; t < rows-1; t+=pyramid_height) {
    int temp = src;
    src = dst;
    dst = temp;
    dynproc_kernel(MIN(pyramid_height, rows-t-1), gpuWall, gpuResult[src], gpuResult[dst],
        cols, rows, t, borderCols, dimGrid, dimBlock, 1, 0);
  }
  return dst;
}

예제 #10

0

파일 보기

파일: GpuAbsoluteDifference.hpp 프로젝트: samkreter/cvtile

ErrorCode GpuAbsoluteDifference<InputPixelType, InputBandCount, OutputPixelType, OutputBandCount>::launchKernel(unsigned blockWidth, unsigned blockHeight)
{
	dim3 dimBlock(blockWidth,blockHeight);
	size_t gridWidth = this->dataSize.width / dimBlock.x + (((this->dataSize.width % dimBlock.x)==0) ? 0 :1 );
	size_t gridHeight = this->dataSize.height / dimBlock.y + (((this->dataSize.height % dimBlock.y)==0) ? 0 :1 );
	dim3 dimGrid(gridWidth, gridHeight);

	// Bind the texture to the array and setup the access parameters
	bind_texture<InputPixelType, 0>(this->gpuInputDataArray);
	bind_texture<InputPixelType, 1>(this->gpuInputDataArrayTwo_);

 	cvt::gpu::launch_absDifference<InputPixelType,OutputPixelType>(dimGrid, dimBlock, 0,
	   this->stream, (OutputPixelType *)this->gpuOutputData,
	   this->dataSize.width, this->dataSize.height);
	
	cudaError cuer;
	cuer = cudaGetLastError();
	if (cuer != cudaSuccess) {
		std::cout << "CUDA ERROR = " << cuer << std::endl;
		throw std::runtime_error("KERNEL LAUNCH FAILURE");
	}
	return CudaError; // needs to be changed

};

예제 #11

0

파일 보기

파일: srad.c 프로젝트: adsc-hls/fcuda-benchmarks

  void
runTest( int argc, char** argv)
{
  int rows, cols, size_I, size_R, niter = 10, iter;
  double *I, *J, lambda, q0sqr, sum, sum2, tmp, meanROI,varROI ;

#ifdef CPU
  double Jc, G2, L, num, den, qsqr;
  int *iN,*iS,*jE,*jW, k;
  double *dN,*dS,*dW,*dE;
  double cN,cS,cW,cE,D;
#endif

#ifdef GPU

  double *J_cuda;
  double *C_cuda;
  double *E_C, *W_C, *N_C, *S_C;

#endif

  unsigned int r1, r2, c1, c2;
  double *c;



  if (argc == 9)
  {
    rows = atoi(argv[1]);  //number of rows in the domain
    cols = atoi(argv[2]);  //number of cols in the domain
    if ((rows%16!=0) || (cols%16!=0)){
      fprintf(stderr, "rows and cols must be multiples of 16\n");
      exit(1);
    }
    r1   = atoi(argv[3]);  //y1 position of the speckle
    r2   = atoi(argv[4]);  //y2 position of the speckle
    c1   = atoi(argv[5]);  //x1 position of the speckle
    c2   = atoi(argv[6]);  //x2 position of the speckle
    lambda = atof(argv[7]); //Lambda value
    niter = atoi(argv[8]); //number of iterations

  }
  else{
    usage(argc, argv);
  }



  size_I = cols * rows;
  size_R = (r2-r1+1)*(c2-c1+1);

  I = (double *)malloc( size_I * sizeof(double) );
  J = (double *)malloc( size_I * sizeof(double) );
  c  = (double *)malloc(sizeof(double)* size_I) ;


#ifdef CPU

  iN = (int *)malloc(sizeof(unsigned int*) * rows) ;
  iS = (int *)malloc(sizeof(unsigned int*) * rows) ;
  jW = (int *)malloc(sizeof(unsigned int*) * cols) ;
  jE = (int *)malloc(sizeof(unsigned int*) * cols) ;


  dN = (double *)malloc(sizeof(double)* size_I) ;
  dS = (double *)malloc(sizeof(double)* size_I) ;
  dW = (double *)malloc(sizeof(double)* size_I) ;
  dE = (double *)malloc(sizeof(double)* size_I) ;


  for (int i=0; i< rows; i++) {
    iN[i] = i-1;
    iS[i] = i+1;
  }
  for (int j=0; j< cols; j++) {
    jW[j] = j-1;
    jE[j] = j+1;
  }
  iN[0]    = 0;
  iS[rows-1] = rows-1;
  jW[0]    = 0;
  jE[cols-1] = cols-1;

#endif

#ifdef GPU
  printf("size_I = %d\n", size_I);
  //Allocate device memory
  //cudaMalloc((void**)& J_cuda, sizeof(double)* size_I);
  J_cuda = (double*)malloc(sizeof(double)*size_I);
  //cudaMalloc((void**)& C_cuda, sizeof(double)* size_I);
  C_cuda = (double*)malloc(sizeof(double)*size_I);
  //cudaMalloc((void**)& E_C, sizeof(double)* size_I);
  E_C = (double*)malloc(sizeof(double)*size_I);
  //cudaMalloc((void**)& W_C, sizeof(double)* size_I);
  W_C = (double*)malloc(sizeof(double)*size_I);
  //cudaMalloc((void**)& S_C, sizeof(double)* size_I);
  S_C = (double*)malloc(sizeof(double)*size_I);
  //cudaMalloc((void**)& N_C, sizeof(double)* size_I);
  N_C = (double*)malloc(sizeof(double)*size_I);

#endif

  printf("Randomizing the input matrix\n");
  //Generate a random matrix
  random_matrix(I, rows, cols);

  for (int k = 0;  k < size_I; k++ ) {
    J[k] = exp(I[k]*1.0) ;
  }
  printf("Start the SRAD main loop\n");
  for (iter=0; iter< niter; iter++){
    sum=0; sum2=0;
    for (int i=r1; i<=r2; i++) {
      for (int j=c1; j<=c2; j++) {
        tmp   = J[i * cols + j];
        sum  += tmp ;
        sum2 += tmp*tmp;
      }
    }
    meanROI = sum / (size_R * 1.0);
    varROI  = (sum2 / (size_R*1.0)) - meanROI*meanROI;
    q0sqr   = varROI / (1.0*(meanROI*meanROI));

#ifdef CPU

    for (int i = 0 ; i < rows ; i++) {
      for (int j = 0; j < cols; j++) {

        k = i * cols + j;
        Jc = J[k];

        // directional derivates
        dN[k] = J[iN[i] * cols + j] - Jc;
        dS[k] = J[iS[i] * cols + j] - Jc;
        dW[k] = J[i * cols + jW[j]] - Jc;
        dE[k] = J[i * cols + jE[j]] - Jc;

        G2 = (dN[k]*dN[k] + dS[k]*dS[k]
            + dW[k]*dW[k] + dE[k]*dE[k]) / (Jc*Jc);

        L = (dN[k] + dS[k] + dW[k] + dE[k]) / Jc;

        num  = (0.5*G2) - ((1.0/16.0)*(L*L)) ;
        den  = 1.0 + (.25*L);
        qsqr = num/(den*den*1.0);

        // diffusion coefficent (equ 33)
        den = (qsqr-q0sqr) / (q0sqr * (1.0+q0sqr)) ;
        c[k] = 1.0 / (1.0+den) ;

        // saturate diffusion coefficent
        if (c[k] < 0) {c[k] = 0;}
        else if (c[k] > 1) {c[k] = 1;}
      }
    }
    for (int i = 0; i < rows; i++) {
      for (int j = 0; j < cols; j++) {

        // current index
        k = i * cols + j;

        // diffusion coefficent
        cN = c[k];
        cS = c[iS[i] * cols + j];
        cW = c[k];
        cE = c[i * cols + jE[j]];

        // divergence (equ 58)
        D = cN * dN[k] + cS * dS[k] + cW * dW[k] + cE * dE[k];

        // image update (equ 61)
        J[k] = J[k] + 0.25*lambda*D;
      }
    }

#endif // CPU


#ifdef GPU

    //Currently the input size must be divided by 16 - the block size
    int block_x = cols/BLOCK_SIZE ;
    int block_y = rows/BLOCK_SIZE ;

    dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
    dim3 dimGrid(block_x , block_y);


    //Copy data from main memory to device memory
    //cudaMemcpy(J_cuda, J, sizeof(double) * size_I, cudaMemcpyHostToDevice);
    memcpy(J_cuda, J, sizeof(double) * size_I);
    //Run kernels
    //srad_cuda_1<<<dimGrid, dimBlock>>>(E_C, W_C, N_C, S_C, J_cuda, C_cuda, cols, rows, q0sqr);
    srad_cuda_1(E_C, W_C, N_C, S_C, J_cuda, C_cuda, cols, rows, q0sqr, dimGrid, dimBlock, 1, 0);
    //srad_cuda_2<<<dimGrid, dimBlock>>>(E_C, W_C, N_C, S_C, J_cuda, C_cuda, cols, rows, lambda, q0sqr);
    srad_cuda_2(E_C, W_C, N_C, S_C, J_cuda, C_cuda, cols, rows, lambda, q0sqr, dimGrid, dimBlock, 1, 0);

    //Copy data from device memory to main memory
    //cudaMemcpy(J, J_cuda, sizeof(double) * size_I, cudaMemcpyDeviceToHost);
    memcpy(J, J_cuda, sizeof(double) * size_I);

#endif
  }

  //cudaThreadSynchronize();
#define OUTPUT
#ifdef OUTPUT
  //Printing output
  printf("Printing Output:\n");
  int passed = 1;
  FILE *gp = fopen("cuda/gold_output.txt", "r");
  if (gp == NULL) {
    printf("Cannot open file.\n");
  }
  double gold_J_val;
  for( int i = 0 ; i < rows ; i++){
    for ( int j = 0 ; j < cols ; j++){
      fscanf(gp, "%lf", &gold_J_val);
      //printf("%.8f ", J[i * cols + j]);
      if (fabs(gold_J_val - J[i * cols + j]) > EPSILON) {
        printf("Mismatch at %d: gold = %f, calc = %f.\n",
            i * cols + j, gold_J_val, J[i * cols + j]);
        passed = 0;
        break;
      }
    }
    if (passed == 0)
      break;
    //printf("\n");
  }
  fclose(gp);
  if (passed == 1)
    printf("PASSED.\n");
  else
    printf("FAILED.\n");
#endif

  printf("Computation Done\n");

  free(I);
  free(J);
#ifdef CPU
  free(iN); free(iS); free(jW); free(jE);
  free(dN); free(dS); free(dW); free(dE);
#endif
#ifdef GPU
  /*cudaFree(C_cuda);
    cudaFree(J_cuda);
    cudaFree(E_C);
    cudaFree(W_C);
    cudaFree(N_C);
    cudaFree(S_C);*/
  free(C_cuda);
  free(J_cuda);
  free(E_C);
  free(W_C);
  free(N_C);
  free(S_C);

#endif
  free(c);

}

예제 #12

0

파일 보기

파일: sw_cuda_cpp.cpp 프로젝트: ugeneunipro/ugene

QList<resType> calculateOnGPU(const char * seqLib, int seqLibLength, ScoreType* queryProfile, ScoreType qProfLen, int queryLength,
                                ScoreType gapOpen, ScoreType gapExtension, ScoreType maxScore, U2::SmithWatermanSettings::SWResultView resultView) {

    //TODO: calculate maximum alignment length
    const int overlapLength = calcOverlap(queryLength);

    int partsNumber = calcPartsNumber(seqLibLength, overlapLength);

    int queryDevider = 1;
    if (queryLength > sw_cuda_cpp::MAX_SHARED_VECTOR_LENGTH) {
        queryDevider = (queryLength + sw_cuda_cpp::MAX_SHARED_VECTOR_LENGTH - 1) / sw_cuda_cpp::MAX_SHARED_VECTOR_LENGTH;
    }

    int partQuerySize = (queryLength + queryDevider - 1) / queryDevider;

    int partSeqSize = calcPartSeqSize(seqLibLength, overlapLength, partsNumber);

    int sizeRow = calcSizeRow(seqLibLength, overlapLength, partsNumber, partSeqSize);

    u2log.details(QString("partsNumber: %1 queryDevider: %2").arg(partsNumber).arg(queryDevider));

    u2log.details(QString("seqLen: %1 partSeqSize: %2 overlapSize: %3").arg(seqLibLength).arg(partSeqSize).arg(overlapLength));
    u2log.details(QString("queryLen %1 partQuerySize: %2").arg(queryLength).arg(partQuerySize));

    //************************** declare some temp variables on host

    ScoreType* tempRow = new ScoreType[sizeRow];
    ScoreType* zerroArr = new ScoreType[sizeRow];
    for (int i = 0; i < sizeRow; i++) {
        zerroArr[i] = 0;
    }

    ScoreType* directionRow = new ScoreType[sizeRow];

    size_t directionMatrixSize = 0;
    size_t backtraceBeginsSize = 0;
    int * globalMatrix = NULL;
    int * backtraceBegins = NULL;
    if(U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) {
        directionMatrixSize = seqLibLength * queryLength * sizeof(int);
        backtraceBeginsSize = 2 * sizeRow * sizeof(int);

        globalMatrix = new int[directionMatrixSize / sizeof(int)];
        backtraceBegins = new int[backtraceBeginsSize / sizeof(int)];

        memset(globalMatrix, 0, directionMatrixSize);
        memset(backtraceBegins, 0, backtraceBeginsSize);
    }
    //************************** sizes of arrays

    size_t sizeQ = sizeRow * sizeof(ScoreType);
    size_t sizeQQ = (sizeRow) * sizeof(ScoreType);
    size_t sizeP = qProfLen * sizeof(ScoreType);
    size_t sizeL = (seqLibLength) * sizeof(char);

    //************************** declare arrays on device

    char * g_seqLib;
    ScoreType* g_queryProfile;
    ScoreType* g_HdataMax;
    ScoreType* g_HdataUp;
    ScoreType* g_HdataRec;
    ScoreType* g_HdataTmp;
    ScoreType* g_FdataUp;
    ScoreType* g_directionsUp;
    ScoreType* g_directionsMax;
    ScoreType* g_directionsRec;
    int * g_directionsMatrix = NULL;
    int * g_backtraceBegins = NULL;

    //************************** allocate global memory on device

    cudaMalloc((void **)& g_seqLib, sizeL);
    cudaMalloc((void **)& g_queryProfile, sizeP);
    cudaMalloc((void **)& g_HdataMax, sizeQ);
    cudaMalloc((void **)& g_HdataUp, sizeQ);
    cudaMalloc((void **)& g_FdataUp, sizeQ);
    cudaMalloc((void **)& g_directionsUp, sizeQ);
    cudaMalloc((void **)& g_directionsMax, sizeQ);
    cudaMalloc((void **)& g_HdataRec, sizeQ);
    cudaMalloc((void **)& g_directionsRec, sizeQ);

    if(U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) {
        cudaError errorMatrix = cudaMalloc(reinterpret_cast<void **>(&g_directionsMatrix), directionMatrixSize);
        cudaError errorBacktrace = cudaMalloc(reinterpret_cast<void **>(&g_backtraceBegins), backtraceBeginsSize);
    }

    u2log.details(QString("GLOBAL MEMORY USED %1 KB").arg((sizeL + sizeP + sizeQ * 7
                                                        + directionMatrixSize + backtraceBeginsSize) / 1024));

    //************************** copy from host to device

    cudaMemcpy(g_seqLib, seqLib, sizeL, cudaMemcpyHostToDevice);
    cudaMemcpy(g_queryProfile, queryProfile, sizeP, cudaMemcpyHostToDevice);
    cudaMemcpy(g_HdataMax, zerroArr, sizeQ, cudaMemcpyHostToDevice);
    cudaMemcpy(g_HdataUp, zerroArr, sizeQ, cudaMemcpyHostToDevice);
    cudaMemcpy(g_FdataUp, zerroArr, sizeQ, cudaMemcpyHostToDevice);
    cudaMemcpy(g_directionsUp, zerroArr, sizeQ, cudaMemcpyHostToDevice);
    cudaMemcpy(g_directionsMax, zerroArr, sizeQ, cudaMemcpyHostToDevice);
    cudaMemcpy(g_directionsRec, zerroArr, sizeQ, cudaMemcpyHostToDevice);
    cudaMemcpy(g_HdataRec, zerroArr, sizeQ, cudaMemcpyHostToDevice);
    if(U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) {
        cudaMemcpy(g_directionsMatrix, globalMatrix, directionMatrixSize, cudaMemcpyHostToDevice);
        cudaMemcpy(g_backtraceBegins, backtraceBegins, backtraceBeginsSize, cudaMemcpyHostToDevice);
    }

    //************************** start calculation

    int BLOCK_SIZE = partsNumber;

    dim3 dimBlock(BLOCK_SIZE);
    dim3 dimGrid(partQuerySize);

    //move constants variables to constant cuda memory
    setConstants(partSeqSize, partsNumber, overlapLength, seqLibLength,
                queryLength, gapOpen, gapExtension, maxScore, partQuerySize,
                U2::SmithWatermanAlgorithm::UP, U2::SmithWatermanAlgorithm::LEFT, U2::SmithWatermanAlgorithm::DIAG,
                U2::SmithWatermanAlgorithm::STOP);

    size_t sh_mem_size = sizeof(ScoreType) * (dimGrid.x + 1) * 3;
    u2log.details(QString("SHARED MEM SIZE USED: %1 B").arg(sh_mem_size));
    // start main loop
    for (int i = 0; i < queryDevider; i++) {

        calculateMatrix_wrap( dimBlock.x, dimGrid.x, g_seqLib,
            g_queryProfile, g_HdataUp, g_HdataRec, g_HdataMax,
            g_FdataUp, g_directionsUp, g_directionsRec,
            g_directionsMax, i * partQuerySize, g_directionsMatrix, g_backtraceBegins);

        cudaError hasErrors = cudaThreadSynchronize();

        if (hasErrors != 0) {
            u2log.trace(QString("CUDA ERROR HAPPEN, errorId: ") + QString::number(hasErrors));
        }

        //revert arrays
        g_HdataTmp = g_HdataRec;
        g_HdataRec = g_HdataUp;
        g_HdataUp = g_HdataTmp;

        g_HdataTmp = g_directionsRec;
        g_directionsRec = g_directionsUp;
        g_directionsUp = g_HdataTmp;
    }

    //Copy vectors on host and find actual results
    cudaMemcpy(tempRow, g_HdataMax, sizeQQ, cudaMemcpyDeviceToHost);
    cudaMemcpy(directionRow, g_directionsMax, sizeQQ, cudaMemcpyDeviceToHost);
    if(U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) {
        cudaMemcpy(globalMatrix, g_directionsMatrix, directionMatrixSize, cudaMemcpyDeviceToHost);
        cudaMemcpy(backtraceBegins, g_backtraceBegins, backtraceBeginsSize, cudaMemcpyDeviceToHost);
    }

    QList<resType> pas;
    resType res;
    for (int j = 0; j < (sizeRow); j++) {
        if (tempRow[j] >= maxScore) {
            res.refSubseq.startPos = directionRow[j];
            res.refSubseq.length = j - res.refSubseq.startPos + 1 - (j) / (partSeqSize + 1) * overlapLength - (j) / (partSeqSize + 1);
            res.score = tempRow[j];
            if(U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) {
                qint32 pairAlignOffset = 0;

                qint32 row = backtraceBegins[2 * j];
                qint32 column = backtraceBegins[2 * j + 1];
                while(U2::SmithWatermanAlgorithm::STOP != globalMatrix[seqLibLength * row + column]) {
                    if(U2::SmithWatermanAlgorithm::DIAG == globalMatrix[seqLibLength * row + column]) {
                        res.pairAlign[pairAlignOffset++] = U2::SmithWatermanAlgorithm::DIAG;
                        row--;
                        column--;
                    } else if(U2::SmithWatermanAlgorithm::LEFT == globalMatrix[seqLibLength * row + column]) {
                        res.pairAlign[pairAlignOffset++] = U2::SmithWatermanAlgorithm::UP;
                        column--;
                    } else if(U2::SmithWatermanAlgorithm::UP == globalMatrix[seqLibLength * row + column]) {
                        res.pairAlign[pairAlignOffset++] = U2::SmithWatermanAlgorithm::LEFT;
                        row--;
                    }
                    if(0 >= row || 0 >= column) {
                        break;
                    }
                }
                res.patternSubseq.startPos = row;
                res.patternSubseq.length = backtraceBegins[2 * j] - row + 1;
            }

            pas.append(res);
        }
    }

    //deallocation memory
    cudaFree(g_seqLib);
    cudaFree(g_queryProfile);
    cudaFree(g_HdataMax);
    cudaFree(g_HdataUp);
    cudaFree(g_HdataRec);
    cudaFree(g_FdataUp);
    cudaFree(g_directionsUp);
    cudaFree(g_directionsMax);
    cudaFree(g_directionsRec);

    if(U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) {
        cudaFree(g_directionsMatrix);
        cudaFree(g_backtraceBegins);
    }

    delete[] tempRow;
    delete[] directionRow;
    delete[] zerroArr;
    delete[] globalMatrix;
    delete[] backtraceBegins;

    return pas;
}