void sobel1(int *h_result, unsigned int *h_pic, int xsize, int ysize, int thresh)
{

	
	int *d_result;
	unsigned int *d_pic;
	
	 
	int resultSize = xsize * ysize  * 3 * sizeof(int);
	int picSize = xsize * ysize * sizeof(int);

	 
	cudaMalloc( (void**)&d_result, resultSize);
	if( !d_result) {
		exit(-1);
	}
	cudaMalloc( (void**)&d_pic, picSize);
	if( !d_pic) {
		exit(-1);
	}

	 
	cudaMemcpy(d_result, h_result, resultSize, cudaMemcpyHostToDevice);
	cudaMemcpy(d_pic, h_pic, picSize, cudaMemcpyHostToDevice);
	
	 
	
	dim3 threadsPerBlock(BLOCKSIZE, BLOCKSIZE);
	dim3 numBlocks(ceil((float)ysize/(float)threadsPerBlock.x), ceil((float)xsize/(float)threadsPerBlock.y));
	
	 
	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);
{	__set_CUDAConfig(numBlocks, threadsPerBlock ); 
          
	d_sobel1 (d_result, d_pic, xsize, ysize, thresh);}
          
	
	 
	cudaEventSynchronize(stop);
	float elapsedTime;
	cudaEventElapsedTime(&elapsedTime, start, stop);
	cudaEventDestroy(start);
	cudaEventDestroy(stop);

	 
	 
	cudaMemcpy(h_result, d_result, resultSize, cudaMemcpyDeviceToHost);
	cudaMemcpy(h_pic, d_pic, picSize, cudaMemcpyDeviceToHost);

	 
	cudaFree(d_result);
	cudaFree(d_pic);
	
}
예제 #2
0
파일: race_01.cpp 프로젝트: soarlab/Gklee
int main(int argc, char* argv[]) {
   
  unsigned int *host_array = (unsigned int*) malloc(SIZE*sizeof(unsigned int));
  unsigned int *device_array_a = 0;
  cudaMalloc((void **) &device_array_a, SIZE*sizeof(unsigned int));
  unsigned int *device_array_b = 0;
  cudaMalloc((void **) &device_array_b, SIZE*sizeof(unsigned int));

   
  if (host_array == 0) { return 1;}
  if (device_array_a == 0) { return 2;}
  if (device_array_b == 0) { return 3;}

   
  for (int i=0; i<SIZE; i++) {
    if (i%2 == 0) {
      host_array[i] = i;
    } else {
      host_array[i] = 0;
    }
  }
  cudaMemcpy(device_array_a, host_array, SIZE, cudaMemcpyHostToDevice);

   
   
  for (int i=0; i<SIZE; i++) {
    if (i%2 == 0 && i%3 == 0) {
      host_array[i] = i;
    } else {
      host_array[i] = 0;
    }
  }
  cudaMemcpy(device_array_b, host_array, SIZE, cudaMemcpyHostToDevice);
{ __set_CUDAConfig(BLOCKS, (SIZE/BLOCKS)); 
          
 device_global(device_array_a, device_array_b, SIZE);}
          
  cudaMemcpy(host_array, device_array_a, SIZE, cudaMemcpyDeviceToHost);

   
  for (int i=0; i<SIZE; i += SIZE/BLOCKS) {
    for (int j=0; j<SIZE/BLOCKS; j++){
      printf("%d, ", host_array[i+j]);
    }
    printf("\n");
  }
  
   
  free(host_array);
  cudaFree(device_array_a);
  cudaFree(device_array_b);
}
int main()
{
  int* din;
  cudaMalloc((void**)&din, N*sizeof(int));
  int in[N];
  for(int i = 0; i < N; i++)
    in[i] = 0;
  cudaMemcpy(din, &in, N*sizeof(int), cudaMemcpyHostToDevice);
{ __set_CUDAConfig(1, N); 
          
 iwarp(din);}
          
  int output[N];
  cudaMemcpy(&output, din, N*sizeof(int), cudaMemcpyDeviceToHost);
  for(int i = 0; i < N; i++)
	printf("%d ", output[i]);
  printf("\n");
}
int main(int argc, char **argv)
{
    int block_size = 32;

    dim3 dimsA(1*1*block_size, 1*1*block_size, 1);
    dim3 dimsB(1*2*block_size, 1*1*block_size, 1);

     
    unsigned int size_A = dimsA.x * dimsA.y;
    unsigned int mem_size_A = sizeof(int) * size_A;
    int *h_A = (int *)malloc(mem_size_A);
    unsigned int size_B = dimsB.x * dimsB.y;
    unsigned int mem_size_B = sizeof(int) * size_B;
    int *h_B = (int *)malloc(mem_size_B);
    const float valB = 0.01f;

     
    constantInit(h_A, size_A, 1);
    constantInit(h_B, size_B, 1);

     
    int *d_A, *d_B, *d_C;

     
    dim3 dimsC(dimsB.x, dimsA.y, 1);
    unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(int);
    int *h_C = (int *) malloc(mem_size_C);

    if (h_C == NULL)
    {
        fprintf(stderr, "Failed to allocate host matrix C!\n");
        exit(EXIT_FAILURE);
    }

    cudaError_t error;

    error = cudaMalloc((void **) &d_A, mem_size_A);

    if (error != cudaSuccess)
    {
        printf("cudaMalloc d_A returned error code %d, line(%d)\n", error, __LINE__);
        exit(EXIT_FAILURE);
    }

    error = cudaMalloc((void **) &d_B, mem_size_B);

    if (error != cudaSuccess)
    {
        printf("cudaMalloc d_B returned error code %d, line(%d)\n", error, __LINE__);
        exit(EXIT_FAILURE);
    }

    error = cudaMalloc((void **) &d_C, mem_size_C);

    if (error != cudaSuccess)
    {
        printf("cudaMalloc d_C returned error code %d, line(%d)\n", error, __LINE__);
        exit(EXIT_FAILURE);
    }

     
    error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);

    if (error != cudaSuccess)
    {
        printf("cudaMemcpy (d_A,h_A) returned error code %d, line(%d)\n", error, __LINE__);
        exit(EXIT_FAILURE);
    }

    error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);

    if (error != cudaSuccess)
    {
        printf("cudaMemcpy (d_B,h_B) returned error code %d, line(%d)\n", error, __LINE__);
        exit(EXIT_FAILURE);
    }

     
    dim3 threads(block_size, block_size);
    dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);

     
    printf("Computing result using CUDA Kernel...\n");

     
    if (block_size == 16)
    {
{ __set_CUDAConfig(grid, threads ); 
          
 matrixMulCUDA<16>(d_C, d_A, d_B, dimsA.x, dimsB.x);}
          
    }
    else
    {
{ __set_CUDAConfig(grid, threads ); 
          
 matrixMulCUDA<32>(d_C, d_A, d_B, dimsA.x, dimsB.x);}
          
    }

    printf("done\n");

    cudaDeviceSynchronize();

     
    cudaEvent_t start;
    error = cudaEventCreate(&start);

    if (error != cudaSuccess)
    {
        fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }

    cudaEvent_t stop;
    error = cudaEventCreate(&stop);

    if (error != cudaSuccess)
    {
        fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }

     
    error = cudaEventRecord(start, NULL);

    if (error != cudaSuccess)
    {
        fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }

     
    int nIter = 300;

    for (int j = 0; j < nIter; j++)
    {
        if (block_size == 16)
        {
{ __set_CUDAConfig(grid, threads ); 
          
 matrixMulCUDA<16>(d_C, d_A, d_B, dimsA.x, dimsB.x);}
          
        }
        else
        {
{ __set_CUDAConfig(grid, threads ); 
          
 matrixMulCUDA<32>(d_C, d_A, d_B, dimsA.x, dimsB.x);}
          
        }
    }

     
    error = cudaEventRecord(stop, NULL);

    if (error != cudaSuccess)
    {
        fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }

     
    error = cudaEventSynchronize(stop);

    if (error != cudaSuccess)
    {
        fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }

    float msecTotal = 0.0f;
    error = cudaEventElapsedTime(&msecTotal, start, stop);

    if (error != cudaSuccess)
    {
        fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }

     
    float msecPerMatrixMul = msecTotal / nIter;
    double flopsPerMatrixMul = 2.0 * (double)dimsA.x * (double)dimsA.y * (double)dimsB.x;
    double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
    printf(
        "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops, WorkgroupSize= %u threads/block\n",
        gigaFlops,
        msecPerMatrixMul,
        flopsPerMatrixMul,
        threads.x * threads.y);

     
    error = cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost);

    if (error != cudaSuccess)
    {
        printf("cudaMemcpy (h_C,d_C) returned error code %d, line(%d)\n", error, __LINE__);
        exit(EXIT_FAILURE);
    }

    printf("Checking computed result for correctness: ");
    bool correct = true;

     
     
    double eps = 1.e-6 ;  
    for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++)
    {
        double abs_err = fabs(h_C[i] - (dimsA.x * valB));
        double dot_length = dimsA.x;
        double abs_val = fabs(h_C[i]);
        double rel_err = abs_err/abs_val/dot_length ;
        if (rel_err > eps)
        {
            printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x*valB, eps);
            correct = false;
        }
    }

    printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");

     
    free(h_A);
    free(h_B);
    free(h_C);
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    printf("\nNote: For peak performance, please refer to the matrixMulCUBLAS example.\n");

    cudaDeviceReset();

    if (correct)
    {
        return EXIT_SUCCESS;
    }
    else
    {
        return EXIT_FAILURE;
    }
}