Ejemplo n.º 1
0
////////////////////////////////////////////////////////////////////////////////
// Main program
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char *argv[]){
	float *h_Data, 
	      *h_Kernel,
	      *h_ResultCPU, 
	      *h_ResultGPU;

	float *d_Data,
	      *d_Kernel;

	double delta, ref, sum_delta2, sum_ref2, L2norm, gpuTime;

	unsigned int hTimer;
	int i;

	//    shrQAStart(argc, argv);

	// use command-line specified CUDA device, otherwise use device with highest Gflops/s
	/*  if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
	    cutilDeviceInit(argc, argv);
	    else
	    cudaSetDevice( cutGetMaxGflopsDeviceId() );

	    cutilCheckError( cutCreateTimer(&hTimer) );
	    */
	//	printf("%d",sizeof(float));
	printf(" Initializing data...\n");
	printf("...allocating CPU memory\n");
	//      cutilSafeMalloc( h_Kernel    = (float *)malloc(KERNEL_SIZE) );
	//      cutilSafeMalloc( h_Data      = (float *)malloc(DATA_SIZE)   );
	//     cutilSafeMalloc( h_ResultCPU = (float *)malloc(DATA_SIZE));
	//     cutilSafeMalloc( h_ResultGPU = (float *)malloc(DATA_SIZE)   );
	h_Kernel    = (float *)malloc(KERNEL_SIZE);
	h_Data      = (float *)malloc(DATA_SIZE) ;
	h_ResultCPU = (float *)malloc(DATA_SIZE);
	h_ResultGPU = (float *)malloc(DATA_SIZE);
	printf("...allocating GPU memory\n");
	// cutilSafeCall( cudaMalloc((void **)&d_Kernel, DATA_SIZE) );
	// cutilSafeCall( cudaMalloc((void **)&d_Data,   DATA_SIZE) );

	printf("...generating data\n");
	printf("Data length: %i; kernel length: %i\n", dataN, kernelN);
	srand(2007);
	for (i = 0; i < kernelN; i++)
		h_Kernel[i] = (float)rand() / (float)RAND_MAX;
	printf("\n test");
	printf("\n test");   
	for (i = 0; i < dataN; i++)
		h_Data[i] = (float)rand() / (float)RAND_MAX;
	//  cutilSafeCall( memset(d_Kernel, 0, DATA_SIZE) );
	// cutilSafeCall( memcpy(d_Kernel, h_Kernel, KERNEL_SIZE, cudaMemcpyHostToDevice) );
	// cutilSafeCall( memcpy(d_Data,   h_Data,     DATA_SIZE, cudaMemcpyHostToDevice) );

	//  printf("Running GPU dyadic convolution using Fast Walsh Transform...\n");
	// cutilSafeCall( cutilDeviceSynchronize() );
	// cutilCheckError( cutResetTimer(hTimer) );
	// cutilCheckError( cutStartTimer(hTimer) );
	//     fwtBatchGPU(d_Data, 1, log2Data);
	//     fwtBatchGPU(d_Kernel, 1, log2Data);
	//     modulateGPU(d_Data, d_Kernel, dataN);
	//     fwtBatchGPU(d_Data, 1, log2Data);
	//  cutilSafeCall( cutilDeviceSynchronize() );
	//  cutilCheckError( cutStopTimer(hTimer) );
	//  gpuTime = cutGetTimerValue(hTimer);
	//  printf("GPU time: %f ms; GOP/s: %f\n", gpuTime, NOPS / (gpuTime * 0.001 * 1E+9));

	//  printf("Reading back GPU results...\n");
	// cutilSafeCall( cudaMemcpy(h_ResultGPU, d_Data, DATA_SIZE, cudaMemcpyDeviceToHost) );
	printf("Running on GPU...\n");
	dyadicConvolutionGPU(h_ResultGPU, h_Data, h_Kernel, log2Data, log2Kernel);

	printf("Running straightforward CPU dyadic convolution...\n");
	dyadicConvolutionCPU(h_ResultCPU, h_Data, h_Kernel, log2Data, log2Kernel);

	printf("Comparing the results...\n");
	sum_delta2 = 0;
	sum_ref2   = 0;
	for(i = 0; i < dataN; i++){
		delta       = h_ResultCPU[i] - h_ResultGPU[i];
		ref         = h_ResultCPU[i];
		sum_delta2 += delta * delta;
		sum_ref2   += ref * ref;
	}
	L2norm = sqrt(sum_delta2 / sum_ref2);

	printf("Shutting down...\n");
	//   cutilCheckError(  cutDeleteTimer(hTimer) );
	//  cutilSafeCall( cudaFree(d_Data)   );
	//  cutilSafeCall( cudaFree(d_Kernel) );
	//      free(h_ResultGPU);
	free(h_ResultGPU);
	free(h_ResultCPU);
	free(h_Data);
	free(h_Kernel);

	// cutilDeviceReset();
	printf("L2 norm: %E\n", L2norm);
	// shrQAFinishExit(argc, (const char **)argv, (L2norm < 1e-6) ? QA_PASSED : QA_FAILED);
}
Ejemplo n.º 2
0
////////////////////////////////////////////////////////////////////////////////
// Main program
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char *argv[])
{
  init_timer(timer_ctrl, timer_counter_l, timer_counter_h);
  start_timer(timer_ctrl);

  int i;

#ifdef HW
  int Status;
  XFcuda xcore;

  Status = XFcuda_Initialize(&xcore, 0);
  if (Status != XST_SUCCESS) {
    printf("Initialization fwt1 failed %d\n", Status);
    return 1;
  }
#endif

  const int log2Kernel = 7;

#ifndef __DEVICE_EMULATION__
  const int log2Data = 23;
#else
  const int log2Data = 15;
#endif
  const int   dataN = 1 << log2Data;
  const int kernelN = 1 << log2Kernel;

  const int   DATA_SIZE = dataN   * sizeof(float);
  const int KERNEL_SIZE = kernelN * sizeof(float);

  float *h_Data, *h_Kernel, *h_ResultCPU;
  float *d_Data, *d_Kernel;
  double delta, ref, sum_delta2, sum_ref2, L2norm, gpuTime;
  printf("Initializing data...\n");
  printf("...allocating CPU memory\n");
  h_Kernel = (float*) malloc(KERNEL_SIZE);
  if (h_Kernel == NULL)
    printf("Unable to allocate memory for h_Kernel.\n");
  h_Data = (float*) malloc(DATA_SIZE);
  if (h_Data == NULL)
    printf("Unable to allocate memory for h_Data.\n");
  h_ResultCPU = (float*) malloc(DATA_SIZE);
  if (h_ResultCPU == NULL)
    printf("Unable to allocate memory for h_ResultCPU.\n");

  printf("...allocating GPU memory\n");
  d_Kernel = (float*) malloc(DATA_SIZE);
  if (d_Kernel == NULL)
    printf("Unable to allocate memory for d_Kernel.\n");
  d_Data = (float*) malloc(DATA_SIZE);
  if (d_Data == NULL)
    printf("Unable to allocate memory for d_Data.\n");

  printf("...generating data\n");
  printf("Data length: %i; kernel length: %i\n", dataN, kernelN);
  srand(2007);
  for (i = 0; i < kernelN; i++)
    h_Kernel[i] = (float)rand() / (float)RAND_MAX;

  for (i = 0; i < dataN; i++)
    h_Data[i] = (float)rand() / (float)RAND_MAX;

  for(i = 0; i < dataN; i++)
    d_Kernel[i] = 0;
  memcpy(d_Kernel, h_Kernel, KERNEL_SIZE);
  memcpy(d_Data, h_Data, DATA_SIZE);

#ifdef VERBOSE
  printf("Running GPU dyadic convolution using Fast Walsh Transform...\n");
#endif

#ifdef HW
  XFcuda_SetD_output_addr(&xcore, (int)d_Data / sizeof(float));
  XFcuda_SetD_input_addr(&xcore, (int)d_Data / sizeof(float));

  fwtBatchGPU(d_Data, 1, log2Data, &xcore);

  XFcuda_SetD_output_addr(&xcore, (int)d_Kernel / sizeof(float));
  XFcuda_SetD_input_addr(&xcore, (int)d_Kernel / sizeof(float));

  fwtBatchGPU(d_Kernel, 1, log2Data, &xcore);
  modulateGPU(d_Data, d_Kernel, dataN);

  XFcuda_SetD_output_addr(&xcore, (int)d_Data / sizeof(float));
  XFcuda_SetD_input_addr(&xcore, (int)d_Data / sizeof(float));

  fwtBatchGPU(d_Data, 1, log2Data, &xcore);
#endif

#ifdef VERBOSE
  printf("Reading back GPU results...\n");
#endif

#ifdef VERBOSE
  printf("Running straightforward CPU dyadic convolution...\n");
#endif

#ifdef SW
  dyadicConvolutionCPU(h_ResultCPU, h_Data, h_Kernel, log2Data, log2Kernel);
  printf("CPU impl time %lld us\n\r", elapsed_time());
#endif

#ifdef HW && SW && VERIFY
  for (i = 0; i < 10; i++)
    printf("at %d, cpu=%f, fpga=%f\n", i, h_ResultCPU[i], d_Data[i]);
  printf("Comparing the results...\n");

  sum_delta2 = 0;
  sum_ref2   = 0;
  for(i = 0; i < dataN; i++) {
    //delta = h_ResultCPU[i] - h_ResultGPU[i];
    delta = h_ResultCPU[i] - d_Data[i];
    ref = h_ResultCPU[i];
    sum_delta2 += delta * delta;
    sum_ref2   += ref * ref;
  }
  L2norm = sqrt(sum_delta2 / sum_ref2);
  printf("L2 norm: %E\n", L2norm);
  printf("sum_delta2: %E \t sum_ref2: %E\n", sum_delta2, sum_ref2);
  printf((L2norm < 1e-4) ? "PASSED.\n" : "FAILED.\n");
#endif
  free(d_Data);
  free(d_Kernel);
  free(h_ResultCPU);
  free(h_Data);
  free(h_Kernel);

  stop_timer(timer_ctrl);
  printf("Execution time %lld us\n\r", elapsed_time());

  return 0;
}