コード例 #1
0
ファイル: main.cpp プロジェクト: abhishek111226/sach
int main (int argc, char* argv[]){
  struct pb_Parameters* prms;
  struct pb_TimerSet timers;

  prms = pb_ReadParameters(&argc,argv);
  pb_InitializeTimerSet(&timers);
  
  pb_AddSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);

  pb_SwitchToTimer(&timers, pb_TimerID_NONE);

  char uksdata[250];
  parameters params;

  FILE* uksfile_f = NULL;
  FILE* uksdata_f = NULL;

  strcpy(uksdata,prms->inpFiles[0]);
  strcat(uksdata,".data");

  uksfile_f = fopen(prms->inpFiles[0],"r");
  if (uksfile_f == NULL){
    printf("ERROR: Could not open %s\n",prms->inpFiles[0]);
    exit(1);
  }

  printf("\nReading parameters\n");

  if (argc >= 2){
    params.binsize = atoi(argv[1]);
  } else { //default binsize value;
    params.binsize = 128;
  }

  setParameters(uksfile_f, &params);

  pb_SwitchToTimer(&timers, pb_TimerID_IO);

  ReconstructionSample* samples; //Input Data
//  cl_mem samplesPin; 
  float* LUT; //use look-up table for faster execution on CPU (intermediate data)
  unsigned int sizeLUT; //set in the function calculateLUT (intermediate data)

  cmplx* gridData; //Output Data
  float* sampleDensity; //Output Data
//  cl_mem gridDataPin;
//  cl_mem sampleDensityPin;

  cmplx* gridData_gold; //Gold Output Data
  float* sampleDensity_gold; //Gold Output Data
  
  cl_int ciErrNum;
  cl_platform_id clPlatform;
  cl_device_type deviceType = CL_DEVICE_TYPE_GPU;
  cl_device_id clDevice;
  cl_context clContext;

  int deviceFound = getOpenCLDevice(&clPlatform, &clDevice, &deviceType, 0);

  size_t max_alloc_size = 0;
  (void) clGetDeviceInfo(clDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &max_alloc_size, 0);
  size_t global_mem_size = 0;
  (void) clGetDeviceInfo(clDevice, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size_t), &global_mem_size, 0);

  size_t samples_size = params.numSamples*sizeof(ReconstructionSample);
  int gridNumElems = params.gridSize[0] * params.gridSize[1] * params.gridSize[2];
  size_t output_size = gridNumElems*sizeof(cmplx);

  if ( (deviceFound < 0) ||
       ((samples_size+output_size) > global_mem_size) ||
       (samples_size > max_alloc_size) || 
       (output_size > max_alloc_size ) ) {
    fprintf(stderr, "No suitable device was found\n");
    if(deviceFound >= 0) {
      fprintf(stderr, "Memory requirements for this dataset exceed device capabilities\n");
    }
    exit(1);
  }
  
  cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties) clPlatform, 0};
  clContext = clCreateContextFromType(cps, deviceType, NULL, NULL, &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);

  cl_command_queue clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);
  
  cl_uint workItemDimensions;
  OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), &workItemDimensions, NULL) );
  
  size_t workItemSizes[workItemDimensions];
  OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, workItemDimensions*sizeof(size_t), workItemSizes, NULL) );
  
  pb_SetOpenCL(&clContext, &clCommandQueue);
    
    /*
  samplesPin = clCreateBuffer(clContext, CL_MEM_ALLOC_HOST_PTR, 
      params.numSamples*sizeof(ReconstructionSample),
      NULL, &ciErrNum);
*/
  samples = (ReconstructionSample *) malloc ( params.numSamples*sizeof(ReconstructionSample) );
  
  /*(ReconstructionSample *) clEnqueueMapBuffer(clCommandQueue, samplesPin, CL_TRUE, CL_MAP_WRITE, 0, params.numSamples*sizeof(ReconstructionSample), 0, NULL, NULL, &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);
*/
  if (samples == NULL){
    printf("ERROR: Unable to allocate and map memory for input data\n");
    exit(1);
  }


  uksdata_f = fopen(uksdata,"rb");

  if(uksdata_f == NULL){
    printf("ERROR: Could not open data file\n");
    exit(1);
  }

  printf("Reading input data from files\n");

  unsigned int n = readSampleData(params, uksdata_f, samples);
  fclose(uksdata_f);

  if (params.useLUT){
    printf("Generating Look-Up Table\n");
    float beta = PI * sqrt(4*params.kernelWidth*params.kernelWidth/(params.oversample*params.oversample) * (params.oversample-.5)*(params.oversample-.5)-.8);
    calculateLUT(beta, params.kernelWidth, &LUT, &sizeLUT);
  }

  pb_SwitchToTimer(&timers, pb_TimerID_NONE);

  gridData_gold = (cmplx*) calloc (gridNumElems, sizeof(cmplx));
  sampleDensity_gold = (float*) calloc (gridNumElems, sizeof(float));
  if (sampleDensity_gold == NULL || gridData_gold == NULL){
    printf("ERROR: Unable to allocate memory for output data\n");
    exit(1);
  }

  printf("Running gold version\n");

  gridding_Gold(n, params, samples, LUT, sizeLUT, gridData_gold, sampleDensity_gold);

  printf("Running OpenCL version\n");

  pb_SwitchToTimer(&timers, pb_TimerID_COPY);

/*
  OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, samplesPin, CL_TRUE, 
                          0, // Offset in bytes
                          n*sizeof(ReconstructionSample), // Size of data to write
                          samples, // Host Source
  
                          0, NULL, NULL) );*/
 // OCL_ERRCK_RETVAL ( clFinish(clCommandQueue) );
 
 /*
  gridDataPin = clCreateBuffer(clContext, CL_MEM_ALLOC_HOST_PTR, 
      gridNumElems*sizeof(cmplx), NULL, &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);
  */
  gridData = (cmplx *) malloc ( gridNumElems*sizeof(cmplx) );
  if (gridData == NULL) { fprintf(stderr, "Could not allocate memory on host! (%s: %d)\n", __FILE__, __LINE__); exit(1); }
  
  /*(cmplx *) clEnqueueMapBuffer(clCommandQueue, gridDataPin, CL_TRUE, CL_MAP_READ, 0, gridNumElems*sizeof(cmplx), 0, NULL, NULL, &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);
  */
  
  /*
  sampleDensityPin = clCreateBuffer(clContext, CL_MEM_ALLOC_HOST_PTR, 
      gridNumElems*sizeof(float), NULL, &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);
  */
  
  sampleDensity = (float *) malloc ( gridNumElems*sizeof(float) );
  if (sampleDensity == NULL) { fprintf(stderr, "Could not allocate memory on host! (%s: %d)\n", __FILE__, __LINE__); exit(1); }
  
  /*(float *) clEnqueueMapBuffer(clCommandQueue, sampleDensityPin, CL_TRUE, CL_MAP_READ, 0, gridNumElems*sizeof(float), 0, NULL, NULL, &ciErrNum);
  */
  
  OCL_ERRCK_VAR(ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);
  
  if (sampleDensity == NULL || gridData == NULL){
    printf("ERROR: Unable to allocate memory for output data\n");
    exit(1);
  }

  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
  
  //Interface function to GPU implementation of gridding
  OpenCL_interface(&timers, n, params, samples, LUT, sizeLUT, gridData, sampleDensity, clContext, clCommandQueue, clDevice, workItemSizes);

  pb_SwitchToTimer(&timers, pb_TimerID_NONE);

  int passed=1;
  for (int i=0; i<gridNumElems; i++){
    if(sampleDensity[i] != sampleDensity_gold[i]) {
      passed=0;
      break;
    }
  }
  //(passed) ? printf("Comparing GPU and Gold results... PASSED\n"):printf("Comparing GPU and Gold results... FAILED\n");

  pb_SwitchToTimer(&timers, pb_TimerID_IO);

  FILE* outfile;
  if(!(outfile=fopen(prms->outFile,"w")))
  {
        printf("Cannot open output file!\n");
  } else {
        fwrite(&passed,sizeof(int),1,outfile);
        fclose(outfile);
  }

  pb_SwitchToTimer(&timers, pb_TimerID_NONE);

  if (params.useLUT){
    free(LUT);
  }
  
  /*
  OCL_ERRCK_RETVAL ( clEnqueueUnmapMemObject(clCommandQueue, samplesPin, samples, 0, NULL, NULL) );
  OCL_ERRCK_RETVAL ( clEnqueueUnmapMemObject(clCommandQueue, gridDataPin, gridData, 0, NULL, NULL) );
  OCL_ERRCK_RETVAL ( clEnqueueUnmapMemObject(clCommandQueue, sampleDensityPin, sampleDensity, 0, NULL, NULL) );
  
  clReleaseMemObject(samplesPin);
  clReleaseMemObject(gridDataPin);
  clReleaseMemObject(sampleDensityPin);
  */
  
  free(samples);
  free(gridData);
  free(sampleDensity);
  
  
  free(gridData_gold);
  free(sampleDensity_gold);

  printf("\n");
  pb_PrintTimerSet(&timers);
  pb_FreeParameters(prms);

  return 0;
}
コード例 #2
0
ファイル: fft.cpp プロジェクト: abduld/Parboil
int main( int argc, char **argv ) {

  int n_bytes; 
  int N, B;
  struct pb_TimerSet timers;
  struct pb_Parameters *params;
  params = pb_ReadParameters(&argc, argv);
  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] != NULL))
    {
      fprintf(stderr, "Expecting one input filename\n");
      exit(-1);
    }

  int err = 0;
  if(argc != 3)
    err |= 1;
  else {
    char* numend;
    N = strtol(argv[1], &numend, 10);
    if(numend == argv[1])
      err |= 2;
    B = strtol(argv[2], &numend, 10);
    if(numend == argv[2])
      err |= 4;
  }

  if(err)
  {
    fprintf(stderr, "Expecting two integers for N and B\n");
    exit(-1);
  }

  n_bytes = N*B*sizeof(float2);
    
  pb_InitializeTimerSet(&timers);
  
  pb_AddSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);
  
  pb_SwitchToTimer(&timers, pb_TimerID_IO);

  float2 *source    = (float2 *)malloc( n_bytes );
  float2 *result    = (float2 *)calloc( N*B, sizeof(float2) );

  inputData(params->inpFiles[0],(float*)source,N*B*2);

  // OpenCL Code
  cl_int clErrNum;
  pb_Context* pb_context;
  pb_context = pb_InitOpenCLContext(params);
  if (pb_context == NULL) {
    fprintf (stderr, "Error: No OpenCL platform/device can be found."); 
    return -1;
  }

  cl_device_id clDevice = (cl_device_id) pb_context->clDeviceId;
  cl_platform_id clPlatform = (cl_platform_id) pb_context->clPlatformId;
  cl_context clContext = (cl_context) pb_context->clContext;
  cl_command_queue clCommandQueue;
  
  cl_program clProgram;
  
  cl_kernel fft_kernel;

  cl_mem d_source, d_work;//float2 *d_source, *d_work;
  cl_mem *data0, *data1;

  clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clErrNum);
  OCL_ERRCK_VAR(clErrNum);
  
  pb_SetOpenCL(&clContext, &clCommandQueue);
  pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);
  
  const char *source_path = "src/opencl_base/fft_kernel.cl";
  char *sourceCode;
  sourceCode = readFile(source_path);
  if (sourceCode == NULL) {
    fprintf(stderr, "Could not load program source of '%s'\n", source_path); exit(1);
  }
  
  clProgram = clCreateProgramWithSource(clContext, 1, (const char **)&sourceCode, NULL, &clErrNum);
  OCL_ERRCK_VAR(clErrNum);
  	  	
  free(sourceCode);
  
  /*
    char compileOptions[1024];
  //                -cl-nv-verbose // Provides register info for NVIDIA devices
  // Set all Macros referenced by kernels
  sprintf(compileOptions, "\
                -D PRESCAN_THREADS=%u\
                -D KB=%u -D UNROLL=%u\
                -D BINS_PER_BLOCK=%u -D BLOCK_X=%u",

                prescanThreads,
                lmemKB, UNROLL,
                bins_per_block, blockX
            ); 
  */
  OCL_ERRCK_RETVAL ( clBuildProgram(clProgram, 1, &clDevice, NULL /*compileOptions*/, NULL, NULL) );
  
  
  char *build_log;
  size_t ret_val_size;
  OCL_ERRCK_RETVAL ( clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size) );
  build_log = (char *)malloc(ret_val_size+1);
  OCL_ERRCK_RETVAL ( clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL) );
  
  // to be careful, terminate with \0
  build_log[ret_val_size] = '\0';

  fprintf(stderr, "%s\n", build_log );
  
  
  fft_kernel = clCreateKernel(clProgram, "GPU_FFT_Global", &clErrNum);
  OCL_ERRCK_VAR(clErrNum);
  
  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
  // allocate & copy device memory
  d_source = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, n_bytes, source, &clErrNum);  OCL_ERRCK_VAR(clErrNum);
  
  //result is initially zero'd out
  d_work = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, n_bytes, result, &clErrNum);  OCL_ERRCK_VAR(clErrNum);
  


  size_t block[1] = { N/R };
  size_t grid[1] = { B*block[0] };
  
  OCL_ERRCK_RETVAL( clSetKernelArg(fft_kernel, 3, sizeof(int), &N) );

  data0 = &d_source;
  data1 = &d_work;

  pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);


  for (int Ns = 1; Ns < N; Ns *= R) {
    OCL_ERRCK_RETVAL( clSetKernelArg(fft_kernel, 0, sizeof(int), &Ns) );
    OCL_ERRCK_RETVAL( clSetKernelArg(fft_kernel, 1, sizeof(cl_mem), (void *)data0) );
    OCL_ERRCK_RETVAL( clSetKernelArg(fft_kernel, 2, sizeof(cl_mem), (void *)data1) ); 

    OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, fft_kernel, 1, 0,
                            grid, block, 0, 0, 0) );
    
    cl_mem *tmp = data0;
    data0 = data1;
    data1 = tmp;
  }
  
  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
  // copy device memory to host
  //cudaMemcpy(result, d_source, n_bytes,cudaMemcpyDeviceToHost);
  OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, *data0, CL_TRUE, 
                        0, // Offset in bytes
                        n_bytes, // Size of data to read
                        result, // Host Source
                        0, NULL, NULL) );

  OCL_ERRCK_RETVAL ( clReleaseMemObject(d_source) );
  OCL_ERRCK_RETVAL ( clReleaseMemObject(d_work) );

  if (params->outFile) {
    /* Write result to file */
    pb_SwitchToTimer(&timers, pb_TimerID_IO);
    outputData(params->outFile, (float*)result, N*B*2);
    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
  }

  free(source);
  free(result);
  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
  pb_PrintTimerSet(&timers);
  
  pb_DestroyTimerSet(&timers);
  pb_FreeParameters(params);
  
  return 0;
}
コード例 #3
0
ファイル: main.cpp プロジェクト: abduld/Parboil
int main(int argc, char* argv[]) {
  struct pb_Parameters *parameters;

  parameters = pb_ReadParameters(&argc, argv);
  if (!parameters)
    return -1;

  if(!parameters->inpFiles[0]){
    fputs("Input file expected\n", stderr);
    return -1;
  }

  
  struct pb_TimerSet timers;
  
  char oclOverhead[] = "OCL Overhead";
  char intermediates[] = "IntermediatesKernel";
  char finals[] = "FinalKernel";

  pb_InitializeTimerSet(&timers);
  
  pb_AddSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);
  pb_AddSubTimer(&timers, intermediates, pb_TimerID_KERNEL);
  pb_AddSubTimer(&timers, finals, pb_TimerID_KERNEL);
    
  pb_SwitchToTimer(&timers, pb_TimerID_IO);
  
  int numIterations;
  if (argc >= 2){
    numIterations = atoi(argv[1]);
  } else {
    fputs("Expected at least one command line argument\n", stderr);
    return -1;
  }

  unsigned int img_width, img_height;
  unsigned int histo_width, histo_height;

  FILE* f = fopen(parameters->inpFiles[0],"rb");
  int result = 0;

  result += fread(&img_width,    sizeof(unsigned int), 1, f);
  result += fread(&img_height,   sizeof(unsigned int), 1, f);
  result += fread(&histo_width,  sizeof(unsigned int), 1, f);
  result += fread(&histo_height, sizeof(unsigned int), 1, f);

  if (result != 4){
    fputs("Error reading input and output dimensions from file\n", stderr);
    return -1;
  }

  unsigned int* img = (unsigned int*) malloc (img_width*img_height*sizeof(unsigned int));
  unsigned char* histo = (unsigned char*) calloc (histo_width*histo_height, sizeof(unsigned char));

  result = fread(img, sizeof(unsigned int), img_width*img_height, f);

  fclose(f);

  if (result != img_width*img_height){
    fputs("Error reading input array from file\n", stderr);
    return -1;
  }

  cl_int ciErrNum;
  pb_Context* pb_context;
  pb_context = pb_InitOpenCLContext();
  if (pb_context == NULL) {
    fprintf (stderr, "Error: No OpenCL platform/device can be found."); 
    return -1;
  }

  cl_int clStatus;
  cl_device_id clDevice = (cl_device_id) pb_context->clDeviceId;
  cl_platform_id clPlatform = (cl_platform_id) pb_context->clPlatformId;
  cl_context clContext = (cl_context) pb_context->clContext;
  cl_command_queue clCommandQueue;
  
  cl_program clProgram[2];
  
  cl_kernel histo_intermediates_kernel;
  cl_kernel histo_final_kernel;
  
  cl_mem input;
  cl_mem ranges;
  cl_mem sm_mappings;
  cl_mem global_subhisto;
  cl_mem global_overflow;
  cl_mem final_histo;
  
  clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);
  
  pb_SetOpenCL(&clContext, &clCommandQueue);
  pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);

  cl_uint workItemDimensions;
  OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), &workItemDimensions, NULL) );
  
  size_t workItemSizes[workItemDimensions];
  OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, workItemDimensions*sizeof(size_t), workItemSizes, NULL) );
  
  size_t program_length[2];
  const char *source_path[2] = { 
    "src/opencl_naive/histo_intermediates.cl", 
   "src/opencl_naive/histo_final.cl"};
  char *source[4];

  for (int i = 0; i < 2; ++i) {
    // Dynamically allocate buffer for source
    source[i] = oclLoadProgSource(source_path[i], "", &program_length[i]);
    if(!source[i]) {
      fprintf(stderr, "Could not load program source\n"); exit(1);
    }
  	
  	clProgram[i] = clCreateProgramWithSource(clContext, 1, (const char **)&source[i], &program_length[i], &ciErrNum);
  	OCL_ERRCK_VAR(ciErrNum);
  	  	
  	free(source[i]);
  }
  	
  	  	  	  	  	  	  	
  for (int i = 0; i < 2; ++i) {
    //fprintf(stderr, "Building Program #%d...\n", i);
    OCL_ERRCK_RETVAL ( clBuildProgram(clProgram[i], 1, &clDevice, NULL, NULL, NULL) );
       
    #if 1
       char *build_log;
       size_t ret_val_size;
       ciErrNum = clGetProgramBuildInfo(clProgram[i], clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);	OCL_ERRCK_VAR(ciErrNum);
       build_log = (char *)malloc(ret_val_size+1);
       ciErrNum = clGetProgramBuildInfo(clProgram[i], clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
       	OCL_ERRCK_VAR(ciErrNum);
       	

       // to be carefully, terminate with \0
       // there's no information in the reference whether the string is 0 terminated or not
       build_log[ret_val_size] = '\0';

       fprintf(stderr, "%s\n", build_log );
     #endif
  }
  	
  histo_intermediates_kernel = clCreateKernel(clProgram[0], "histo_intermediates_kernel", &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);
  histo_final_kernel = clCreateKernel(clProgram[1], "histo_final_kernel", &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);
  
  pb_SwitchToTimer(&timers, pb_TimerID_COPY);  

  input =           clCreateBuffer(clContext, CL_MEM_READ_WRITE, 
      img_width*img_height*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);
  ranges =          clCreateBuffer(clContext, CL_MEM_READ_WRITE, 2*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);  
  sm_mappings =     clCreateBuffer(clContext, CL_MEM_READ_WRITE, img_width*img_height*4*sizeof(unsigned char), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);
  global_subhisto = clCreateBuffer(clContext, CL_MEM_READ_WRITE, histo_width*histo_height*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);
  global_overflow = clCreateBuffer(clContext, CL_MEM_READ_WRITE, histo_width*histo_height*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);
  final_histo =     clCreateBuffer(clContext, CL_MEM_READ_WRITE, histo_width*histo_height*sizeof(unsigned char), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);

  // Must dynamically allocate. Too large for stack
  unsigned int *zeroData;
  zeroData = (unsigned int *) calloc(img_width*histo_height, sizeof(unsigned int));
  if (zeroData == NULL) {
    fprintf(stderr, "Failed to allocate %ld bytes of memory on host!\n", sizeof(unsigned int) * img_width * histo_height);
    exit(1);
  }
   
  for (int y=0; y < img_height; y++){
    OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, input, CL_TRUE, 
                          y*img_width*sizeof(unsigned int), // Offset in bytes
                          img_width*sizeof(unsigned int), // Size of data to write
                          &img[y*img_width], // Host Source
                          0, NULL, NULL) );
  }
 
  pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);

  unsigned int img_dim = img_height*img_width;
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 0, sizeof(cl_mem), (void *)&input) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 1, sizeof(unsigned int), &img_width) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 2, sizeof(cl_mem), (void *)&global_subhisto) );
  
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 0, sizeof(unsigned int), &histo_height) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 1, sizeof(unsigned int), &histo_width) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 2, sizeof(cl_mem), (void *)&global_subhisto) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 3, sizeof(cl_mem), (void *)&final_histo) );

  size_t inter_localWS[1] = { workItemSizes[0] };
  size_t inter_globalWS[1] = { img_height * inter_localWS[0] };
  
  size_t final_localWS[1] = { workItemSizes[0] };
  size_t final_globalWS[1] = {((histo_height*histo_width+(final_localWS[0]-1)) /
                                          final_localWS[0])*final_localWS[0] };
  
  pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);

  for (int iter = 0; iter < numIterations; iter++) {
    unsigned int ranges_h[2] = {UINT32_MAX, 0};
    
    // how about something like
    // __global__ unsigned int ranges[2];
    // ...kernel
    // __shared__ unsigned int s_ranges[2];
    // if (threadIdx.x == 0) {s_ranges[0] = ranges[0]; s_ranges[1] = ranges[1];}
    // __syncthreads();
    
    // Although then removing the blocking cudaMemcpy's might cause something about
    // concurrent kernel execution.
    // If kernel launches are synchronous, then how can 2 kernels run concurrently? different host threads?


  OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, ranges, CL_TRUE, 
                          0, // Offset in bytes
                          2*sizeof(unsigned int), // Size of data to write
                          ranges_h, // Host Source
                          0, NULL, NULL) );
                          
  OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, global_subhisto, CL_TRUE, 
                          0, // Offset in bytes
                          histo_width*histo_height*sizeof(unsigned int), // Size of data to write
                          zeroData, // Host Source
                          0, NULL, NULL) );
                          
  pb_SwitchToSubTimer(&timers, intermediates, pb_TimerID_KERNEL);

  OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_intermediates_kernel /*histo_intermediates_kernel*/, 1, 0,
                            inter_globalWS, inter_localWS, 0, 0, 0) );              
  pb_SwitchToSubTimer(&timers, finals, pb_TimerID_KERNEL);                            
  OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_final_kernel, 1, 0,
                            final_globalWS, final_localWS, 0, 0, 0) );                           
  }

  pb_SwitchToTimer(&timers, pb_TimerID_IO);

  OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, final_histo, CL_TRUE, 
                          0, // Offset in bytes
                          histo_height*histo_width*sizeof(unsigned char), // Size of data to read
                          histo, // Host Source
                          0, NULL, NULL) );                         

  OCL_ERRCK_RETVAL ( clReleaseKernel(histo_intermediates_kernel) );
  OCL_ERRCK_RETVAL ( clReleaseKernel(histo_final_kernel) );
  OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[0]) );
  OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[1]) );
  
  OCL_ERRCK_RETVAL ( clReleaseMemObject(input) );
  OCL_ERRCK_RETVAL ( clReleaseMemObject(ranges) );
  OCL_ERRCK_RETVAL ( clReleaseMemObject(sm_mappings) );
  OCL_ERRCK_RETVAL ( clReleaseMemObject(global_subhisto) );
  OCL_ERRCK_RETVAL ( clReleaseMemObject(global_overflow) );
  OCL_ERRCK_RETVAL ( clReleaseMemObject(final_histo) );

  if (parameters->outFile) {
    dump_histo_img(histo, histo_height, histo_width, parameters->outFile);
  }

  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);

  free(zeroData);
  free(img);
  free(histo);

  pb_SwitchToTimer(&timers, pb_TimerID_NONE);

  printf("\n");
  pb_PrintTimerSet(&timers);
  pb_FreeParameters(parameters);
  
  pb_DestroyTimerSet(&timers);

  OCL_ERRCK_RETVAL ( clReleaseCommandQueue(clCommandQueue) );
  OCL_ERRCK_RETVAL ( clReleaseContext(clContext) );

  return 0;
}
コード例 #4
0
ファイル: main.cpp プロジェクト: abduld/Parboil
int main(int argc, char* argv[]) {
  struct pb_TimerSet timers;
  struct pb_Parameters *parameters;

  parameters = pb_ReadParameters(&argc, argv);
  if (!parameters)
    return -1;

  if(!parameters->inpFiles[0]){
    fputs("Input file expected\n", stderr);
    return -1;
  }
  
  char oclOverhead[] = "OCL Overhead";
  char prescans[] = "PreScanKernel";
  char postpremems[] = "PostPreMems";
  char intermediates[] = "IntermediatesKernel";
  char mains[] = "MainKernel";
  char finals[] = "FinalKernel";

  pb_InitializeTimerSet(&timers);
  
  pb_AddSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);
  pb_AddSubTimer(&timers, prescans, pb_TimerID_KERNEL);
  pb_AddSubTimer(&timers, postpremems, pb_TimerID_KERNEL);
  pb_AddSubTimer(&timers, intermediates, pb_TimerID_KERNEL);
  pb_AddSubTimer(&timers, mains, pb_TimerID_KERNEL);
  pb_AddSubTimer(&timers, finals, pb_TimerID_KERNEL);

  pb_SwitchToTimer(&timers, pb_TimerID_IO);

  int numIterations;
  if (argc >= 2){
    numIterations = atoi(argv[1]);
  } else {
    fputs("Expected at least one command line argument\n", stderr);
    return -1;
  }

  unsigned int img_width, img_height;
  unsigned int histo_width, histo_height;
  unsigned int lmemKB;
  unsigned int nThreads;
  unsigned int bins_per_block;

  FILE* f = fopen(parameters->inpFiles[0],"rb");
  int result = 0;

  result += fread(&img_width,    sizeof(unsigned int), 1, f);
  result += fread(&img_height,   sizeof(unsigned int), 1, f);
  result += fread(&histo_width,  sizeof(unsigned int), 1, f);
  result += fread(&histo_height, sizeof(unsigned int), 1, f);

  if (result != 4){
    fputs("Error reading input and output dimensions from file\n", stderr);
    return -1;
  }

  unsigned int* img = (unsigned int*) malloc (img_width*img_height*sizeof(unsigned int));
  unsigned char* histo = (unsigned char*) calloc (histo_width*histo_height, sizeof(unsigned char));

  result = fread(img, sizeof(unsigned int), img_width*img_height, f);

  fclose(f);

  if (result != img_width*img_height){
    fputs("Error reading input array from file\n", stderr);
    return -1;
  }

  cl_int ciErrNum;
  pb_Context* pb_context;
  pb_context = pb_InitOpenCLContext(parameters);
  if (pb_context == NULL) {
    fprintf (stderr, "Error: No OpenCL platform/device can be found."); 
    return -1;
  }

  cl_int clStatus;
  cl_device_id clDevice = (cl_device_id) pb_context->clDeviceId;
  cl_platform_id clPlatform = (cl_platform_id) pb_context->clPlatformId;
  cl_context clContext = (cl_context) pb_context->clContext;
  cl_command_queue clCommandQueue;
  
  cl_program clProgram[4];
  
  cl_kernel histo_prescan_kernel;
  cl_kernel histo_intermediates_kernel;
  cl_kernel histo_main_kernel;
  cl_kernel histo_final_kernel;

  int even_width = ((img_width+1)/2)*2;

  cl_mem input;
  cl_mem ranges;
  cl_mem sm_mappings;
  cl_mem global_subhisto;
  cl_mem global_histo;
  cl_mem global_overflow;
  cl_mem final_histo;
  
  clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);
  
  pb_SetOpenCL(&clContext, &clCommandQueue);  
  pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);
  
  long unsigned int lmemSize = 0;
  OCL_ERRCK_RETVAL ( clGetDeviceInfo(clDevice, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &lmemSize, NULL) );
  
  // lmemKB = lmemSize / 1024; // Should be valid, but not taken into consideration for initial programming
  
  if (lmemSize >= 48*1024) {
    lmemKB = 48;
  } else if (lmemSize >= 24*1024) {
    lmemKB = 24;
  } else {
    lmemKB = 8;
  }
  
  lmemKB = 24;
  
  bins_per_block = lmemKB * 1024;
  
  switch (lmemKB) {
    case 48: nThreads = 1024; break;
    case 24: nThreads = 768; break;
    default: nThreads = 512; break;
  }
  
  
  
  size_t program_length[4];
  const char *source_path[4] = { "src/opencl_nvidia/histo_prescan.cl",
    "src/opencl_nvidia/histo_intermediates.cl", "src/opencl_nvidia/histo_main.cl","src/opencl_nvidia/histo_final.cl"};
  char *source[4];

  for (int i = 0; i < 4; ++i) {
    // Dynamically allocate buffer for source
    source[i] = oclLoadProgSource(source_path[i], "", &program_length[i]);
    if(!source[i]) {
      fprintf(stderr, "Could not load program source\n"); exit(1);
    }
  	
  	clProgram[i] = clCreateProgramWithSource(clContext, 1, (const char **)&source[i], &program_length[i], &ciErrNum);
  	OCL_ERRCK_VAR(ciErrNum);
  	  	
  	free(source[i]);
  }
  	
  	  	  	  	  	  	  	
  char compileOptions[1024];
  //                -cl-nv-verbose // Provides register info for NVIDIA devices
  // Set all Macros referenced by kernels
  sprintf(compileOptions, "\
                -D PRESCAN_THREADS=%u\
                -D KB=%u -D UNROLL=%u\
                -D BINS_PER_BLOCK=%u -D BLOCK_X=%u",
                
                PRESCAN_THREADS,
                lmemKB, UNROLL,
                bins_per_block, BLOCK_X
            ); 
  
  for (int i = 0; i < 4; ++i) {
//fprintf(stderr, "Building Program #%d...\n", i);
    OCL_ERRCK_RETVAL ( clBuildProgram(clProgram[i], 1, &clDevice, compileOptions, NULL, NULL) );
       
          /*
       char *build_log;
       size_t ret_val_size;
       ciErrNum = clGetProgramBuildInfo(clProgram[i], clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);	OCL_ERRCK_VAR(ciErrNum);
       build_log = (char *)malloc(ret_val_size+1);
       ciErrNum = clGetProgramBuildInfo(clProgram[i], clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
       	OCL_ERRCK_VAR(ciErrNum);
       	

       // to be carefully, terminate with \0
       // there's no information in the reference whether the string is 0 terminated or not
       build_log[ret_val_size] = '\0';

       fprintf(stderr, "%s\n", build_log );
       */
  }
  	
  histo_prescan_kernel = clCreateKernel(clProgram[0], "histo_prescan_kernel", &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);
  histo_intermediates_kernel = clCreateKernel(clProgram[1], "histo_intermediates_kernel", &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);
  histo_main_kernel = clCreateKernel(clProgram[2], "histo_main_kernel", &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);
  histo_final_kernel = clCreateKernel(clProgram[3], "histo_final_kernel", &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);  	

  	
  pb_SwitchToTimer(&timers, pb_TimerID_IO);  

  input = clCreateBuffer(clContext, CL_MEM_READ_WRITE, 
      even_width*(((img_height+UNROLL)/UNROLL)*UNROLL)*sizeof(unsigned int), NULL, &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);
  ranges = clCreateBuffer(clContext, CL_MEM_READ_WRITE, 
      2*sizeof(unsigned int), NULL, &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);  
  sm_mappings = clCreateBuffer(clContext, CL_MEM_READ_WRITE, 
      img_width*img_height*4*sizeof(unsigned char), NULL, &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);
  global_subhisto = clCreateBuffer(clContext, CL_MEM_READ_WRITE, 
      img_width*histo_height*sizeof(unsigned int), NULL, &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);
  global_histo = clCreateBuffer(clContext, CL_MEM_READ_WRITE, 
      img_width*histo_height*sizeof(unsigned short), NULL, &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);
  global_overflow = clCreateBuffer(clContext, CL_MEM_READ_WRITE, 
      img_width*histo_height*sizeof(unsigned int), NULL, &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);
  final_histo = clCreateBuffer(clContext, CL_MEM_READ_WRITE, 
      img_width*histo_height*sizeof(unsigned char), NULL, &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);
  

  // Must dynamically allocate. Too large for stack
  unsigned int *zeroData;
  zeroData = (unsigned int *) malloc(sizeof(unsigned int) *img_width*histo_height);
  if (zeroData == NULL) {
    fprintf(stderr, "Failed to allocate %ld bytes of memory!\n", sizeof(unsigned int) * img_width * histo_height);
    exit(1);
  }
  memset(zeroData, 0, img_width*histo_height*sizeof(unsigned int));
   
  for (int y=0; y < img_height; y++){
    OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, input, CL_FALSE, 
                          y*even_width*sizeof(unsigned int), // Offset in bytes
                          img_width*sizeof(unsigned int), // Size of data to write
                          &img[y*img_width], // Host Source
                          0, NULL, NULL) );
  }
 
  pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);

  unsigned int img_dim = img_height*img_width;
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_prescan_kernel, 0, sizeof(cl_mem), (void *)&input) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_prescan_kernel, 1, sizeof(unsigned int), &img_dim) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_prescan_kernel, 2, sizeof(cl_mem), (void *)&ranges) );

  unsigned int half_width = (img_width+1)/2;
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 0, sizeof(cl_mem), (void *)&input) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 1, sizeof(unsigned int), &img_height) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 2, sizeof(unsigned int), &img_width) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 3, sizeof(unsigned int), &half_width) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 4, sizeof(cl_mem), (void *)&sm_mappings) );

  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 0, sizeof(cl_mem), (void *)&sm_mappings) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 1, sizeof(unsigned int), &img_dim) );

  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 4, sizeof(unsigned int), &histo_height) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 5, sizeof(unsigned int), &histo_width) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 6, sizeof(cl_mem), (void *)&global_subhisto) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 7, sizeof(cl_mem), (void *)&global_histo) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 8, sizeof(cl_mem), (void *)&global_overflow) );
  

  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 2, sizeof(unsigned int), &histo_height) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 3, sizeof(unsigned int), &histo_width) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 4, sizeof(cl_mem), (void *)&global_subhisto) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 5, sizeof(cl_mem), (void *)&global_histo) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 6, sizeof(cl_mem), (void *)&global_overflow) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 7, sizeof(cl_mem), (void *)&final_histo) );

  size_t prescan_localWS[1] = {PRESCAN_THREADS};
  size_t prescan_globalWS[1] = {PRESCAN_BLOCKS_X*prescan_localWS[0]};
  size_t inter_localWS[1] = {(img_width+1)/2};
  size_t inter_globalWS[1] = {((img_height + UNROLL-1)/UNROLL) * inter_localWS[0]};
  size_t main_localWS[2] = {nThreads, 1};
  size_t main_globalWS[2];  main_globalWS[0] = BLOCK_X * main_localWS[0];
  size_t final_localWS[1] = {512};
  size_t final_globalWS[1] = {BLOCK_X*3 * final_localWS[0]};
    

  pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);

  for (int iter = 0; iter < numIterations; iter++) {
    unsigned int ranges_h[2] = {UINT32_MAX/2, 0};
    
    // how about something like
    // __global__ unsigned int ranges[2];
    // ...kernel
    // __shared__ unsigned int s_ranges[2];
    // if (threadIdx.x == 0) {s_ranges[0] = ranges[0]; s_ranges[1] = ranges[1];}
    // __syncthreads();
    
    // Although then removing the blocking cudaMemcpy's might cause something about
    // concurrent kernel execution.
    // If kernel launches are synchronous, then how can 2 kernels run concurrently? different host threads?

  OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, ranges, CL_TRUE, 
                          0, // Offset in bytes
                          2*sizeof(unsigned int), // Size of data to write
                          ranges_h, // Host Source
                          0, NULL, NULL) );
                                                    
  pb_SwitchToSubTimer(&timers, prescans , pb_TimerID_KERNEL);
                         
  OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_prescan_kernel, 1, 0,
                            prescan_globalWS, prescan_localWS, 0, 0, 0) );

  pb_SwitchToSubTimer(&timers, postpremems , pb_TimerID_KERNEL);
    
  OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, ranges, CL_TRUE, 
                          0, // Offset in bytes
                          2*sizeof(unsigned int), // Size of data to read
                          ranges_h, // Host Source
                          0, NULL, NULL) );

  OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, global_subhisto, CL_TRUE, 
                          0, // Offset in bytes
                          img_width*histo_height*sizeof(unsigned int), // Size of data to write
                          zeroData, // Host Source
                          0, NULL, NULL) );

  pb_SwitchToSubTimer(&timers, intermediates, pb_TimerID_KERNEL);
                     
  OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_intermediates_kernel, 1, 0,
                            inter_globalWS, inter_localWS, 0, 0, 0) );                          

  main_globalWS[1] = ranges_h[1]-ranges_h[0]+1;
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 2, sizeof(unsigned int), &ranges_h[0]) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 3, sizeof(unsigned int), &ranges_h[1]) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 0, sizeof(unsigned int), &ranges_h[0]) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 1, sizeof(unsigned int), &ranges_h[1]) );

  pb_SwitchToSubTimer(&timers, mains, pb_TimerID_KERNEL);


  OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_main_kernel, 2, 0,
                            main_globalWS, main_localWS, 0, 0, 0) );

  pb_SwitchToSubTimer(&timers, finals, pb_TimerID_KERNEL);

  OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_final_kernel, 1, 0,
                            final_globalWS, final_localWS, 0, 0, 0) );
  
  }

  pb_SwitchToTimer(&timers, pb_TimerID_IO);


  OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, final_histo, CL_TRUE, 
                          0, // Offset in bytes
                          histo_height*histo_width*sizeof(unsigned char), // Size of data to read
                          histo, // Host Source
                          0, NULL, NULL) );

  OCL_ERRCK_RETVAL ( clReleaseKernel(histo_prescan_kernel) );
  OCL_ERRCK_RETVAL ( clReleaseKernel(histo_intermediates_kernel) );
  OCL_ERRCK_RETVAL ( clReleaseKernel(histo_main_kernel) );
  OCL_ERRCK_RETVAL ( clReleaseKernel(histo_final_kernel) );
  OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[0]) );
  OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[1]) );
  OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[2]) );
  OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[3]) );

  OCL_ERRCK_RETVAL ( clReleaseMemObject(input) );
  OCL_ERRCK_RETVAL ( clReleaseMemObject(ranges) );
  OCL_ERRCK_RETVAL ( clReleaseMemObject(sm_mappings) );
  OCL_ERRCK_RETVAL ( clReleaseMemObject(global_subhisto) );
  OCL_ERRCK_RETVAL ( clReleaseMemObject(global_histo) );
  OCL_ERRCK_RETVAL ( clReleaseMemObject(global_overflow) );
  OCL_ERRCK_RETVAL ( clReleaseMemObject(final_histo) );
  


  if (parameters->outFile) {
    dump_histo_img(histo, histo_height, histo_width, parameters->outFile);
  }


  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);


  free(zeroData);
  free(img);
  free(histo);

  pb_SwitchToTimer(&timers, pb_TimerID_NONE);

  printf("\n");
  pb_PrintTimerSet(&timers);
  pb_FreeParameters(parameters);
  
  OCL_ERRCK_RETVAL ( clReleaseCommandQueue(clCommandQueue) );
  OCL_ERRCK_RETVAL ( clReleaseContext(clContext) );
  
  pb_DestroyTimerSet(&timers);

  sleep(1);

  return 0;
}
コード例 #5
0
int
main(int argc, char **argv)
{
  struct image_i16 *ref_image;
  struct image_i16 *cur_image;
  unsigned short *sads_computed; /* SADs generated by the program */

  int image_size_bytes;
  int image_width_macroblocks, image_height_macroblocks;
  int image_size_macroblocks;

  struct pb_TimerSet timers;
  struct pb_Parameters *params;

  char oclOverhead[]= "OpenCL Overhead";

  pb_InitializeTimerSet(&timers);
  pb_AddSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);

  params = pb_ReadParameters(&argc, argv);

  if (pb_Parameters_CountInputs(params) != 2)
    {
      fprintf(stderr, "Expecting two input filenames\n");
      exit(-1);
    }

  /* Read input files */
  pb_SwitchToTimer(&timers, pb_TimerID_IO);
  ref_image = load_image(params->inpFiles[0]);
  cur_image = load_image(params->inpFiles[1]);
  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);

  if ((ref_image->width != cur_image->width) ||
      (ref_image->height != cur_image->height))
    {
      fprintf(stderr, "Input images must be the same size\n");
      exit(-1);
    }
  if ((ref_image->width % 16) || (ref_image->height % 16))
    {
      fprintf(stderr, "Input image size must be an integral multiple of 16\n");
      exit(-1);
    }

  /* Compute parameters, allocate memory */
  image_size_bytes = ref_image->width * ref_image->height * sizeof(short);
  image_width_macroblocks = ref_image->width >> 4;
  image_height_macroblocks = ref_image->height >> 4;
  image_size_macroblocks = image_width_macroblocks * image_height_macroblocks;

  sads_computed = (unsigned short *)
    malloc(41 * MAX_POS_PADDED * image_size_macroblocks * sizeof(short));

  // Run the kernel code
  // ************************************************************************

  	cl_int ciErrNum;
	cl_command_queue clCommandQueue;

	cl_kernel mb_sad_calc;
	cl_kernel larger_sad_calc_8;
	cl_kernel larger_sad_calc_16;

	cl_mem imgRef;		/* Reference image on the device */
	cl_mem d_cur_image;	/* Current image on the device */
	cl_mem d_sads;		/* SADs on the device */

    // x : image_width_macroblocks
    // y : image_height_macroblocks

  pb_Context* pb_context;
  pb_context = pb_InitOpenCLContext(params);
  if (pb_context == NULL) {
    fprintf (stderr, "Error: No OpenCL platform/device can be found.");
    return -1;
  }

    cl_int clStatus;
    cl_device_id clDevice = (cl_device_id) pb_context->clDeviceId;
    cl_platform_id clPlatform = (cl_platform_id) pb_context->clPlatformId;
    cl_context clContext = (cl_context) pb_context->clContext;

  	clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
  	OCL_ERRCK_VAR(ciErrNum);

    pb_SetOpenCL(&clContext, &clCommandQueue);
    pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);

  	// Read Source Code File
  	size_t program_length;
    const char* source_path = "src/opencl_base/kernel.cl";
    char* source = oclLoadProgSource(source_path, "", &program_length);
    if(!source) {
        fprintf(stderr, "Could not load program source\n"); exit(1);
    }

  	cl_program clProgram = clCreateProgramWithSource(clContext, 1, (const char **)&source, &program_length, &ciErrNum);
  	OCL_ERRCK_VAR(ciErrNum);

  	free(source);

    // JIT Compilation Options
    char compileOptions[1024];
    //                -cl-nv-verbose
    sprintf(compileOptions, "\
                -D MAX_POS=%u -D CEIL_POS=%u\
                -D POS_PER_THREAD=%u -D MAX_POS_PADDED=%u\
                -D THREADS_W=%u -D THREADS_H=%u\
                -D SEARCH_RANGE=%u -D SEARCH_DIMENSION=%u\
                \0",
                MAX_POS, CEIL(MAX_POS, POS_PER_THREAD),
                POS_PER_THREAD,   MAX_POS_PADDED,
                THREADS_W,   THREADS_H,
                SEARCH_RANGE, SEARCH_DIMENSION
            );
    printf ("options = %s\n", compileOptions);

    OCL_ERRCK_RETVAL( clBuildProgram(clProgram, 1, &clDevice, compileOptions, NULL, NULL) );

   /*
   char *build_log;
       size_t ret_val_size;
       OCL_ERRCK_RETVAL( clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size) );
       build_log = (char *)malloc(ret_val_size+1);
       OCL_ERRCK_RETVAL( clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL) );

       // Null terminate (original writer wasn't sure)
       build_log[ret_val_size] = '\0';

       fprintf(stderr, "%s\n", build_log );
   */

    mb_sad_calc = clCreateKernel(clProgram, "mb_sad_calc", &ciErrNum);
   	OCL_ERRCK_VAR(ciErrNum);
   	larger_sad_calc_8 = clCreateKernel(clProgram, "larger_sad_calc_8", &ciErrNum);
   	OCL_ERRCK_VAR(ciErrNum);
   	larger_sad_calc_16 = clCreateKernel(clProgram, "larger_sad_calc_16", &ciErrNum);
   	OCL_ERRCK_VAR(ciErrNum);

    size_t wgSize;
    size_t comp_wgSize[3];
    cl_ulong localMemSize;
    size_t prefwgSizeMult;
    cl_ulong privateMemSize;

    pb_SwitchToTimer(&timers, pb_TimerID_COPY);

#if 0
    cl_image_format img_format;
    img_format.image_channel_order = CL_R;
    img_format.image_channel_data_type = CL_UNSIGNED_INT16;

    /* Transfer reference image to device */
	imgRef = clCreateImage2D(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &img_format,
                                      ref_image->width /** sizeof(unsigned short)*/, // width
                                      ref_image->height, // height
                                      ref_image->width * sizeof(unsigned short), // row_pitch
                                      ref_image->data, &ciErrNum);
#endif

#if 1
    imgRef = clCreateBuffer(clContext, CL_MEM_READ_ONLY,
                            ref_image->width * ref_image->height * sizeof(unsigned short),
                            NULL, &ciErrNum);
    OCL_ERRCK_VAR(ciErrNum);
    OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, imgRef, CL_TRUE,
        0,
        ref_image->width * ref_image->height * sizeof(unsigned short),
        ref_image->data, 0, NULL, NULL) );
#else
    imgRef = clCreateBuffer(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                            ref_image->width * ref_image->height * sizeof(unsigned short),
                            ref_image->data, &ciErrNum);
    printf ("Allocating %d bytes\n", ref_image->width * ref_image->height * sizeof(unsigned short));

#endif
    OCL_ERRCK_VAR(ciErrNum);

    /* Allocate SAD data on the device */

    unsigned short *tmpZero = (unsigned short *)calloc(41 * MAX_POS_PADDED * image_size_macroblocks, sizeof(unsigned short));

/*
    size_t max_alloc_size = 0;
    clGetDeviceInfo(clDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
                    sizeof(max_alloc_size), &max_alloc_size, NULL);
    if (max_alloc_size < (41 * MAX_POS_PADDED *
            image_size_macroblocks * sizeof(unsigned short))) {
      fprintf(stderr, "Can't allocate sad buffer: max alloc size is %dMB\n",
              (int) (max_alloc_size >> 20));
      exit(-1);
    }
*/

    d_sads = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, 41 * MAX_POS_PADDED * image_size_macroblocks * sizeof(unsigned short), tmpZero, &ciErrNum);
    OCL_ERRCK_VAR(ciErrNum);
    free(tmpZero);

    d_cur_image = clCreateBuffer(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, image_size_bytes, cur_image->data, &ciErrNum);
    OCL_ERRCK_VAR(ciErrNum);

	/* Set Kernel Parameters */

	OCL_ERRCK_RETVAL( clSetKernelArg(mb_sad_calc, 0, sizeof(cl_mem), (void *)&d_sads) );
	OCL_ERRCK_RETVAL( clSetKernelArg(mb_sad_calc, 1, sizeof(cl_mem), (void *)&d_cur_image) );
	OCL_ERRCK_RETVAL( clSetKernelArg(mb_sad_calc, 2, sizeof(int), &image_width_macroblocks) );
	OCL_ERRCK_RETVAL( clSetKernelArg(mb_sad_calc, 3, sizeof(int), &image_height_macroblocks) );
	OCL_ERRCK_RETVAL( clSetKernelArg(mb_sad_calc, 4, sizeof(cl_mem), (void *)&imgRef) );

	OCL_ERRCK_RETVAL( clSetKernelArg(larger_sad_calc_8, 0, sizeof(cl_mem), (void *)&d_sads) );
	OCL_ERRCK_RETVAL( clSetKernelArg(larger_sad_calc_8, 1, sizeof(int), &image_width_macroblocks) );
	OCL_ERRCK_RETVAL( clSetKernelArg(larger_sad_calc_8, 2, sizeof(int), &image_height_macroblocks) );

	OCL_ERRCK_RETVAL( clSetKernelArg(larger_sad_calc_16, 0, sizeof(cl_mem), (void *)&d_sads) );
	OCL_ERRCK_RETVAL( clSetKernelArg(larger_sad_calc_16, 1, sizeof(int), &image_width_macroblocks) );
	OCL_ERRCK_RETVAL( clSetKernelArg(larger_sad_calc_16, 2, sizeof(int), &image_height_macroblocks) );

	size_t mb_sad_calc_localWorkSize[2] = {
	    CEIL(MAX_POS, POS_PER_THREAD) * THREADS_W * THREADS_H,
	    1 };
	size_t mb_sad_calc_globalWorkSize[2] = {
        mb_sad_calc_localWorkSize[0] * CEIL(ref_image->width / 4, THREADS_W),
	    mb_sad_calc_localWorkSize[1] * CEIL(ref_image->height / 4, THREADS_H) };

	size_t larger_sad_calc_8_localWorkSize[2] = {32,4};
	size_t larger_sad_calc_8_globalWorkSize[2] = {image_width_macroblocks * 32,
	  image_height_macroblocks * 4};

	size_t larger_sad_calc_16_localWorkSize[2] = {32, 1};
	size_t larger_sad_calc_16_globalWorkSize[2] = {image_width_macroblocks * 32,
	  image_height_macroblocks * 1};

    pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);

    /* Run the 4x4 kernel */
    printf ("DBlock = %dx%d\n", mb_sad_calc_localWorkSize[1], mb_sad_calc_localWorkSize[0]);
	OCL_ERRCK_RETVAL( clEnqueueNDRangeKernel(clCommandQueue, mb_sad_calc, 2, 0, mb_sad_calc_globalWorkSize, mb_sad_calc_localWorkSize, 0, 0, 0) );

	/* Run the larger-blocks kernels */
	OCL_ERRCK_RETVAL( clEnqueueNDRangeKernel(clCommandQueue, larger_sad_calc_8, 2, 0, larger_sad_calc_8_globalWorkSize, larger_sad_calc_8_localWorkSize, 0, 0, 0) );

	OCL_ERRCK_RETVAL( clEnqueueNDRangeKernel(clCommandQueue, larger_sad_calc_16, 2, 0, larger_sad_calc_16_globalWorkSize, larger_sad_calc_16_localWorkSize, 0, 0, 0) );

    OCL_ERRCK_RETVAL( clFinish(clCommandQueue) );
    pb_SwitchToTimer(&timers, pb_TimerID_COPY);

    /* Transfer SAD data to the host */
    OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, d_sads, CL_TRUE,
        0,
        41 * MAX_POS_PADDED * image_size_macroblocks * sizeof(unsigned short),
        sads_computed, 0, NULL, NULL) );

    /* Free GPU memory */
    OCL_ERRCK_RETVAL( clReleaseKernel(larger_sad_calc_8) );
    OCL_ERRCK_RETVAL( clReleaseKernel(larger_sad_calc_16) );
    OCL_ERRCK_RETVAL( clReleaseProgram(clProgram) );

    OCL_ERRCK_RETVAL( clReleaseMemObject(d_sads) );
    OCL_ERRCK_RETVAL( clReleaseMemObject(imgRef) );
    OCL_ERRCK_RETVAL( clReleaseMemObject(d_cur_image) );

    OCL_ERRCK_RETVAL( clFinish(clCommandQueue) );
    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);

  // ************************************************************************
  // End GPU Code

  /* Print output */
  if (params->outFile)
    {
      pb_SwitchToTimer(&timers, pb_TimerID_IO);
      write_sads(params->outFile,
		 image_width_macroblocks,
		 image_height_macroblocks,
		 sads_computed);
      pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
    }

#if 0  /* Debugging */
  print_test_sads(sads_computed, image_size_macroblocks);
  write_sads_directly("sad-debug.bin",
		      ref_image->width / 16, ref_image->height / 16,
		      sads_computed);
#endif

  /* Free memory */
  free(sads_computed);
  free_image(ref_image);
  free_image(cur_image);

  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
  pb_PrintTimerSet(&timers);
  pb_FreeParameters(params);

  OCL_ERRCK_RETVAL( clReleaseCommandQueue(clCommandQueue) );
  OCL_ERRCK_RETVAL( clReleaseContext(clContext) );

  pb_DestroyTimerSet(&timers);

  return 0;
}