示例#1
0
int main(int argc, char *argv[])
{
  std::string matmul_kernel_str;
 
  /* Provide names of the OpenCL kernels
   * and cl file that they're kept in */
  std::string matmul_name_str = 
    std::string("matmul");
  std::string matmul_kernel_file = 
    std::string("matmul.cl");

  cl_vars_t cv; 
  cl_kernel matmul;

  /* Read OpenCL file into STL string */
  readFile(matmul_kernel_file,
	   matmul_kernel_str);
  
  /* Initialize the OpenCL runtime 
   * Source in clhelp.cpp */
  initialize_ocl(cv);
  
  // Compile all OpenCL kernels.
  compile_ocl_program(matmul, cv, matmul_kernel_str.c_str(),
		      matmul_name_str.c_str());
  
  // Arrays on the host (CPU)
  float *h_A, *h_B, *h_Y, *h_YY;
  // Arrays on the device (GPU)
  cl_mem g_A, g_B, g_Y;

  /* Allocate arrays on the host
   * and fill with random data */
  int n = (1<<10);
  h_A = new float[n*n];
  assert(h_A);
  h_B = new float[n*n];
  assert(h_B);
  h_Y = new float[n*n];
  assert(h_Y);
  h_YY = new float[n*n];
  assert(h_YY);
  bzero(h_Y, sizeof(float)*n*n);
  bzero(h_YY, sizeof(float)*n*n);
  
  for(int i = 0; i < (n*n); i++)
    {
      h_A[i] = (float)drand48();
      h_B[i] = (float)drand48();
    }

  // Allocate memory for arrays on the GPU
  cl_int err = CL_SUCCESS;

  /* CS194: Allocate Buffers on the GPU.
   *...We're already allocating the Y buffer
   * on the GPU for you */
  g_Y = clCreateBuffer(cv.context,CL_MEM_READ_WRITE,
		       sizeof(float)*n*n,NULL,&err);
  CHK_ERR(err);
  g_A = clCreateBuffer(cv.context,CL_MEM_READ_WRITE,
           sizeof(float)*n*n,NULL,&err);
  CHK_ERR(err);
  g_B = clCreateBuffer(cv.context,CL_MEM_READ_WRITE,
           sizeof(float)*n*n,NULL,&err);
  CHK_ERR(err);
  
  /* CS194: Copy data from host CPU to GPU */
  err = clEnqueueWriteBuffer(cv.commands, g_Y, true, 0, sizeof(float)*n*n,
            h_Y, 0, NULL, NULL);
  CHK_ERR(err);
  err = clEnqueueWriteBuffer(cv.commands, g_A, true, 0, sizeof(float)*n*n,
            h_A, 0, NULL, NULL);
  CHK_ERR(err);
  err = clEnqueueWriteBuffer(cv.commands, g_B, true, 0, sizeof(float)*n*n,
            h_B, 0, NULL, NULL);
  CHK_ERR(err);


  /* CS194: Create appropriately sized workgroups */
  size_t global_work_size[2] = {n,n};
  size_t local_work_size[2] = {4,4};
  
  /* CS194: Set kernel arguments */
  err = clSetKernelArg(matmul, 0, sizeof(cl_mem), &g_Y);
  CHK_ERR(err);
  err = clSetKernelArg(matmul, 1, sizeof(cl_mem), &g_A);
  CHK_ERR(err);
  err = clSetKernelArg(matmul, 2, sizeof(cl_mem), &g_B);
  CHK_ERR(err);
  err = clSetKernelArg(matmul, 3, sizeof(int), &n);
  CHK_ERR(err);


  double t0 = timestamp();

  /* CS194: Launch matrix multiply kernel
    Here's a little code to get you started..  */
   err = clEnqueueNDRangeKernel(cv.commands, matmul, 2, NULL,
                    global_work_size, local_work_size, 0, NULL, NULL);
   CHK_ERR(err);
   err = clFinish(cv.commands);
   CHK_ERR(err);

  t0 = timestamp()-t0;


  /* Read result of GPU on host CPU */
  err = clEnqueueReadBuffer(cv.commands, g_Y, true, 0, sizeof(float)*n*n,
			    h_Y, 0, NULL, NULL);
  CHK_ERR(err);
  err = clFinish(cv.commands);
  CHK_ERR(err);

  double t1 = timestamp();
  sqr_sgemm(h_YY, h_A, h_B, n);
  t1 = timestamp()-t1;

  for(int i = 0; i < (n*n); i++)
    {
      double d = h_YY[i] - h_Y[i];
      d *= d;
      if(d > 0.0001)
	{
	  printf("CPU and GPU results do not match!\n");
	  break;
	}
    }
  uninitialize_ocl(cv);
  
  delete [] h_A; 
  delete [] h_B; 
  delete [] h_Y;
  delete [] h_YY;

  clReleaseMemObject(g_A); 
  clReleaseMemObject(g_B); 
  clReleaseMemObject(g_Y);
  
  double gpu_flops_s = (2.0 * pow((double)n, 3.0)) / t0;
  printf("GPU: %g gflops/sec\n", gpu_flops_s / (1e9));

  double cpu_flops_s = (2.0 * pow((double)n, 3.0)) / t1;
  printf("CPU: %g gflops/sec\n", cpu_flops_s / (1e9));
  return 0;
}
示例#2
0
文件: reduce.cpp 项目: ldfaiztt/CS194
int main(int argc, char *argv[])
{
  std::string reduce_kernel_str;
  
  std::string reduce_name_str = 
    std::string("reduce");
  std::string reduce_kernel_file = 
    std::string("reduce.cl");

  cl_vars_t cv; 
  cl_kernel reduce;
  
  readFile(reduce_kernel_file,
	   reduce_kernel_str);
  
  initialize_ocl(cv);
  
  compile_ocl_program(reduce, cv, reduce_kernel_str.c_str(),
		      reduce_name_str.c_str());

  int *h_A, *h_Y;
  cl_mem g_Out, g_In;
  int n = (1<<24);

  int c;
  /* how long do you want your arrays? */
  while((c = getopt(argc, argv, "n:"))!=-1){
    switch(c){
      case 'n':
        n = atoi(optarg);
        break;
    }
  }
  
  if(n==0)
    return 0;

  // pad the array is not power of 2
  int padded_size = 1;
  
  while(padded_size < n){
    padded_size <<= 1;
  } 

  h_A = new int[padded_size];
  h_Y = new int[padded_size];

  for(int i = 0; i < n; i++){
    h_A[i] = 1;
    h_Y[i] = 0;
  }

  for (int i = n; i < padded_size; ++i)
  {
    h_A[i] = 0;
    h_Y[i] = 0;
  }

  cl_int err = CL_SUCCESS;
  g_Out = clCreateBuffer(cv.context,CL_MEM_READ_WRITE,
			 sizeof(int)*n,NULL,&err);
  CHK_ERR(err);  
  g_In = clCreateBuffer(cv.context,CL_MEM_READ_WRITE,
			sizeof(int)*n,NULL,&err);
  CHK_ERR(err);

  //copy data from host CPU to GPU
  err = clEnqueueWriteBuffer(cv.commands, g_Out, true, 0, sizeof(int)*n,
			     h_Y, 0, NULL, NULL);
  CHK_ERR(err);

  err = clEnqueueWriteBuffer(cv.commands, g_In, true, 0, sizeof(int)*n,
			     h_A, 0, NULL, NULL);
  CHK_ERR(err);
 
  size_t local_work_size[1] = {512};
  size_t global_work_size[1] = {padded_size};

  err = clSetKernelArg(reduce, 0, sizeof(cl_mem), &g_In);
  CHK_ERR(err);
  err = clSetKernelArg(reduce, 1, sizeof(cl_mem), &g_Out);
  CHK_ERR(err);
  err = clSetKernelArg(reduce, 2, sizeof(int)*512, NULL);
  CHK_ERR(err);
  err = clSetKernelArg(reduce, 3, sizeof(int), &padded_size);
  CHK_ERR(err);
  
  double t0 = timestamp();

  // calls the recursion function
  recursive_reduce(cv.commands, cv.context, reduce, g_In, g_Out, padded_size);
  t0 = timestamp()-t0;
  
  //read result of GPU on host CPU
  err = clEnqueueReadBuffer(cv.commands, g_Out, true, 0, sizeof(int)*n,
			    h_Y, 0, NULL, NULL);
  CHK_ERR(err);
  
  int sum=0.0f;
  for(int i = 0; i < n; i++)
  {
    sum += h_A[i];
  }

  if(sum!=h_Y[0])
  {
    printf("WRONG: CPU sum = %d, GPU sum = %d\n", sum, h_Y[0]);
    printf("WRONG: difference = %d\n", sum-h_Y[0]);
    printf("Other parts = %d, %d, %d, %d\n", h_Y[1], h_Y[2], h_Y[3], h_Y[4]);
    int z=0;
    while(h_Y[z] == h_Y[z+1]){
	z++;
    }
    printf("red: %d\n", z);
  }
  else
  {
    printf("CORRECT: %d,%g\n",n,t0);
  }

  uninitialize_ocl(cv);
  
  delete [] h_A; 
  delete [] h_Y;
  
  clReleaseMemObject(g_Out); 
  clReleaseMemObject(g_In);
  
  return 0;
}
示例#3
0
int main(int argc, char *argv[])
{
  std::string incr_kernel_str;

  /* Provide names of the OpenCL kernels
   * and cl file that they're kept in */
  std::string incr_name_str = 
    std::string("incr");
  std::string incr_kernel_file = 
    std::string("incr.cl");

  cl_vars_t cv; 
  cl_kernel incr;

  /* Read OpenCL file into STL string */
  readFile(incr_kernel_file,
	   incr_kernel_str);
  
  /* Initialize the OpenCL runtime 
   * Source in clhelp.cpp */
  initialize_ocl(cv);
  
  /* Compile all OpenCL kernels */
  compile_ocl_program(incr, cv, incr_kernel_str.c_str(),
		      incr_name_str.c_str());
  
  /* Arrays on the host (CPU) */
  float *h_Y, *h_YY;
  /* Arrays on the device (GPU) */
  cl_mem g_Y;

  // Allocate arrays on the host and fill with random data.
  int n = (1<<20);
  h_Y = new float[n];
  h_YY = new float[n];
   
  for(int i = 0; i < n; i++)
    {
      h_YY[i] = h_Y[i] = (float)drand48();
    }

  cl_int err = CL_SUCCESS;
  /* CS194: Allocate memory for arrays on 
   * the GPU */

  // Allocate the buffer memory objects.
  g_Y = clCreateBuffer(cv.context,CL_MEM_READ_WRITE,sizeof(float)*n,NULL,&err);
  CHK_ERR(err);

  // Write data from CPU to GPU.(this is opposite of clEnqueueReadBuffer())
  err = clEnqueueWriteBuffer(cv.commands, g_Y, true, 0, sizeof(float)*n,
			     h_Y, 0, NULL, NULL);
  CHK_ERR(err);
   
  // Define the global and local workgroup sizes.
  size_t global_work_size[1] = {n};
  size_t local_work_size[1] = {128};
    
  // Set the kernel args values.
  err = clSetKernelArg(incr, 0, sizeof(cl_mem), &g_Y);
  CHK_ERR(err);
  err = clSetKernelArg(incr, 1, sizeof(int), &n);
  CHK_ERR(err);

  // Call kernel on the GPU.
  err = clEnqueueNDRangeKernel(cv.commands,
			       incr,
			       1,//work_dim,
			       NULL, //global_work_offset
			       global_work_size, //global_work_size
			       local_work_size, //local_work_size
			       0, //num_events_in_wait_list
			       NULL, //event_wait_list
			       NULL //
			       );
  CHK_ERR(err);

  /* Read result of GPU on host CPU */
  err = clEnqueueReadBuffer(cv.commands, g_Y, true, 0, sizeof(float)*n,
			    h_Y, 0, NULL, NULL);
  CHK_ERR(err);

  /* Check answer */
  bool er = false;
  for(int i = 0; i < n; i++)
    {
      float d = (h_YY[i] + 1.0f);
      if(h_Y[i] != d)
	{
	  printf("error at %d :(\n", i);
	  er = true;
	  break;
	}
    }
  if(!er)
    {
      printf("CPU and GPU results match\n");
    }

  uninitialize_ocl(cv);
  
  delete [] h_Y;
  delete [] h_YY;

  clReleaseMemObject(g_Y);
  
  return 0;
}
示例#4
0
文件: vvadd.cpp 项目: AtonDev/cs194
int main(int argc, char *argv[])
{
  std::string vvadd_kernel_str;

  /* Provide names of the OpenCL kernels
   * and cl file that they're kept in */
  std::string vvadd_name_str = 
    std::string("vvadd");
  std::string vvadd_kernel_file = 
    std::string("vvadd.cl");

  cl_vars_t cv; 
  cl_kernel vvadd;

  /* Read OpenCL file into STL string */
  readFile(vvadd_kernel_file,
	   vvadd_kernel_str);
  
  /* Initialize the OpenCL runtime 
   * Source in clhelp.cpp */
  initialize_ocl(cv);
  
  /* Compile all OpenCL kernels */
  compile_ocl_program(vvadd, cv, vvadd_kernel_str.c_str(),
		      vvadd_name_str.c_str());
  
  /* Arrays on the host (CPU) */
  float *h_A, *h_B, *h_Y;
  /* Arrays on the device (GPU) */
  cl_mem g_A, g_B, g_Y;

  /* Allocate arrays on the host
   * and fill with random data */
  int n = (1<<20);
  h_A = new float[n];
  h_B = new float[n];
  h_Y = new float[n];
  bzero(h_Y, sizeof(float)*n);
  
  for(int i = 0; i < n; i++)
    {
      h_A[i] = (float)drand48();
      h_B[i] = (float)drand48();
    }

  /* CS194: Allocate memory for arrays on 
   * the GPU */
  cl_int err = CL_SUCCESS;
  
  /* CS194: Here's something to get you started  */
  g_Y = clCreateBuffer(cv.context,CL_MEM_READ_WRITE,sizeof(float)*n,NULL,&err);
  CHK_ERR(err);
  g_A = clCreateBuffer(cv.context,CL_MEM_READ_WRITE,sizeof(float)*n,NULL,&err);
  CHK_ERR(err);
  g_B = clCreateBuffer(cv.context,CL_MEM_READ_WRITE,sizeof(float)*n,NULL,&err);
  CHK_ERR(err);
  

  /* CS194: Copy data from host CPU to GPU */
  err = clEnqueueWriteBuffer(cv.commands, g_Y, true, 0, sizeof(float)*n, h_Y, 0, NULL, NULL);
  CHK_ERR(err);
  err = clEnqueueWriteBuffer(cv.commands, g_A, true, 0, sizeof(float)*n, h_A, 0, NULL, NULL);
  CHK_ERR(err);
  err = clEnqueueWriteBuffer(cv.commands, g_B, true, 0, sizeof(float)*n, h_B, 0, NULL, NULL);
  CHK_ERR(err);
 
  /* CS194: Define the global and local workgroup sizes */
  size_t global_work_size[1] = {n};
  size_t local_work_size[1] = {128};
  
  /* CS194: Set Kernel Arguments */
  err  = clSetKernelArg(vvadd, 0, sizeof(cl_mem), &g_Y);
  CHK_ERR(err);
  err = clSetKernelArg(vvadd, 1, sizeof(cl_mem), &g_A);
  CHK_ERR(err);
  err = clSetKernelArg(vvadd, 2, sizeof(cl_mem), &g_B);
  CHK_ERR(err);
  err = clSetKernelArg(vvadd, 3, sizeof(int), &n);
  CHK_ERR(err);

  /* CS194: Call kernel on the GPU */
  err = clEnqueueNDRangeKernel(cv.commands,
                               vvadd,
                               1,//work_dim,
                               NULL, //global_work_offset
                               global_work_size, //global_work_size
                               local_work_size, //local_work_size
                               0, //num_events_in_wait_list
                               NULL, //event_wait_list
                               NULL //
                               );
  /* Read result of GPU on host CPU */
  err = clEnqueueReadBuffer(cv.commands, g_Y, true, 0, sizeof(float)*n,
			    h_Y, 0, NULL, NULL);
  CHK_ERR(err);

  /* Check answer */
  for(int i = 0; i < n; i++)
    {
      float d = h_A[i] + h_B[i];
      if(h_Y[i] != d)
    	{
    	  printf("error at %d :(\n", i);
    	  break;
    	}
    }

  /* Shut down the OpenCL runtime */
  uninitialize_ocl(cv);
  
  delete [] h_A; 
  delete [] h_B; 
  delete [] h_Y;
  
  clReleaseMemObject(g_A); 
  clReleaseMemObject(g_B); 
  clReleaseMemObject(g_Y);
  
  return 0;
}
示例#5
0
文件: incr.cpp 项目: AtonDev/cs194
int main(int argc, char *argv[])
{
  std::string incr_kernel_str;

  /* Provide names of the OpenCL kernels
   * and cl file that they're kept in */
  std::string incr_name_str =
    std::string("incr");
  std::string incr_kernel_file =
    std::string("incr.cl");


  cl_vars_t cv;
  cl_kernel incr;

  /* Read OpenCL file into STL string */
  readFile(incr_kernel_file,
	   incr_kernel_str);

  /* Initialize the OpenCL runtime
   * Source in clhelp.cpp */
  initialize_ocl(cv);

  /* Compile all OpenCL kernels */
  compile_ocl_program(incr, cv, incr_kernel_str.c_str(),
		      incr_name_str.c_str());

  /* Arrays on the host (CPU) */
  float *h_Y, *h_YY;
  /* Arrays on the device (GPU) */
  cl_mem g_Y;

  int n = (1<<20);
  h_Y = new float[n];
  h_YY = new float[n];

  for(int i = 0; i < n; i++)
    {
      h_YY[i] = h_Y[i] = (float)drand48();
    }

  cl_int err = CL_SUCCESS;
  /* CS194: Allocate memory for arrays on
   * the GPU */
  /* Creates a buffer in the cv.context context, with read and write access
   * at the global host adress g_Y, of size sizeof(float)*n. */
  g_Y = clCreateBuffer(cv.context,CL_MEM_READ_WRITE,sizeof(float)*n,NULL,&err);
  CHK_ERR(err);

  /* enqueue commands to write to the buffer g_Y from hos memory.
   * Commands will be queued in cv.commands.
   * true indicates that the write is put on the commands queue.
   * 0 is the offset in bytes in the buffer object to write to.
   * sizeof(float)*n is the size in byte of data being wirtten.
   * h_Y is the address in host memory of the data being written from.
   */
   err = clEnqueueWriteBuffer(cv.commands, g_Y, true, 0, sizeof(float)*n,
			     h_Y, 0, NULL, NULL);
   /* checks whether the write buffer command was succesful. */
  CHK_ERR(err);

  /* declaring the global size of th y dimension to be n. */
  size_t global_work_size[1] = {n};
  /* declaring the size of work groups to be 128 work items. */
  size_t local_work_size[1] = {128};

  /* Sets specific arguments for the kernel incr.
   * 0 is the argument index, sizeof(cl_mem) is the size
   * of the argument, which is the pointer to g_Y.*/
  err = clSetKernelArg(incr, 0, sizeof(cl_mem), &g_Y);
  CHK_ERR(err);

  /* Sets specific arguments for the kernel incr.
   * 1 is the argument index, sizeof(int) is the size
   * of the argument, which is the pointer to n.*/
  err = clSetKernelArg(incr, 1, sizeof(int), &n);
  CHK_ERR(err);

  /* Enqueues a command on cv.commands to execute the
   * kernel incr.cl on the device. Uses linear dimension
   * to specify work groups and items and specifies to use
   * global_work_size work items for the execution and local_work_size
   * as the size of a work group.  */
  err = clEnqueueNDRangeKernel(cv.commands,
			       incr,
			       1,//work_dim,
			       NULL, //global_work_offset
			       global_work_size, //global_work_size
			       local_work_size, //local_work_size
			       0, //num_events_in_wait_list
			       NULL, //event_wait_list
			       NULL //
			       );
  CHK_ERR(err);

  /* Read result of GPU on host CPU */
  err = clEnqueueReadBuffer(cv.commands, g_Y, true, 0, sizeof(float)*n,
			    h_Y, 0, NULL, NULL);
  CHK_ERR(err);

  /* Check answer */
  bool er = false;
  for(int i = 0; i < n; i++)
    {
      float d = (h_YY[i] + 1.0f);
      if(h_Y[i] != d)
	{
	  printf("error at %d :(\n", i);
	  er = true;
	  break;
	}
    }
  if(!er)
    {
      printf("CPU and GPU results match\n");
    }

  uninitialize_ocl(cv);

  delete [] h_Y;
  delete [] h_YY;

  clReleaseMemObject(g_Y);

  return 0;
}
示例#6
0
int main(int argc, char **argv) 
{
    char c;
    char *filepath;
    int cut_horizontal = 0;
    int cut_vertical = 0;
    int timed = 0;
    int show = 0;

    while ((c = getopt(argc, argv, "f:h:v:ts")) != -1) {
        switch (c) {
        case 'f': filepath = optarg; break;
        case 'h': cut_horizontal = (int)strtol(optarg, NULL, 10); break;
        case 'v': cut_vertical = (int)strtol(optarg, NULL, 10); break;
        case 't': timed = 1; break;
        case 's': show = 1; break;
        default: exit(1); 
        }
    }

    // OpenCL boilerplate
    std::string ..._kernel_str;

    std::string ..._name_str = std::string("...");
    std::string ..._kernel_file = std::string("...");

    cl_vars_t cv; 
    cl_kernel ...;

    readFile(..._kernel_file, ..._kernel_str);

    initialize_ocl(cv);

    compile_ocl_program(..., cv, ..._kernel_str.c_str(), ..._name_str.c_str());

    // Read image
    Mat_<Vec3b> image = imread(filepath);

    if (!image.data) {
        cout << "Invalid input";
        image.release();
        return -1;
    }

    if (show) {
        imshow("Original Image", image);
    }

    SeamCarver s(image);

    // imshow("Gradient", s.energy);
    // Mat tmp = s.energy/195075.0*255.0;
    // s.energy.convertTo(tmp,CV_8U,-1);
    // imwrite("bench_gradient.jpg", tmp);
    // vector<uint> sm = s.findVerticalSeam();
    // s.showVerticalSeam(sm);


    // Carving happens here
    double start = get_time();
    ...;
    double elapsed = get_time() - start;
    // --------------------

    // double start = get_time();
    // for (int i = 0; i < cut_horizontal; ++i) {
    //     vector<uint> seam = s.findHorizontalSeam();
    //     // s.showHorizontalSeam(seam);
    //     s.removeHorizontalSeam(seam);
    // }
    // for (int i = 0; i < cut_vertical; ++i) {
    //     vector<uint> seam = s.findVerticalSeam();
    //     // s.showVerticalSeam(seam);
    //     s.removeVerticalSeam(seam);
    // }
    // double elapsed = get_time() - start;

    if (timed) {
        printf("Elapsed time: %.3lf seconds\n", elapsed);
    }

    Mat_<Vec3b> output = s.getImage();
    imwrite("scarved.jpg", output);

    if (show) {
        imshow("Carved Image", output);
        while (waitKey(20) != 27);
    }

    // cout << "Seam Length: " << seam.size() << endl;
    // s.showImage();
    // s.showEnergy();

    // imwrite("bench_carved.jpg", s.getImage());

    // for (int i = 0; i < 5; ++i) {
    //     for (int j = 0; j < 5; ++j) {
    //         cout << s.energy.at<uint32_t>(i,j) << " ";
    //     }
    //     cout << endl;
    // }

    uninitialize_ocl(cv);

    ...;

    clReleaseMemObject(...); 

    image.release();

    return 0;
}