cl_vars_t setupRuntime(kernel* kernels, std::map<std::string, cl_kernel>* 
    kernel_map, int num_kerns)
{
  std::string kernel_source_str[num_kerns-1];
  std::string arraycompact_kernel_file[num_kerns-1];
  cl_vars_t cv;
  std::list<std::string> kernel_names; 
  //get the names of the kernel files
  for (int i = 1; i<num_kerns; i++) {
    arraycompact_kernel_file[i-1] = kernels[i].name + ".cl";
    kernel_names.push_back(kernels[i].name);
  }

  cl_int err = CL_SUCCESS;
 

  //read the kernel files
  readFile(arraycompact_kernel_file,
	   kernel_source_str, num_kerns-1);
  
  initialize_ocl(cv);
  
  compile_ocl_program(*kernel_map, cv, 
		      kernel_source_str,
                      num_kerns-1,
		      kernel_names);

  return cv;
}
Exemple #2
0
int main(int argc, char *argv[])
{
  std::string matmul_kernel_str;
 
  /* Provide names of the OpenCL kernels
   * and cl file that they're kept in */
  std::string matmul_name_str = 
    std::string("matmul");
  std::string matmul_kernel_file = 
    std::string("matmul.cl");

  cl_vars_t cv; 
  cl_kernel matmul;

  /* Read OpenCL file into STL string */
  readFile(matmul_kernel_file,
	   matmul_kernel_str);
  
  /* Initialize the OpenCL runtime 
   * Source in clhelp.cpp */
  initialize_ocl(cv);
  
  // Compile all OpenCL kernels.
  compile_ocl_program(matmul, cv, matmul_kernel_str.c_str(),
		      matmul_name_str.c_str());
  
  // Arrays on the host (CPU)
  float *h_A, *h_B, *h_Y, *h_YY;
  // Arrays on the device (GPU)
  cl_mem g_A, g_B, g_Y;

  /* Allocate arrays on the host
   * and fill with random data */
  int n = (1<<10);
  h_A = new float[n*n];
  assert(h_A);
  h_B = new float[n*n];
  assert(h_B);
  h_Y = new float[n*n];
  assert(h_Y);
  h_YY = new float[n*n];
  assert(h_YY);
  bzero(h_Y, sizeof(float)*n*n);
  bzero(h_YY, sizeof(float)*n*n);
  
  for(int i = 0; i < (n*n); i++)
    {
      h_A[i] = (float)drand48();
      h_B[i] = (float)drand48();
    }

  // Allocate memory for arrays on the GPU
  cl_int err = CL_SUCCESS;

  /* CS194: Allocate Buffers on the GPU.
   *...We're already allocating the Y buffer
   * on the GPU for you */
  g_Y = clCreateBuffer(cv.context,CL_MEM_READ_WRITE,
		       sizeof(float)*n*n,NULL,&err);
  CHK_ERR(err);
  g_A = clCreateBuffer(cv.context,CL_MEM_READ_WRITE,
           sizeof(float)*n*n,NULL,&err);
  CHK_ERR(err);
  g_B = clCreateBuffer(cv.context,CL_MEM_READ_WRITE,
           sizeof(float)*n*n,NULL,&err);
  CHK_ERR(err);
  
  /* CS194: Copy data from host CPU to GPU */
  err = clEnqueueWriteBuffer(cv.commands, g_Y, true, 0, sizeof(float)*n*n,
            h_Y, 0, NULL, NULL);
  CHK_ERR(err);
  err = clEnqueueWriteBuffer(cv.commands, g_A, true, 0, sizeof(float)*n*n,
            h_A, 0, NULL, NULL);
  CHK_ERR(err);
  err = clEnqueueWriteBuffer(cv.commands, g_B, true, 0, sizeof(float)*n*n,
            h_B, 0, NULL, NULL);
  CHK_ERR(err);


  /* CS194: Create appropriately sized workgroups */
  size_t global_work_size[2] = {n,n};
  size_t local_work_size[2] = {4,4};
  
  /* CS194: Set kernel arguments */
  err = clSetKernelArg(matmul, 0, sizeof(cl_mem), &g_Y);
  CHK_ERR(err);
  err = clSetKernelArg(matmul, 1, sizeof(cl_mem), &g_A);
  CHK_ERR(err);
  err = clSetKernelArg(matmul, 2, sizeof(cl_mem), &g_B);
  CHK_ERR(err);
  err = clSetKernelArg(matmul, 3, sizeof(int), &n);
  CHK_ERR(err);


  double t0 = timestamp();

  /* CS194: Launch matrix multiply kernel
    Here's a little code to get you started..  */
   err = clEnqueueNDRangeKernel(cv.commands, matmul, 2, NULL,
                    global_work_size, local_work_size, 0, NULL, NULL);
   CHK_ERR(err);
   err = clFinish(cv.commands);
   CHK_ERR(err);

  t0 = timestamp()-t0;


  /* Read result of GPU on host CPU */
  err = clEnqueueReadBuffer(cv.commands, g_Y, true, 0, sizeof(float)*n*n,
			    h_Y, 0, NULL, NULL);
  CHK_ERR(err);
  err = clFinish(cv.commands);
  CHK_ERR(err);

  double t1 = timestamp();
  sqr_sgemm(h_YY, h_A, h_B, n);
  t1 = timestamp()-t1;

  for(int i = 0; i < (n*n); i++)
    {
      double d = h_YY[i] - h_Y[i];
      d *= d;
      if(d > 0.0001)
	{
	  printf("CPU and GPU results do not match!\n");
	  break;
	}
    }
  uninitialize_ocl(cv);
  
  delete [] h_A; 
  delete [] h_B; 
  delete [] h_Y;
  delete [] h_YY;

  clReleaseMemObject(g_A); 
  clReleaseMemObject(g_B); 
  clReleaseMemObject(g_Y);
  
  double gpu_flops_s = (2.0 * pow((double)n, 3.0)) / t0;
  printf("GPU: %g gflops/sec\n", gpu_flops_s / (1e9));

  double cpu_flops_s = (2.0 * pow((double)n, 3.0)) / t1;
  printf("CPU: %g gflops/sec\n", cpu_flops_s / (1e9));
  return 0;
}
Exemple #3
0
int main(int argc, char *argv[])
{
  std::string reduce_kernel_str;
  
  std::string reduce_name_str = 
    std::string("reduce");
  std::string reduce_kernel_file = 
    std::string("reduce.cl");

  cl_vars_t cv; 
  cl_kernel reduce;
  
  readFile(reduce_kernel_file,
	   reduce_kernel_str);
  
  initialize_ocl(cv);
  
  compile_ocl_program(reduce, cv, reduce_kernel_str.c_str(),
		      reduce_name_str.c_str());

  int *h_A, *h_Y;
  cl_mem g_Out, g_In;
  int n = (1<<24);

  int c;
  /* how long do you want your arrays? */
  while((c = getopt(argc, argv, "n:"))!=-1){
    switch(c){
      case 'n':
        n = atoi(optarg);
        break;
    }
  }
  
  if(n==0)
    return 0;

  // pad the array is not power of 2
  int padded_size = 1;
  
  while(padded_size < n){
    padded_size <<= 1;
  } 

  h_A = new int[padded_size];
  h_Y = new int[padded_size];

  for(int i = 0; i < n; i++){
    h_A[i] = 1;
    h_Y[i] = 0;
  }

  for (int i = n; i < padded_size; ++i)
  {
    h_A[i] = 0;
    h_Y[i] = 0;
  }

  cl_int err = CL_SUCCESS;
  g_Out = clCreateBuffer(cv.context,CL_MEM_READ_WRITE,
			 sizeof(int)*n,NULL,&err);
  CHK_ERR(err);  
  g_In = clCreateBuffer(cv.context,CL_MEM_READ_WRITE,
			sizeof(int)*n,NULL,&err);
  CHK_ERR(err);

  //copy data from host CPU to GPU
  err = clEnqueueWriteBuffer(cv.commands, g_Out, true, 0, sizeof(int)*n,
			     h_Y, 0, NULL, NULL);
  CHK_ERR(err);

  err = clEnqueueWriteBuffer(cv.commands, g_In, true, 0, sizeof(int)*n,
			     h_A, 0, NULL, NULL);
  CHK_ERR(err);
 
  size_t local_work_size[1] = {512};
  size_t global_work_size[1] = {padded_size};

  err = clSetKernelArg(reduce, 0, sizeof(cl_mem), &g_In);
  CHK_ERR(err);
  err = clSetKernelArg(reduce, 1, sizeof(cl_mem), &g_Out);
  CHK_ERR(err);
  err = clSetKernelArg(reduce, 2, sizeof(int)*512, NULL);
  CHK_ERR(err);
  err = clSetKernelArg(reduce, 3, sizeof(int), &padded_size);
  CHK_ERR(err);
  
  double t0 = timestamp();

  // calls the recursion function
  recursive_reduce(cv.commands, cv.context, reduce, g_In, g_Out, padded_size);
  t0 = timestamp()-t0;
  
  //read result of GPU on host CPU
  err = clEnqueueReadBuffer(cv.commands, g_Out, true, 0, sizeof(int)*n,
			    h_Y, 0, NULL, NULL);
  CHK_ERR(err);
  
  int sum=0.0f;
  for(int i = 0; i < n; i++)
  {
    sum += h_A[i];
  }

  if(sum!=h_Y[0])
  {
    printf("WRONG: CPU sum = %d, GPU sum = %d\n", sum, h_Y[0]);
    printf("WRONG: difference = %d\n", sum-h_Y[0]);
    printf("Other parts = %d, %d, %d, %d\n", h_Y[1], h_Y[2], h_Y[3], h_Y[4]);
    int z=0;
    while(h_Y[z] == h_Y[z+1]){
	z++;
    }
    printf("red: %d\n", z);
  }
  else
  {
    printf("CORRECT: %d,%g\n",n,t0);
  }

  uninitialize_ocl(cv);
  
  delete [] h_A; 
  delete [] h_Y;
  
  clReleaseMemObject(g_Out); 
  clReleaseMemObject(g_In);
  
  return 0;
}
Exemple #4
0
void initialize_ocl(cl_vars_t& cv)
{
  cl_uint num_platforms;
  cv.err = clGetPlatformIDs(1, &(cv.platform), &(num_platforms));
  if(cv.err != CL_SUCCESS)
  {
    std::cout << "Could not get platform ID" << std::endl;
    exit(1);
  }

  if(getenv("HM_CPU0"))
  {
    std::cout << "Running on CPU 0" << std::endl;
    cl_uint max_devices = 1;
    cv.err = clGetDeviceIDs(cv.platform, CL_DEVICE_TYPE_CPU, max_devices, cv.device_ids, &(cv.num_devices));
    cv.num_devices = 1;
  }
  else if(getenv("HM_CPU0_SUB1"))
  {
    std::cout << "Running on Subdivided1 CPU 0" << std::endl;
    cl_uint max_devices = 1;
    cl_device_id dev0;
    cv.err = clGetDeviceIDs(cv.platform, CL_DEVICE_TYPE_CPU, max_devices, &dev0, &(cv.num_devices));
    cl_uint num_subdevices;
    cl_device_partition_property props[3];
    props[0] = CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN;
    props[1] = CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE;
    props[2] = 0;
    cl_device_id id4[4];
    cv.err = clCreateSubDevices(dev0, props, 2, id4, &num_subdevices);
    std::cout << "num subdevices: " << num_subdevices << std::endl;
    cv.device_ids[0] = id4[1];
    cv.num_devices = 1;
  }
  else if(getenv("HM_GPU0"))
  {
    std::cout << "Running on GPU 0" << std::endl;
    cl_uint max_devices = 1;
    cv.err = clGetDeviceIDs(cv.platform, CL_DEVICE_TYPE_GPU, max_devices, cv.device_ids, &(cv.num_devices));
    cv.num_devices = 1;
  }
  else if(getenv("HM_GPU01"))
  {
    std::cout << "Running on GPU 0 and GPU 1" << std::endl;
    cl_uint max_devices = 2;
    cv.err = clGetDeviceIDs(cv.platform, CL_DEVICE_TYPE_GPU, max_devices, cv.device_ids, &(cv.num_devices));
    cv.num_devices = 2;
  }
  else if(getenv("HM_GPU1"))
  {
    std::cout << "Running on GPU 1" << std::endl;
    cl_uint max_devices = 2;
    cv.err = clGetDeviceIDs(cv.platform, CL_DEVICE_TYPE_GPU, max_devices, cv.device_ids, &(cv.num_devices));
    assert(cv.num_devices > 1);
    cv.device_ids[0] = cv.device_ids[1];
    cv.num_devices = 1;
  }
  else
  {
    std::cout << "Error: Specify target either HM_CPU0, HM_GPU0, HM_GPU01, or HM_GPU1" << std::endl;
  }
  if(cv.err != CL_SUCCESS)
  {
    std::cout << "Could not get GPU device ID" << std::endl;
    exit(1);
  }

  cv.context = clCreateContext(0, cv.num_devices, cv.device_ids, NULL, NULL, &(cv.err));
  if(!cv.context)
  {
    std::cout << "Could not create context" << std::endl;
    exit(1);
  }

  //cv.commands = clCreateCommandQueue(cv.context, cv.device_id, 0, &(cv.err));
  for(size_t devId = 0 ; devId < cv.num_devices ; devId++)
  {
    cv.commands[devId] = clCreateCommandQueue(cv.context, cv.device_ids[devId], CL_QUEUE_PROFILING_ENABLE, &(cv.err));
    if(!cv.commands[devId])
    {
      std::cout << "Could not create command queue" << std::endl;
      exit(1);
    }
  }
  compile_ocl_program(cv.memset_program, cv.memset_kernel, cv, memset_kernel_str, "memset_kernel");

#ifdef VERBOSE_COMPILATION
  docs.opencl_ss << "CL fill vars success" << std::endl;

  // Device info
  for(size_t devId = 0 ; devId < cv.num_devices ; devId++)
  {
    docs.opencl_ss << "Device ID: " << devId << std::endl;

    char device_name[255];
    cv.err = clGetDeviceInfo(cv.device_ids[devId], CL_DEVICE_NAME, 255, device_name, NULL);
    docs.opencl_ss << "Device Name: " << device_name << std::endl;

    cl_ulong mem_size;
    cv.err = clGetDeviceInfo(cv.device_ids[devId], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &mem_size, NULL);
    docs.opencl_ss << "Global mem size: " << mem_size << std::endl;

    size_t max_work_item[3];
    cv.err = clGetDeviceInfo(cv.device_ids[devId], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(max_work_item), max_work_item, NULL);
    docs.opencl_ss << "Max work item sizes: " << max_work_item[0] << ", " << max_work_item[1] << ", " << max_work_item[2] << std::endl;
  }
#endif
}
Exemple #5
0
int main(int argc, char *argv[])
{
  std::string incr_kernel_str;

  /* Provide names of the OpenCL kernels
   * and cl file that they're kept in */
  std::string incr_name_str = 
    std::string("incr");
  std::string incr_kernel_file = 
    std::string("incr.cl");

  cl_vars_t cv; 
  cl_kernel incr;

  /* Read OpenCL file into STL string */
  readFile(incr_kernel_file,
	   incr_kernel_str);
  
  /* Initialize the OpenCL runtime 
   * Source in clhelp.cpp */
  initialize_ocl(cv);
  
  /* Compile all OpenCL kernels */
  compile_ocl_program(incr, cv, incr_kernel_str.c_str(),
		      incr_name_str.c_str());
  
  /* Arrays on the host (CPU) */
  float *h_Y, *h_YY;
  /* Arrays on the device (GPU) */
  cl_mem g_Y;

  // Allocate arrays on the host and fill with random data.
  int n = (1<<20);
  h_Y = new float[n];
  h_YY = new float[n];
   
  for(int i = 0; i < n; i++)
    {
      h_YY[i] = h_Y[i] = (float)drand48();
    }

  cl_int err = CL_SUCCESS;
  /* CS194: Allocate memory for arrays on 
   * the GPU */

  // Allocate the buffer memory objects.
  g_Y = clCreateBuffer(cv.context,CL_MEM_READ_WRITE,sizeof(float)*n,NULL,&err);
  CHK_ERR(err);

  // Write data from CPU to GPU.(this is opposite of clEnqueueReadBuffer())
  err = clEnqueueWriteBuffer(cv.commands, g_Y, true, 0, sizeof(float)*n,
			     h_Y, 0, NULL, NULL);
  CHK_ERR(err);
   
  // Define the global and local workgroup sizes.
  size_t global_work_size[1] = {n};
  size_t local_work_size[1] = {128};
    
  // Set the kernel args values.
  err = clSetKernelArg(incr, 0, sizeof(cl_mem), &g_Y);
  CHK_ERR(err);
  err = clSetKernelArg(incr, 1, sizeof(int), &n);
  CHK_ERR(err);

  // Call kernel on the GPU.
  err = clEnqueueNDRangeKernel(cv.commands,
			       incr,
			       1,//work_dim,
			       NULL, //global_work_offset
			       global_work_size, //global_work_size
			       local_work_size, //local_work_size
			       0, //num_events_in_wait_list
			       NULL, //event_wait_list
			       NULL //
			       );
  CHK_ERR(err);

  /* Read result of GPU on host CPU */
  err = clEnqueueReadBuffer(cv.commands, g_Y, true, 0, sizeof(float)*n,
			    h_Y, 0, NULL, NULL);
  CHK_ERR(err);

  /* Check answer */
  bool er = false;
  for(int i = 0; i < n; i++)
    {
      float d = (h_YY[i] + 1.0f);
      if(h_Y[i] != d)
	{
	  printf("error at %d :(\n", i);
	  er = true;
	  break;
	}
    }
  if(!er)
    {
      printf("CPU and GPU results match\n");
    }

  uninitialize_ocl(cv);
  
  delete [] h_Y;
  delete [] h_YY;

  clReleaseMemObject(g_Y);
  
  return 0;
}
Exemple #6
0
int main(int argc, char *argv[])
{
  std::string vvadd_kernel_str;

  /* Provide names of the OpenCL kernels
   * and cl file that they're kept in */
  std::string vvadd_name_str = 
    std::string("vvadd");
  std::string vvadd_kernel_file = 
    std::string("vvadd.cl");

  cl_vars_t cv; 
  cl_kernel vvadd;

  /* Read OpenCL file into STL string */
  readFile(vvadd_kernel_file,
	   vvadd_kernel_str);
  
  /* Initialize the OpenCL runtime 
   * Source in clhelp.cpp */
  initialize_ocl(cv);
  
  /* Compile all OpenCL kernels */
  compile_ocl_program(vvadd, cv, vvadd_kernel_str.c_str(),
		      vvadd_name_str.c_str());
  
  /* Arrays on the host (CPU) */
  float *h_A, *h_B, *h_Y;
  /* Arrays on the device (GPU) */
  cl_mem g_A, g_B, g_Y;

  /* Allocate arrays on the host
   * and fill with random data */
  int n = (1<<20);
  h_A = new float[n];
  h_B = new float[n];
  h_Y = new float[n];
  bzero(h_Y, sizeof(float)*n);
  
  for(int i = 0; i < n; i++)
    {
      h_A[i] = (float)drand48();
      h_B[i] = (float)drand48();
    }

  /* CS194: Allocate memory for arrays on 
   * the GPU */
  cl_int err = CL_SUCCESS;
  
  /* CS194: Here's something to get you started  */
  g_Y = clCreateBuffer(cv.context,CL_MEM_READ_WRITE,sizeof(float)*n,NULL,&err);
  CHK_ERR(err);
  g_A = clCreateBuffer(cv.context,CL_MEM_READ_WRITE,sizeof(float)*n,NULL,&err);
  CHK_ERR(err);
  g_B = clCreateBuffer(cv.context,CL_MEM_READ_WRITE,sizeof(float)*n,NULL,&err);
  CHK_ERR(err);
  

  /* CS194: Copy data from host CPU to GPU */
  err = clEnqueueWriteBuffer(cv.commands, g_Y, true, 0, sizeof(float)*n, h_Y, 0, NULL, NULL);
  CHK_ERR(err);
  err = clEnqueueWriteBuffer(cv.commands, g_A, true, 0, sizeof(float)*n, h_A, 0, NULL, NULL);
  CHK_ERR(err);
  err = clEnqueueWriteBuffer(cv.commands, g_B, true, 0, sizeof(float)*n, h_B, 0, NULL, NULL);
  CHK_ERR(err);
 
  /* CS194: Define the global and local workgroup sizes */
  size_t global_work_size[1] = {n};
  size_t local_work_size[1] = {128};
  
  /* CS194: Set Kernel Arguments */
  err  = clSetKernelArg(vvadd, 0, sizeof(cl_mem), &g_Y);
  CHK_ERR(err);
  err = clSetKernelArg(vvadd, 1, sizeof(cl_mem), &g_A);
  CHK_ERR(err);
  err = clSetKernelArg(vvadd, 2, sizeof(cl_mem), &g_B);
  CHK_ERR(err);
  err = clSetKernelArg(vvadd, 3, sizeof(int), &n);
  CHK_ERR(err);

  /* CS194: Call kernel on the GPU */
  err = clEnqueueNDRangeKernel(cv.commands,
                               vvadd,
                               1,//work_dim,
                               NULL, //global_work_offset
                               global_work_size, //global_work_size
                               local_work_size, //local_work_size
                               0, //num_events_in_wait_list
                               NULL, //event_wait_list
                               NULL //
                               );
  /* Read result of GPU on host CPU */
  err = clEnqueueReadBuffer(cv.commands, g_Y, true, 0, sizeof(float)*n,
			    h_Y, 0, NULL, NULL);
  CHK_ERR(err);

  /* Check answer */
  for(int i = 0; i < n; i++)
    {
      float d = h_A[i] + h_B[i];
      if(h_Y[i] != d)
    	{
    	  printf("error at %d :(\n", i);
    	  break;
    	}
    }

  /* Shut down the OpenCL runtime */
  uninitialize_ocl(cv);
  
  delete [] h_A; 
  delete [] h_B; 
  delete [] h_Y;
  
  clReleaseMemObject(g_A); 
  clReleaseMemObject(g_B); 
  clReleaseMemObject(g_Y);
  
  return 0;
}
Exemple #7
0
int main(int argc, char *argv[])
{
  std::string incr_kernel_str;

  /* Provide names of the OpenCL kernels
   * and cl file that they're kept in */
  std::string incr_name_str =
    std::string("incr");
  std::string incr_kernel_file =
    std::string("incr.cl");


  cl_vars_t cv;
  cl_kernel incr;

  /* Read OpenCL file into STL string */
  readFile(incr_kernel_file,
	   incr_kernel_str);

  /* Initialize the OpenCL runtime
   * Source in clhelp.cpp */
  initialize_ocl(cv);

  /* Compile all OpenCL kernels */
  compile_ocl_program(incr, cv, incr_kernel_str.c_str(),
		      incr_name_str.c_str());

  /* Arrays on the host (CPU) */
  float *h_Y, *h_YY;
  /* Arrays on the device (GPU) */
  cl_mem g_Y;

  int n = (1<<20);
  h_Y = new float[n];
  h_YY = new float[n];

  for(int i = 0; i < n; i++)
    {
      h_YY[i] = h_Y[i] = (float)drand48();
    }

  cl_int err = CL_SUCCESS;
  /* CS194: Allocate memory for arrays on
   * the GPU */
  /* Creates a buffer in the cv.context context, with read and write access
   * at the global host adress g_Y, of size sizeof(float)*n. */
  g_Y = clCreateBuffer(cv.context,CL_MEM_READ_WRITE,sizeof(float)*n,NULL,&err);
  CHK_ERR(err);

  /* enqueue commands to write to the buffer g_Y from hos memory.
   * Commands will be queued in cv.commands.
   * true indicates that the write is put on the commands queue.
   * 0 is the offset in bytes in the buffer object to write to.
   * sizeof(float)*n is the size in byte of data being wirtten.
   * h_Y is the address in host memory of the data being written from.
   */
   err = clEnqueueWriteBuffer(cv.commands, g_Y, true, 0, sizeof(float)*n,
			     h_Y, 0, NULL, NULL);
   /* checks whether the write buffer command was succesful. */
  CHK_ERR(err);

  /* declaring the global size of th y dimension to be n. */
  size_t global_work_size[1] = {n};
  /* declaring the size of work groups to be 128 work items. */
  size_t local_work_size[1] = {128};

  /* Sets specific arguments for the kernel incr.
   * 0 is the argument index, sizeof(cl_mem) is the size
   * of the argument, which is the pointer to g_Y.*/
  err = clSetKernelArg(incr, 0, sizeof(cl_mem), &g_Y);
  CHK_ERR(err);

  /* Sets specific arguments for the kernel incr.
   * 1 is the argument index, sizeof(int) is the size
   * of the argument, which is the pointer to n.*/
  err = clSetKernelArg(incr, 1, sizeof(int), &n);
  CHK_ERR(err);

  /* Enqueues a command on cv.commands to execute the
   * kernel incr.cl on the device. Uses linear dimension
   * to specify work groups and items and specifies to use
   * global_work_size work items for the execution and local_work_size
   * as the size of a work group.  */
  err = clEnqueueNDRangeKernel(cv.commands,
			       incr,
			       1,//work_dim,
			       NULL, //global_work_offset
			       global_work_size, //global_work_size
			       local_work_size, //local_work_size
			       0, //num_events_in_wait_list
			       NULL, //event_wait_list
			       NULL //
			       );
  CHK_ERR(err);

  /* Read result of GPU on host CPU */
  err = clEnqueueReadBuffer(cv.commands, g_Y, true, 0, sizeof(float)*n,
			    h_Y, 0, NULL, NULL);
  CHK_ERR(err);

  /* Check answer */
  bool er = false;
  for(int i = 0; i < n; i++)
    {
      float d = (h_YY[i] + 1.0f);
      if(h_Y[i] != d)
	{
	  printf("error at %d :(\n", i);
	  er = true;
	  break;
	}
    }
  if(!er)
    {
      printf("CPU and GPU results match\n");
    }

  uninitialize_ocl(cv);

  delete [] h_Y;
  delete [] h_YY;

  clReleaseMemObject(g_Y);

  return 0;
}