static void build_program_callback(cl_program program, void *user_data) {
  cl_int err;
  cl_build_status build_status;
  bp_data_t *bp_data = (bp_data_t *)user_data;

  // Check the build status.
  err = clGetProgramBuildInfo(program, bp_data->dev,
                              CL_PROGRAM_BUILD_STATUS,
                              sizeof(cl_build_status),
                              &build_status, NULL);
  CHECK_ERROR(err);
  if (build_status != CL_BUILD_SUCCESS) {
    print_build_log(program, bp_data->dev);
    exit(EXIT_FAILURE);
  }

  // Set the event status
  err = clSetUserEventStatus(*(bp_data->event), CL_COMPLETE);
  CHECK_ERROR(err);
}
Exemple #2
0
cl_int
set_kernel(int did,
           cl_prop *prop) {
  cl_int status;

  prop->context = clCreateContext(0, prop->num_devices,
      (const cl_device_id *)prop->devices, NULL, NULL, &status);

  prop->queue = clCreateCommandQueueWithProperties(prop->context,
      prop->devices[did], 0, &status);

  prop->program = clCreateProgramWithSource(prop->context,
      prop->kcode.count, (const char **)prop->kcode.codes, NULL, &status);

  const char *options = "-I./include";
  status = clBuildProgram(prop->program, prop->num_devices,
      (const cl_device_id *)prop->devices, options, NULL, NULL);

  if(status != CL_SUCCESS) {
    printf("%s[Build Error Log]%s\n", ERR_STR, CLR_STR);
  } else {
    printf("%s[Build Log]%s\n", WHT_STR, CLR_STR);
  }
  print_build_log(did, prop);
  if(status != CL_SUCCESS) getchar();

  prop->gabor   =
    clCreateKernel(prop->program, (const char *)"enable_gabor",   NULL);
  prop->pooling =
    clCreateKernel(prop->program, (const char *)"enable_pooling", NULL);
  prop->feature =
    clCreateKernel(prop->program, (const char *)"feature_rfcn",   NULL);
  prop->cls     =
    clCreateKernel(prop->program, (const char *)"class_rfcn",     NULL);

  return status;
}
Exemple #3
0
int main(int argc, char **argv) {
  cl_int status;

  const char *platform_name = "NVIDIA";

  if (!find_platform(platform_name, &platform)) {
    fprintf(stderr,"Error: Platform \"%s\" not found\n", platform_name);
    print_platforms();
    teardown(-1);
  }

  status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
  checkError (status, "Error: could not query devices");

  context = clCreateContext(NULL, 1, &device, NULL, NULL, &status);
  checkError(status, "could not create context");

  const char name[] = KERNELDIR "/reduce.cl";

  unsigned char *source;
  size_t size;
  if (!load_file(name, &source, &size)) {
    teardown(-1);
  }

  program = clCreateProgramWithSource(context, 1, (const char **) &source, &size, &status);
  checkError(status, "Error: failed to create program %s: ", name);

  status = clBuildProgram(program, 1, &device, "-I.", NULL, NULL);

  if (status != CL_SUCCESS) {
    print_build_log(program, device);
    checkError(status, "Error: failed to create build %s: ", name);
  }

  free(source);

  print_device_info(device, 0);

  queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &status);
  checkError(status, "could not create command queue");

  cl_ulong start, end;
  cl_event event;

  size_t width  = 1024+1024;
  size_t buf_size = width*sizeof(cl_float);

  kernel = clCreateKernel(program, "reduce", &status);
  checkError(status, "could not create kernel");

  size_t work_size = width;
  size_t local_size = 64;
  size_t local_buf_size = local_size * sizeof(cl_float);
  size_t groups = width / local_size;
  size_t res_buf_size = groups * sizeof(cl_float);

  float *data_in  = malloc(buf_size);
  float *data_out = malloc(res_buf_size);
  if (!data_in || !data_out) {
    fprintf(stderr,"\nError: malloc failed\n");
    teardown(-1);
  }

  for (unsigned int i = 0; i < width; ++i) {
    data_in[i] = (float) (i % 16);
  }

  buffer_in = clCreateBuffer(context, CL_MEM_READ_WRITE, buf_size, NULL, &status);
  checkError(status, "Error: could not create buffer_in");

  buffer_out = clCreateBuffer(context, CL_MEM_READ_WRITE, res_buf_size, NULL, &status);
  checkError(status, "Error: could not create buffer_out");

  status = clEnqueueWriteBuffer(queue, buffer_in, CL_FALSE, 0, buf_size, data_in, 0, NULL, NULL);
  checkError(status, "Error: could not copy data into device");

  // execute kernel
  int arg = 0;
  status  = clSetKernelArg(kernel, arg++, sizeof(cl_mem), &buffer_in);
  status  = clSetKernelArg(kernel, arg++, sizeof(cl_mem), &buffer_out);
  status  = clSetKernelArg(kernel, arg++, local_buf_size, NULL);
  status  = clSetKernelArg(kernel, arg++, sizeof(cl_int), &width);
  checkError(status, "Error: could not set args");

  status = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &work_size, &local_size, 0, NULL, &event);
  checkError(status, "Error: could not enqueue kernel");

  status = clWaitForEvents(1, &event);
  checkError(status, "Error: could not wait for event");

  status  = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
  checkError(status, "Error: could not get start profile information");

  status = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
  checkError(status, "Error: could not get end profile information");

  status = clReleaseEvent(event);
  checkError(status, "Error: could not release event");

  // read results back
  status = clEnqueueReadBuffer(queue, buffer_out, CL_TRUE, 0, res_buf_size, data_out, 0, NULL, NULL);
  checkError(status, "Error: could not copy data into device");

  status  = clFinish(queue);
  checkError(status, "Error: could not finish successfully");

  float clsum = 0;
  for (unsigned int i = 0; i < groups; ++i) {
    clsum += data_out[i];
  }

#ifdef DEBUG
  for (int i = 0; i < groups; ++i) {
    printf("%.0f ", data_out[i]);
  }
#endif

  double elapsed = (end - start) * 1e-9f;
  printf("time: %f\n", elapsed);

  float sum = 0;
  for (unsigned int i = 0; i < width; ++i) {
    sum += data_in[i];
  }

  if (sum != clsum)
    fprintf(stderr, "Compare failed: %f != %f\n", clsum, sum);


  free(data_in);
  free(data_out);
  teardown(0);
}
void mat_mul_opencl_printf(float *M_A, float *M_B, float *M_C,
                           size_t ROW_A, size_t COL_A, size_t COL_B) {
  cl_platform_id   *platform;
  cl_device_type   dev_type = CL_DEVICE_TYPE_DEFAULT;
  cl_device_id     dev;
  cl_context       context;
  cl_command_queue cmd_queue;
  cl_program       program;
  cl_kernel        kernel;
  cl_mem           mem_A, mem_B, mem_C;
  cl_int           err;
  cl_uint          num_platforms;
  cl_uint          num_dev = 0;

  int i;

  // Get the device type to use from the environmental variable.
  char *dtype = getenv("CL_DEV_TYPE");
  if (dtype) {
    if (strcasecmp(dtype, "cpu") == 0) {
      dev_type = CL_DEVICE_TYPE_CPU;
    } else if (strcasecmp(dtype, "gpu") == 0) {
      dev_type = CL_DEVICE_TYPE_GPU;
    }
  }

  // Platform
  err = clGetPlatformIDs(0, NULL, &num_platforms);
  CHECK_ERROR(err);
  if (num_platforms == 0) {
    fprintf(stderr, "[%s:%d] ERROR: No OpenCL platform\n", __FILE__,__LINE__);
    exit(EXIT_FAILURE);
  }
  printf("Number of platforms: %u\n", num_platforms);
  platform = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms);
  err = clGetPlatformIDs(num_platforms, platform, NULL);
  CHECK_ERROR(err);

  // Device
  for (i = 0; i < num_platforms; i++) {
    err = clGetDeviceIDs(platform[i], dev_type, 1, &dev, &num_dev);
    if (err != CL_DEVICE_NOT_FOUND) CHECK_ERROR(err);
    if (num_dev == 1) break;
  }
  if (num_dev < 1) {
    fprintf(stderr, "[%s:%d] ERROR: No device\n", __FILE__, __LINE__);
    exit(EXIT_FAILURE);
  }
  print_device_name(dev);

  // Context
  context = clCreateContext(NULL, 1, &dev, NULL, NULL, &err);
  CHECK_ERROR(err);

  // Command queue
  cmd_queue = clCreateCommandQueue(context, dev, 0, &err);
  CHECK_ERROR(err);

  // Create a program.
  size_t source_len;
  char *source_code = get_source_code("./kernel_printf.cl", &source_len);
  program = clCreateProgramWithSource(context,
                                      1,
                                      (const char **)&source_code,
                                      &source_len,
                                      &err);
  free(source_code);
  CHECK_ERROR(err);

  // Build the program.
  char build_opts[200];
  sprintf(build_opts, "-DROW_A=%lu -DCOL_A=%lu -DCOL_B=%lu",
          ROW_A, COL_A, COL_B);
  err = clBuildProgram(program, 1, &dev, build_opts, NULL, NULL);
  if (err != CL_SUCCESS) {
    print_build_log(program, dev);
    CHECK_ERROR(err);
  }

  // Kernel
  kernel = clCreateKernel(program, "mat_mul", &err);
  CHECK_ERROR(err);

  // Buffers
  mem_A = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
                         sizeof(float) * ROW_A * COL_A,
                         M_A, &err);
  CHECK_ERROR(err);

  mem_B = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
                         sizeof(float) * COL_A * COL_B,
                         M_B, &err);
  CHECK_ERROR(err);

  mem_C = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 
                         sizeof(float) * ROW_A * COL_B,
                         M_C, &err);
  CHECK_ERROR(err);

  // Set the arguments.
  err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem_A);
  CHECK_ERROR(err);
  err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &mem_B);
  CHECK_ERROR(err);
  err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &mem_C);
  CHECK_ERROR(err);

  // Enqueue the kernel.
  size_t lws[2] = {16, 16};
  size_t gws[2];
  gws[1] = (size_t)ceil((double)ROW_A / lws[1]) * lws[1];
  gws[0] = (size_t)ceil((double)COL_B / lws[0]) * lws[0];
  timer_start(1);
  err = clEnqueueNDRangeKernel(cmd_queue,
                               kernel,
                               2, NULL,
                               gws, lws,
                               0, NULL, NULL);
  CHECK_ERROR(err);

  // Read the result.
  err = clEnqueueReadBuffer(cmd_queue,
                            mem_C,
                            CL_TRUE, 0,
                            sizeof(float) * ROW_A * COL_B, 
                            M_C,
                            0, NULL, NULL);
  CHECK_ERROR(err);
  timer_stop(1);
  printf("Kernel time : %f sec\n", timer_read(1));

  // Release
  clReleaseMemObject(mem_A);
  clReleaseMemObject(mem_B);
  clReleaseMemObject(mem_C);
  clReleaseKernel(kernel);
  clReleaseProgram(program);
  clReleaseCommandQueue(cmd_queue);
  clReleaseContext(context);
  free(platform);
}
Exemple #5
0
int main(int argc, char **argv) {
  cl_int status;

  const char *platform_name = "NVIDIA";

  if (!find_platform(platform_name, &platform)) {
    fprintf(stderr,"Error: Platform \"%s\" not found\n", platform_name);
    print_platforms();
    teardown(-1);
  }

  status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
  checkError (status, "Error: could not query devices");

  context = clCreateContext(NULL, 1, &device, NULL, NULL, &status);
  checkError(status, "could not create context");

  const char name[] = KERNELDIR "/gauss.cl";

  unsigned char *source;
  size_t size;
  if (!load_file(name, &source, &size)) {
    teardown(-1);
  }

  program = clCreateProgramWithSource(context, 1, (const char **) &source, &size, &status);
  checkError(status, "Error: failed to create program %s: ", name);

  status = clBuildProgram(program, 1, &device, "-I.", NULL, NULL);
  if (status != CL_SUCCESS) {
    print_build_log(program, device);
    checkError(status, "Error: failed to create build %s: ", name);
  }

  free(source);

  print_device_info(device, 0);

  queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &status);
  checkError(status, "could not create command queue");

  cl_ulong start, end;
  cl_event event;

  unsigned char *data;
  size_t datasize;

  if (!load_file("lena.dat", &data, &datasize)) {
    teardown(-1);
  }

  size_t width  = 512;
  size_t height = 512;
  size_t buf_size = width*height*sizeof(cl_float);

  float *data_out = malloc(buf_size);
  if (!data_out) {
    fprintf(stderr,"\nError: malloc failed\n");
    teardown(-1);
  }

  kernel = clCreateKernel(program, "gauss", &status);
  checkError(status, "could not create kernel");

  cl_image_format format = { CL_R, CL_UNORM_INT8};
  buffer_in = clCreateImage2D (context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &format,
    width, height, 0,
    data,
    &status);
  checkError(status, "Error: could not create image");

  buffer_out = clCreateBuffer(context, CL_MEM_READ_WRITE, buf_size, NULL, &status);
  checkError(status, "Error: could not create buffer_out");

  // execute kernel
  int arg = 0;
  status  = clSetKernelArg(kernel, arg++, sizeof(cl_mem), &buffer_in);
  status  = clSetKernelArg(kernel, arg++, sizeof(cl_mem), &buffer_out);
  checkError(status, "Error: could not set args");

  size_t work_size[] = {width, height};
  size_t local_size[] = {1, 1};

  status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, work_size, local_size, 0, NULL, &event);
  checkError(status, "Error: could not enqueue kernel");

  status = clWaitForEvents(1, &event);
  checkError(status, "Error: could not wait for event");

  status  = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
  checkError(status, "Error: could not get start profile information");

  status = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
  checkError(status, "Error: could not get end profile information");

  status = clReleaseEvent(event);
  checkError(status, "Error: could not release event");

  // read results back
  status = clEnqueueReadBuffer(queue, buffer_out, CL_FALSE, 0, buf_size, data_out, 0, NULL, NULL);
  checkError(status, "Error: could not copy data into device");

  status  = clFinish(queue);
  checkError(status, "Error: could not finish successfully");

  double elapsed = (end - start) * 1e-9f;
  printf("time: %f\n", elapsed);

  write_bmp("gauss.bmp", data_out, width, height, NORMAL);

  free(data);
  free(data_out);
  teardown(0);
}
void mandelbrot(int m, int n)
{
  cl_platform_id   *platform;
  cl_device_type   dev_type = CL_DEVICE_TYPE_GPU;
  cl_device_id     *devs = NULL;
  cl_context       context;
  cl_command_queue *cmd_queues;
  cl_program       program;
  cl_kernel        *kernels;
  cl_mem           *mem_R;
  cl_mem		   *mem_G;
  cl_mem		   *mem_B;
  cl_int           err;
  cl_uint          num_platforms;
  cl_uint          num_devs = 0;
  cl_event		   *ev_kernels;

	
		
  int count_max = COUNT_MAX;
  int i, j, jhi, jlo;
  char *output_filename = "mandelbrot.ppm";
  FILE *output_unit;
  double wtime;

  float x_max =   1.25;
  float x_min = - 2.25;
//  float x;
//  float x1;
//  float x2;
  float y_max =   1.75;
  float y_min = - 1.75;
  //float y;
  //float y1;
  //float y2;

  size_t size_color;

  size_color = sizeof(int) * m * n;

  int (*r)[n] = (int (*)[n])calloc(m * n, sizeof(int));
  int (*g)[n] = (int (*)[n])calloc(m * n, sizeof(int));
  int (*b)[n] = (int (*)[n])calloc(m * n, sizeof(int));

  printf( "  Sequential C version\n" );
  printf( "\n" );
  printf( "  Create an ASCII PPM image of the Mandelbrot set.\n" );
  printf( "\n" );
  printf( "  For each point C = X + i*Y\n" );
  printf( "  with X range [%g,%g]\n", x_min, x_max );
  printf( "  and  Y range [%g,%g]\n", y_min, y_max );
  printf( "  carry out %d iterations of the map\n", count_max );
  printf( "  Z(n+1) = Z(n)^2 + C.\n" );
  printf( "  If the iterates stay bounded (norm less than 2)\n" );
  printf( "  then C is taken to be a member of the set.\n" );
  printf( "\n" );
  printf( "  An ASCII PPM image of the set is created using\n" );
  printf( "    M = %d pixels in the X direction and\n", m );
  printf( "    N = %d pixels in the Y direction.\n", n );

  timer_init();
  timer_start(0);

  // Platform
  err = clGetPlatformIDs(0, NULL, &num_platforms);
  CHECK_ERROR(err);
  if (num_platforms == 0) {
    fprintf(stderr, "[%s:%d] ERROR: No OpenCL platform\n", __FILE__,__LINE__);
    exit(EXIT_FAILURE);
  }

  printf("Number of platforms: %u\n", num_platforms);
  platform = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms);
  err = clGetPlatformIDs(num_platforms, platform, NULL);
  CHECK_ERROR(err);
  
  // Device
  for (i = 0; i < num_platforms; i++) {
    err = clGetDeviceIDs(platform[i], dev_type, 0, NULL, &num_devs);
    if (err != CL_DEVICE_NOT_FOUND) CHECK_ERROR(err);
	num_devs = 1; //**
    if (num_devs >= 1)
	{
		devs = (cl_device_id*)malloc(sizeof(cl_device_id) * num_devs);

		err = clGetDeviceIDs(platform[i], dev_type, num_devs, devs, NULL);
		break;
	}
  }
  if ( devs == NULL || num_devs < 1) {
    fprintf(stderr, "[%s:%d] ERROR: No device\n", __FILE__, __LINE__);
    exit(EXIT_FAILURE);
  }

  for( i = 0; i < num_devs; ++i ) {
	printf("dev[%d] : ", i);
  	print_device_name(devs[i]);
  }

  // Context
  context = clCreateContext(NULL, num_devs, devs, NULL, NULL, &err);
  CHECK_ERROR(err);

  // Command queue
  cmd_queues = (cl_command_queue*)malloc(sizeof(cl_command_queue)*num_devs);
  for( i = 0; i < num_devs; ++i) {
	cmd_queues[i] = clCreateCommandQueue(context, devs[i], 0, &err);
  	CHECK_ERROR(err);
  }

  // Create a program.
  size_t source_len;
  char *source_code = get_source_code("./mandelbrot_kernel.cl", &source_len);
  program = clCreateProgramWithSource(context,
                                      1,
                                      (const char **)&source_code,
                                      &source_len,
                                      &err);
  free(source_code);
  CHECK_ERROR(err);

  // Build the program.
  char build_opts[200];
  sprintf(build_opts, "-Dm=%d -Dn=%d -Dnum_devs=%d", m, n, num_devs);
  err = clBuildProgram(program, num_devs, devs, build_opts, NULL, NULL);
  if (err != CL_SUCCESS) {
    print_build_log(program, devs[0]);
    CHECK_ERROR(err);
  }
  
  // Kernel
  kernels = (cl_kernel*)malloc(sizeof(cl_kernel)*num_devs);
  for (i = 0; i < num_devs; i++) {
	  kernels[i] = clCreateKernel(program, "mandelbrot_kernel", NULL);
  }
 
  // Buffers  
  mem_R = (cl_mem*)malloc(sizeof(cl_mem)*num_devs);
  mem_G = (cl_mem*)malloc(sizeof(cl_mem)*num_devs);
  mem_B = (cl_mem*)malloc(sizeof(cl_mem)*num_devs);

  for(i = 0; i < num_devs; i++) {
	  mem_R[i] = clCreateBuffer(context, CL_MEM_READ_WRITE,
                         size_color / num_devs, NULL, NULL);
	  mem_G[i] = clCreateBuffer(context, CL_MEM_READ_WRITE,
                         size_color / num_devs, NULL, NULL);
	  mem_B[i] = clCreateBuffer(context, CL_MEM_READ_WRITE,
                         size_color / num_devs, NULL, NULL);
  }

/*
  // Write to Buffers
  for(i = 0; i < num_devs; i++) {
  	clEnqueueWriteBuffer(cmd_queues[i],
                         mem_CHECK[i], 
                         CL_FALSE, 0,
                         size_CHECK / num_devs,
                         (CHECK + (N / num_devs) * i),
                         0, NULL, NULL);
  }
*/

  // Set the arguments.
  for(i = 0; i < num_devs; i++) {
//	  flag = i * (m * n / num_devs);
  	clSetKernelArg(kernels[i], 0, sizeof(cl_mem), (void*) &mem_R[i]);
	clSetKernelArg(kernels[i], 1, sizeof(cl_mem), (void*) &mem_G[i]);
  	clSetKernelArg(kernels[i], 2, sizeof(cl_mem), (void*) &mem_B[i]);

	clSetKernelArg(kernels[i], 3, sizeof(int), &count_max);
	clSetKernelArg(kernels[i], 4, sizeof(float), &x_max);
	clSetKernelArg(kernels[i], 5, sizeof(float), &x_min);
	clSetKernelArg(kernels[i], 6, sizeof(float), &y_max);
	clSetKernelArg(kernels[i], 7, sizeof(float), &y_min);
  }

  // Enqueue the kernel.
  size_t lws[1] = {256};
  size_t gws[1] = { m * n /num_devs };
  gws[0] = (size_t)ceil((double)m * n / lws[0]) * lws[0];
  ev_kernels = (cl_event*)malloc(sizeof(cl_event)*num_devs);
  for(i = 0; i < num_devs; i++) {
 	 err = clEnqueueNDRangeKernel(cmd_queues[i], kernels[i], 1, NULL, gws, lws, 0, NULL, &ev_kernels[i]);
  	 CHECK_ERROR(err);
  }

  // Read the result.
  for(i = 0; i < num_devs; i++) {
  	err = clEnqueueReadBuffer(cmd_queues[i],
                            mem_R[i],
                            CL_TRUE, 0,
                            size_color / num_devs,
                            r,
                            1, &ev_kernels[i], NULL);
  	err = clEnqueueReadBuffer(cmd_queues[i],
                            mem_G[i],
                            CL_TRUE, 0,
                            size_color / num_devs,
                            g,
							1, &ev_kernels[i], NULL);
   	err = clEnqueueReadBuffer(cmd_queues[i],
                            mem_B[i],
                            CL_TRUE, 0,
							size_color / num_devs,
							b,
                            1, &ev_kernels[i], NULL);
  }

  // Release
  for( i = 0; i < num_devs; ++i ) {
  clFinish(cmd_queues[i]); 
  clReleaseMemObject(mem_R[i]);
  clReleaseMemObject(mem_G[i]);
  clReleaseMemObject(mem_B[i]);
  clReleaseKernel(kernels[i]);
  clReleaseCommandQueue(cmd_queues[i]);
  clReleaseEvent(ev_kernels[i]);
  }
  clReleaseProgram(program);
  clReleaseContext(context);
  free(mem_R);
  free(mem_G);
  free(mem_B);
  free(cmd_queues);
  free(kernels);
  free(devs);
  free(ev_kernels);
  free(platform);

  timer_stop(0);
  wtime = timer_read(0);
  printf( "\n" );
  printf( "  Time = %lf seconds.\n", wtime );

  // Write data to an ASCII PPM file.
  output_unit = fopen( output_filename, "wt" );

  fprintf( output_unit, "P3\n" );
  fprintf( output_unit, "%d  %d\n", n, m );
  fprintf( output_unit, "%d\n", 255 );
  for ( i = 0; i < m; i++ )
  {
    for ( jlo = 0; jlo < n; jlo = jlo + 4 )
    {
      jhi = MIN( jlo + 4, n );
      for ( j = jlo; j < jhi; j++ )
      {
        fprintf( output_unit, "  %d  %d  %d", r[i][j], g[i][j], b[i][j] );
      }
      fprintf( output_unit, "\n" );
    }
  }

  fclose( output_unit );
  printf( "\n" );
  printf( "  Graphics data written to \"%s\".\n\n", output_filename );

  // Terminate.
  free(r);
  free(g);
  free(b);
}
void mat_mul_opencl_1d(float *M_A, float *M_B, float *M_C,
                       size_t ROW_A, size_t COL_A, size_t COL_B) {
  cl_platform_id   *platform;
  cl_device_type   dev_type;
  cl_device_id     dev;
  cl_context       context;
  cl_command_queue cmd_queue;
  cl_program       program;
  cl_kernel        kernel;
  cl_mem           mem_A, mem_B, mem_C;
  cl_event         ev_kernel;
  cl_int           err;
  cl_uint          num_platforms;
  cl_uint          num_dev = 0;
  int i;

  // Platform
  err = clGetPlatformIDs(0, NULL, &num_platforms);
  CHECK_ERROR(err);
  if (num_platforms == 0) {
    fprintf(stderr, "[%s:%d] ERROR: No OpenCL platform\n", __FILE__,__LINE__);
    exit(EXIT_FAILURE);
  }
  printf("Number of platforms: %u\n", num_platforms);
  platform = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms);
  err = clGetPlatformIDs(num_platforms, platform, NULL);
  CHECK_ERROR(err);

  // Device
  dev_type = get_device_type();
  for (i = 0; i < num_platforms; i++) {
    err = clGetDeviceIDs(platform[i], dev_type, 1, &dev, &num_dev);
    if (err != CL_DEVICE_NOT_FOUND) CHECK_ERROR(err);
    if (num_dev == 1) break;
  }
  if (num_dev < 1) {
    fprintf(stderr, "[%s:%d] ERROR: No device\n", __FILE__, __LINE__);
    exit(EXIT_FAILURE);
  }
  print_device_name(dev);
  free(platform);

  // Context
  context = clCreateContext(NULL, 1, &dev, NULL, NULL, &err);
  CHECK_ERROR(err);

  // Command queue
  cmd_queue = clCreateCommandQueue(context, dev,
                                   CL_QUEUE_PROFILING_ENABLE,
                                   &err);
  CHECK_ERROR(err);

  // Create a program.
  char *source_code = get_source_code("./kernel_1d.cl");
  program = clCreateProgramWithSource(context,
                                      1, (const char **)&source_code,
                                      NULL, &err);
  free(source_code);
  CHECK_ERROR(err);

  // Build the program.
  char build_opts[200];
  sprintf(build_opts, "-DROW_A=%lu -DCOL_A=%lu -DCOL_B=%lu",
          ROW_A, COL_A, COL_B);
  err = clBuildProgram(program, 1, &dev, build_opts, NULL, NULL);
  if (err != CL_SUCCESS) {
    print_build_log(program, dev);
    CHECK_ERROR(err);
  }

  // Kernel
  kernel = clCreateKernel(program, "mat_mul", &err);
  CHECK_ERROR(err);

  // Buffers
  mem_A = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
                         sizeof(float) * ROW_A * COL_A,
                         M_A, &err);
  CHECK_ERROR(err);

  mem_B = clCreateBuffer(context, CL_MEM_READ_ONLY, 
                         sizeof(float) * COL_A * COL_B,
                         NULL, &err);
  CHECK_ERROR(err);
  err = clEnqueueWriteBuffer(cmd_queue,
                             mem_B,
                             CL_FALSE, 0,
                             sizeof(float) * COL_A * COL_B,
                             M_B,
                             0, NULL, NULL);
  CHECK_ERROR(err)

  mem_C = clCreateBuffer(context, CL_MEM_READ_WRITE, 
                         sizeof(float) * ROW_A * COL_B,
                         NULL, &err);
  CHECK_ERROR(err);

  // Set the arguments.
  err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem_A);
  CHECK_ERROR(err);
  err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &mem_B);
  CHECK_ERROR(err);
  err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &mem_C);
  CHECK_ERROR(err);

  // Enqueue the kernel.
  size_t lws[1] = {256};
  size_t gws[1];
  gws[0] = (size_t)ceil((double)ROW_A / lws[0]) * lws[0];
  err = clEnqueueNDRangeKernel(cmd_queue,
                               kernel,
                               1, NULL,
                               gws, lws,
                               0, NULL,
                               &ev_kernel);
  CHECK_ERROR(err);

  // Read the result.
  err = clEnqueueReadBuffer(cmd_queue,
                            mem_C,
                            CL_TRUE, 0,
                            sizeof(float) * ROW_A * COL_B, 
                            M_C,
                            0, NULL, NULL);
  CHECK_ERROR(err);

  // Read the profiling info.
  cl_ulong start_time, end_time;
  err = clGetEventProfilingInfo(ev_kernel, CL_PROFILING_COMMAND_START, 
                                sizeof(cl_ulong), &start_time, NULL);
  CHECK_ERROR(err);
  err = clGetEventProfilingInfo(ev_kernel, CL_PROFILING_COMMAND_END, 
                                sizeof(cl_ulong), &end_time, NULL);
  CHECK_ERROR(err);
  printf("Kernel time : %lf sec\n", (double)(end_time - start_time) / 10e9);

  // Release
  clReleaseEvent(ev_kernel);
  clReleaseMemObject(mem_A);
  clReleaseMemObject(mem_B);
  clReleaseMemObject(mem_C);
  clReleaseKernel(kernel);
  clReleaseProgram(program);
  clReleaseCommandQueue(cmd_queue);
  clReleaseContext(context);
}