Esempio n. 1
0
// Main part of the below code is originated from Lode Vandevenne's code.
// Please refer to http://lodev.org/cgtutor/juliamandelbrot.html
void julia(int w, int h) {
  // each iteration, it calculates: new = old*old + c,
  // where c is a constant and old starts at current pixel

  // real and imaginary part of the constant c
  // determinate shape of the Julia Set
  double cRe, cIm;

  // you can change these to zoom and change position
  double zoom = 1, moveX = 0, moveY = 0;

  // after how much iterations the function should stop
  int maxIterations = COUNT_MAX;

  char *output_filename = "julia.ppm";
  FILE *output_unit;
  double wtime;

  // pick some values for the constant c
  // this determines the shape of the Julia Set
  cRe = -0.7;
  cIm = 0.27015;

  /* Host data structures */
  cl_platform_id   *platforms;
  cl_uint          num_platforms;
  cl_device_type   dev_type = CL_DEVICE_TYPE_DEFAULT;
  cl_device_id     dev;
  cl_context       context;
  // NOTE : You might have multiple cmd_queue but whatever
  cl_command_queue cmd_queue;
  cl_program       program;
  cl_kernel        kernel;
  // TODO : define your variables
  cl_mem           R,G,B;
  cl_int           err;
  cl_uint          num_dev = 0;
//  cl_event         ev_bp;
  int i; 
  // TODO : 
/*
    // loop through every pixel
  for (int y = 0; y < h; y++)
  {
    for (int x = 0; x < w; x++)
*/
  size_t lws[2]={16,16};
  size_t gws[2]={h/16,w/16};



  printf( "  Parallel OpenCL version\n" );
  printf( "\n" );
  printf( "  Create an ASCII PPM image of the Julia set.\n" );
  printf( "\n" );
  printf( "  An ASCII PPM image of the set is created using\n" );
  printf( "    W = %d pixels in the X direction and\n", w );
  printf( "    H = %d pixels in the Y direction.\n", h );

  timer_init();
  timer_start(0);
  // Platform
  err = clGetPlatformIDs(0, NULL, &num_platforms);
  CHECK_ERROR(err);
  if(num_platforms == 0) {
    ERROR("No OpenCl platform");
  }
  printf("Number of platforms: %u\n",num_platforms);
  platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms);
  err = clGetPlatformIDs(num_platforms,platforms,NULL);
  CHECK_ERROR(err);

  //Device
  for(i=0;i<num_platforms;i++) {
    err = clGetDeviceIDs(platforms[i],dev_type,1,&dev,&num_dev);
    if(err != CL_DEVICE_NOT_FOUND) CHECK_ERROR(err);
    if(num_dev == 1) break;
  }
  if(num_dev<1) {
    ERROR("No device");
  }
  
  // Context
  context = clCreateContext(NULL, 1, &dev, NULL, NULL, &err);
  CHECK_ERROR(err);
printf("-4");
  // Command queue
  cmd_queue = clCreateCommandQueue(context, dev, 0, &err);
  CHECK_ERROR(err);
printf("-3");
  // Create a program
  // TODO : Get source code in your favor
  char * source_code=get_source_code("julia.cl");
  
 printf("-2"); 
  program = clCreateProgramWithSource(context, 1, (const char **)&source_code, NULL, &err);
  CHECK_ERROR(err);

  // Callback data for clBuildProgram
  /*
  ev_bp=clCreateUserEvent(context,&err);
  CHECK_ERROR(err);
  bp_data_t bp_data;
  bp_data.dev=dev;
  bp_data.event=&ev_bp;
  */
printf("-1");
  // Build the program.
  err = clBuildProgram(program, 1, &dev, NULL, NULL, NULL);
  if (err != CL_SUCCESS) {
    // Print the build log.
    size_t log_size;
    clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
        0, NULL, &log_size);
    char *log = (char *)malloc(log_size + 1);
    clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
        log_size, log, NULL);
    fprintf(stderr,"\n");
    fprintf(stderr,"---------- BUILD LOG ----------\n");
    fprintf(stderr,"%s\n",log);
    fprintf(stderr,"-------------------------------\n");
    free(log);

    CHECK_ERROR(err);
  }
  printf("0");
  // Buffers
  // TODO: make and buffers
  
  int (*r) = (int (*))calloc(w * h, sizeof(int));
  int (*g) = (int (*))calloc(w * h, sizeof(int));
  int (*b) = (int (*))calloc(w * h, sizeof(int));
printf("1");
  R=clCreateBuffer(context, CL_MEM_READ_WRITE,
      sizeof(int) * w*h, NULL, &err);
  CHECK_ERROR(err);
  err=clEnqueueWriteBuffer(cmd_queue, R,CL_FALSE,0,w*h*sizeof(int),r,0,NULL,NULL);
  CHECK_ERROR(err);
  G=clCreateBuffer(context, CL_MEM_READ_WRITE,
      sizeof(int) * w*h, NULL, &err);
  CHECK_ERROR(err);
  err=clEnqueueWriteBuffer(cmd_queue, G,CL_FALSE,0,w*h*sizeof(int),g,0,NULL,NULL);
  CHECK_ERROR(err);
  B=clCreateBuffer(context, CL_MEM_READ_WRITE,
		         sizeof(int) * w*h, NULL, &err);
 CHECK_ERROR(err);
  err=clEnqueueWriteBuffer(cmd_queue, B,CL_FALSE,0,w*h*sizeof(int),b,0,NULL,NULL);
  CHECK_ERROR(err);
printf("2");
  kernel=clCreateKernel(program,"julia",&err);
  CHECK_ERROR(err);
printf("3");
  err=clSetKernelArg(kernel,0,sizeof(int),&w);
  CHECK_ERROR(err);
  err=clSetKernelArg(kernel,1,sizeof(int),&h);
  CHECK_ERROR(err);
  err=clSetKernelArg(kernel,2,sizeof(int),&cRe);
  CHECK_ERROR(err);
  err=clSetKernelArg(kernel,3,sizeof(int),&cIm);
  CHECK_ERROR(err);
  err=clSetKernelArg(kernel,4,sizeof(cl_mem),&R);
  CHECK_ERROR(err);
  err=clSetKernelArg(kernel,5,sizeof(cl_mem),&G);
  CHECK_ERROR(err);
  err=clSetKernelArg(kernel,6,sizeof(cl_mem),&B);
  CHECK_ERROR(err);
  err=clSetKernelArg(kernel,7,sizeof(int),&zoom);
  CHECK_ERROR(err);
  err=clSetKernelArg(kernel,8,sizeof(int),&moveX);
  CHECK_ERROR(err);
  err=clSetKernelArg(kernel,9,sizeof(int),&moveY);
  CHECK_ERROR(err);
  err=clSetKernelArg(kernel,10,sizeof(int),&maxIterations);
  CHECK_ERROR(err);

  printf("4");
// Enqueue the kernel.
  err=clEnqueueNDRangeKernel(cmd_queue,kernel,1,NULL,gws,lws,0,NULL,NULL);
  CHECK_ERROR(err);
printf("5");
  // Read the result.
  
  err = clEnqueueReadBuffer(cmd_queue,
      R,
      CL_TRUE, 0,
      sizeof(int) * w*h,
      r,
      0, NULL, NULL);
  
  CHECK_ERROR(err);
  err = clEnqueueReadBuffer(cmd_queue,
      G,
      CL_TRUE, 0,
      sizeof(int) * w*h,
      g,
      0, NULL, NULL);
  
  CHECK_ERROR(err);
  err = clEnqueueReadBuffer(cmd_queue,
      B,
      CL_TRUE, 0,
      sizeof(int) * w*h,
      b,
      0, NULL, NULL);
  
  CHECK_ERROR(err);
timer_stop(0);
  wtime = timer_read(0);
  printf( "\n" );
  printf( "  Time = %lf seconds.\n", wtime );

  // Write data to an ASCII PPM file.
  output_unit = fopen( output_filename, "wt" );

  fprintf( output_unit, "P3\n" );
  fprintf( output_unit, "%d  %d\n", h, w );
  fprintf( output_unit, "%d\n", 255 );
  for ( int i = 0; i < h; i++ )
  {
    for ( int jlo = 0; jlo < w; jlo = jlo + 4 )
    {
      int jhi = MIN( jlo + 4, w );
      for ( int j = jlo; j < jhi; j++ )
      {
        fprintf( output_unit, "  %d  %d  %d", r[i*w+j], g[i*w+j], b[i*w+j] );
      }
      fprintf( output_unit, "\n" );
    }
  }

  fclose( output_unit );
  printf( "\n" );
  printf( "  Graphics data written to \"%s\".\n\n", output_filename );


  // Release
  //clReleaseEvent(ev_bp);
  clReleaseMemObject(R);
  clReleaseMemObject(G);
  clReleaseMemObject(B);
  clReleaseKernel(kernel);
  clReleaseProgram(program);
  clReleaseCommandQueue(cmd_queue);
  clReleaseContext(context);
  free(platforms);


  // Terminate.
  free(r);
  free(g);
  free(b);
}
Esempio n. 2
0
void mat_mul_opencl_printf(float *M_A, float *M_B, float *M_C,
                           size_t ROW_A, size_t COL_A, size_t COL_B) {
  cl_platform_id   *platform;
  cl_device_type   dev_type = CL_DEVICE_TYPE_DEFAULT;
  cl_device_id     dev;
  cl_context       context;
  cl_command_queue cmd_queue;
  cl_program       program;
  cl_kernel        kernel;
  cl_mem           mem_A, mem_B, mem_C;
  cl_int           err;
  cl_uint          num_platforms;
  cl_uint          num_dev = 0;

  int i;

  // Get the device type to use from the environmental variable.
  char *dtype = getenv("CL_DEV_TYPE");
  if (dtype) {
    if (strcasecmp(dtype, "cpu") == 0) {
      dev_type = CL_DEVICE_TYPE_CPU;
    } else if (strcasecmp(dtype, "gpu") == 0) {
      dev_type = CL_DEVICE_TYPE_GPU;
    }
  }

  // Platform
  err = clGetPlatformIDs(0, NULL, &num_platforms);
  CHECK_ERROR(err);
  if (num_platforms == 0) {
    fprintf(stderr, "[%s:%d] ERROR: No OpenCL platform\n", __FILE__,__LINE__);
    exit(EXIT_FAILURE);
  }
  printf("Number of platforms: %u\n", num_platforms);
  platform = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms);
  err = clGetPlatformIDs(num_platforms, platform, NULL);
  CHECK_ERROR(err);

  // Device
  for (i = 0; i < num_platforms; i++) {
    err = clGetDeviceIDs(platform[i], dev_type, 1, &dev, &num_dev);
    if (err != CL_DEVICE_NOT_FOUND) CHECK_ERROR(err);
    if (num_dev == 1) break;
  }
  if (num_dev < 1) {
    fprintf(stderr, "[%s:%d] ERROR: No device\n", __FILE__, __LINE__);
    exit(EXIT_FAILURE);
  }
  print_device_name(dev);

  // Context
  context = clCreateContext(NULL, 1, &dev, NULL, NULL, &err);
  CHECK_ERROR(err);

  // Command queue
  cmd_queue = clCreateCommandQueue(context, dev, 0, &err);
  CHECK_ERROR(err);

  // Create a program.
  size_t source_len;
  char *source_code = get_source_code("./kernel_printf.cl", &source_len);
  program = clCreateProgramWithSource(context,
                                      1,
                                      (const char **)&source_code,
                                      &source_len,
                                      &err);
  free(source_code);
  CHECK_ERROR(err);

  // Build the program.
  char build_opts[200];
  sprintf(build_opts, "-DROW_A=%lu -DCOL_A=%lu -DCOL_B=%lu",
          ROW_A, COL_A, COL_B);
  err = clBuildProgram(program, 1, &dev, build_opts, NULL, NULL);
  if (err != CL_SUCCESS) {
    print_build_log(program, dev);
    CHECK_ERROR(err);
  }

  // Kernel
  kernel = clCreateKernel(program, "mat_mul", &err);
  CHECK_ERROR(err);

  // Buffers
  mem_A = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
                         sizeof(float) * ROW_A * COL_A,
                         M_A, &err);
  CHECK_ERROR(err);

  mem_B = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
                         sizeof(float) * COL_A * COL_B,
                         M_B, &err);
  CHECK_ERROR(err);

  mem_C = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 
                         sizeof(float) * ROW_A * COL_B,
                         M_C, &err);
  CHECK_ERROR(err);

  // Set the arguments.
  err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem_A);
  CHECK_ERROR(err);
  err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &mem_B);
  CHECK_ERROR(err);
  err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &mem_C);
  CHECK_ERROR(err);

  // Enqueue the kernel.
  size_t lws[2] = {16, 16};
  size_t gws[2];
  gws[1] = (size_t)ceil((double)ROW_A / lws[1]) * lws[1];
  gws[0] = (size_t)ceil((double)COL_B / lws[0]) * lws[0];
  timer_start(1);
  err = clEnqueueNDRangeKernel(cmd_queue,
                               kernel,
                               2, NULL,
                               gws, lws,
                               0, NULL, NULL);
  CHECK_ERROR(err);

  // Read the result.
  err = clEnqueueReadBuffer(cmd_queue,
                            mem_C,
                            CL_TRUE, 0,
                            sizeof(float) * ROW_A * COL_B, 
                            M_C,
                            0, NULL, NULL);
  CHECK_ERROR(err);
  timer_stop(1);
  printf("Kernel time : %f sec\n", timer_read(1));

  // Release
  clReleaseMemObject(mem_A);
  clReleaseMemObject(mem_B);
  clReleaseMemObject(mem_C);
  clReleaseKernel(kernel);
  clReleaseProgram(program);
  clReleaseCommandQueue(cmd_queue);
  clReleaseContext(context);
  free(platform);
}
Esempio n. 3
0
void kmeans(int iteration_n, int class_n, int data_n, Point* centroids, Point* data, int* partitioned)
{
    cl_int err;

    cl_platform_id platform;
    err = clGetPlatformIDs(1, &platform, NULL);
    CHECK_ERROR(err);

    cl_device_id device;
    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    CHECK_ERROR(err);

    cl_context context;
    context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
    CHECK_ERROR(err);

    cl_command_queue queueIO;
    queueIO = clCreateCommandQueue(context, device, 0, &err);
    CHECK_ERROR(err);
    cl_command_queue queueSM;
    queueSM = clCreateCommandQueue(context, device, 0, &err);
    CHECK_ERROR(err);

    const char *source_code;
    size_t source_size;
    source_code = get_source_code("kernel.cl", &source_size);

    cl_program program;
    program = clCreateProgramWithSource(context, 1, &source_code, &source_size, &err);
    CHECK_ERROR(err);

    err = clBuildProgram(program, 1, &device, "", NULL, NULL);
    if (err == CL_BUILD_PROGRAM_FAILURE) {
        char *log;
        size_t log_size;
        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
        log = (char*)malloc(log_size + 1);
        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
        log[log_size] = 0;
        printf("Compile error:\n%s\n", log);
        free(log);
    }
    CHECK_ERROR(err);

    cl_kernel kernel;
    kernel = clCreateKernel(program, "classify", &err);
    CHECK_ERROR(err);

    size_t global_size, local_size = 256;
    global_size = (data_n + local_size - 1) / local_size * local_size;
    int n = (data_n + global_size - 1) / global_size * global_size;
    float *D = (float*)malloc(sizeof(float) * 2 * data_n);
    float *C = (float*)malloc(sizeof(float) * 2 * class_n);
    cl_uchar *E = (cl_uchar*)malloc(sizeof(cl_uchar) * n);
    int *F = (int*)malloc(sizeof(int) * class_n);
    for (int i = 0; i < data_n; ++i) {
        D[i * 2] = data[i].x;
        D[i * 2 + 1] = data[i].y;
    }
    for (int i = 0; i < class_n; ++i) {
        C[i * 2] = centroids[i].x;
        C[i * 2 + 1] = centroids[i].y;
    }

    cl_mem memD;
    memD = clCreateBuffer(context, CL_MEM_READ_ONLY,
        sizeof(cl_float2) * n, NULL, &err);
    CHECK_ERROR(err);
    cl_mem memC;
    memC = clCreateBuffer(context, CL_MEM_READ_ONLY,
        sizeof(cl_float2) * class_n, NULL, &err);
    CHECK_ERROR(err);
    cl_mem memE;
    memE = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
        sizeof(cl_uchar) * n, NULL, &err);
    CHECK_ERROR(err);

    err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memD);
    CHECK_ERROR(err);
    err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &memC);
    CHECK_ERROR(err);
    err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &memE);
    CHECK_ERROR(err);
    err = clSetKernelArg(kernel, 3, sizeof(cl_uchar), &class_n);
    CHECK_ERROR(err);

    err = clEnqueueWriteBuffer(queueIO, memD, CL_TRUE, 0,
        sizeof(cl_float2) * data_n, D, 0, NULL, NULL);
    CHECK_ERROR(err);
    for (int iter = 0; iter < iteration_n; ++iter) {
        err = clEnqueueWriteBuffer(queueIO, memC, CL_TRUE, 0,
            sizeof(cl_float2) * class_n, C, 0, NULL, NULL);
        CHECK_ERROR(err);
        memset(C, 0, sizeof(cl_float2) * class_n);
        memset(F, 0, sizeof(int) * class_n);

        int xSM = -1, xHost = -1;
        cl_uint num_events = 0;
        cl_event event[2];
        for (int i = 0; i < n; i += global_size) {
            if (num_events > 0) {
                err = clWaitForEvents(num_events, event);
                CHECK_ERROR(err);
            }

            if (xSM != -1) {
                err = clEnqueueReadBuffer(queueIO, memE, CL_FALSE,
                    sizeof(cl_uchar) * xSM, sizeof(cl_uchar) * global_size,
                    &E[xSM], 0, NULL, &event[1]);
                CHECK_ERROR(err);
                xHost = xSM;
            }

            num_events = 0;
            size_t global_offset = i;
            err = clEnqueueNDRangeKernel(queueSM, kernel, 1, &global_offset,
                &global_size, &local_size, 0, NULL, &event[num_events++]);
            CHECK_ERROR(err);
            xSM = i;

            if (xHost != -1) {
                err = clWaitForEvents(1, &event[1]);
                CHECK_ERROR(err);
                for (size_t x = 0; x < global_size; ++x) {
                    int idx = xHost + x;
                    C[E[idx] * 2] += D[idx * 2];
                    C[E[idx] * 2 + 1] += D[idx * 2 + 1];
                    ++F[E[idx]];
                }
            }
        }

        if (num_events > 0) {
            err = clWaitForEvents(num_events, event);
            CHECK_ERROR(err);
        }

        if (xSM != -1) {
            err = clEnqueueReadBuffer(queueIO, memE, CL_FALSE,
                sizeof(cl_uchar) * xSM, sizeof(cl_uchar) * global_size,
                &E[xSM], 0, NULL, &event[1]);
            CHECK_ERROR(err);
            xHost = xSM;
        }

        if (xHost != -1) {
            err = clWaitForEvents(1, &event[1]);
            CHECK_ERROR(err);
            for (size_t x = 0; x < global_size; ++x) {
                int idx = xHost + x;
                if (idx >= n) break;
                C[E[idx] * 2] += D[idx * 2];
                C[E[idx] * 2 + 1] += D[idx * 2 + 1];
                ++F[E[idx]];
            }
        }

        for (int x = 0; x < class_n; ++x) {
            if (F[x] > 0) {
                C[x * 2] /= F[x];
                C[x * 2 + 1] /= F[x];
            }
        }
    }

    for (int i = 0; i < class_n; ++i) {
        centroids[i].x = C[i * 2];
        centroids[i].y = C[i * 2 + 1];
    }
    for (int i = 0; i < data_n; ++i) {
        partitioned[i] = E[i];
    }

    free(D);
    free(C);
    free(E);
    free(F);
    clReleaseMemObject(memD);
    clReleaseMemObject(memC);
    clReleaseMemObject(memE);

    clReleaseKernel(kernel);
    clReleaseProgram(program);
    clReleaseCommandQueue(queueIO);
    clReleaseCommandQueue(queueSM);
    clReleaseContext(context);
}
Esempio n. 4
0
void mat_mul_opencl_binary(float *M_A, float *M_B, float *M_C,
                       size_t ROW_A, size_t COL_A, size_t COL_B) {
  cl_platform_id   *platform;
  cl_device_type   dev_type;
  cl_device_id     dev;
  cl_context       context;
  cl_command_queue cmd_queue;
  cl_program       program;
  cl_kernel        kernel;
  cl_mem           mem_A, mem_B, mem_C;
  cl_event         ev_kernel, ev_bp;
  cl_int           err;
  cl_uint          num_platforms;
  cl_uint          num_dev = 0;
  int i;

  // Platform
  err = clGetPlatformIDs(0, NULL, &num_platforms);
  CHECK_ERROR(err);
  if (num_platforms == 0) {
    fprintf(stderr, "[%s:%d] ERROR: No OpenCL platform\n", __FILE__,__LINE__);
    exit(EXIT_FAILURE);
  }
  printf("Number of platforms: %u\n", num_platforms);
  platform = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms);
  err = clGetPlatformIDs(num_platforms, platform, NULL);
  CHECK_ERROR(err);

  // Device
  dev_type = get_device_type();
  for (i = 0; i < num_platforms; i++) {
    err = clGetDeviceIDs(platform[i], dev_type, 1, &dev, &num_dev);
    if (err != CL_DEVICE_NOT_FOUND) CHECK_ERROR(err);
    if (num_dev == 1) break;
  }
  if (num_dev < 1) {
    fprintf(stderr, "[%s:%d] ERROR: No device\n", __FILE__, __LINE__);
    exit(EXIT_FAILURE);
  }
  print_device_name(dev);
  free(platform);

  // Context
  context = clCreateContext(NULL, 1, &dev, NULL, NULL, &err);
  CHECK_ERROR(err);

  // Create a program.
  char *source_code = get_source_code("./kernel_2d.cl");
  program = clCreateProgramWithSource(context,
                                      1, (const char **)&source_code,
                                      NULL, &err);
  free(source_code);
  CHECK_ERROR(err);

  // Callback data for clBuildProgram
  ev_bp = clCreateUserEvent(context, &err);
  CHECK_ERROR(err);
  bp_data_t bp_data;
  bp_data.dev    = dev;
  bp_data.event  = &ev_bp;

  // Build the program.
  char build_opts[200];
  sprintf(build_opts, "-DROW_A=%lu -DCOL_A=%lu -DCOL_B=%lu",
          ROW_A, COL_A, COL_B);
  err = clBuildProgram(program, 1, &dev, build_opts,
                       build_program_callback, &bp_data);
  CHECK_ERROR(err);

  // Command queue
  cmd_queue = clCreateCommandQueue(context, dev,
                                   CL_QUEUE_PROFILING_ENABLE,
                                   &err);
  CHECK_ERROR(err);

  // Buffers
  mem_A = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
                         sizeof(float) * ROW_A * COL_A,
                         M_A, &err);
  CHECK_ERROR(err);

  mem_B = clCreateBuffer(context, CL_MEM_READ_ONLY, 
                         sizeof(float) * COL_A * COL_B,
                         NULL, &err);
  CHECK_ERROR(err);
  err = clEnqueueWriteBuffer(cmd_queue,
                             mem_B,
                             CL_FALSE, 0,
                             sizeof(float) * COL_A * COL_B,
                             M_B,
                             0, NULL, NULL);
  CHECK_ERROR(err)

  mem_C = clCreateBuffer(context, CL_MEM_READ_WRITE, 
                         sizeof(float) * ROW_A * COL_B,
                         NULL, &err);
  CHECK_ERROR(err);

  // Index space (gws) and work-group size (lws)
  size_t lws[2] = {16, 16};
  size_t gws[2];
  gws[1] = (size_t)ceil((double)ROW_A / lws[1]) * lws[1];
  gws[0] = (size_t)ceil((double)COL_B / lws[0]) * lws[0];

  // Wait for the kernel creation.
  clWaitForEvents(1, bp_data.event);

  // Kernel
  kernel = clCreateKernel(program, "mat_mul", &err);
  CHECK_ERROR(err);

  // Set the arguments.
  err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem_A);
  CHECK_ERROR(err);
  err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &mem_B);
  CHECK_ERROR(err);
  err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &mem_C);
  CHECK_ERROR(err);

  // Enqueue the kernel.
  err = clEnqueueNDRangeKernel(cmd_queue,
                               kernel,
                               2, NULL,
                               gws, lws,
                               0, NULL,
                               &ev_kernel);
  CHECK_ERROR(err);

  // Read the result.
  err = clEnqueueReadBuffer(cmd_queue,
                            mem_C,
                            CL_TRUE, 0,
                            sizeof(float) * ROW_A * COL_B, 
                            M_C,
                            0, NULL, NULL);
  CHECK_ERROR(err);

  // Read the profiling info.
  cl_ulong start_time, end_time;
  err = clGetEventProfilingInfo(ev_kernel, CL_PROFILING_COMMAND_START, 
                                sizeof(cl_ulong), &start_time, NULL);
  CHECK_ERROR(err);
  err = clGetEventProfilingInfo(ev_kernel, CL_PROFILING_COMMAND_END, 
                                sizeof(cl_ulong), &end_time, NULL);
  CHECK_ERROR(err);
  printf("Kernel time : %lf sec\n", (double)(end_time - start_time) / 10e9);

  // Release
  clReleaseEvent(ev_kernel);
  clReleaseEvent(ev_bp);
  clReleaseMemObject(mem_A);
  clReleaseMemObject(mem_B);
  clReleaseMemObject(mem_C);
  clReleaseKernel(kernel);
  clReleaseProgram(program);
  clReleaseCommandQueue(cmd_queue);
  clReleaseContext(context);
}
Esempio n. 5
0
void mat_mul(float *a, float *b, float *c,
    size_t *dim, size_t *global_size, size_t *local_size) {
    cl_int err;

    cl_platform_id platform;
    err = clGetPlatformIDs(1, &platform, NULL);
    CHECK_ERROR(err);

    cl_device_id device;
    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    CHECK_ERROR(err);

    cl_context context;
    context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
    CHECK_ERROR(err);

    cl_command_queue queueIO;
    queueIO = clCreateCommandQueue(context, device, 0, &err);
    CHECK_ERROR(err);
    cl_command_queue queueSM;
    queueSM = clCreateCommandQueue(context, device, 0, &err);
    CHECK_ERROR(err);

    const char *source_code;
    size_t source_size;
    source_code = get_source_code("kernel.cl", &source_size);

    cl_program program;
    program = clCreateProgramWithSource(context, 1, &source_code, &source_size, &err);
    CHECK_ERROR(err);

    err = clBuildProgram(program, 1, &device, "", NULL, NULL);
    if (err == CL_BUILD_PROGRAM_FAILURE) {
        char *log;
        size_t log_size;
        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
        log = (char*)malloc(log_size + 1);
        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
        log[log_size] = 0;
        printf("Compile error:\n%s\n", log);
        free(log);
    }
    CHECK_ERROR(err);

    cl_kernel kernel;
    kernel = clCreateKernel(program, "mat_mul", &err);
    CHECK_ERROR(err);

    cl_mem memA[2];
    memA[0] = clCreateBuffer(context, CL_MEM_READ_ONLY,
        sizeof(float) * global_size[1] * global_size[2], NULL, &err);
    CHECK_ERROR(err);
    memA[1] = clCreateBuffer(context, CL_MEM_READ_ONLY,
        sizeof(float) * global_size[1] * global_size[2], NULL, &err);
    CHECK_ERROR(err);
    cl_mem memB[2];
    memB[0] = clCreateBuffer(context, CL_MEM_READ_ONLY,
        sizeof(float) * global_size[2] * global_size[0], NULL, &err);
    CHECK_ERROR(err);
    memB[1] = clCreateBuffer(context, CL_MEM_READ_ONLY,
        sizeof(float) * global_size[2] * global_size[0], NULL, &err);
    CHECK_ERROR(err);
    cl_mem memC[2];
    memC[0] = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
        sizeof(float) * global_size[1] * global_size[0], NULL, &err);
    CHECK_ERROR(err);
    memC[1] = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
        sizeof(float) * global_size[1] * global_size[0], NULL, &err);
    CHECK_ERROR(err);

    err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &memC[0]);
    CHECK_ERROR(err);
    err = clSetKernelArg(kernel, 3, sizeof(cl_ulong), &global_size[2]);
    CHECK_ERROR(err);
    err = clSetKernelArg(kernel, 4, sizeof(cl_ulong), &global_size[0]);
    CHECK_ERROR(err);

    float *bufA, *bufB, *bufC;
    bufA = (float*)malloc(sizeof(float) * global_size[1] * global_size[2]);
    bufB = (float*)malloc(sizeof(float) * global_size[2] * global_size[0]);
    bufC = (float*)malloc(sizeof(float) * global_size[1] * global_size[0]);

    int swA = 0, swB = 0, swC = 0;
    cl_uint num_events = 0;
    cl_event event[3];
    int xIO, yIO, xSM, ySM, xHost, yHost;
    xIO = yIO = xSM = ySM = xHost = yHost = -1;
    for (int i = 0; i < dim[1]; i += global_size[1]) {
        for (int k = 0; k < dim[2]; k += global_size[2]) {
            for (int j = 0; j < dim[0]; j += global_size[0]) {
                if (num_events > 0) {
                    err = clWaitForEvents(num_events, event);
                    CHECK_ERROR(err);
                }

                if (xSM != -1) {
                    err = clEnqueueReadBuffer(queueIO, memC[swC], CL_FALSE, 0,
                        sizeof(float) * global_size[0] * global_size[1],
                        bufC, 0, NULL, &event[1]);
                    CHECK_ERROR(err);
                    swC ^= 1;
                    err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &memC[swC]);
                    CHECK_ERROR(err);
                    xHost = xSM; yHost = ySM;
                }

                num_events = 0;
                if (xIO != -1) {
                    err = clEnqueueNDRangeKernel(queueSM, kernel, 2, NULL,
                        global_size, local_size, 0, NULL, &event[num_events++]);
                    CHECK_ERROR(err);
                    xSM = xIO; ySM = yIO;
                }

                if (xHost != -1) {
                    err = clWaitForEvents(1, &event[1]);
                    CHECK_ERROR(err);
                }

                if (j == 0) {
                    in2buf(a, bufA, global_size[1], global_size[2], dim[2], i, k);
                    err = clEnqueueWriteBuffer(queueIO, memA[swA], CL_FALSE, 0,
                        sizeof(float) * global_size[1] * global_size[2],
                        bufA, 0, NULL, &event[num_events++]);
                    CHECK_ERROR(err);
                    err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memA[swA]);
                    CHECK_ERROR(err);
                    swA ^= 1;
                }

                in2buf(b, bufB, global_size[2], global_size[0], dim[0], k, j);
                err = clEnqueueWriteBuffer(queueIO, memB[swB], CL_FALSE, 0,
                    sizeof(float) * global_size[2] * global_size[0],
                    bufB, 0, NULL, &event[num_events++]);
                CHECK_ERROR(err);
                err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &memB[swB]);
                CHECK_ERROR(err);
                swB ^= 1;

                xIO = i; yIO = j;

                if (xHost != -1) {
                    for (int x = 0; x < global_size[1]; ++x) {
                        for (int y = 0; y < global_size[0]; ++y) {
                            c[(xHost + x) * dim[0] + (yHost + y)] += bufC[x * global_size[0] + y];
                        }
                    }
                }
            }
        }
    }

    if (num_events > 0) {
        err = clWaitForEvents(num_events, event);
        CHECK_ERROR(err);
    }

    if (xSM != -1) {
        err = clEnqueueReadBuffer(queueIO, memC[swC], CL_FALSE, 0,
            sizeof(float) * global_size[0] * global_size[1],
            bufC, 0, NULL, &event[1]);
        CHECK_ERROR(err);
        swC ^= 1;
        err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &memC[swC]);
        CHECK_ERROR(err);
        xHost = xSM; yHost = ySM;
    }

    num_events = 0;
    if (xIO != -1) {
        err = clEnqueueNDRangeKernel(queueSM, kernel, 2, NULL,
            global_size, local_size, 0, NULL, &event[num_events++]);
        CHECK_ERROR(err);
        xSM = xIO; ySM = yIO;
    }

    if (xHost != -1) {
        err = clWaitForEvents(1, &event[1]);
        CHECK_ERROR(err);
        for (int x = 0; x < global_size[1]; ++x) {
            for (int y = 0; y < global_size[0]; ++y) {
                c[(xHost + x) * dim[0] + (yHost + y)] += bufC[x * global_size[0] + y];
            }
        }
    }

    if (num_events > 0) {
        err = clWaitForEvents(num_events, event);
        CHECK_ERROR(err);
    }

    if (xSM != -1) {
        err = clEnqueueReadBuffer(queueIO, memC[swC], CL_FALSE, 0,
            sizeof(float) * global_size[0] * global_size[1],
            bufC, 0, NULL, &event[1]);
        CHECK_ERROR(err);
        swC ^= 1;
        err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &memC[swC]);
        CHECK_ERROR(err);
        xHost = xSM; yHost = ySM;
    }

    if (xHost != -1) {
        err = clWaitForEvents(1, &event[1]);
        CHECK_ERROR(err);
        for (int x = 0; x < global_size[1]; ++x) {
            for (int y = 0; y < global_size[0]; ++y) {
                c[(xHost + x) * dim[0] + (yHost + y)] += bufC[x * global_size[0] + y];
            }
        }
    }

    free(bufA);
    free(bufB);
    free(bufC);
    clReleaseMemObject(memA[0]);
    clReleaseMemObject(memA[1]);
    clReleaseMemObject(memB[0]);
    clReleaseMemObject(memB[1]);
    clReleaseMemObject(memC[0]);
    clReleaseMemObject(memC[1]);

    clReleaseKernel(kernel);
    clReleaseProgram(program);
    clReleaseCommandQueue(queueIO);
    clReleaseCommandQueue(queueSM);
    clReleaseContext(context);
}
Esempio n. 6
0
void mat_mul_opencl(float *A, float *B, float *C,
                    int ROW_A, int COL_A, int COL_B) {
    cl_int err;

    cl_platform_id platform;
    err = clGetPlatformIDs(1, &platform, NULL);
    CHECK_ERROR(err);

    cl_device_id device;
    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    CHECK_ERROR(err);

    cl_context context;
    context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
    CHECK_ERROR(err);

    cl_command_queue queue;
    queue = clCreateCommandQueue(context, device, 0, &err);
    CHECK_ERROR(err);

    const char *source_code;
    size_t source_size;
    source_code = get_source_code("kernel.cl", &source_size);

    cl_program program;
    program = clCreateProgramWithSource(context, 1, &source_code, &source_size, &err);
    CHECK_ERROR(err);

    err = clBuildProgram(program, 1, &device, "", NULL, NULL);
    if (err == CL_BUILD_PROGRAM_FAILURE) {
        char *log;
        size_t log_size;
        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
        log = (char*)malloc(log_size + 1);
        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
        log[log_size] = 0;
        printf("Compile error:\n%s\n", log);
        free(log);
    }
    CHECK_ERROR(err);

    cl_kernel kernel;
    kernel = clCreateKernel(program, "mat_mul", &err);
    CHECK_ERROR(err);

    cl_mem memA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
        sizeof(float) * ROW_A * COL_A, A, &err);
    CHECK_ERROR(err);
    cl_mem memB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
        sizeof(float) * COL_A * COL_B, B, &err);
    CHECK_ERROR(err);
    cl_mem memC = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
        sizeof(float) * ROW_A * COL_B, NULL, &err);
    CHECK_ERROR(err);

    err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memA);
    CHECK_ERROR(err);
    err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &memB);
    CHECK_ERROR(err);
    err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &memC);
    CHECK_ERROR(err);
    err = clSetKernelArg(kernel, 3, sizeof(cl_int), &ROW_A);
    CHECK_ERROR(err);
    err = clSetKernelArg(kernel, 4, sizeof(cl_int), &COL_A);
    CHECK_ERROR(err);
    err = clSetKernelArg(kernel, 5, sizeof(cl_int), &COL_B);
    CHECK_ERROR(err);

    size_t global_size[2] = {COL_B, ROW_A};
    size_t local_size[2] = {16, 16};
    for (int i = 0; i < 2; ++i)
        global_size[i] = (global_size[i] + local_size[i] - 1) / local_size[i]
            * local_size[i];

    err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_size, local_size,
        0, NULL, NULL);
    CHECK_ERROR(err);

    err = clEnqueueReadBuffer(queue, memC, CL_TRUE, 0, sizeof(float) * ROW_A * COL_B,
        C, 0, NULL, NULL);
    CHECK_ERROR(err);

    clReleaseMemObject(memA);
    clReleaseMemObject(memB);
    clReleaseMemObject(memC);

    clReleaseKernel(kernel);
    clReleaseProgram(program);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);
}
Esempio n. 7
0
File: jit.c Progetto: Tetr4/HQ9Plus
int main(int argc, char **argv)
{
    FILE *file;
    char instruction;
    char* filename;
    struct vector instruction_stream;

    if(argc != 2)
    {
        /* wrong number of args */
        fprintf(stderr, "Error: exactly 1 HQ9+ source file as arg required\n");
        exit(EXIT_FAILURE);
    }

    /* open file */
    filename = argv[1];
    file = fopen(filename, "r");
    if (file == NULL)
    {
        /* could not open file */
        perror("Error opening file");
        exit(EXIT_FAILURE);
    }


    /*** prologue ***/
    vector_create(&instruction_stream, 100);
    char prologue [] = {
        0x55, // push %rbp
        0x48, 0x89, 0xE5, // mov %rsp, %rbp

        // backup %r12 (callee saved register)
        0x41, 0x54, // pushq %r12
        // store %rdi content (putchar) in %r12 as callee saved
        0x49, 0x89, 0xFC, // movq %rdi, %r12

        // push accumulator on stack
        0x6a, 0x00, // pushq $0
    };
    vector_push(&instruction_stream, prologue, sizeof(prologue));

    int stack_offset = -0x10; // offset from %rbp
    int offset_accumulator = stack_offset; // accumulator address: -0x10(%rbp)

    // hello world
    write_to_stack(&instruction_stream, "Hello World\n", &stack_offset);
    int offset_hello_world = stack_offset;

    // source code
    char* source_code = get_source_code(filename);
    write_to_stack(&instruction_stream, source_code, &stack_offset);
    free(source_code);
    int offset_source = stack_offset;

    // lyrics
    char* lyrics = get_lyrics(99);
    write_to_stack(&instruction_stream, lyrics, &stack_offset);
    free(lyrics);
    int offset_bottles = stack_offset;

    // everything after accumulator is text bytes
    int text_bytes_on_stack = -(stack_offset - offset_accumulator);


    /*** parse file ***/
    while((instruction = fgetc(file)) != EOF)
    {
        switch (instruction)
        {
            case 'H':
                {
                    // access single chars of int
                    char *hw = (char*) &offset_hello_world;
                    char opcodes [] = {
                        0xB0, 00, // movb $0, %al
                        0x48, 0x8D, 0xBD, hw[0], hw[1], hw[2], hw[3], // leaq -0x<offset>(%rbp),%rdi
                        0x41, 0xFF, 0xD4 // callq *%r12
                    };
                    vector_push(&instruction_stream, opcodes, sizeof(opcodes));
                }
                break;

            case 'Q':
                {
                    // access single chars of int
                    char *s = (char*) &offset_source;
                    char opcodes [] = {
                        0xB0, 00, // movb $0, %al
                        0x48, 0x8D, 0xBD, s[0], s[1], s[2], s[3], // leaq -0x<offset>(%rbp),%rdi
                        0x41, 0xFF, 0xD4 // callq *%r12
                    };
                    vector_push(&instruction_stream, opcodes, sizeof(opcodes));
                }
                break;

            case '9':
                {
                    // access single chars of int
                    char *b = (char*) &offset_bottles;
                    char opcodes [] = {
                        0xB0, 00, // movb $0, %al
                        0x48, 0x8D, 0xBD, b[0], b[1], b[2], b[3], // leaq -0x<offset>(%rbp),%rdi
                        // 0xBF, 0x39, 0x00, 0x00, 0x00, // mov $0x39, %edi
                        0x41, 0xFF, 0xD4 // callq *%r12
                    };
                    vector_push(&instruction_stream, opcodes, sizeof(opcodes));
                }
                break;

            case '+':
                {
                    char *acc = (char*) &offset_accumulator;
                    char opcodes [] = {
                        // increment the accumulator
                        // TODO from variable instead of constant offset
                        0x48, 0xFF, 0x45, 0xF0, // incq -0x10(%rbp)
                    };
                    vector_push(&instruction_stream, opcodes, sizeof(opcodes));
                }
                break;
        }
    }
    if (!feof(file)) {
        perror("Error reading file");
    }
    fclose(file);


    /*** epilogue ***/
    // access single chars of long
    char *t = (char*) &text_bytes_on_stack;
    char epilogue [] = {
        // free strings
        0x48, 0x81, 0xC4, t[0], t[1], t[2], t[3], // addq $<x>, %rsp

        // free accumulator
        0x48, 0x83, 0xC4, 0x08, // addq $8, %rsp

        // restore callee saved register
        0x41, 0x5C, // popq %r12

        0x5d, // pop rbp
        0xC3 // ret
    };
    vector_push(&instruction_stream, epilogue, sizeof(epilogue));


    /*** invoke generated code ***/
    /* allocate and copy instruction stream into executable memory */
    void* mem = mmap(NULL, instruction_stream.size, PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
    memcpy(mem, instruction_stream.data, instruction_stream.size);

    /* typecast memory to a function pointer and call the dynamically created executable code */
    void (*hq9p_program) (fn_printf) = mem;
    hq9p_program(printf);

    /* clear up */
    munmap(mem, instruction_stream.size);
    vector_destroy(&instruction_stream);

    exit(EXIT_SUCCESS);
}
void mandelbrot(int m, int n)
{
  cl_platform_id   *platform;
  cl_device_type   dev_type = CL_DEVICE_TYPE_GPU;
  cl_device_id     *devs = NULL;
  cl_context       context;
  cl_command_queue *cmd_queues;
  cl_program       program;
  cl_kernel        *kernels;
  cl_mem           *mem_R;
  cl_mem		   *mem_G;
  cl_mem		   *mem_B;
  cl_int           err;
  cl_uint          num_platforms;
  cl_uint          num_devs = 0;
  cl_event		   *ev_kernels;

	
		
  int count_max = COUNT_MAX;
  int i, j, jhi, jlo;
  char *output_filename = "mandelbrot.ppm";
  FILE *output_unit;
  double wtime;

  float x_max =   1.25;
  float x_min = - 2.25;
//  float x;
//  float x1;
//  float x2;
  float y_max =   1.75;
  float y_min = - 1.75;
  //float y;
  //float y1;
  //float y2;

  size_t size_color;

  size_color = sizeof(int) * m * n;

  int (*r)[n] = (int (*)[n])calloc(m * n, sizeof(int));
  int (*g)[n] = (int (*)[n])calloc(m * n, sizeof(int));
  int (*b)[n] = (int (*)[n])calloc(m * n, sizeof(int));

  printf( "  Sequential C version\n" );
  printf( "\n" );
  printf( "  Create an ASCII PPM image of the Mandelbrot set.\n" );
  printf( "\n" );
  printf( "  For each point C = X + i*Y\n" );
  printf( "  with X range [%g,%g]\n", x_min, x_max );
  printf( "  and  Y range [%g,%g]\n", y_min, y_max );
  printf( "  carry out %d iterations of the map\n", count_max );
  printf( "  Z(n+1) = Z(n)^2 + C.\n" );
  printf( "  If the iterates stay bounded (norm less than 2)\n" );
  printf( "  then C is taken to be a member of the set.\n" );
  printf( "\n" );
  printf( "  An ASCII PPM image of the set is created using\n" );
  printf( "    M = %d pixels in the X direction and\n", m );
  printf( "    N = %d pixels in the Y direction.\n", n );

  timer_init();
  timer_start(0);

  // Platform
  err = clGetPlatformIDs(0, NULL, &num_platforms);
  CHECK_ERROR(err);
  if (num_platforms == 0) {
    fprintf(stderr, "[%s:%d] ERROR: No OpenCL platform\n", __FILE__,__LINE__);
    exit(EXIT_FAILURE);
  }

  printf("Number of platforms: %u\n", num_platforms);
  platform = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms);
  err = clGetPlatformIDs(num_platforms, platform, NULL);
  CHECK_ERROR(err);
  
  // Device
  for (i = 0; i < num_platforms; i++) {
    err = clGetDeviceIDs(platform[i], dev_type, 0, NULL, &num_devs);
    if (err != CL_DEVICE_NOT_FOUND) CHECK_ERROR(err);
	num_devs = 1; //**
    if (num_devs >= 1)
	{
		devs = (cl_device_id*)malloc(sizeof(cl_device_id) * num_devs);

		err = clGetDeviceIDs(platform[i], dev_type, num_devs, devs, NULL);
		break;
	}
  }
  if ( devs == NULL || num_devs < 1) {
    fprintf(stderr, "[%s:%d] ERROR: No device\n", __FILE__, __LINE__);
    exit(EXIT_FAILURE);
  }

  for( i = 0; i < num_devs; ++i ) {
	printf("dev[%d] : ", i);
  	print_device_name(devs[i]);
  }

  // Context
  context = clCreateContext(NULL, num_devs, devs, NULL, NULL, &err);
  CHECK_ERROR(err);

  // Command queue
  cmd_queues = (cl_command_queue*)malloc(sizeof(cl_command_queue)*num_devs);
  for( i = 0; i < num_devs; ++i) {
	cmd_queues[i] = clCreateCommandQueue(context, devs[i], 0, &err);
  	CHECK_ERROR(err);
  }

  // Create a program.
  size_t source_len;
  char *source_code = get_source_code("./mandelbrot_kernel.cl", &source_len);
  program = clCreateProgramWithSource(context,
                                      1,
                                      (const char **)&source_code,
                                      &source_len,
                                      &err);
  free(source_code);
  CHECK_ERROR(err);

  // Build the program.
  char build_opts[200];
  sprintf(build_opts, "-Dm=%d -Dn=%d -Dnum_devs=%d", m, n, num_devs);
  err = clBuildProgram(program, num_devs, devs, build_opts, NULL, NULL);
  if (err != CL_SUCCESS) {
    print_build_log(program, devs[0]);
    CHECK_ERROR(err);
  }
  
  // Kernel
  kernels = (cl_kernel*)malloc(sizeof(cl_kernel)*num_devs);
  for (i = 0; i < num_devs; i++) {
	  kernels[i] = clCreateKernel(program, "mandelbrot_kernel", NULL);
  }
 
  // Buffers  
  mem_R = (cl_mem*)malloc(sizeof(cl_mem)*num_devs);
  mem_G = (cl_mem*)malloc(sizeof(cl_mem)*num_devs);
  mem_B = (cl_mem*)malloc(sizeof(cl_mem)*num_devs);

  for(i = 0; i < num_devs; i++) {
	  mem_R[i] = clCreateBuffer(context, CL_MEM_READ_WRITE,
                         size_color / num_devs, NULL, NULL);
	  mem_G[i] = clCreateBuffer(context, CL_MEM_READ_WRITE,
                         size_color / num_devs, NULL, NULL);
	  mem_B[i] = clCreateBuffer(context, CL_MEM_READ_WRITE,
                         size_color / num_devs, NULL, NULL);
  }

/*
  // Write to Buffers
  for(i = 0; i < num_devs; i++) {
  	clEnqueueWriteBuffer(cmd_queues[i],
                         mem_CHECK[i], 
                         CL_FALSE, 0,
                         size_CHECK / num_devs,
                         (CHECK + (N / num_devs) * i),
                         0, NULL, NULL);
  }
*/

  // Set the arguments.
  for(i = 0; i < num_devs; i++) {
//	  flag = i * (m * n / num_devs);
  	clSetKernelArg(kernels[i], 0, sizeof(cl_mem), (void*) &mem_R[i]);
	clSetKernelArg(kernels[i], 1, sizeof(cl_mem), (void*) &mem_G[i]);
  	clSetKernelArg(kernels[i], 2, sizeof(cl_mem), (void*) &mem_B[i]);

	clSetKernelArg(kernels[i], 3, sizeof(int), &count_max);
	clSetKernelArg(kernels[i], 4, sizeof(float), &x_max);
	clSetKernelArg(kernels[i], 5, sizeof(float), &x_min);
	clSetKernelArg(kernels[i], 6, sizeof(float), &y_max);
	clSetKernelArg(kernels[i], 7, sizeof(float), &y_min);
  }

  // Enqueue the kernel.
  size_t lws[1] = {256};
  size_t gws[1] = { m * n /num_devs };
  gws[0] = (size_t)ceil((double)m * n / lws[0]) * lws[0];
  ev_kernels = (cl_event*)malloc(sizeof(cl_event)*num_devs);
  for(i = 0; i < num_devs; i++) {
 	 err = clEnqueueNDRangeKernel(cmd_queues[i], kernels[i], 1, NULL, gws, lws, 0, NULL, &ev_kernels[i]);
  	 CHECK_ERROR(err);
  }

  // Read the result.
  for(i = 0; i < num_devs; i++) {
  	err = clEnqueueReadBuffer(cmd_queues[i],
                            mem_R[i],
                            CL_TRUE, 0,
                            size_color / num_devs,
                            r,
                            1, &ev_kernels[i], NULL);
  	err = clEnqueueReadBuffer(cmd_queues[i],
                            mem_G[i],
                            CL_TRUE, 0,
                            size_color / num_devs,
                            g,
							1, &ev_kernels[i], NULL);
   	err = clEnqueueReadBuffer(cmd_queues[i],
                            mem_B[i],
                            CL_TRUE, 0,
							size_color / num_devs,
							b,
                            1, &ev_kernels[i], NULL);
  }

  // Release
  for( i = 0; i < num_devs; ++i ) {
  clFinish(cmd_queues[i]); 
  clReleaseMemObject(mem_R[i]);
  clReleaseMemObject(mem_G[i]);
  clReleaseMemObject(mem_B[i]);
  clReleaseKernel(kernels[i]);
  clReleaseCommandQueue(cmd_queues[i]);
  clReleaseEvent(ev_kernels[i]);
  }
  clReleaseProgram(program);
  clReleaseContext(context);
  free(mem_R);
  free(mem_G);
  free(mem_B);
  free(cmd_queues);
  free(kernels);
  free(devs);
  free(ev_kernels);
  free(platform);

  timer_stop(0);
  wtime = timer_read(0);
  printf( "\n" );
  printf( "  Time = %lf seconds.\n", wtime );

  // Write data to an ASCII PPM file.
  output_unit = fopen( output_filename, "wt" );

  fprintf( output_unit, "P3\n" );
  fprintf( output_unit, "%d  %d\n", n, m );
  fprintf( output_unit, "%d\n", 255 );
  for ( i = 0; i < m; i++ )
  {
    for ( jlo = 0; jlo < n; jlo = jlo + 4 )
    {
      jhi = MIN( jlo + 4, n );
      for ( j = jlo; j < jhi; j++ )
      {
        fprintf( output_unit, "  %d  %d  %d", r[i][j], g[i][j], b[i][j] );
      }
      fprintf( output_unit, "\n" );
    }
  }

  fclose( output_unit );
  printf( "\n" );
  printf( "  Graphics data written to \"%s\".\n\n", output_filename );

  // Terminate.
  free(r);
  free(g);
  free(b);
}