Пример #1
0
static void release_opencl()
{
  DTIMER_START(T_RELEASE);

  free(g_chk);

  clReleaseMemObject(m_u);
  clReleaseMemObject(m_u0);
  clReleaseMemObject(m_u1);
  clReleaseMemObject(m_twiddle);
  if (device_type == CL_DEVICE_TYPE_CPU) {
    clReleaseMemObject(m_ty1);
    clReleaseMemObject(m_ty2);
  }
  clReleaseMemObject(m_chk);

  clReleaseKernel(k_compute_indexmap);
  clReleaseKernel(k_compute_ics);
  clReleaseKernel(k_cffts1);
  clReleaseKernel(k_cffts2);
  clReleaseKernel(k_cffts3);
  clReleaseKernel(k_evolve);
  clReleaseKernel(k_checksum);

  clReleaseProgram(program);
  clReleaseCommandQueue(cmd_queue);
  clReleaseContext(context);

  DTIMER_STOP(T_RELEASE);

#ifdef TIMER_DETAIL
  print_opencl_timers();
#endif
}
Пример #2
0
//---------------------------------------------------------------------
// set the boundary values of dependent variables
//---------------------------------------------------------------------
void setbv()
{
  DTIMER_START(t_setbv);

  cl_int ecode;

  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_setbv1,
                                 SETBV1_DIM, NULL,
                                 setbv1_gws,
                                 setbv1_lws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel()");

  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_setbv2,
                                 SETBV2_DIM, NULL,
                                 setbv2_gws,
                                 setbv2_lws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel()");

  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_setbv3,
                                 SETBV3_DIM, NULL,
                                 setbv3_gws,
                                 setbv3_lws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel()");
  CHECK_FINISH();

  DTIMER_STOP(t_setbv);
}
Пример #3
0
void release_opencl()
{
  DTIMER_START(T_RELEASE);

  clReleaseMemObject(pgq);
  clReleaseMemObject(pgsx);
  clReleaseMemObject(pgsy);

  clReleaseKernel(kernel);
  clReleaseProgram(program);
  clReleaseCommandQueue(cmd_queue);
  clReleaseContext(context);

  DTIMER_STOP(T_RELEASE);

#ifdef TIMER_DETAIL
  if (timers_enabled) {
    int i;
    double tt;
    double t_opencl = 0.0, t_buffer = 0.0, t_kernel = 0.0;
    unsigned cnt;

    for (i = T_OPENCL_API; i < T_END; i++)
      t_opencl += timer_read(i);

    for (i = T_BUFFER_CREATE; i <= T_BUFFER_WRITE; i++)
      t_buffer += timer_read(i);

    for (i = T_KERNEL_EMBAR; i <= T_KERNEL_EMBAR; i++)
      t_kernel += timer_read(i);

    printf("\nOpenCL timers -\n");
    printf("Kernel    : %9.3f (%.2f%%)\n", 
        t_kernel, t_kernel/t_opencl * 100.0);

    cnt = timer_count(T_KERNEL_EMBAR);
    tt = timer_read(T_KERNEL_EMBAR);
    printf("- embar   : %9.3lf (%u, %.3f, %.2f%%)\n",
        tt, cnt, tt/cnt, tt/t_kernel * 100.0);

    printf("Buffer    : %9.3lf (%.2f%%)\n",
        t_buffer, t_buffer/t_opencl * 100.0);
    printf("- creation: %9.3lf\n", timer_read(T_BUFFER_CREATE));
    printf("- read    : %9.3lf\n", timer_read(T_BUFFER_READ));
    printf("- write   : %9.3lf\n", timer_read(T_BUFFER_WRITE));

    tt = timer_read(T_OPENCL_API);
    printf("API       : %9.3lf (%.2f%%)\n", tt, tt/t_opencl * 100.0);

    tt = timer_read(T_BUILD);
    printf("BUILD     : %9.3lf (%.2f%%)\n", tt, tt/t_opencl * 100.0);

    tt = timer_read(T_RELEASE);
    printf("RELEASE   : %9.3lf (%.2f%%)\n", tt, tt/t_opencl * 100.0);

    printf("Total     : %9.3lf\n", t_opencl);
  }
#endif
}
Пример #4
0
//---------------------------------------------------------------------
// touch all the big data
//---------------------------------------------------------------------
static void init_ui(cl_mem *u0, cl_mem *u1, cl_mem *twiddle,
                    int d1, int d2, int d3)
{
  cl_kernel k_init_ui;
  cl_int ecode;

  DTIMER_START(T_OPENCL_API);
  // Create a kernel
  k_init_ui = clCreateKernel(program, "init_ui", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for init_ui");
  DTIMER_STOP(T_OPENCL_API);

  int n = d3 * d2 * (d1+1);
  ecode  = clSetKernelArg(k_init_ui, 0, sizeof(cl_mem), (void*)u0);
  ecode |= clSetKernelArg(k_init_ui, 1, sizeof(cl_mem), (void*)u1);
  ecode |= clSetKernelArg(k_init_ui, 2, sizeof(cl_mem), (void*)twiddle);
  ecode |= clSetKernelArg(k_init_ui, 3, sizeof(int), (void*)&n);
  clu_CheckError(ecode, "clSetKernelArg() for init_ui");

  size_t local_ws = work_item_sizes[0];
  size_t global_ws = clu_RoundWorkSize((size_t)n, local_ws);
  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_init_ui,
                                 1, NULL,
                                 &global_ws,
                                 &local_ws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel() for init_ui");

  ecode = clFinish(cmd_queue);
  clu_CheckError(ecode, "clFinish()");

  DTIMER_START(T_RELEASE);
  clReleaseKernel(k_init_ui);
  DTIMER_STOP(T_RELEASE);
}
Пример #5
0
void create_seq( double seed, double a )
{
  cl_kernel k_cs;
  cl_int    ecode;
  size_t cs_lws[1], cs_gws[1];

  DTIMER_START(T_OPENCL_API);
  // Create a kernel
  k_cs = clCreateKernel(program, "create_seq", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for create_seq");
  DTIMER_STOP(T_OPENCL_API);

  DTIMER_START(T_KERNEL_CREATE_SEQ);
  // Set kernel arguments
  ecode  = clSetKernelArg(k_cs, 0, sizeof(cl_mem), (void*)&m_key_array);
  ecode |= clSetKernelArg(k_cs, 1, sizeof(cl_double), (void*)&seed);
  ecode |= clSetKernelArg(k_cs, 2, sizeof(cl_double), (void*)&a);
  clu_CheckError(ecode, "clSetKernelArg() for create_seq");

  // Launch the kernel
  cs_lws[0] = CREATE_SEQ_GROUP_SIZE;
  cs_gws[0] = CREATE_SEQ_GLOBAL_SIZE;
  ecode = clEnqueueNDRangeKernel(cmd_queue, k_cs, 1, NULL,
                                 cs_gws, 
                                 cs_lws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel() for create_seq");

  ecode = clFinish(cmd_queue);
  clu_CheckError(ecode, "clFinish");
  DTIMER_STOP(T_KERNEL_CREATE_SEQ);

  DTIMER_START(T_RELEASE);
  clReleaseKernel(k_cs);
  DTIMER_STOP(T_RELEASE);
}
Пример #6
0
//---------------------------------------------------------------------
// Fill in array u0 with initial conditions from 
// random number generator 
//---------------------------------------------------------------------
static void compute_initial_conditions(cl_mem *u0, int d1, int d2, int d3)
{
  int k;
  double start, an, dummy, starts[NZ];
  size_t local_ws, global_ws, temp;
  cl_mem m_starts;
  cl_int ecode;

  start = SEED;
  //---------------------------------------------------------------------
  // Jump to the starting element for our first plane.
  //---------------------------------------------------------------------
  an = ipow46(A, 0);
  dummy = randlc(&start, an);
  an = ipow46(A, 2*NX*NY);

  starts[0] = start;
  for (k = 1; k < dims[2]; k++) {
    dummy = randlc(&start, an);
    starts[k] = start;
  }

  if (device_type == CL_DEVICE_TYPE_CPU) {
    m_starts = clCreateBuffer(context,
                              CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
                              sizeof(double) * NZ,
                              starts, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_starts");

    local_ws  = 1;
    global_ws = clu_RoundWorkSize((size_t)d2, local_ws);
  } else { //GPU
    m_starts = clCreateBuffer(context,
                              CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                              sizeof(double) * NZ,
                              starts,
                              &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_starts");

    temp = d2 / max_compute_units;
    local_ws  = temp == 0 ? 
                1 : ((temp > work_item_sizes[0]) ? work_item_sizes[0] : temp);
    global_ws = clu_RoundWorkSize((size_t)d2, local_ws);
  }

  ecode  = clSetKernelArg(k_compute_ics, 0, sizeof(cl_mem), u0);
  ecode |= clSetKernelArg(k_compute_ics, 1, sizeof(cl_mem), &m_starts);
  clu_CheckError(ecode, "clSetKernelArg() for compute_initial_conditions");

  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_compute_ics,
                                 1, NULL,
                                 &global_ws,
                                 &local_ws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel()");

  ecode = clFinish(cmd_queue);
  clu_CheckError(ecode, "clFinish()");

  DTIMER_START(T_RELEASE);
  clReleaseMemObject(m_starts);
  DTIMER_STOP(T_RELEASE);
}
Пример #7
0
int main(int argc, char *argv[])
{
  int i;
  int iter;
  double total_time, mflops;
  logical verified;
  char Class;

  if (argc == 1) {
    fprintf(stderr, "Usage: %s <kernel directory>\n", argv[0]);
    exit(-1);
  }

  //---------------------------------------------------------------------
  // Run the entire problem once to make sure all data is touched. 
  // This reduces variable startup costs, which is important for such a 
  // short benchmark. The other NPB 2 implementations are similar. 
  //---------------------------------------------------------------------
  for (i = 1; i <= T_max; i++) {
    timer_clear(i);
  }
  setup();
  setup_opencl(argc, argv);
  init_ui(&m_u0, &m_u1, &m_twiddle, dims[0], dims[1], dims[2]);
  compute_indexmap(&m_twiddle, dims[0], dims[1], dims[2]);
  compute_initial_conditions(&m_u1, dims[0], dims[1], dims[2]);
  fft_init(dims[0]);
  fft(1, &m_u1, &m_u0);

  //---------------------------------------------------------------------
  // Start over from the beginning. Note that all operations must
  // be timed, in contrast to other benchmarks. 
  //---------------------------------------------------------------------
  for (i = 1; i <= T_max; i++) {
    timer_clear(i);
  }

  timer_start(T_total);
  if (timers_enabled) timer_start(T_setup);

  DTIMER_START(T_compute_im);
  compute_indexmap(&m_twiddle, dims[0], dims[1], dims[2]);
  DTIMER_STOP(T_compute_im);

  DTIMER_START(T_compute_ics);
  compute_initial_conditions(&m_u1, dims[0], dims[1], dims[2]);
  DTIMER_STOP(T_compute_ics);

  DTIMER_START(T_fft_init);
  fft_init(dims[0]);
  DTIMER_STOP(T_fft_init);

  if (timers_enabled) timer_stop(T_setup);
  if (timers_enabled) timer_start(T_fft);
  fft(1, &m_u1, &m_u0);
  if (timers_enabled) timer_stop(T_fft);

  for (iter = 1; iter <= niter; iter++) {
    if (timers_enabled) timer_start(T_evolve);
    evolve(&m_u0, &m_u1, &m_twiddle, dims[0], dims[1], dims[2]);
    if (timers_enabled) timer_stop(T_evolve);
    if (timers_enabled) timer_start(T_fft);
    fft(-1, &m_u1, &m_u1);
    if (timers_enabled) timer_stop(T_fft);
    if (timers_enabled) timer_start(T_checksum);
    checksum(iter, &m_u1, dims[0], dims[1], dims[2]);
    if (timers_enabled) timer_stop(T_checksum);
  }

  verify(NX, NY, NZ, niter, &verified, &Class);

  timer_stop(T_total);
  total_time = timer_read(T_total);

  if (total_time != 0.0) {
    mflops = 1.0e-6 * (double)NTOTAL *
            (14.8157 + 7.19641 * log((double)NTOTAL)
            + (5.23518 + 7.21113 * log((double)NTOTAL)) * niter)
            / total_time;
  } else {
    mflops = 0.0;
  }
  c_print_results("FT", Class, NX, NY, NZ, niter,
                  total_time, mflops, "          floating point", verified, 
                  NPBVERSION, COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, CS7,
                  clu_GetDeviceTypeName(device_type),
                  device_name);
  if (timers_enabled) print_timers();

  release_opencl();

  fflush(stdout);

  return 0;
}
Пример #8
0
//---------------------------------------------------------------------
// Set up the OpenCL environment.
//---------------------------------------------------------------------
static void setup_opencl(int argc, char *argv[])
{
  size_t temp;
  cl_int ecode;
  char *source_dir = "FT";
  if (argc > 1) source_dir = argv[1];

#ifdef TIMER_DETAIL
  if (timers_enabled) {
    int i;
    for (i = T_OPENCL_API; i < T_END; i++) timer_clear(i);
  }
#endif

  DTIMER_START(T_OPENCL_API);

  // 1. Find the default device type and get a device for the device type
  device_type = clu_GetDefaultDeviceType();
  device      = clu_GetAvailableDevice(device_type);
  device_name = clu_GetDeviceName(device);

  // Device information
  ecode = clGetDeviceInfo(device,
                          CL_DEVICE_MAX_WORK_ITEM_SIZES,
                          sizeof(work_item_sizes),
                          &work_item_sizes,
                          NULL);
  clu_CheckError(ecode, "clGetDiviceInfo()");

  ecode = clGetDeviceInfo(device,
                          CL_DEVICE_MAX_WORK_GROUP_SIZE,
                          sizeof(size_t),
                          &max_work_group_size,
                          NULL);
  clu_CheckError(ecode, "clGetDiviceInfo()");

  // FIXME: The below values are experimental.
  if (max_work_group_size > 64) {
    max_work_group_size = 64;
    int i;
    for (i = 0; i < 3; i++) {
      if (work_item_sizes[i] > 64) {
        work_item_sizes[i] = 64;
      }
    }
  }

  ecode = clGetDeviceInfo(device,
                          CL_DEVICE_MAX_COMPUTE_UNITS,
                          sizeof(cl_uint),
                          &max_compute_units,
                          NULL);
  clu_CheckError(ecode, "clGetDiviceInfo()");

  // 2. Create a context for the specified device
  context = clCreateContext(NULL, 1, &device, NULL, NULL, &ecode);
  clu_CheckError(ecode, "clCreateContext()");

  // 3. Create a command queue
  cmd_queue = clCreateCommandQueue(context, device, 0, &ecode);
  clu_CheckError(ecode, "clCreateCommandQueue()");

  DTIMER_STOP(T_OPENCL_API);

  // 4. Build the program
  DTIMER_START(T_BUILD);
  char *source_file;
  char build_option[50];
  if (device_type == CL_DEVICE_TYPE_CPU) {
    source_file = "ft_cpu.cl";
    sprintf(build_option, "-I. -DCLASS=%d -DUSE_CPU", CLASS);

    COMPUTE_IMAP_DIM = COMPUTE_IMAP_DIM_CPU;
    EVOLVE_DIM = EVOLVE_DIM_CPU;
    CFFTS_DIM = CFFTS_DIM_CPU;

  } else if (device_type == CL_DEVICE_TYPE_GPU) {
    char vendor[50];
    ecode = clGetDeviceInfo(device, CL_DEVICE_VENDOR, 50, vendor, NULL);
    clu_CheckError(ecode, "clGetDeviceInfo()");
    if (strncmp(vendor, DEV_VENDOR_NVIDIA, strlen(DEV_VENDOR_NVIDIA)) == 0) {
      source_file = "ft_gpu_nvidia.cl";
      CFFTS_LSIZE = 32;
    } else {
      source_file = "ft_gpu.cl";
      CFFTS_LSIZE = 64;
    }

    sprintf(build_option, "-I. -DCLASS=\'%c\' -DLSIZE=%lu",
            CLASS, CFFTS_LSIZE);

    COMPUTE_IMAP_DIM = COMPUTE_IMAP_DIM_GPU;
    EVOLVE_DIM = EVOLVE_DIM_GPU;
    CFFTS_DIM = CFFTS_DIM_GPU;

  } else {
    fprintf(stderr, "Set the environment variable OPENCL_DEVICE_TYPE!\n");
    exit(EXIT_FAILURE);
  }
  program = clu_MakeProgram(context, device, source_dir, source_file,
                            build_option);
  DTIMER_STOP(T_BUILD);

  // 5. Create buffers
  DTIMER_START(T_BUFFER_CREATE);
  m_u = clCreateBuffer(context,
                       CL_MEM_READ_ONLY,
                       sizeof(dcomplex) * NXP,
                       NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_u");

  m_u0 = clCreateBuffer(context,
                        CL_MEM_READ_WRITE,
                        sizeof(dcomplex) * NTOTALP,
                        NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_u0");

  m_u1 = clCreateBuffer(context,
                        CL_MEM_READ_WRITE,
                        sizeof(dcomplex) * NTOTALP,
                        NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_u1");

  m_twiddle = clCreateBuffer(context,
                             CL_MEM_READ_WRITE,
                             sizeof(double) * NTOTALP,
                             NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_twiddle");

  if (device_type == CL_DEVICE_TYPE_CPU) {
    size_t ty1_size, ty2_size;
    if (CFFTS_DIM == 2) {
      ty1_size = sizeof(dcomplex) * NX * NY * NZ;
      ty2_size = sizeof(dcomplex) * NX * NY * NZ;
    } else {
      fprintf(stderr, "Wrong CFFTS_DIM: %u\n", CFFTS_DIM);
      exit(EXIT_FAILURE);
    }

    m_ty1 = clCreateBuffer(context,
                           CL_MEM_READ_WRITE,
                           ty1_size,
                           NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_ty1");

    m_ty2 = clCreateBuffer(context,
                           CL_MEM_READ_WRITE,
                           ty2_size,
                           NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_ty2");
  }

  if (device_type == CL_DEVICE_TYPE_CPU) {
    temp = 1024 / max_compute_units;
    checksum_local_ws  = temp == 0 ? 1 : temp;
    checksum_global_ws = clu_RoundWorkSize((size_t)1024, checksum_local_ws);
  } else if (device_type == CL_DEVICE_TYPE_GPU) {
    checksum_local_ws  = 32;
    checksum_global_ws = clu_RoundWorkSize((size_t)1024, checksum_local_ws);
  }
  checksum_wg_num = checksum_global_ws / checksum_local_ws;
  m_chk = clCreateBuffer(context,
                         CL_MEM_READ_WRITE,
                         sizeof(dcomplex) * checksum_wg_num,
                         NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_chk");
  g_chk = (dcomplex *)malloc(sizeof(dcomplex) * checksum_wg_num);
  DTIMER_STOP(T_BUFFER_CREATE);

  // 6. Create kernels
  DTIMER_START(T_OPENCL_API);
  double ap = -4.0 * ALPHA * PI * PI;
  int d1 = dims[0];
  int d2 = dims[1];
  int d3 = dims[2];

  k_compute_indexmap = clCreateKernel(program, "compute_indexmap", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for compute_indexmap");
  ecode  = clSetKernelArg(k_compute_indexmap, 0, sizeof(cl_mem), &m_twiddle);
  ecode |= clSetKernelArg(k_compute_indexmap, 1, sizeof(int), &d1);
  ecode |= clSetKernelArg(k_compute_indexmap, 2, sizeof(int), &d2);
  ecode |= clSetKernelArg(k_compute_indexmap, 3, sizeof(int), &d3);
  ecode |= clSetKernelArg(k_compute_indexmap, 4, sizeof(double), &ap);
  clu_CheckError(ecode, "clSetKernelArg() for compute_indexmap");
  if (COMPUTE_IMAP_DIM == 3) {
    cimap_lws[0] = d1 < work_item_sizes[0] ? d1 : work_item_sizes[0];
    temp = max_work_group_size / cimap_lws[0];
    cimap_lws[1] = d2 < temp ? d2 : temp;
    temp = temp / cimap_lws[1];
    cimap_lws[2] = d3 < temp ? d3 : temp;

    cimap_gws[0] = clu_RoundWorkSize((size_t)d1, cimap_lws[0]);
    cimap_gws[1] = clu_RoundWorkSize((size_t)d2, cimap_lws[1]);
    cimap_gws[2] = clu_RoundWorkSize((size_t)d3, cimap_lws[2]);
  } else if (COMPUTE_IMAP_DIM == 2) {
    cimap_lws[0] = d2 < work_item_sizes[0] ? d2 : work_item_sizes[0];
    temp = max_work_group_size / cimap_lws[0];
    cimap_lws[1] = d3 < temp ? d3 : temp;

    cimap_gws[0] = clu_RoundWorkSize((size_t)d2, cimap_lws[0]);
    cimap_gws[1] = clu_RoundWorkSize((size_t)d3, cimap_lws[1]);
  } else {
    //temp = d3 / max_compute_units;
    temp = 1;
    cimap_lws[0] = temp == 0 ? 1 : temp;
    cimap_gws[0] = clu_RoundWorkSize((size_t)d3, cimap_lws[0]);
  }

  k_compute_ics = clCreateKernel(program,
                                 "compute_initial_conditions", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for compute_initial_conditions");
  ecode  = clSetKernelArg(k_compute_ics, 2, sizeof(int), &d1);
  ecode |= clSetKernelArg(k_compute_ics, 3, sizeof(int), &d2);
  ecode |= clSetKernelArg(k_compute_ics, 4, sizeof(int), &d3);
  clu_CheckError(ecode, "clSetKernelArg() for compute_initial_conditions");

  k_cffts1 = clCreateKernel(program, "cffts1", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for cffts1");
  ecode  = clSetKernelArg(k_cffts1, 2, sizeof(cl_mem), &m_u);
  if (device_type == CL_DEVICE_TYPE_CPU) {
    ecode |= clSetKernelArg(k_cffts1, 8, sizeof(cl_mem), &m_ty1);
    ecode |= clSetKernelArg(k_cffts1, 9, sizeof(cl_mem), &m_ty2);
  }
  clu_CheckError(ecode, "clSetKernelArg() for k_cffts1");

  k_cffts2 = clCreateKernel(program, "cffts2", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for cffts2");
  ecode  = clSetKernelArg(k_cffts2, 2, sizeof(cl_mem), &m_u);
  if (device_type == CL_DEVICE_TYPE_CPU) {
    ecode |= clSetKernelArg(k_cffts2, 8, sizeof(cl_mem), &m_ty1);
    ecode |= clSetKernelArg(k_cffts2, 9, sizeof(cl_mem), &m_ty2);
  }
  clu_CheckError(ecode, "clSetKernelArg() for k_cffts2");

  k_cffts3 = clCreateKernel(program, "cffts3", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for cffts3");
  ecode  = clSetKernelArg(k_cffts3, 2, sizeof(cl_mem), &m_u);
  if (device_type == CL_DEVICE_TYPE_CPU) {
    ecode |= clSetKernelArg(k_cffts3, 8, sizeof(cl_mem), &m_ty1);
    ecode |= clSetKernelArg(k_cffts3, 9, sizeof(cl_mem), &m_ty2);
  }
  clu_CheckError(ecode, "clSetKernelArg() for k_cffts3");

  k_evolve = clCreateKernel(program, "evolve", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for evolve");

  k_checksum = clCreateKernel(program, "checksum", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for checksum");
  ecode  = clSetKernelArg(k_checksum, 1, sizeof(cl_mem), &m_chk);
  ecode |= clSetKernelArg(k_checksum, 2, sizeof(dcomplex)*checksum_local_ws,
                          NULL);
  ecode |= clSetKernelArg(k_checksum, 3, sizeof(int), &dims[0]);
  ecode |= clSetKernelArg(k_checksum, 4, sizeof(int), &dims[1]);
  clu_CheckError(ecode, "clSetKernelArg() for checksum");
  DTIMER_STOP(T_OPENCL_API);
}
Пример #9
0
static void release_opencl()
{
  DTIMER_START(T_RELEASE);

  clReleaseMemObject(m_key_array);
  clReleaseMemObject(m_key_buff1);
  clReleaseMemObject(m_key_buff2);
  clReleaseMemObject(m_index_array);
  clReleaseMemObject(m_rank_array);
  clReleaseMemObject(m_partial_vals);
  clReleaseMemObject(m_passed_verification);
  if (device_type == CL_DEVICE_TYPE_GPU) {
    clReleaseMemObject(m_key_scan);
    clReleaseMemObject(m_sum);
  } else {
    clReleaseMemObject(m_bucket_ptrs);
    clReleaseMemObject(m_bucket_size);
  }

  clReleaseKernel(k_rank0);
  clReleaseKernel(k_rank1);
  clReleaseKernel(k_rank2);
  if (device_type == CL_DEVICE_TYPE_GPU) {
    clReleaseKernel(k_rank3_0);
    clReleaseKernel(k_rank3_1);
    clReleaseKernel(k_rank3_2);
  } else {
    clReleaseKernel(k_rank3);
  }
  clReleaseKernel(k_rank4);

  clReleaseProgram(program);
  clReleaseCommandQueue(cmd_queue);
  clReleaseContext(context);

  DTIMER_STOP(T_RELEASE);

#ifdef TIMER_DETAIL
  if (timer_on) {
    int i;
    double tt;
    double t_opencl = 0.0, t_buffer = 0.0, t_kernel = 0.0;
    unsigned cnt;

    for (i = T_OPENCL_API; i < T_END; i++)
      t_opencl += timer_read(i);

    for (i = T_BUFFER_CREATE; i <= T_BUFFER_WRITE; i++)
      t_buffer += timer_read(i);

    for (i = T_KERNEL_CREATE_SEQ; i <= T_KERNEL_FV2; i++)
      t_kernel += timer_read(i);

    printf("\nOpenCL timers -\n");
    printf("Kernel      : %9.3f (%.2f%%)\n", 
        t_kernel, t_kernel/t_opencl * 100.0);

    cnt = timer_count(T_KERNEL_CREATE_SEQ);
    tt = timer_read(T_KERNEL_CREATE_SEQ);
    printf("- create_seq: %9.3lf (%u, %.3f, %.2f%%)\n",
        tt, cnt, tt/cnt, tt/t_kernel * 100.0);

    cnt = timer_count(T_KERNEL_RANK0);
    tt = timer_read(T_KERNEL_RANK0);
    printf("- rank0     : %9.3lf (%u, %.3f, %.2f%%)\n",
        tt, cnt, tt/cnt, tt/t_kernel * 100.0);

    cnt = timer_count(T_KERNEL_RANK1);
    tt = timer_read(T_KERNEL_RANK1);
    printf("- rank1     : %9.3lf (%u, %.3f, %.2f%%)\n",
        tt, cnt, tt/cnt, tt/t_kernel * 100.0);

    cnt = timer_count(T_KERNEL_RANK2);
    tt = timer_read(T_KERNEL_RANK2);
    printf("- rank2     : %9.3lf (%u, %.3f, %.2f%%)\n",
        tt, cnt, tt/cnt, tt/t_kernel * 100.0);

    cnt = timer_count(T_KERNEL_RANK3);
    tt = timer_read(T_KERNEL_RANK3);
    printf("- rank3     : %9.3lf (%u, %.3f, %.2f%%)\n",
        tt, cnt, tt/cnt, tt/t_kernel * 100.0);

    cnt = timer_count(T_KERNEL_RANK4);
    tt = timer_read(T_KERNEL_RANK4);
    printf("- rank4     : %9.3lf (%u, %.3f, %.2f%%)\n",
        tt, cnt, tt/cnt, tt/t_kernel * 100.0);

    cnt = timer_count(T_KERNEL_FV0);
    tt = timer_read(T_KERNEL_FV0);
    printf("- fv0       : %9.3lf (%u, %.3f, %.2f%%)\n",
        tt, cnt, tt/cnt, tt/t_kernel * 100.0);

    cnt = timer_count(T_KERNEL_FV1);
    tt = timer_read(T_KERNEL_FV1);
    printf("- fv1       : %9.3lf (%u, %.3f, %.2f%%)\n",
        tt, cnt, tt/cnt, tt/t_kernel * 100.0);

    cnt = timer_count(T_KERNEL_FV2);
    tt = timer_read(T_KERNEL_FV2);
    printf("- fv2       : %9.3lf (%u, %.3f, %.2f%%)\n",
        tt, cnt, tt/cnt, tt/t_kernel * 100.0);

    printf("Buffer      : %9.3lf (%.2f%%)\n",
        t_buffer, t_buffer/t_opencl * 100.0);
    printf("- creation  : %9.3lf\n", timer_read(T_BUFFER_CREATE));
    printf("- read      : %9.3lf\n", timer_read(T_BUFFER_READ));
    printf("- write     : %9.3lf\n", timer_read(T_BUFFER_WRITE));

    tt = timer_read(T_OPENCL_API);
    printf("API         : %9.3lf (%.2f%%)\n", tt, tt/t_opencl * 100.0);

    tt = timer_read(T_BUILD);
    printf("BUILD       : %9.3lf (%.2f%%)\n", tt, tt/t_opencl * 100.0);

    tt = timer_read(T_RELEASE);
    printf("RELEASE     : %9.3lf (%.2f%%)\n", tt, tt/t_opencl * 100.0);

    printf("Total       : %9.3lf\n", t_opencl);
  }
#endif
}
Пример #10
0
//---------------------------------------------------------------------
// Set up the OpenCL environment.
//---------------------------------------------------------------------
static void setup_opencl(int argc, char *argv[])
{
  cl_int ecode;
  char *source_dir = "IS";
  if (argc > 1) source_dir = argv[1];

#ifdef TIMER_DETAIL
  if (timer_on) {
    int i;
    for (i = T_OPENCL_API; i < T_END; i++) timer_clear(i);
  }
#endif

  DTIMER_START(T_OPENCL_API);

  // 1. Find the default device type and get a device for the device type
  device_type = clu_GetDefaultDeviceType();
  device      = clu_GetAvailableDevice(device_type);
  device_name = clu_GetDeviceName(device);

  // Device information
  ecode = clGetDeviceInfo(device,
                          CL_DEVICE_MAX_WORK_ITEM_SIZES,
                          sizeof(work_item_sizes),
                          &work_item_sizes,
                          NULL);
  clu_CheckError(ecode, "clGetDiviceInfo()");

  ecode = clGetDeviceInfo(device,
                          CL_DEVICE_MAX_WORK_GROUP_SIZE,
                          sizeof(size_t),
                          &max_work_group_size,
                          NULL);
  clu_CheckError(ecode, "clGetDiviceInfo()");

  // FIXME: The below values are experimental.
  if (max_work_group_size > 256) {
    max_work_group_size = 256;
    int i;
    for (i = 0; i < 3; i++) {
      if (work_item_sizes[i] > 256) {
        work_item_sizes[i] = 256;
      }
    }
  }

  // 2. Create a context for the specified device
  context = clCreateContext(NULL, 1, &device, NULL, NULL, &ecode);
  clu_CheckError(ecode, "clCreateContext()");

  // 3. Create a command queue
  cmd_queue = clCreateCommandQueue(context, device, 0, &ecode);
  clu_CheckError(ecode, "clCreateCommandQueue()");

  DTIMER_STOP(T_OPENCL_API);

  // 4. Build the program
  DTIMER_START(T_BUILD);
  char *source_file;
  char build_option[30];
  if (device_type == CL_DEVICE_TYPE_CPU) {
    source_file = "is_cpu.cl";
    sprintf(build_option, "-DCLASS=%d -I.", CLASS);

    CREATE_SEQ_GROUP_SIZE = 64;
    CREATE_SEQ_GLOBAL_SIZE = CREATE_SEQ_GROUP_SIZE * 256;
    RANK_GROUP_SIZE = 1;
    RANK_GLOBAL_SIZE = RANK_GROUP_SIZE * 128;
    RANK1_GROUP_SIZE = 1;
    RANK1_GLOBAL_SIZE = RANK1_GROUP_SIZE * RANK_GLOBAL_SIZE;;
    RANK2_GROUP_SIZE = RANK_GROUP_SIZE;
    RANK2_GLOBAL_SIZE = RANK_GLOBAL_SIZE;;
    FV2_GROUP_SIZE = 64;
    FV2_GLOBAL_SIZE = FV2_GROUP_SIZE * 256;
  } else if (device_type == CL_DEVICE_TYPE_GPU) {
    source_file = "is_gpu.cl";
    sprintf(build_option, "-DCLASS=\'%c\' -I.", CLASS);

    CREATE_SEQ_GROUP_SIZE = 64;
    CREATE_SEQ_GLOBAL_SIZE = CREATE_SEQ_GROUP_SIZE * 256;
    RANK1_GROUP_SIZE = work_item_sizes[0];
    RANK1_GLOBAL_SIZE = MAX_KEY;
    RANK2_GROUP_SIZE = work_item_sizes[0];
    RANK2_GLOBAL_SIZE = NUM_KEYS;
    FV2_GROUP_SIZE = work_item_sizes[0];
    FV2_GLOBAL_SIZE = NUM_KEYS;
  } else {
    fprintf(stderr, "%s: not supported.", clu_GetDeviceTypeName(device_type));
    exit(EXIT_FAILURE);
  }
  program = clu_MakeProgram(context, device, source_dir, source_file,
                            build_option);
  DTIMER_STOP(T_BUILD);

  // 5. Create buffers
  DTIMER_START(T_BUFFER_CREATE);
  m_key_array = clCreateBuffer(context,
                               CL_MEM_READ_WRITE,
                               sizeof(INT_TYPE) * SIZE_OF_BUFFERS,
                               NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_key_array");

  m_key_buff1 = clCreateBuffer(context,
                               CL_MEM_READ_WRITE,
                               sizeof(INT_TYPE) * MAX_KEY,
                               NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_key_buff1");

  m_key_buff2 = clCreateBuffer(context,
                               CL_MEM_READ_WRITE,
                               sizeof(INT_TYPE) * SIZE_OF_BUFFERS,
                               NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_key_buff2");

  size_t test_array_size = sizeof(INT_TYPE) * TEST_ARRAY_SIZE;
  m_index_array = clCreateBuffer(context,
                                 CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                 test_array_size,
                                 test_index_array, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_index_array");

  m_rank_array = clCreateBuffer(context,
                                CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                test_array_size,
                                test_rank_array, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_rank_array");

  m_partial_vals = clCreateBuffer(context,
                                  CL_MEM_WRITE_ONLY,
                                  test_array_size,
                                  NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_partial_vals");

  m_passed_verification = clCreateBuffer(context,
                                         CL_MEM_READ_WRITE,
                                         sizeof(cl_int),
                                         NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_passed_verification");

  if (device_type == CL_DEVICE_TYPE_GPU) {
    m_key_scan = clCreateBuffer(context,
                                CL_MEM_READ_WRITE,
                                sizeof(INT_TYPE) * MAX_KEY,
                                NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_key_buff1_scan");

    m_sum = clCreateBuffer(context,
                           CL_MEM_READ_WRITE,
                           sizeof(INT_TYPE) * work_item_sizes[0],
                           NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_sum");
  } else {
    size_t bs_size = RANK_GLOBAL_SIZE * sizeof(INT_TYPE) * NUM_BUCKETS;
    m_bucket_size = clCreateBuffer(context,
                                   CL_MEM_READ_WRITE,
                                   bs_size,
                                   NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_bucket_size");

    m_bucket_ptrs = clCreateBuffer(context,
                                   CL_MEM_READ_WRITE,
                                   bs_size,
                                   NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_bucket_ptrs");
  }
  DTIMER_STOP(T_BUFFER_CREATE);

  // 6. Create kernels
  DTIMER_START(T_OPENCL_API);
  k_rank0 = clCreateKernel(program, "rank0", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for rank0");
  ecode  = clSetKernelArg(k_rank0, 0, sizeof(cl_mem), (void*)&m_key_array);
  ecode |= clSetKernelArg(k_rank0, 1, sizeof(cl_mem), (void*)&m_partial_vals);
  ecode |= clSetKernelArg(k_rank0, 2, sizeof(cl_mem), (void*)&m_index_array);
  clu_CheckError(ecode, "clSetKernelArg() for rank0");

  if (device_type == CL_DEVICE_TYPE_GPU) {
    k_rank1 = clCreateKernel(program, "rank1", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for rank1");
    ecode  = clSetKernelArg(k_rank1, 0, sizeof(cl_mem), (void*)&m_key_buff1);
    clu_CheckError(ecode, "clSetKernelArg() for rank1");

    k_rank2 = clCreateKernel(program, "rank2", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for rank2");
    ecode  = clSetKernelArg(k_rank2, 0, sizeof(cl_mem), (void*)&m_key_buff1);
    ecode |= clSetKernelArg(k_rank2, 1, sizeof(cl_mem), (void*)&m_key_array);
    clu_CheckError(ecode, "clSetKernelArg() for rank2");

    k_rank3_0 = clCreateKernel(program, "rank3_0", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for rank3_0");
    ecode  = clSetKernelArg(k_rank3_0, 0, sizeof(cl_mem),(void*)&m_key_buff1);
    ecode |= clSetKernelArg(k_rank3_0, 1, sizeof(cl_mem),(void*)&m_key_buff1);
    ecode |= clSetKernelArg(k_rank3_0, 2, sizeof(cl_mem),(void*)&m_sum);
    ecode |= clSetKernelArg(k_rank3_0, 3, 
                            sizeof(INT_TYPE) * work_item_sizes[0] * 2,
                            NULL);
    clu_CheckError(ecode, "clSetKernelArg() for rank3_0");

    k_rank3_1 = clCreateKernel(program, "rank3_1", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for rank3_1");
    ecode  = clSetKernelArg(k_rank3_1, 0, sizeof(cl_mem), (void*)&m_sum);
    ecode  = clSetKernelArg(k_rank3_1, 1, sizeof(cl_mem), (void*)&m_sum);
    ecode |= clSetKernelArg(k_rank3_1, 2, 
                            sizeof(INT_TYPE) * work_item_sizes[0] * 2,
                            NULL);
    clu_CheckError(ecode, "clSetKernelArg() for rank3_1");

    k_rank3_2 = clCreateKernel(program, "rank3_2", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for rank3_2");
    ecode  = clSetKernelArg(k_rank3_2, 0, sizeof(cl_mem),(void*)&m_key_buff1);
    ecode  = clSetKernelArg(k_rank3_2, 1, sizeof(cl_mem),(void*)&m_key_buff1);
    ecode |= clSetKernelArg(k_rank3_2, 2, sizeof(cl_mem),(void*)&m_sum);
    clu_CheckError(ecode, "clSetKernelArg() for rank3_2");
  } else {
    k_rank1 = clCreateKernel(program, "rank1", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for rank1");
    ecode  = clSetKernelArg(k_rank1, 0, sizeof(cl_mem),(void*)&m_key_array);
    ecode |= clSetKernelArg(k_rank1, 1, sizeof(cl_mem),(void*)&m_bucket_size);
    clu_CheckError(ecode, "clSetKernelArg() for rank1");

    k_rank2 = clCreateKernel(program, "rank2", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for rank2");
    ecode  = clSetKernelArg(k_rank2, 0, sizeof(cl_mem),(void*)&m_key_array);
    ecode |= clSetKernelArg(k_rank2, 1, sizeof(cl_mem),(void*)&m_bucket_size);
    ecode |= clSetKernelArg(k_rank2, 2, sizeof(cl_mem),(void*)&m_bucket_ptrs);
    ecode |= clSetKernelArg(k_rank2, 3, sizeof(cl_mem),(void*)&m_key_buff2);
    clu_CheckError(ecode, "clSetKernelArg() for rank2");

    k_rank3 = clCreateKernel(program, "rank3", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for rank3");
    ecode  = clSetKernelArg(k_rank3, 0, sizeof(cl_mem),(void*)&m_bucket_size);
    ecode |= clSetKernelArg(k_rank3, 1, sizeof(cl_mem),(void*)&m_bucket_ptrs);
    ecode |= clSetKernelArg(k_rank3, 2, sizeof(cl_mem),(void*)&m_key_buff1);
    ecode |= clSetKernelArg(k_rank3, 3, sizeof(cl_mem),(void*)&m_key_buff2);
    clu_CheckError(ecode, "clSetKernelArg() for rank3");
  }

  k_rank4 = clCreateKernel(program, "rank4", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for rank4");
  ecode  = clSetKernelArg(k_rank4, 0, sizeof(cl_mem), (void*)&m_partial_vals);
  ecode |= clSetKernelArg(k_rank4, 1, sizeof(cl_mem), (void*)&m_key_buff1);
  ecode |= clSetKernelArg(k_rank4, 2, sizeof(cl_mem), (void*)&m_rank_array);
  ecode |= clSetKernelArg(k_rank4, 3, sizeof(cl_mem),
                                      (void*)&m_passed_verification);
  clu_CheckError(ecode, "clSetKernelArg() for rank4");
  DTIMER_STOP(T_OPENCL_API);
}
Пример #11
0
int main( int argc, char **argv )
{

  int             i, iteration;

  double          timecounter;

  FILE            *fp;

  cl_int ecode;

  if (argc == 1) {
    fprintf(stderr, "Usage: %s <kernel directory>\n", argv[0]);
    exit(-1);
  }

  /*  Initialize timers  */
  timer_on = 0;            
  if ((fp = fopen("timer.flag", "r")) != NULL) {
    fclose(fp);
    timer_on = 1;
  }
  timer_clear( 0 );
  if (timer_on) {
    timer_clear( 1 );
    timer_clear( 2 );
    timer_clear( 3 );
  }

  if (timer_on) timer_start( 3 );

  /*  Initialize the verification arrays if a valid class */
  for( i=0; i<TEST_ARRAY_SIZE; i++ )
    switch( CLASS )
    {
      case 'S':
        test_index_array[i] = S_test_index_array[i];
        test_rank_array[i]  = S_test_rank_array[i];
        break;
      case 'A':
        test_index_array[i] = A_test_index_array[i];
        test_rank_array[i]  = A_test_rank_array[i];
        break;
      case 'W':
        test_index_array[i] = W_test_index_array[i];
        test_rank_array[i]  = W_test_rank_array[i];
        break;
      case 'B':
        test_index_array[i] = B_test_index_array[i];
        test_rank_array[i]  = B_test_rank_array[i];
        break;
      case 'C':
        test_index_array[i] = C_test_index_array[i];
        test_rank_array[i]  = C_test_rank_array[i];
        break;
      case 'D':
        test_index_array[i] = D_test_index_array[i];
        test_rank_array[i]  = D_test_rank_array[i];
        break;
    };

  /* set up the OpenCL environment. */
  setup_opencl(argc, argv);

  /*  Printout initial NPB info */
  printf( "\n\n NAS Parallel Benchmarks (NPB3.3-OCL) - IS Benchmark\n\n" );
  printf( " Size:  %ld  (class %c)\n", (long)TOTAL_KEYS, CLASS );
  printf( " Iterations:   %d\n", MAX_ITERATIONS );

  if (timer_on) timer_start( 1 );

  /*  Generate random number sequence and subsequent keys on all procs */
  create_seq( 314159265.00,                    /* Random number gen seed */
              1220703125.00 );                 /* Random number gen mult */
  if (timer_on) timer_stop( 1 );

  /*  Do one interation for free (i.e., untimed) to guarantee initialization of  
      all data and code pages and respective tables */
  rank( 1 );  

  /*  Start verification counter */
  passed_verification = 0;

  DTIMER_START(T_BUFFER_WRITE);
  ecode = clEnqueueWriteBuffer(cmd_queue,
                               m_passed_verification,
                               CL_TRUE,
                               0,
                               sizeof(cl_int),
                               &passed_verification,
                               0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueWriteBuffer() for m_passed_verification");
  DTIMER_STOP(T_BUFFER_WRITE);

  if( CLASS != 'S' ) printf( "\n   iteration\n" );

  /*  Start timer  */             
  timer_start( 0 );


  /*  This is the main iteration */
  for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ )
  {
    if( CLASS != 'S' ) printf( "        %d\n", iteration );
    rank( iteration );
  }

  DTIMER_START(T_BUFFER_READ);
  ecode = clEnqueueReadBuffer(cmd_queue,
                              m_passed_verification,
                              CL_TRUE,
                              0,
                              sizeof(cl_int),
                              &passed_verification,
                              0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueReadBuffer() for m_passed_verification");
  DTIMER_STOP(T_BUFFER_READ);

  /*  End of timing, obtain maximum time of all processors */
  timer_stop( 0 );
  timecounter = timer_read( 0 );


  /*  This tests that keys are in sequence: sorting of last ranked key seq
      occurs here, but is an untimed operation                             */
  if (timer_on) timer_start( 2 );
  full_verify();
  if (timer_on) timer_stop( 2 );

  if (timer_on) timer_stop( 3 );


  /*  The final printout  */
  if( passed_verification != 5*MAX_ITERATIONS + 1 )
    passed_verification = 0;
  c_print_results( "IS",
                   CLASS,
                   (int)(TOTAL_KEYS/64),
                   64,
                   0,
                   MAX_ITERATIONS,
                   timecounter,
                   ((double) (MAX_ITERATIONS*TOTAL_KEYS))
                              /timecounter/1000000.,
                   "keys ranked", 
                   passed_verification,
                   NPBVERSION,
                   COMPILETIME,
                   CC,
                   CLINK,
                   C_LIB,
                   C_INC,
                   CFLAGS,
                   CLINKFLAGS,
                   "",
                   clu_GetDeviceTypeName(device_type),
                   device_name);

  /*  Print additional timers  */
  if (timer_on) {
    double t_total, t_percent;

    t_total = timer_read( 3 );
    printf("\nAdditional timers -\n");
    printf(" Total execution: %8.3f\n", t_total);
    if (t_total == 0.0) t_total = 1.0;
    timecounter = timer_read(1);
    t_percent = timecounter/t_total * 100.;
    printf(" Initialization : %8.3f (%5.2f%%)\n", timecounter, t_percent);
    timecounter = timer_read(0);
    t_percent = timecounter/t_total * 100.;
    printf(" Benchmarking   : %8.3f (%5.2f%%)\n", timecounter, t_percent);
    timecounter = timer_read(2);
    t_percent = timecounter/t_total * 100.;
    printf(" Sorting        : %8.3f (%5.2f%%)\n", timecounter, t_percent);
  }

  release_opencl();
  
  fflush(stdout);

  return 0;
  /**************************/
} /*  E N D  P R O G R A M  */
Пример #12
0
void rank( int iteration )
{
  size_t r1_lws[1], r1_gws[1];
  size_t r2_lws[1], r2_gws[1];
  size_t r3_lws[1], r3_gws[1];
  cl_int ecode;

  DTIMER_START(T_KERNEL_RANK0);
  // rank0
  ecode = clSetKernelArg(k_rank0, 3, sizeof(cl_int), (void*)&iteration);
  clu_CheckError(ecode, "clSetKernelArg() for rank0: iteration");

  ecode = clEnqueueTask(cmd_queue, k_rank0, 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueTask() for rank0");
  CHECK_FINISH();
  DTIMER_STOP(T_KERNEL_RANK0);

  DTIMER_START(T_KERNEL_RANK1);
  // rank1
  r1_lws[0] = RANK1_GROUP_SIZE;
  r1_gws[0] = RANK1_GLOBAL_SIZE;
  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_rank1,
                                 1, NULL,
                                 r1_gws, 
                                 r1_lws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel() for rank1");
  CHECK_FINISH();
  DTIMER_STOP(T_KERNEL_RANK1);

  DTIMER_START(T_KERNEL_RANK2);
  // rank2
  r2_lws[0] = RANK2_GROUP_SIZE;
  r2_gws[0] = RANK2_GLOBAL_SIZE;
  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_rank2,
                                 1, NULL,
                                 r2_gws, 
                                 r2_lws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel() for rank2");
  CHECK_FINISH();
  DTIMER_STOP(T_KERNEL_RANK2);

  DTIMER_START(T_KERNEL_RANK3);
  // rank3
  if (device_type == CL_DEVICE_TYPE_GPU) {
    r3_lws[0] = work_item_sizes[0];
    r3_gws[0] = work_item_sizes[0] * work_item_sizes[0];
    if (r3_gws[0] > MAX_KEY) r3_gws[0] = MAX_KEY;
    ecode = clEnqueueNDRangeKernel(cmd_queue,
                                   k_rank3_0,
                                   1, NULL,
                                   r3_gws, 
                                   r3_lws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel() for rank3_0");

    r3_lws[0] = work_item_sizes[0];
    r3_gws[0] = work_item_sizes[0];
    ecode = clEnqueueNDRangeKernel(cmd_queue,
                                   k_rank3_1,
                                   1, NULL,
                                   r3_gws, 
                                   r3_lws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel() for rank3_1");

    r3_lws[0] = work_item_sizes[0];
    r3_gws[0] = work_item_sizes[0] * work_item_sizes[0];
    if (r3_gws[0] > MAX_KEY) r3_gws[0] = MAX_KEY;
    ecode = clEnqueueNDRangeKernel(cmd_queue,
                                   k_rank3_2,
                                   1, NULL,
                                   r3_gws, 
                                   r3_lws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel() for rank3_2");
  } else {
    r3_lws[0] = RANK_GROUP_SIZE;
    r3_gws[0] = RANK_GLOBAL_SIZE;
    ecode = clEnqueueNDRangeKernel(cmd_queue,
                                   k_rank3,
                                   1, NULL,
                                   r3_gws, 
                                   r3_lws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel() for rank3");
  }
  CHECK_FINISH();
  DTIMER_STOP(T_KERNEL_RANK3);

  // rank4 - partial verification
  DTIMER_START(T_KERNEL_RANK4);
  ecode = clSetKernelArg(k_rank4, 4, sizeof(cl_int), (void*)&iteration);
  clu_CheckError(ecode, "clSetKernelArg() for rank4");

  ecode = clEnqueueTask(cmd_queue, k_rank4, 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueTask() for rank4");

  ecode = clFinish(cmd_queue);
  clu_CheckError(ecode, "clFinish");
  DTIMER_STOP(T_KERNEL_RANK4);
}
Пример #13
0
void full_verify( void )
{
  cl_kernel k_fv1, k_fv2;
  cl_mem    m_j;
  INT_TYPE *g_j;
  INT_TYPE j = 0, i;
  size_t j_size;
  size_t fv1_lws[1], fv1_gws[1];
  size_t fv2_lws[1], fv2_gws[1];
  cl_int ecode;

  DTIMER_START(T_BUFFER_CREATE);
  // Create buffers
  j_size = sizeof(INT_TYPE) * (FV2_GLOBAL_SIZE / FV2_GROUP_SIZE);
  m_j = clCreateBuffer(context,
                       CL_MEM_READ_WRITE,
                       j_size,
                       NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer for m_j");
  DTIMER_STOP(T_BUFFER_CREATE);

  DTIMER_START(T_OPENCL_API);
  // Create kernels
  k_fv1 = clCreateKernel(program, "full_verify1", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for full_verify1");

  k_fv2 = clCreateKernel(program, "full_verify2", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for full_verify2");
  DTIMER_STOP(T_OPENCL_API);

  if (device_type == CL_DEVICE_TYPE_GPU) {
    cl_kernel k_fv0;
    size_t fv0_lws[1], fv0_gws[1];

    DTIMER_START(T_OPENCL_API);
    // Create kernels
    k_fv0 = clCreateKernel(program, "full_verify0", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for full_verify0");
    DTIMER_STOP(T_OPENCL_API);

    // Kernel execution
    DTIMER_START(T_KERNEL_FV0);
    ecode  = clSetKernelArg(k_fv0, 0, sizeof(cl_mem), (void*)&m_key_array);
    ecode |= clSetKernelArg(k_fv0, 1, sizeof(cl_mem), (void*)&m_key_buff2);
    clu_CheckError(ecode, "clSetKernelArg() for full_verify0");

    fv0_lws[0] = work_item_sizes[0];
    fv0_gws[0] = NUM_KEYS;
    ecode = clEnqueueNDRangeKernel(cmd_queue,
                                   k_fv0,
                                   1, NULL,
                                   fv0_gws, 
                                   fv0_lws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel() for full_verify0");
    CHECK_FINISH();
    DTIMER_STOP(T_KERNEL_FV0);

    DTIMER_START(T_KERNEL_FV1);
    ecode  = clSetKernelArg(k_fv1, 0, sizeof(cl_mem), (void*)&m_key_buff2);
    ecode |= clSetKernelArg(k_fv1, 1, sizeof(cl_mem), (void*)&m_key_buff1);
    ecode |= clSetKernelArg(k_fv1, 2, sizeof(cl_mem), (void*)&m_key_array);
    clu_CheckError(ecode, "clSetKernelArg() for full_verify1");

    fv1_lws[0] = work_item_sizes[0];
    fv1_gws[0] = NUM_KEYS;
    ecode = clEnqueueNDRangeKernel(cmd_queue,
                                   k_fv1,
                                   1, NULL,
                                   fv1_gws, 
                                   fv1_lws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel() for full_verify1");
    CHECK_FINISH();
    DTIMER_STOP(T_KERNEL_FV1);

    DTIMER_START(T_KERNEL_FV2);
    ecode  = clSetKernelArg(k_fv2, 0, sizeof(cl_mem), (void*)&m_key_array);
    ecode |= clSetKernelArg(k_fv2, 1, sizeof(cl_mem), (void*)&m_j);
    ecode |= clSetKernelArg(k_fv2, 2, sizeof(INT_TYPE)*FV2_GROUP_SIZE, NULL);
    clu_CheckError(ecode, "clSetKernelArg() for full_verify2");

    fv2_lws[0] = FV2_GROUP_SIZE;
    fv2_gws[0] = FV2_GLOBAL_SIZE;
    ecode = clEnqueueNDRangeKernel(cmd_queue,
                                   k_fv2,
                                   1, NULL,
                                   fv2_gws, 
                                   fv2_lws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel() for full_verify2");
    CHECK_FINISH();
    DTIMER_STOP(T_KERNEL_FV2);

    g_j = (INT_TYPE *)malloc(j_size);

    DTIMER_START(T_BUFFER_READ);
    ecode = clEnqueueReadBuffer(cmd_queue,
                                m_j,
                                CL_TRUE,
                                0,
                                j_size,
                                g_j,
                                0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueReadBuffer() for m_j");
    DTIMER_STOP(T_BUFFER_READ);

    // reduction
    for (i = 0; i < j_size/sizeof(INT_TYPE); i++) {
      j += g_j[i];
    }

    DTIMER_START(T_RELEASE);
    clReleaseKernel(k_fv0);
    DTIMER_STOP(T_RELEASE);

  } else {

    // Kernel execution
    DTIMER_START(T_KERNEL_FV1);
    ecode  = clSetKernelArg(k_fv1, 0, sizeof(cl_mem), (void*)&m_bucket_ptrs);
    ecode |= clSetKernelArg(k_fv1, 1, sizeof(cl_mem), (void*)&m_key_buff2);
    ecode |= clSetKernelArg(k_fv1, 2, sizeof(cl_mem), (void*)&m_key_buff1);
    ecode |= clSetKernelArg(k_fv1, 3, sizeof(cl_mem), (void*)&m_key_array);
    clu_CheckError(ecode, "clSetKernelArg() for full_verify1");

    fv1_lws[0] = RANK_GROUP_SIZE;
    fv1_gws[0] = RANK_GLOBAL_SIZE;
    ecode = clEnqueueNDRangeKernel(cmd_queue,
                                   k_fv1,
                                   1, NULL,
                                   fv1_gws, 
                                   fv1_lws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel() for full_verify1");
    CHECK_FINISH();
    DTIMER_STOP(T_KERNEL_FV1);

    DTIMER_START(T_KERNEL_FV2);
    ecode  = clSetKernelArg(k_fv2, 0, sizeof(cl_mem), (void*)&m_key_array);
    ecode |= clSetKernelArg(k_fv2, 1, sizeof(cl_mem), (void*)&m_j);
    ecode |= clSetKernelArg(k_fv2, 2, sizeof(INT_TYPE)*FV2_GROUP_SIZE, NULL);
    clu_CheckError(ecode, "clSetKernelArg() for full_verify2");

    fv2_lws[0] = FV2_GROUP_SIZE;
    fv2_gws[0] = FV2_GLOBAL_SIZE;
    ecode = clEnqueueNDRangeKernel(cmd_queue,
                                   k_fv2,
                                   1, NULL,
                                   fv2_gws, 
                                   fv2_lws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel() for full_verify2");
    CHECK_FINISH();
    DTIMER_STOP(T_KERNEL_FV2);

    g_j = (INT_TYPE *)malloc(j_size);

    DTIMER_START(T_BUFFER_READ);
    ecode = clEnqueueReadBuffer(cmd_queue,
                                m_j,
                                CL_TRUE,
                                0,
                                j_size,
                                g_j,
                                0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueReadBuffer() for m_j");
    DTIMER_STOP(T_BUFFER_READ);

    // reduction
    for (i = 0; i < j_size/sizeof(INT_TYPE); i++) {
      j += g_j[i];
    }

  }

  DTIMER_START(T_RELEASE);
  free(g_j);
  clReleaseMemObject(m_j);
  clReleaseKernel(k_fv1);
  clReleaseKernel(k_fv2);
  DTIMER_STOP(T_RELEASE);

  if (j != 0)
    printf( "Full_verify: number of keys out of sort: %ld\n", (long)j );
  else
    passed_verification++;
}
Пример #14
0
//---------------------------------------------------------------------
// Set up the OpenCL environment.
//---------------------------------------------------------------------
void setup_opencl(int argc, char *argv[])
{
  cl_int err_code;
  char *source_dir = "EP";
  if (argc > 1) source_dir = argv[1];

#ifdef TIMER_DETAIL
  if (timers_enabled) {
    int i;
    for (i = T_OPENCL_API; i < T_END; i++) timer_clear(i);
  }
#endif

  DTIMER_START(T_OPENCL_API);

  // 1. Find the default device type and get a device for the device type
  device_type = clu_GetDefaultDeviceType();
  device      = clu_GetAvailableDevice(device_type);
  device_name = clu_GetDeviceName(device);

  // 2. Create a context for the specified device
  context = clCreateContext(NULL, 1, &device, NULL, NULL, &err_code);
  clu_CheckError(err_code, "clCreateContext()");

  // 3. Create a command queue
  cmd_queue = clCreateCommandQueue(context, device, 0, &err_code);
  clu_CheckError(err_code, "clCreateCommandQueue()");

  DTIMER_STOP(T_OPENCL_API);

  // 4. Build the program
  DTIMER_START(T_BUILD);
  char *source_file;
  char build_option[30];
  sprintf(build_option, "-DM=%d -I.", M);
  if (device_type == CL_DEVICE_TYPE_CPU) {
    source_file = "ep_cpu.cl";
    GROUP_SIZE = 16;
  } else {
    source_file = "ep_gpu.cl";
    GROUP_SIZE = 64;
  }
  program = clu_MakeProgram(context, device, source_dir, source_file,
                            build_option);
  DTIMER_STOP(T_BUILD);

  // 5. Create buffers
  DTIMER_START(T_BUFFER_CREATE);

  gq_size  = np / GROUP_SIZE * NQ * sizeof(double);
  gsx_size = np / GROUP_SIZE * sizeof(double);
  gsy_size = np / GROUP_SIZE * sizeof(double);

  pgq = clCreateBuffer(context, CL_MEM_READ_WRITE, gq_size, NULL, &err_code);
  clu_CheckError(err_code, "clCreateBuffer() for pgq");

  pgsx = clCreateBuffer(context, CL_MEM_READ_WRITE, gsx_size,NULL, &err_code);
  clu_CheckError(err_code, "clCreateBuffer() for pgsx");

  pgsy = clCreateBuffer(context, CL_MEM_READ_WRITE, gsy_size,NULL, &err_code);
  clu_CheckError(err_code, "clCreateBuffer() for pgsy");

  DTIMER_STOP(T_BUFFER_CREATE);

  // 6. Create a kernel
  DTIMER_START(T_OPENCL_API);
  kernel = clCreateKernel(program, "embar", &err_code);
  clu_CheckError(err_code, "clCreateKernel()");
  DTIMER_STOP(T_OPENCL_API);
}
Пример #15
0
int main(int argc, char *argv[]) 
{
  double Mops, t1, t2;
  double tsx, tsy, tm, an, tt, gc;
  double sx_verify_value, sy_verify_value, sx_err, sy_err;
  int    i, nit;
  int    k_offset, j;
  logical verified;

  char   size[16];

  FILE *fp;

  if (argc == 1) {
    fprintf(stderr, "Usage: %s <kernel directory>\n", argv[0]);
    exit(-1);
  }

  if ((fp = fopen("timer.flag", "r")) == NULL) {
    timers_enabled = false;
  } else {
    timers_enabled = true;
    fclose(fp);
  }

  //--------------------------------------------------------------------
  //  Because the size of the problem is too large to store in a 32-bit
  //  integer for some classes, we put it into a string (for printing).
  //  Have to strip off the decimal point put in there by the floating
  //  point print statement (internal file)
  //--------------------------------------------------------------------

  sprintf(size, "%15.0lf", pow(2.0, M+1));
  j = 14;
  if (size[j] == '.') j--;
  size[j+1] = '\0';
  printf("\n\n NAS Parallel Benchmarks (NPB3.3-OCL) - EP Benchmark\n");
  printf("\n Number of random numbers generated: %15s\n", size);

  verified = false;

  //--------------------------------------------------------------------
  //  Compute the number of "batches" of random number pairs generated 
  //  per processor. Adjust if the number of processors does not evenly 
  //  divide the total number
  //--------------------------------------------------------------------

  np = NN; 

  setup_opencl(argc, argv);

  timer_clear(0);
  timer_start(0);

  //--------------------------------------------------------------------
  //  Compute AN = A ^ (2 * NK) (mod 2^46).
  //--------------------------------------------------------------------

  t1 = A;

  for (i = 0; i < MK + 1; i++) {
    t2 = randlc(&t1, t1);
  }

  an = t1;
  tt = S;

  //--------------------------------------------------------------------
  //  Each instance of this loop may be performed independently. We compute
  //  the k offsets separately to take into account the fact that some nodes
  //  have more numbers to generate than others
  //--------------------------------------------------------------------

  k_offset = -1;

  DTIMER_START(T_KERNEL_EMBAR);

  // Launch the kernel
  int q_size  = GROUP_SIZE * NQ * sizeof(cl_double);
  int sx_size = GROUP_SIZE * sizeof(cl_double);
  int sy_size = GROUP_SIZE * sizeof(cl_double);
  err_code  = clSetKernelArg(kernel, 0, q_size, NULL);
  err_code |= clSetKernelArg(kernel, 1, sx_size, NULL);
  err_code |= clSetKernelArg(kernel, 2, sy_size, NULL);
  err_code |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&pgq);
  err_code |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&pgsx);
  err_code |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&pgsy);
  err_code |= clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&k_offset);
  err_code |= clSetKernelArg(kernel, 7, sizeof(cl_double), (void*)&an);
  clu_CheckError(err_code, "clSetKernelArg()");
  
  size_t localWorkSize[] = { GROUP_SIZE };
  size_t globalWorkSize[] = { np };
  err_code = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL,
                                    globalWorkSize, 
                                    localWorkSize,
                                    0, NULL, NULL);
  clu_CheckError(err_code, "clEnqueueNDRangeKernel()");
  CHECK_FINISH();
  DTIMER_STOP(T_KERNEL_EMBAR);

  double (*gq)[NQ] = (double (*)[NQ])malloc(gq_size);
  double *gsx = (double*)malloc(gsx_size);
  double *gsy = (double*)malloc(gsy_size);

  gc  = 0.0;
  tsx = 0.0;
  tsy = 0.0;

  for (i = 0; i < NQ; i++) {
    q[i] = 0.0;
  }

  // 9. Get the result
  DTIMER_START(T_BUFFER_READ);
  err_code = clEnqueueReadBuffer(cmd_queue, pgq, CL_FALSE, 0, gq_size, 
                                 gq, 0, NULL, NULL);
  clu_CheckError(err_code, "clEnqueueReadbuffer()");

  err_code = clEnqueueReadBuffer(cmd_queue, pgsx, CL_FALSE, 0, gsx_size, 
                                 gsx, 0, NULL, NULL);
  clu_CheckError(err_code, "clEnqueueReadbuffer()");

  err_code = clEnqueueReadBuffer(cmd_queue, pgsy, CL_TRUE, 0, gsy_size, 
                                 gsy, 0, NULL, NULL);
  clu_CheckError(err_code, "clEnqueueReadbuffer()");
  DTIMER_STOP(T_BUFFER_READ);

  for (i = 0; i < np/localWorkSize[0]; i++) {
    for (j = 0; j < NQ; j++ ){
      q[j] = q[j] + gq[i][j];
    }
    tsx = tsx + gsx[i];
    tsy = tsy + gsy[i];
  }

  for (i = 0; i < NQ; i++) {
    gc = gc + q[i];
  }

  timer_stop(0);
  tm = timer_read(0);

  nit = 0;
  verified = true;
  if (M == 24) {
    sx_verify_value = -3.247834652034740e+3;
    sy_verify_value = -6.958407078382297e+3;
  } else if (M == 25) {
    sx_verify_value = -2.863319731645753e+3;
    sy_verify_value = -6.320053679109499e+3;
  } else if (M == 28) {
    sx_verify_value = -4.295875165629892e+3;
    sy_verify_value = -1.580732573678431e+4;
  } else if (M == 30) {
    sx_verify_value =  4.033815542441498e+4;
    sy_verify_value = -2.660669192809235e+4;
  } else if (M == 32) {
    sx_verify_value =  4.764367927995374e+4;
    sy_verify_value = -8.084072988043731e+4;
  } else if (M == 36) {
    sx_verify_value =  1.982481200946593e+5;
    sy_verify_value = -1.020596636361769e+5;
  } else if (M == 40) {
    sx_verify_value = -5.319717441530e+05;
    sy_verify_value = -3.688834557731e+05;
  } else {
    verified = false;
  }

  if (verified) {
    sx_err = fabs((tsx - sx_verify_value) / sx_verify_value);
    sy_err = fabs((tsy - sy_verify_value) / sy_verify_value);
    verified = ((sx_err <= EPSILON) && (sy_err <= EPSILON));
  }

  Mops = pow(2.0, M+1) / tm / 1000000.0;

  printf("\nEP Benchmark Results:\n\n");
  printf("CPU Time =%10.4lf\n", tm);
  printf("N = 2^%5d\n", M);
  printf("No. Gaussian Pairs = %15.0lf\n", gc);
  printf("Sums = %25.15lE %25.15lE\n", tsx, tsy);
  printf("Counts: \n");
  for (i = 0; i < NQ; i++) {
    printf("%3d%15.0lf\n", i, q[i]);
  }

  c_print_results("EP", CLASS, M+1, 0, 0, nit,
      tm, Mops, 
      "Random numbers generated",
      verified, NPBVERSION, COMPILETIME, 
      CS1, CS2, CS3, CS4, CS5, CS6, CS7,
      clu_GetDeviceTypeName(device_type), device_name);

  if (timers_enabled) {
    if (tm <= 0.0) tm = 1.0;
    tt = timer_read(0);
    printf("\nTotal time:     %9.3lf (%6.2lf)\n", tt, tt*100.0/tm);
  }

  free(gq);
  free(gsx);
  free(gsy);
  release_opencl();

  fflush(stdout);

  return 0;
}