Beispiel #1
0
//---------------------------------------------------------------------
// set the boundary values of dependent variables
//---------------------------------------------------------------------
void setbv()
{
  DTIMER_START(t_setbv);

  cl_int ecode;

  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_setbv1,
                                 SETBV1_DIM, NULL,
                                 setbv1_gws,
                                 setbv1_lws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel()");

  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_setbv2,
                                 SETBV2_DIM, NULL,
                                 setbv2_gws,
                                 setbv2_lws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel()");

  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_setbv3,
                                 SETBV3_DIM, NULL,
                                 setbv3_gws,
                                 setbv3_lws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel()");
  CHECK_FINISH();

  DTIMER_STOP(t_setbv);
}
Beispiel #2
0
void lhsinit()
{
  int i;
  size_t d0_size, d1_size, d2_size;
  size_t local_ws[3], global_ws[3], temp;

  cl_kernel *k_lhsinit;
  cl_int ecode;

  k_lhsinit = (cl_kernel *)malloc(sizeof(cl_kernel) * num_devices);

  for (i = 0; i < num_devices; i++) {
    d0_size = max_cell_size[i][1];
    d1_size = max_cell_size[i][2];
    d2_size = ncells;

    local_ws[0] = d0_size < work_item_sizes[0] ? d0_size : work_item_sizes[0];
    temp = max_work_group_size / local_ws[0];
    local_ws[1] = d1_size < temp ? d1_size : temp;
    temp = temp / local_ws[1];
    local_ws[2] = d2_size < temp ? d2_size : temp;

    global_ws[0] = clu_RoundWorkSize(d0_size, local_ws[0]);
    global_ws[1] = clu_RoundWorkSize(d1_size, local_ws[1]);
    global_ws[2] = clu_RoundWorkSize(d2_size, local_ws[2]);

    k_lhsinit[i] = clCreateKernel(p_initialize[i], "lhsinit", &ecode);
    clu_CheckError(ecode, "clCreateKernel()");
    
    ecode  = clSetKernelArg(k_lhsinit[i], 0, sizeof(cl_mem), &m_lhsc[i]);
    ecode |= clSetKernelArg(k_lhsinit[i], 1, sizeof(cl_mem), &m_start[i]);
    ecode |= clSetKernelArg(k_lhsinit[i], 2, sizeof(cl_mem), &m_end[i]);
    ecode |= clSetKernelArg(k_lhsinit[i], 3, sizeof(cl_mem),&m_cell_coord[i]);
    ecode |= clSetKernelArg(k_lhsinit[i], 4, sizeof(cl_mem), &m_cell_size[i]);
    ecode |= clSetKernelArg(k_lhsinit[i], 5, sizeof(int), &ncells);
    clu_CheckError(ecode, "clSetKernelArg()");
    
    ecode = clEnqueueNDRangeKernel(cmd_queue[i],
                                   k_lhsinit[i],
                                   3, NULL,
                                   global_ws,
                                   local_ws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel()");
  }

  for (i = 0; i < num_devices; i++) {
    ecode = clFinish(cmd_queue[i]);
    clu_CheckError(ecode, "clFinish()");
  }

  for (i = 0; i < num_devices; i++) {
    clReleaseKernel(k_lhsinit[i]);
  }
  free(k_lhsinit);
}
Beispiel #3
0
static void cffts3(int is, int d1, int d2, int d3, cl_mem *x, cl_mem *xout)
{
  int logd3 = ilog2(d3);
  size_t local_ws[2], global_ws[2], temp;
  cl_int ecode;

  if (timers_enabled) timer_start(T_fftz);

  ecode  = clSetKernelArg(k_cffts3, 0, sizeof(cl_mem), x);
  ecode |= clSetKernelArg(k_cffts3, 1, sizeof(cl_mem), xout);
  ecode |= clSetKernelArg(k_cffts3, 3, sizeof(int), &is);
  ecode |= clSetKernelArg(k_cffts3, 4, sizeof(int), &d1);
  ecode |= clSetKernelArg(k_cffts3, 5, sizeof(int), &d2);
  ecode |= clSetKernelArg(k_cffts3, 6, sizeof(int), &d3);
  ecode |= clSetKernelArg(k_cffts3, 7, sizeof(int), &logd3);
  clu_CheckError(ecode, "clSetKernelArg() for k_cffts3");

  if (device_type == CL_DEVICE_TYPE_CPU) {
    local_ws[0] = d1 < work_item_sizes[0] ? d1 : work_item_sizes[0];
    temp = max_work_group_size / local_ws[0];
    local_ws[1] = d2 < temp ? d2 : temp;

    global_ws[0] = clu_RoundWorkSize((size_t)d1, local_ws[0]);
    global_ws[1] = clu_RoundWorkSize((size_t)d2, local_ws[1]);
  } else if (device_type == CL_DEVICE_TYPE_GPU) {
    if (CFFTS_DIM == 2) {
      local_ws[0] = CFFTS_LSIZE;
      local_ws[1] = 1;
      global_ws[0] = d1 * local_ws[0];
      global_ws[1] = d2;
    } else {
      local_ws[0] = CFFTS_LSIZE;
      global_ws[0] = d2 * local_ws[0];
    }
  }

  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_cffts3,
                                 CFFTS_DIM, NULL,
                                 global_ws,
                                 local_ws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel() for cffts3");
  CHECK_FINISH();

  if (timers_enabled) timer_stop(T_fftz);
}
Beispiel #4
0
//---------------------------------------------------------------------
// evolve u0 -> u1 (t time steps) in fourier space
//---------------------------------------------------------------------
static void evolve(cl_mem *u0, cl_mem *u1, cl_mem *twiddle,
                   int d1, int d2, int d3)
{
  cl_int ecode;
  size_t local_ws[3], global_ws[3];

  ecode  = clSetKernelArg(k_evolve, 0, sizeof(cl_mem), u0);
  ecode |= clSetKernelArg(k_evolve, 1, sizeof(cl_mem), u1);
  ecode |= clSetKernelArg(k_evolve, 2, sizeof(cl_mem), twiddle);
  ecode |= clSetKernelArg(k_evolve, 3, sizeof(int), &d1);
  ecode |= clSetKernelArg(k_evolve, 4, sizeof(int), &d2);
  ecode |= clSetKernelArg(k_evolve, 5, sizeof(int), &d3);
  clu_CheckError(ecode, "clSetKernelArg() for evolve");

  if (EVOLVE_DIM == 3) {
    local_ws[0] = d1 < work_item_sizes[0] ? d1 : work_item_sizes[0];
    int temp = max_work_group_size / local_ws[0];
    local_ws[1] = d2 < temp ? d2 : temp;
    temp = temp / local_ws[1];
    local_ws[2] = d3 < temp ? d3 : temp;

    global_ws[0] = clu_RoundWorkSize((size_t)d1, local_ws[0]);
    global_ws[1] = clu_RoundWorkSize((size_t)d2, local_ws[1]);
    global_ws[2] = clu_RoundWorkSize((size_t)d3, local_ws[2]);
  } else if (EVOLVE_DIM == 2) {
    local_ws[0] = d2 < work_item_sizes[0] ? d2 : work_item_sizes[0];
    int temp = max_work_group_size / local_ws[0];
    local_ws[1] = d3 < temp ? d3 : temp;

    global_ws[0] = clu_RoundWorkSize((size_t)d2, local_ws[0]);
    global_ws[1] = clu_RoundWorkSize((size_t)d3, local_ws[1]);
  } else {
    int temp = d3 / max_compute_units;
    local_ws[0] = temp == 0 ? 1 : temp;
    global_ws[0] = clu_RoundWorkSize((size_t)d3, local_ws[0]);
  }

  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_evolve,
                                 EVOLVE_DIM, NULL,
                                 global_ws,
                                 local_ws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel() for evolve");
  CHECK_FINISH();
}
Beispiel #5
0
//---------------------------------------------------------------------
// compute function from local (i,j,k) to ibar^2+jbar^2+kbar^2 
// for time evolution exponent. 
//---------------------------------------------------------------------
static void compute_indexmap(cl_mem *twiddle, int d1, int d2, int d3)
{
  cl_int ecode;
  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_compute_indexmap,
                                 COMPUTE_IMAP_DIM, NULL,
                                 cimap_gws,
                                 cimap_lws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel() for compute_indexmap");
  CHECK_FINISH();
}
Beispiel #6
0
void create_seq( double seed, double a )
{
  cl_kernel k_cs;
  cl_int    ecode;
  size_t cs_lws[1], cs_gws[1];

  DTIMER_START(T_OPENCL_API);
  // Create a kernel
  k_cs = clCreateKernel(program, "create_seq", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for create_seq");
  DTIMER_STOP(T_OPENCL_API);

  DTIMER_START(T_KERNEL_CREATE_SEQ);
  // Set kernel arguments
  ecode  = clSetKernelArg(k_cs, 0, sizeof(cl_mem), (void*)&m_key_array);
  ecode |= clSetKernelArg(k_cs, 1, sizeof(cl_double), (void*)&seed);
  ecode |= clSetKernelArg(k_cs, 2, sizeof(cl_double), (void*)&a);
  clu_CheckError(ecode, "clSetKernelArg() for create_seq");

  // Launch the kernel
  cs_lws[0] = CREATE_SEQ_GROUP_SIZE;
  cs_gws[0] = CREATE_SEQ_GLOBAL_SIZE;
  ecode = clEnqueueNDRangeKernel(cmd_queue, k_cs, 1, NULL,
                                 cs_gws, 
                                 cs_lws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel() for create_seq");

  ecode = clFinish(cmd_queue);
  clu_CheckError(ecode, "clFinish");
  DTIMER_STOP(T_KERNEL_CREATE_SEQ);

  DTIMER_START(T_RELEASE);
  clReleaseKernel(k_cs);
  DTIMER_STOP(T_RELEASE);
}
Beispiel #7
0
//---------------------------------------------------------------------
// touch all the big data
//---------------------------------------------------------------------
static void init_ui(cl_mem *u0, cl_mem *u1, cl_mem *twiddle,
                    int d1, int d2, int d3)
{
  cl_kernel k_init_ui;
  cl_int ecode;

  DTIMER_START(T_OPENCL_API);
  // Create a kernel
  k_init_ui = clCreateKernel(program, "init_ui", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for init_ui");
  DTIMER_STOP(T_OPENCL_API);

  int n = d3 * d2 * (d1+1);
  ecode  = clSetKernelArg(k_init_ui, 0, sizeof(cl_mem), (void*)u0);
  ecode |= clSetKernelArg(k_init_ui, 1, sizeof(cl_mem), (void*)u1);
  ecode |= clSetKernelArg(k_init_ui, 2, sizeof(cl_mem), (void*)twiddle);
  ecode |= clSetKernelArg(k_init_ui, 3, sizeof(int), (void*)&n);
  clu_CheckError(ecode, "clSetKernelArg() for init_ui");

  size_t local_ws = work_item_sizes[0];
  size_t global_ws = clu_RoundWorkSize((size_t)n, local_ws);
  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_init_ui,
                                 1, NULL,
                                 &global_ws,
                                 &local_ws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel() for init_ui");

  ecode = clFinish(cmd_queue);
  clu_CheckError(ecode, "clFinish()");

  DTIMER_START(T_RELEASE);
  clReleaseKernel(k_init_ui);
  DTIMER_STOP(T_RELEASE);
}
Beispiel #8
0
static void checksum(int i, cl_mem *u1, int d1, int d2, int d3)
{
  dcomplex chk = dcmplx(0.0, 0.0);
  int k;
  cl_int ecode;

  ecode = clSetKernelArg(k_checksum, 0, sizeof(cl_mem), u1);
  clu_CheckError(ecode, "clSetKernelArg() for checksum");

  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_checksum,
                                 1, NULL,
                                 &checksum_global_ws,
                                 &checksum_local_ws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel()");
  CHECK_FINISH();

  ecode = clEnqueueReadBuffer(cmd_queue,
                              m_chk,
                              CL_TRUE,
                              0, checksum_wg_num * sizeof(dcomplex),
                              g_chk,
                              0, NULL, NULL);
  clu_CheckError(ecode, "clReadBuffer()");

  // reduction
  for (k = 0; k < checksum_wg_num; k++) {
    chk = dcmplx_add(chk, g_chk[k]);
  }

  chk = dcmplx_div2(chk, (double)(NTOTAL));

  printf(" T =%5d     Checksum =%22.12E%22.12E\n", i, chk.real, chk.imag);
  sums[i] = chk;
}
Beispiel #9
0
//---------------------------------------------------------------------
// addition of update to the vector u
//---------------------------------------------------------------------
void add()
{
  cl_int ecode;

  if (timeron) timer_start(t_add);

  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_add,
                                 ADD_DIM, NULL,
                                 add_gws,
                                 add_lws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel()");
  CHECK_FINISH();

  if (timeron) timer_stop(t_add);
}
Beispiel #10
0
//---------------------------------------------------------------------
// block-diagonal matrix-vector multiplication              
//---------------------------------------------------------------------
void ninvr()
{
  cl_int ecode;

  if (timeron) timer_start(t_ninvr);

  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_ninvr,
                                 NINVR_DIM, NULL,
                                 ninvr_gws,
                                 ninvr_lws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel()");
  CHECK_FINISH();

  if (timeron) timer_stop(t_ninvr);
}
Beispiel #11
0
//---------------------------------------------------------------------
// this function performs the solution of the approximate factorization
// step in the z-direction for all five matrix components
// simultaneously. The Thomas algorithm is employed to solve the
// systems for the z-lines. Boundary conditions are non-periodic
//---------------------------------------------------------------------
void z_solve()
{
  cl_int ecode;

  if (timeron) timer_start(t_zsolve);

  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_z_solve,
                                 Z_SOLVE_DIM, NULL,
                                 z_solve_gws,
                                 z_solve_lws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel()");
  CHECK_FINISH();

  if (timeron) timer_stop(t_zsolve);

  tzetar();
}
Beispiel #12
0
//---------------------------------------------------------------------
// compute the roots-of-unity array that will be used for subsequent FFTs. 
//---------------------------------------------------------------------
static void fft_init(int n)
{
  int m, nu, ku, i, j, ln;
  double t, ti;

  //---------------------------------------------------------------------
  // Initialize the U array with sines and cosines in a manner that permits
  // stride one access at each FFT iteration.
  //---------------------------------------------------------------------
  nu = n;
  m = ilog2(n);
  u[0] = dcmplx(m, 0.0);
  ku = 2;
  ln = 1;

  for (j = 1; j <= m; j++) {
    t = PI / ln;

    for (i = 0; i <= ln - 1; i++) {
      ti = i * t;
      u[i+ku-1] = dcmplx(cos(ti), sin(ti));
    }

    ku = ku + ln;
    ln = 2 * ln;
  }

  int ecode;
  ecode = clEnqueueWriteBuffer(cmd_queue,
                               m_u,
                               CL_FALSE,
                               0, sizeof(dcomplex) * NXP,
                               u,
                               0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueWriteBuffer() for m_u");
}
Beispiel #13
0
//---------------------------------------------------------------------
// Set up the OpenCL environment.
//---------------------------------------------------------------------
static void setup_opencl(int argc, char *argv[])
{
  size_t temp;
  cl_int ecode;
  char *source_dir = "FT";
  if (argc > 1) source_dir = argv[1];

#ifdef TIMER_DETAIL
  if (timers_enabled) {
    int i;
    for (i = T_OPENCL_API; i < T_END; i++) timer_clear(i);
  }
#endif

  DTIMER_START(T_OPENCL_API);

  // 1. Find the default device type and get a device for the device type
  device_type = clu_GetDefaultDeviceType();
  device      = clu_GetAvailableDevice(device_type);
  device_name = clu_GetDeviceName(device);

  // Device information
  ecode = clGetDeviceInfo(device,
                          CL_DEVICE_MAX_WORK_ITEM_SIZES,
                          sizeof(work_item_sizes),
                          &work_item_sizes,
                          NULL);
  clu_CheckError(ecode, "clGetDiviceInfo()");

  ecode = clGetDeviceInfo(device,
                          CL_DEVICE_MAX_WORK_GROUP_SIZE,
                          sizeof(size_t),
                          &max_work_group_size,
                          NULL);
  clu_CheckError(ecode, "clGetDiviceInfo()");

  // FIXME: The below values are experimental.
  if (max_work_group_size > 64) {
    max_work_group_size = 64;
    int i;
    for (i = 0; i < 3; i++) {
      if (work_item_sizes[i] > 64) {
        work_item_sizes[i] = 64;
      }
    }
  }

  ecode = clGetDeviceInfo(device,
                          CL_DEVICE_MAX_COMPUTE_UNITS,
                          sizeof(cl_uint),
                          &max_compute_units,
                          NULL);
  clu_CheckError(ecode, "clGetDiviceInfo()");

  // 2. Create a context for the specified device
  context = clCreateContext(NULL, 1, &device, NULL, NULL, &ecode);
  clu_CheckError(ecode, "clCreateContext()");

  // 3. Create a command queue
  cmd_queue = clCreateCommandQueue(context, device, 0, &ecode);
  clu_CheckError(ecode, "clCreateCommandQueue()");

  DTIMER_STOP(T_OPENCL_API);

  // 4. Build the program
  DTIMER_START(T_BUILD);
  char *source_file;
  char build_option[50];
  if (device_type == CL_DEVICE_TYPE_CPU) {
    source_file = "ft_cpu.cl";
    sprintf(build_option, "-I. -DCLASS=%d -DUSE_CPU", CLASS);

    COMPUTE_IMAP_DIM = COMPUTE_IMAP_DIM_CPU;
    EVOLVE_DIM = EVOLVE_DIM_CPU;
    CFFTS_DIM = CFFTS_DIM_CPU;

  } else if (device_type == CL_DEVICE_TYPE_GPU) {
    char vendor[50];
    ecode = clGetDeviceInfo(device, CL_DEVICE_VENDOR, 50, vendor, NULL);
    clu_CheckError(ecode, "clGetDeviceInfo()");
    if (strncmp(vendor, DEV_VENDOR_NVIDIA, strlen(DEV_VENDOR_NVIDIA)) == 0) {
      source_file = "ft_gpu_nvidia.cl";
      CFFTS_LSIZE = 32;
    } else {
      source_file = "ft_gpu.cl";
      CFFTS_LSIZE = 64;
    }

    sprintf(build_option, "-I. -DCLASS=\'%c\' -DLSIZE=%lu",
            CLASS, CFFTS_LSIZE);

    COMPUTE_IMAP_DIM = COMPUTE_IMAP_DIM_GPU;
    EVOLVE_DIM = EVOLVE_DIM_GPU;
    CFFTS_DIM = CFFTS_DIM_GPU;

  } else {
    fprintf(stderr, "Set the environment variable OPENCL_DEVICE_TYPE!\n");
    exit(EXIT_FAILURE);
  }
  program = clu_MakeProgram(context, device, source_dir, source_file,
                            build_option);
  DTIMER_STOP(T_BUILD);

  // 5. Create buffers
  DTIMER_START(T_BUFFER_CREATE);
  m_u = clCreateBuffer(context,
                       CL_MEM_READ_ONLY,
                       sizeof(dcomplex) * NXP,
                       NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_u");

  m_u0 = clCreateBuffer(context,
                        CL_MEM_READ_WRITE,
                        sizeof(dcomplex) * NTOTALP,
                        NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_u0");

  m_u1 = clCreateBuffer(context,
                        CL_MEM_READ_WRITE,
                        sizeof(dcomplex) * NTOTALP,
                        NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_u1");

  m_twiddle = clCreateBuffer(context,
                             CL_MEM_READ_WRITE,
                             sizeof(double) * NTOTALP,
                             NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_twiddle");

  if (device_type == CL_DEVICE_TYPE_CPU) {
    size_t ty1_size, ty2_size;
    if (CFFTS_DIM == 2) {
      ty1_size = sizeof(dcomplex) * NX * NY * NZ;
      ty2_size = sizeof(dcomplex) * NX * NY * NZ;
    } else {
      fprintf(stderr, "Wrong CFFTS_DIM: %u\n", CFFTS_DIM);
      exit(EXIT_FAILURE);
    }

    m_ty1 = clCreateBuffer(context,
                           CL_MEM_READ_WRITE,
                           ty1_size,
                           NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_ty1");

    m_ty2 = clCreateBuffer(context,
                           CL_MEM_READ_WRITE,
                           ty2_size,
                           NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_ty2");
  }

  if (device_type == CL_DEVICE_TYPE_CPU) {
    temp = 1024 / max_compute_units;
    checksum_local_ws  = temp == 0 ? 1 : temp;
    checksum_global_ws = clu_RoundWorkSize((size_t)1024, checksum_local_ws);
  } else if (device_type == CL_DEVICE_TYPE_GPU) {
    checksum_local_ws  = 32;
    checksum_global_ws = clu_RoundWorkSize((size_t)1024, checksum_local_ws);
  }
  checksum_wg_num = checksum_global_ws / checksum_local_ws;
  m_chk = clCreateBuffer(context,
                         CL_MEM_READ_WRITE,
                         sizeof(dcomplex) * checksum_wg_num,
                         NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_chk");
  g_chk = (dcomplex *)malloc(sizeof(dcomplex) * checksum_wg_num);
  DTIMER_STOP(T_BUFFER_CREATE);

  // 6. Create kernels
  DTIMER_START(T_OPENCL_API);
  double ap = -4.0 * ALPHA * PI * PI;
  int d1 = dims[0];
  int d2 = dims[1];
  int d3 = dims[2];

  k_compute_indexmap = clCreateKernel(program, "compute_indexmap", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for compute_indexmap");
  ecode  = clSetKernelArg(k_compute_indexmap, 0, sizeof(cl_mem), &m_twiddle);
  ecode |= clSetKernelArg(k_compute_indexmap, 1, sizeof(int), &d1);
  ecode |= clSetKernelArg(k_compute_indexmap, 2, sizeof(int), &d2);
  ecode |= clSetKernelArg(k_compute_indexmap, 3, sizeof(int), &d3);
  ecode |= clSetKernelArg(k_compute_indexmap, 4, sizeof(double), &ap);
  clu_CheckError(ecode, "clSetKernelArg() for compute_indexmap");
  if (COMPUTE_IMAP_DIM == 3) {
    cimap_lws[0] = d1 < work_item_sizes[0] ? d1 : work_item_sizes[0];
    temp = max_work_group_size / cimap_lws[0];
    cimap_lws[1] = d2 < temp ? d2 : temp;
    temp = temp / cimap_lws[1];
    cimap_lws[2] = d3 < temp ? d3 : temp;

    cimap_gws[0] = clu_RoundWorkSize((size_t)d1, cimap_lws[0]);
    cimap_gws[1] = clu_RoundWorkSize((size_t)d2, cimap_lws[1]);
    cimap_gws[2] = clu_RoundWorkSize((size_t)d3, cimap_lws[2]);
  } else if (COMPUTE_IMAP_DIM == 2) {
    cimap_lws[0] = d2 < work_item_sizes[0] ? d2 : work_item_sizes[0];
    temp = max_work_group_size / cimap_lws[0];
    cimap_lws[1] = d3 < temp ? d3 : temp;

    cimap_gws[0] = clu_RoundWorkSize((size_t)d2, cimap_lws[0]);
    cimap_gws[1] = clu_RoundWorkSize((size_t)d3, cimap_lws[1]);
  } else {
    //temp = d3 / max_compute_units;
    temp = 1;
    cimap_lws[0] = temp == 0 ? 1 : temp;
    cimap_gws[0] = clu_RoundWorkSize((size_t)d3, cimap_lws[0]);
  }

  k_compute_ics = clCreateKernel(program,
                                 "compute_initial_conditions", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for compute_initial_conditions");
  ecode  = clSetKernelArg(k_compute_ics, 2, sizeof(int), &d1);
  ecode |= clSetKernelArg(k_compute_ics, 3, sizeof(int), &d2);
  ecode |= clSetKernelArg(k_compute_ics, 4, sizeof(int), &d3);
  clu_CheckError(ecode, "clSetKernelArg() for compute_initial_conditions");

  k_cffts1 = clCreateKernel(program, "cffts1", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for cffts1");
  ecode  = clSetKernelArg(k_cffts1, 2, sizeof(cl_mem), &m_u);
  if (device_type == CL_DEVICE_TYPE_CPU) {
    ecode |= clSetKernelArg(k_cffts1, 8, sizeof(cl_mem), &m_ty1);
    ecode |= clSetKernelArg(k_cffts1, 9, sizeof(cl_mem), &m_ty2);
  }
  clu_CheckError(ecode, "clSetKernelArg() for k_cffts1");

  k_cffts2 = clCreateKernel(program, "cffts2", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for cffts2");
  ecode  = clSetKernelArg(k_cffts2, 2, sizeof(cl_mem), &m_u);
  if (device_type == CL_DEVICE_TYPE_CPU) {
    ecode |= clSetKernelArg(k_cffts2, 8, sizeof(cl_mem), &m_ty1);
    ecode |= clSetKernelArg(k_cffts2, 9, sizeof(cl_mem), &m_ty2);
  }
  clu_CheckError(ecode, "clSetKernelArg() for k_cffts2");

  k_cffts3 = clCreateKernel(program, "cffts3", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for cffts3");
  ecode  = clSetKernelArg(k_cffts3, 2, sizeof(cl_mem), &m_u);
  if (device_type == CL_DEVICE_TYPE_CPU) {
    ecode |= clSetKernelArg(k_cffts3, 8, sizeof(cl_mem), &m_ty1);
    ecode |= clSetKernelArg(k_cffts3, 9, sizeof(cl_mem), &m_ty2);
  }
  clu_CheckError(ecode, "clSetKernelArg() for k_cffts3");

  k_evolve = clCreateKernel(program, "evolve", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for evolve");

  k_checksum = clCreateKernel(program, "checksum", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for checksum");
  ecode  = clSetKernelArg(k_checksum, 1, sizeof(cl_mem), &m_chk);
  ecode |= clSetKernelArg(k_checksum, 2, sizeof(dcomplex)*checksum_local_ws,
                          NULL);
  ecode |= clSetKernelArg(k_checksum, 3, sizeof(int), &dims[0]);
  ecode |= clSetKernelArg(k_checksum, 4, sizeof(int), &dims[1]);
  clu_CheckError(ecode, "clSetKernelArg() for checksum");
  DTIMER_STOP(T_OPENCL_API);
}
Beispiel #14
0
//---------------------------------------------------------------------
// this function computes the norm of the difference between the
// computed solution and the exact solution
//---------------------------------------------------------------------
void error_norm(double rms[5])
{
  int i, m, d;

  cl_kernel *k_en;
  cl_mem *m_rms;
  double (*g_rms)[5];
  cl_int ecode;

  g_rms = (double (*)[5])malloc(sizeof(double)*5 * num_devices);
  m_rms = (cl_mem *)malloc(sizeof(cl_mem) * num_devices);
  k_en  = (cl_kernel *)malloc(sizeof(cl_kernel) * num_devices);

  for (i = 0; i < num_devices; i++) {
    m_rms[i] = clCreateBuffer(context,
                              CL_MEM_READ_WRITE,
                              sizeof(double) * 5, 
                              NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer()");

    k_en[i] = clCreateKernel(p_error[i], "error_norm", &ecode);
    clu_CheckError(ecode, "clCreateKernel()");

    ecode  = clSetKernelArg(k_en[i], 0, sizeof(cl_mem), &m_u[i]);
    ecode |= clSetKernelArg(k_en[i], 1, sizeof(cl_mem), &m_ce[i]);
    ecode |= clSetKernelArg(k_en[i], 2, sizeof(cl_mem), &m_rms[i]);
    ecode |= clSetKernelArg(k_en[i], 3, sizeof(cl_mem), &m_cell_low[i]);
    ecode |= clSetKernelArg(k_en[i], 4, sizeof(cl_mem), &m_cell_high[i]);
    ecode |= clSetKernelArg(k_en[i], 5, sizeof(int), &ncells);
    clu_CheckError(ecode, "clSetKernelArg()");
    
    ecode = clEnqueueTask(cmd_queue[i],
                          k_en[i],
                          0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueTask()");

    clFinish(cmd_queue[i]);

    ecode = clEnqueueReadBuffer(cmd_queue[i],
                                m_rms[i],
                                CL_TRUE,
                                0, sizeof(double)*5,
                                &g_rms[i],
                                0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueReadBuffer()");
  }

  for (m = 0; m < 5; m++) {
    rms[m] = 0.0;
  }

  for (i = 0; i < num_devices; i++) {
    ecode = clFinish(cmd_queue[i]);
    clu_CheckError(ecode, "clFinish()");
  }

  // reduction
  for (i = 0; i < num_devices; i++) {
    for (m = 0; m < 5; m++) {
      rms[m] += g_rms[i][m];
    }
  }
  
  for (m = 0; m < 5; m++) {
    for (d = 0; d < 3; d++) {
      rms[m] = rms[m] / (double)(grid_points[d]-2);
    }
    rms[m] = sqrt(rms[m]);
  }

  for (i = 0; i < num_devices; i++) {
    clReleaseMemObject(m_rms[i]);
    clReleaseKernel(k_en[i]);
  }
  free(g_rms);
  free(m_rms);
  free(k_en);
}
Beispiel #15
0
//---------------------------------------------------------------------
// Set up the OpenCL environment.
//---------------------------------------------------------------------
static void setup_opencl(int argc, char **argv)
{
  int i, c;
//  size_t temp;
  cl_int ecode = 0;
  char *source_dir = ".";  //FIXME
  int num_subs = DEFAULT_NUM_SUBS;
  int num_cus;
  int sqrt_num_command_queues;

  if (argc > 1) source_dir = argv[1];

  devices = (cl_device_id *)malloc(sizeof(cl_device_id) * num_subs);

  if (timeron) {
    timer_clear(TIMER_OPENCL);
    timer_clear(TIMER_BUILD);
    timer_clear(TIMER_BUFFER);
    timer_clear(TIMER_RELEASE);

    timer_start(TIMER_OPENCL);
  }

  // 1. Find the default device type and get a device for the device type
  //    Then, create sub-devices from the parent device.
  //device_type = CL_DEVICE_TYPE_CPU;
  device_type = CL_DEVICE_TYPE_ALL;
  //device_type = CL_DEVICE_TYPE_GPU;
  if(argc <= 2) {
    printf("Device type argument missing!\n");
	exit(-1);
  }
  char *device_type_str = argv[2];
  if(strcmp(device_type_str, "CPU") == 0 || strcmp(device_type_str, "cpu") == 0) {
  	device_type = CL_DEVICE_TYPE_CPU;
  } else if(strcmp(device_type_str, "GPU") == 0 || strcmp(device_type_str, "gpu") == 0) {
  	device_type = CL_DEVICE_TYPE_GPU;
  } else if(strcmp(device_type_str, "ALL") == 0 || strcmp(device_type_str, "all") == 0) {
  	device_type = CL_DEVICE_TYPE_ALL;
  } else {
    printf("Unsupported device type!\n");
	exit(-1);
  }
  cl_uint num_command_queues = 4;
  char *num_command_queues_str = getenv("SNU_NPB_COMMAND_QUEUES");
  if(num_command_queues_str != NULL)
  	num_command_queues = atoi(num_command_queues_str);

  cl_platform_id platform;
  ecode = clGetPlatformIDs(1, &platform, NULL);
  clu_CheckError(ecode, "clGetPlatformIDs()");

  ecode = clGetDeviceIDs(platform, device_type, 0, NULL, &num_devices);
  clu_CheckError(ecode, "clGetDeviceIDs()");

  //num_devices = 2;
  ecode = clGetDeviceIDs(platform, device_type, num_devices, devices, NULL);
  clu_CheckError(ecode, "clGetDeviceIDs()");
  cl_device_id tmp_dev;

  work_item_sizes[0] = work_item_sizes[1] = work_item_sizes[2] = 1024;
  max_work_group_size = 1024;
  max_compute_units = 22;

  sqrt_num_command_queues = (int)(sqrt((double)(num_command_queues) + 0.00001));
  if (num_command_queues != sqrt_num_command_queues * sqrt_num_command_queues) {
    fprintf(stderr, "Number of devices is not a square of some integer\n");
    exit(EXIT_FAILURE);
  }

  ncells = (int)(sqrt((double)(num_command_queues) + 0.00001));
  MAX_CELL_DIM = ((PROBLEM_SIZE/ncells)+1);
  IMAX = MAX_CELL_DIM;
  JMAX = MAX_CELL_DIM;
  KMAX = MAX_CELL_DIM;
  IMAXP = (IMAX/2*2+1);
  JMAXP = (JMAX/2*2+1);
  //---------------------------------------------------------------------
  // +1 at end to avoid zero length arrays for 1 node
  //---------------------------------------------------------------------
  BUF_SIZE = (MAX_CELL_DIM*MAX_CELL_DIM*(MAXCELLS-1)*60*2+1);


  // FIXME
  if (max_work_group_size > 64) {
    max_work_group_size = 64;
    int i;
    for (i = 0; i < 3; i++) {
      if (work_item_sizes[i] > 64) {
        work_item_sizes[i] = 64;
      }
    }
  }

  // 2. Create a context for devices
#ifdef MINIMD_SNUCL_OPTIMIZATIONS
	cl_context_properties props[5] = {
		CL_CONTEXT_PLATFORM,
		(cl_context_properties)platform,
		CL_CONTEXT_SCHEDULER,
		CL_CONTEXT_SCHEDULER_CODE_SEGMENTED_PERF_MODEL,
		//CL_CONTEXT_SCHEDULER_PERF_MODEL,
		//CL_CONTEXT_SCHEDULER_FIRST_EPOCH_BASED_PERF_MODEL,
		//CL_CONTEXT_SCHEDULER_ALL_EPOCH_BASED_PERF_MODEL,
		0 };
  context = clCreateContext(props, 
#elif defined(SOCL_OPTIMIZATIONS)
	cl_context_properties props[5] = {
		CL_CONTEXT_PLATFORM,
		(cl_context_properties)platform,
		CL_CONTEXT_SCHEDULER_SOCL,
		"dmda",
		//"random",
		0 };
  context = clCreateContext(props, 
#else
  context = clCreateContext(NULL, 
#endif
                            num_devices,
                            devices,
                            NULL, NULL, &ecode);
  clu_CheckError(ecode, "clCreateContext()");

  // 3. Create a command queue
  cmd_queue = (cl_command_queue*)malloc(sizeof(cl_command_queue)*num_command_queues*3);
  for (i = 0; i < num_command_queues * 2; i++) {
    //cmd_queue[i] = clCreateCommandQueue(context, devices[(i / 2) % num_devices], 
#ifdef SOCL_OPTIMIZATIONS
    cmd_queue[i] = clCreateCommandQueue(context, NULL, 
#else    
	cmd_queue[i] = clCreateCommandQueue(context, devices[num_devices - 1 - ((i / 2) % num_devices)],
#endif
   // cmd_queue[i] = clCreateCommandQueue(context, devices[0], 
#ifdef MINIMD_SNUCL_OPTIMIZATIONS
	0,
	//		CL_QUEUE_AUTO_DEVICE_SELECTION | 
	//		CL_QUEUE_ITERATIVE, 
			//CL_QUEUE_COMPUTE_INTENSIVE,
#else
	0,
#endif
	&ecode);
    clu_CheckError(ecode, "clCreateCommandQueue()");
  }

  // 4. Build the program
  if (timeron) timer_start(TIMER_BUILD);
  char *source_file = "sp_kernel.cl";
  //p_program = clu_MakeProgram(context, devices, source_dir, source_file, build_option);
  p_program = clu_CreateProgram(context, source_dir, source_file);
  for(i = 0; i < num_devices; i++) {
	  char build_option[200] = {0};
	  cl_device_type cur_device_type;
	  cl_int err = clGetDeviceInfo(devices[i],
			  CL_DEVICE_TYPE,
			  sizeof(cl_device_type),
			  &cur_device_type,
			  NULL);
	  clu_CheckError(err, "clGetDeviceInfo()");
  if (cur_device_type == CL_DEVICE_TYPE_CPU) {
    sprintf(build_option, "-I. -DCLASS=%d -DUSE_CPU -DMAX_CELL_DIM=%d -DIMAX=%d -DJMAX=%d -DKMAX=%d -DIMAXP=%d -DJMAXP=%d", CLASS, MAX_CELL_DIM, IMAX, JMAX, KMAX, IMAXP, JMAXP);
  } else {
    sprintf(build_option, "-I. -DCLASS=%d -DUSE_GPU -DMAX_CELL_DIM=%d -DIMAX=%d -DJMAX=%d -DKMAX=%d -DIMAXP=%d -DJMAXP=%d", CLASS, MAX_CELL_DIM, IMAX, JMAX, KMAX, IMAXP, JMAXP);
  }

  clu_MakeProgram(p_program, 1, &devices[i], source_dir, build_option);
  //clu_MakeProgram(p_program, num_devices, devices, source_dir, build_option);
  }
  num_devices = num_command_queues;
  program = (cl_program *)malloc(sizeof(cl_program) * num_devices);
  for (i = 0; i < num_devices; i++) {
    program[i] = p_program;
  }
  if (timeron) timer_stop(TIMER_BUILD);

  // 5. Create kernels
  size_t asize = sizeof(cl_kernel) * num_devices;
  k_initialize1 = (cl_kernel *)malloc(asize);
  k_initialize2 = (cl_kernel *)malloc(asize);
  k_initialize3 = (cl_kernel *)malloc(asize);
  k_initialize4 = (cl_kernel *)malloc(asize);
  k_initialize5 = (cl_kernel *)malloc(asize);
  k_initialize6 = (cl_kernel *)malloc(asize);
  k_initialize7 = (cl_kernel *)malloc(asize);
  k_initialize8 = (cl_kernel *)malloc(asize);
  k_lhsinit = (cl_kernel *)malloc(asize);
  k_exact_rhs1 = (cl_kernel *)malloc(asize);
  k_exact_rhs2 = (cl_kernel *)malloc(asize);
  k_exact_rhs3 = (cl_kernel *)malloc(asize);
  k_exact_rhs4 = (cl_kernel *)malloc(asize);
  k_exact_rhs5 = (cl_kernel *)malloc(asize);
  k_copy_faces1 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_copy_faces2 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_copy_faces3 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_copy_faces4 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_copy_faces5 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_copy_faces6 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_compute_rhs1 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_compute_rhs2 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_compute_rhs3 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_compute_rhs4 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_compute_rhs5 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_compute_rhs6 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_txinvr = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_lhsx = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_ninvr = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_x_solve1 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_x_solve2 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_x_solve3 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_x_solve4 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_x_solve5 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_x_solve6 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_lhsy = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_pinvr = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_y_solve1 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_y_solve2 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_y_solve3 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_y_solve4 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_y_solve5 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_y_solve6 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_lhsz = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_tzetar = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_z_solve1 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_z_solve2 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_z_solve3 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_z_solve4 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_z_solve5 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_z_solve6 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_add = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_error_norm = (cl_kernel *)malloc(asize);
  k_rhs_norm = (cl_kernel *)malloc(asize);

  for (i = 0; i < num_devices; i++) {
    k_initialize1[i] = clCreateKernel(program[i], "initialize1", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for initialize1");

    k_initialize2[i] = clCreateKernel(program[i], "initialize2", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for initialize2");

    k_initialize3[i] = clCreateKernel(program[i], "initialize3", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for initialize3");

    k_initialize4[i] = clCreateKernel(program[i], "initialize4", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for initialize4");

    k_initialize5[i] = clCreateKernel(program[i], "initialize5", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for initialize5");

    k_initialize6[i] = clCreateKernel(program[i], "initialize6", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for initialize6");

    k_initialize7[i] = clCreateKernel(program[i], "initialize7", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for initialize7");

    k_initialize8[i] = clCreateKernel(program[i], "initialize8", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for initialize8");

    k_lhsinit[i] = clCreateKernel(program[i], "lhsinit", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for lhsinit");

    k_exact_rhs1[i] = clCreateKernel(program[i], "exact_rhs1", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for exact_rhs1");

    k_exact_rhs2[i] = clCreateKernel(program[i], "exact_rhs2", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for exact_rhs2");

    k_exact_rhs3[i] = clCreateKernel(program[i], "exact_rhs3", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for exact_rhs3");

    k_exact_rhs4[i] = clCreateKernel(program[i], "exact_rhs4", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for exact_rhs4");

    k_exact_rhs5[i] = clCreateKernel(program[i], "exact_rhs5", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for exact_rhs5");

    for (c = 0; c < MAXCELLS; c++) {
      k_copy_faces1[i][c] = clCreateKernel(program[i], "copy_faces1", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for copy_faces1");

      k_copy_faces2[i][c] = clCreateKernel(program[i], "copy_faces2", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for copy_faces2");

      k_copy_faces3[i][c] = clCreateKernel(program[i], "copy_faces3", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for copy_faces3");

      k_copy_faces4[i][c] = clCreateKernel(program[i], "copy_faces4", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for copy_faces4");

      k_copy_faces5[i][c] = clCreateKernel(program[i], "copy_faces5", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for copy_faces5");

      k_copy_faces6[i][c] = clCreateKernel(program[i], "copy_faces6", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for copy_faces6");

      k_compute_rhs1[i][c] = clCreateKernel(program[i], "compute_rhs1", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for compute_rhs1");

      k_compute_rhs2[i][c] = clCreateKernel(program[i], "compute_rhs2", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for compute_rhs2");

      k_compute_rhs3[i][c] = clCreateKernel(program[i], "compute_rhs3", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for compute_rhs3");

      k_compute_rhs4[i][c] = clCreateKernel(program[i], "compute_rhs4", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for compute_rhs4");

      k_compute_rhs5[i][c] = clCreateKernel(program[i], "compute_rhs5", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for compute_rhs5");

      k_compute_rhs6[i][c] = clCreateKernel(program[i], "compute_rhs6", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for compute_rhs6");

      k_txinvr[i][c] = clCreateKernel(program[i], "txinvr", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for txinvr");

      k_lhsx[i][c] = clCreateKernel(program[i], "lhsx", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for lhsx");

      k_ninvr[i][c] = clCreateKernel(program[i], "ninvr", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for ninvr");

      k_x_solve1[i][c] = clCreateKernel(program[i], "x_solve1", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for x_solve1");

      k_x_solve2[i][c] = clCreateKernel(program[i], "x_solve2", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for x_solve2");

      k_x_solve3[i][c] = clCreateKernel(program[i], "x_solve3", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for x_solve3");

      k_x_solve4[i][c] = clCreateKernel(program[i], "x_solve4", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for x_solve4");

      k_x_solve5[i][c] = clCreateKernel(program[i], "x_solve5", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for x_solve5");

      k_x_solve6[i][c] = clCreateKernel(program[i], "x_solve6", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for x_solve6");

      k_lhsy[i][c] = clCreateKernel(program[i], "lhsy", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for lhsy");

      k_pinvr[i][c] = clCreateKernel(program[i], "pinvr", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for pinvr");

      k_y_solve1[i][c] = clCreateKernel(program[i], "y_solve1", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for y_solve1");

      k_y_solve2[i][c] = clCreateKernel(program[i], "y_solve2", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for y_solve2");

      k_y_solve3[i][c] = clCreateKernel(program[i], "y_solve3", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for y_solve3");

      k_y_solve4[i][c] = clCreateKernel(program[i], "y_solve4", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for y_solve4");

      k_y_solve5[i][c] = clCreateKernel(program[i], "y_solve5", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for y_solve5");

      k_y_solve6[i][c] = clCreateKernel(program[i], "y_solve6", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for y_solve6");

      k_lhsz[i][c] = clCreateKernel(program[i], "lhsz", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for lhsz");

      k_tzetar[i][c] = clCreateKernel(program[i], "tzetar", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for tzetar");

      k_z_solve1[i][c] = clCreateKernel(program[i], "z_solve1", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for z_solve1");

      k_z_solve2[i][c] = clCreateKernel(program[i], "z_solve2", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for z_solve2");

      k_z_solve3[i][c] = clCreateKernel(program[i], "z_solve3", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for z_solve3");

      k_z_solve4[i][c] = clCreateKernel(program[i], "z_solve4", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for z_solve4");

      k_z_solve5[i][c] = clCreateKernel(program[i], "z_solve5", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for z_solve5");

      k_z_solve6[i][c] = clCreateKernel(program[i], "z_solve6", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for z_solve6");

      k_add[i][c] = clCreateKernel(program[i], "add", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for add");
    }

    k_error_norm[i] = clCreateKernel(program[i], "error_norm", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for error_norm");

    k_rhs_norm[i] = clCreateKernel(program[i], "rhs_norm", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for rhs_norm");
  }

  // 6. Create buffers
  if (timeron) timer_start(TIMER_BUFFER);

  asize = sizeof(cl_mem) * num_devices;

  m_u = (cl_mem *)malloc(asize);
  m_us = (cl_mem *)malloc(asize);
  m_vs = (cl_mem *)malloc(asize);
  m_ws = (cl_mem *)malloc(asize);
  m_qs = (cl_mem *)malloc(asize);
  m_ainv = (cl_mem *)malloc(asize);
  m_rho_i = (cl_mem *)malloc(asize);
  m_speed = (cl_mem *)malloc(asize);
  m_square = (cl_mem *)malloc(asize);
  m_rhs = (cl_mem *)malloc(asize);
  m_forcing = (cl_mem *)malloc(asize);
  m_lhs = (cl_mem *)malloc(asize);
  m_in_buffer = (cl_mem *)malloc(asize);
  m_out_buffer = (cl_mem *)malloc(asize);

  m_ce = (cl_mem *)malloc(asize);

  for (i = 0; i < num_devices; i++) {
    m_u[i] = clCreateBuffer(context,
                            CL_MEM_READ_WRITE,
                            sizeof(double)*MAXCELLS*(KMAX+4)*(JMAXP+4)*(IMAXP+4)*5,
                            NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_u");

    m_us[i] = clCreateBuffer(context,
                             CL_MEM_READ_WRITE,
                             sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2),
                             NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_us");

    m_vs[i] = clCreateBuffer(context,
                             CL_MEM_READ_WRITE,
                             sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2),
                             NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_vs");

    m_ws[i] = clCreateBuffer(context,
                             CL_MEM_READ_WRITE,
                             sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2),
                             NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_ws");

    m_qs[i] = clCreateBuffer(context,
                             CL_MEM_READ_WRITE,
                             sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2),
                             NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_qs");

    m_ainv[i] = clCreateBuffer(context,
                               CL_MEM_READ_WRITE,
                               sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2),
                               NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_ainv");

    m_rho_i[i] = clCreateBuffer(context,
                                CL_MEM_READ_WRITE,
                                sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2),
                                NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_rho_i");

    m_speed[i] = clCreateBuffer(context,
                                CL_MEM_READ_WRITE,
                                sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2),
                                NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_speed");

    m_square[i] = clCreateBuffer(context,
                                 CL_MEM_READ_WRITE,
                                 sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2),
                                 NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_square");

    m_rhs[i] = clCreateBuffer(context,
                              CL_MEM_READ_WRITE,
                              sizeof(double)*MAXCELLS*KMAX*JMAXP*IMAXP*5,
                              NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_rhs");

    m_forcing[i] = clCreateBuffer(context,
                                  CL_MEM_READ_WRITE,
                                  sizeof(double)*MAXCELLS*KMAX*JMAXP*IMAXP*5,
                                  NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_forcing");

    m_lhs[i] = clCreateBuffer(context,
                              CL_MEM_READ_WRITE,
                              sizeof(double)*MAXCELLS*KMAX*JMAXP*IMAXP*15,
                              NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_lhs");

    m_in_buffer[i] = clCreateBuffer(context,
                                    CL_MEM_READ_WRITE,
                                    sizeof(double)*BUF_SIZE,
                                    NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_in_buffer");

    m_out_buffer[i] = clCreateBuffer(context,
                                     CL_MEM_READ_WRITE,
                                     sizeof(double)*BUF_SIZE,
                                     NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_out_buffer");

    m_ce[i] = clCreateBuffer(context,
                             CL_MEM_READ_ONLY,
                             sizeof(double)*5*13,
                             NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_ce");
  }

  if (timeron) timer_stop(TIMER_BUFFER);

  if (timeron) timer_stop(TIMER_OPENCL);
}
Beispiel #16
0
//---------------------------------------------------------------------
// This subroutine initializes the field variable u using 
// tri-linear transfinite interpolation of the boundary values     
//---------------------------------------------------------------------
void initialize()
{
  int i;
  size_t d0_size, d1_size, d2_size;
  size_t local_ws[3], global_ws[3], temp;

  cl_kernel *k_initialize1;
  cl_kernel *k_initialize2;
  cl_kernel *k_initialize3;
  cl_kernel *k_initialize4;
  cl_kernel *k_initialize5;
  cl_kernel *k_initialize6;
  cl_kernel *k_initialize7;
  cl_kernel *k_initialize8;
  cl_int ecode;

  k_initialize1 = (cl_kernel *)malloc(sizeof(cl_kernel) * num_devices);
  k_initialize2 = (cl_kernel *)malloc(sizeof(cl_kernel) * num_devices);
  k_initialize3 = (cl_kernel *)malloc(sizeof(cl_kernel) * num_devices);
  k_initialize4 = (cl_kernel *)malloc(sizeof(cl_kernel) * num_devices);
  k_initialize5 = (cl_kernel *)malloc(sizeof(cl_kernel) * num_devices);
  k_initialize6 = (cl_kernel *)malloc(sizeof(cl_kernel) * num_devices);
  k_initialize7 = (cl_kernel *)malloc(sizeof(cl_kernel) * num_devices);
  k_initialize8 = (cl_kernel *)malloc(sizeof(cl_kernel) * num_devices);

  //-----------------------------------------------------------------------
  d0_size = JMAX+2;
  d1_size = KMAX+2;
  d2_size = ncells;

  local_ws[0] = d0_size < work_item_sizes[0] ? d0_size : work_item_sizes[0];
  temp = max_work_group_size / local_ws[0];
  local_ws[1] = d1_size < temp ? d1_size : temp;
  temp = temp / local_ws[1];
  local_ws[2] = d2_size < temp ? d2_size : temp;

  global_ws[0] = clu_RoundWorkSize(d0_size, local_ws[0]);
  global_ws[1] = clu_RoundWorkSize(d1_size, local_ws[1]);
  global_ws[2] = clu_RoundWorkSize(d2_size, local_ws[2]);

  for (i = 0; i < num_devices; i++) {
    k_initialize1[i] = clCreateKernel(p_initialize[i], "initialize1", &ecode);
    clu_CheckError(ecode, "clCreateKernel()");
    
    ecode  = clSetKernelArg(k_initialize1[i], 0, sizeof(cl_mem), &m_u[i]);
    ecode |= clSetKernelArg(k_initialize1[i], 1, sizeof(int), &ncells);
    clu_CheckError(ecode, "clSetKernelArg()");

    ecode = clEnqueueNDRangeKernel(cmd_queue[i],
                                   k_initialize1[i],
                                   3, NULL,
                                   global_ws,
                                   local_ws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel()");
    ecode = clFinish(cmd_queue[i]);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel()");
  }
  //-----------------------------------------------------------------------

  //---------------------------------------------------------------------
  // first store the "interpolated" values everywhere on the grid    
  //---------------------------------------------------------------------
  for (i = 0; i < num_devices; i++) {
    d0_size = max_cell_size[i][1];
    d1_size = max_cell_size[i][2];
    d2_size = ncells;

    local_ws[0] = d0_size < work_item_sizes[0] ? d0_size : work_item_sizes[0];
    temp = max_work_group_size / local_ws[0];
    local_ws[1] = d1_size < temp ? d1_size : temp;
    temp = temp / local_ws[1];
    local_ws[2] = d2_size < temp ? d2_size : temp;
    global_ws[0] = clu_RoundWorkSize(d0_size, local_ws[0]);
    global_ws[1] = clu_RoundWorkSize(d1_size, local_ws[1]);
    global_ws[2] = clu_RoundWorkSize(d2_size, local_ws[2]);

    k_initialize2[i] = clCreateKernel(p_initialize[i], "initialize2", &ecode);
    clu_CheckError(ecode, "clCreateKernel()");
    
    ecode  = clSetKernelArg(k_initialize2[i], 0, sizeof(cl_mem), &m_u[i]);
    ecode |= clSetKernelArg(k_initialize2[i], 1, sizeof(cl_mem),
                                                 &m_cell_low[i]);
    ecode |= clSetKernelArg(k_initialize2[i], 2, sizeof(cl_mem),
                                                 &m_cell_high[i]);
    ecode |= clSetKernelArg(k_initialize2[i], 3, sizeof(cl_mem),
                                                 &m_ce[i]);
    ecode |= clSetKernelArg(k_initialize2[i], 4, sizeof(int), &ncells);
    clu_CheckError(ecode, "clSetKernelArg()");
    
    ecode = clEnqueueNDRangeKernel(cmd_queue[i],
                                   k_initialize2[i],
                                   3, NULL,
                                   global_ws,
                                   local_ws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel()");
  }
  //-----------------------------------------------------------------------

  //---------------------------------------------------------------------
  // now store the exact values on the boundaries        
  //---------------------------------------------------------------------
  //---------------------------------------------------------------------
  // west face                                                  
  //---------------------------------------------------------------------
  for (i = 0; i < num_devices; i++) {
    d0_size = max_cell_size[i][1];
    d1_size = max_cell_size[i][2];

    local_ws[0] = d0_size < work_item_sizes[0] ? d0_size : work_item_sizes[0];
    temp = max_work_group_size / local_ws[0];
    local_ws[1] = d1_size < temp ? d1_size : temp;
    global_ws[0] = clu_RoundWorkSize(d0_size, local_ws[0]);
    global_ws[1] = clu_RoundWorkSize(d1_size, local_ws[1]);

    k_initialize3[i] = clCreateKernel(p_initialize[i], "initialize3", &ecode);
    clu_CheckError(ecode, "clCreateKernel()");
    
    ecode  = clSetKernelArg(k_initialize3[i], 0, sizeof(cl_mem), &m_u[i]);
    ecode |= clSetKernelArg(k_initialize3[i], 1, sizeof(cl_mem),
                                                 &m_cell_low[i]);
    ecode |= clSetKernelArg(k_initialize3[i], 2, sizeof(cl_mem),
                                                 &m_cell_high[i]);
    ecode |= clSetKernelArg(k_initialize3[i], 3, sizeof(cl_mem), &m_slice[i]);
    ecode |= clSetKernelArg(k_initialize3[i], 4, sizeof(cl_mem), &m_ce[i]);
    clu_CheckError(ecode, "clSetKernelArg()");

    ecode = clEnqueueNDRangeKernel(cmd_queue[i],
                                   k_initialize3[i],
                                   2, NULL,
                                   global_ws,
                                   local_ws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel()");
  }
  //-----------------------------------------------------------------------

  //---------------------------------------------------------------------
  // east face                                                      
  //---------------------------------------------------------------------
  for (i = 0; i < num_devices; i++) {
    d0_size = max_cell_size[i][1];
    d1_size = max_cell_size[i][2];

    local_ws[0] = d0_size < work_item_sizes[0] ? d0_size : work_item_sizes[0];
    temp = max_work_group_size / local_ws[0];
    local_ws[1] = d1_size < temp ? d1_size : temp;
    global_ws[0] = clu_RoundWorkSize(d0_size, local_ws[0]);
    global_ws[1] = clu_RoundWorkSize(d1_size, local_ws[1]);

    k_initialize4[i] = clCreateKernel(p_initialize[i], "initialize4", &ecode);
    clu_CheckError(ecode, "clCreateKernel()");
    
    ecode  = clSetKernelArg(k_initialize4[i], 0, sizeof(cl_mem), &m_u[i]);
    ecode |= clSetKernelArg(k_initialize4[i], 1, sizeof(cl_mem),
                                                 &m_cell_low[i]);
    ecode |= clSetKernelArg(k_initialize4[i], 2, sizeof(cl_mem),
                                                 &m_cell_high[i]);
    ecode |= clSetKernelArg(k_initialize4[i], 3, sizeof(cl_mem),
                                                 &m_cell_size[i]);
    ecode |= clSetKernelArg(k_initialize4[i], 4, sizeof(cl_mem), &m_slice[i]);
    ecode  = clSetKernelArg(k_initialize4[i], 5, sizeof(cl_mem), &m_ce[i]);
    ecode |= clSetKernelArg(k_initialize4[i], 6, sizeof(int), &ncells);
    clu_CheckError(ecode, "clSetKernelArg()");

    ecode = clEnqueueNDRangeKernel(cmd_queue[i],
                                   k_initialize4[i],
                                   2, NULL,
                                   global_ws,
                                   local_ws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel()");
  }
  //-----------------------------------------------------------------------

  //---------------------------------------------------------------------
  // south face                                                 
  //---------------------------------------------------------------------
  for (i = 0; i < num_devices; i++) {
    d0_size = max_cell_size[i][0];
    d1_size = max_cell_size[i][2];

    local_ws[0] = d0_size < work_item_sizes[0] ? d0_size : work_item_sizes[0];
    temp = max_work_group_size / local_ws[0];
    local_ws[1] = d1_size < temp ? d1_size : temp;
    global_ws[0] = clu_RoundWorkSize(d0_size, local_ws[0]);
    global_ws[1] = clu_RoundWorkSize(d1_size, local_ws[1]);

    k_initialize5[i] = clCreateKernel(p_initialize[i], "initialize5", &ecode);
    clu_CheckError(ecode, "clCreateKernel()");
    
    ecode  = clSetKernelArg(k_initialize5[i], 0, sizeof(cl_mem), &m_u[i]);
    ecode |= clSetKernelArg(k_initialize5[i], 1, sizeof(cl_mem),
                                                 &m_cell_low[i]);
    ecode |= clSetKernelArg(k_initialize5[i], 2, sizeof(cl_mem),
                                                 &m_cell_high[i]);
    ecode |= clSetKernelArg(k_initialize5[i], 3, sizeof(cl_mem), &m_slice[i]);
    ecode |= clSetKernelArg(k_initialize5[i], 4, sizeof(cl_mem), &m_ce[i]);
    clu_CheckError(ecode, "clSetKernelArg()");

    ecode = clEnqueueNDRangeKernel(cmd_queue[i],
                                   k_initialize5[i],
                                   2, NULL,
                                   global_ws,
                                   local_ws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel()");
  }
  //-----------------------------------------------------------------------

  //---------------------------------------------------------------------
  // north face                                    
  //-----------------------------------------------------------------------
  for (i = 0; i < num_devices; i++) {
    d0_size = max_cell_size[i][0];
    d1_size = max_cell_size[i][2];

    local_ws[0] = d0_size < work_item_sizes[0] ? d0_size : work_item_sizes[0];
    temp = max_work_group_size / local_ws[0];
    local_ws[1] = d1_size < temp ? d1_size : temp;
    global_ws[0] = clu_RoundWorkSize(d0_size, local_ws[0]);
    global_ws[1] = clu_RoundWorkSize(d1_size, local_ws[1]);

    k_initialize6[i] = clCreateKernel(p_initialize[i], "initialize6", &ecode);
    clu_CheckError(ecode, "clCreateKernel()");
    
    ecode  = clSetKernelArg(k_initialize6[i], 0, sizeof(cl_mem), &m_u[i]);
    ecode |= clSetKernelArg(k_initialize6[i], 1, sizeof(cl_mem),
                                                 &m_cell_low[i]);
    ecode |= clSetKernelArg(k_initialize6[i], 2, sizeof(cl_mem),
                                                 &m_cell_high[i]);
    ecode |= clSetKernelArg(k_initialize6[i], 3, sizeof(cl_mem),
                                                 &m_cell_size[i]);
    ecode |= clSetKernelArg(k_initialize6[i], 4, sizeof(cl_mem), &m_slice[i]);
    ecode  = clSetKernelArg(k_initialize6[i], 5, sizeof(cl_mem), &m_ce[i]);
    ecode |= clSetKernelArg(k_initialize6[i], 6, sizeof(int), &ncells);
    clu_CheckError(ecode, "clSetKernelArg()");

    ecode = clEnqueueNDRangeKernel(cmd_queue[i],
                                   k_initialize6[i],
                                   2, NULL,
                                   global_ws,
                                   local_ws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel()");
  }
  //-----------------------------------------------------------------------

  //---------------------------------------------------------------------
  // bottom face                                       
  //-----------------------------------------------------------------------
  for (i = 0; i < num_devices; i++) {
    d0_size = max_cell_size[i][0];
    d1_size = max_cell_size[i][1];

    local_ws[0] = d0_size < work_item_sizes[0] ? d0_size : work_item_sizes[0];
    temp = max_work_group_size / local_ws[0];
    local_ws[1] = d1_size < temp ? d1_size : temp;
    global_ws[0] = clu_RoundWorkSize(d0_size, local_ws[0]);
    global_ws[1] = clu_RoundWorkSize(d1_size, local_ws[1]);

    k_initialize7[i] = clCreateKernel(p_initialize[i], "initialize7", &ecode);
    clu_CheckError(ecode, "clCreateKernel()");
    
    ecode  = clSetKernelArg(k_initialize7[i], 0, sizeof(cl_mem), &m_u[i]);
    ecode |= clSetKernelArg(k_initialize7[i], 1, sizeof(cl_mem),
                                                 &m_cell_low[i]);
    ecode |= clSetKernelArg(k_initialize7[i], 2, sizeof(cl_mem),
                                                 &m_cell_high[i]);
    ecode |= clSetKernelArg(k_initialize7[i], 3, sizeof(cl_mem), &m_slice[i]);
    ecode |= clSetKernelArg(k_initialize7[i], 4, sizeof(cl_mem), &m_ce[i]);
    clu_CheckError(ecode, "clSetKernelArg()");

    ecode = clEnqueueNDRangeKernel(cmd_queue[i],
                                   k_initialize7[i],
                                   2, NULL,
                                   global_ws,
                                   local_ws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel()");
  }
  //-----------------------------------------------------------------------

  //---------------------------------------------------------------------
  // top face     
  //-----------------------------------------------------------------------
  for (i = 0; i < num_devices; i++) {
    d0_size = max_cell_size[i][0];
    d1_size = max_cell_size[i][1];

    local_ws[0] = d0_size < work_item_sizes[0] ? d0_size : work_item_sizes[0];
    temp = max_work_group_size / local_ws[0];
    local_ws[1] = d1_size < temp ? d1_size : temp;
    global_ws[0] = clu_RoundWorkSize(d0_size, local_ws[0]);
    global_ws[1] = clu_RoundWorkSize(d1_size, local_ws[1]);

    k_initialize8[i] = clCreateKernel(p_initialize[i], "initialize8", &ecode);
    clu_CheckError(ecode, "clCreateKernel()");
    
    ecode  = clSetKernelArg(k_initialize8[i], 0, sizeof(cl_mem), &m_u[i]);
    ecode |= clSetKernelArg(k_initialize8[i], 1, sizeof(cl_mem),
                                                 &m_cell_low[i]);
    ecode |= clSetKernelArg(k_initialize8[i], 2, sizeof(cl_mem),
                                                 &m_cell_high[i]);
    ecode |= clSetKernelArg(k_initialize8[i], 3, sizeof(cl_mem),
                                                 &m_cell_size[i]);
    ecode |= clSetKernelArg(k_initialize8[i], 4, sizeof(cl_mem), &m_slice[i]);
    ecode  = clSetKernelArg(k_initialize8[i], 5, sizeof(cl_mem), &m_ce[i]);
    ecode |= clSetKernelArg(k_initialize8[i], 6, sizeof(int), &ncells);
    clu_CheckError(ecode, "clSetKernelArg()");

    ecode = clEnqueueNDRangeKernel(cmd_queue[i],
                                   k_initialize8[i],
                                   2, NULL,
                                   global_ws,
                                   local_ws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel()");
  }
  //-----------------------------------------------------------------------

  for (i = 0; i < num_devices; i++) {
    ecode = clFinish(cmd_queue[i]);
    clu_CheckError(ecode, "clFinish()");
  }

  for (i = 0; i < num_devices; i++) {
    clReleaseKernel(k_initialize1[i]);
    clReleaseKernel(k_initialize2[i]);
    clReleaseKernel(k_initialize3[i]);
    clReleaseKernel(k_initialize4[i]);
    clReleaseKernel(k_initialize5[i]);
    clReleaseKernel(k_initialize6[i]);
    clReleaseKernel(k_initialize7[i]);
    clReleaseKernel(k_initialize8[i]);
  }

  free(k_initialize1);
  free(k_initialize2);
  free(k_initialize3);
  free(k_initialize4);
  free(k_initialize5);
  free(k_initialize6);
  free(k_initialize7);
  free(k_initialize8);
}
Beispiel #17
0
//---------------------------------------------------------------------
// Set up the OpenCL environment.
//---------------------------------------------------------------------
static void setup_opencl(int argc, char *argv[])
{
  int i;
  size_t temp, wg_num;
  cl_int ecode;
  char *source_dir = "LU";

  if (timeron) {
    timer_clear(TIMER_OPENCL);
    timer_clear(TIMER_BUILD);
    timer_clear(TIMER_BUFFER);
    timer_clear(TIMER_RELEASE);

    timer_start(TIMER_OPENCL);
  }

  if (argc > 1) source_dir = argv[1];

  //-----------------------------------------------------------------------
  // 1. Find the default device type and get a device for the device type
  //-----------------------------------------------------------------------
  device_type = clu_GetDefaultDeviceType();
  device      = clu_GetAvailableDevice(device_type);
  device_name = clu_GetDeviceName(device);

  // Device information
  ecode = clGetDeviceInfo(device,
                          CL_DEVICE_MAX_WORK_ITEM_SIZES,
                          sizeof(work_item_sizes),
                          &work_item_sizes,
                          NULL);
  clu_CheckError(ecode, "clGetDiviceInfo()");

  ecode = clGetDeviceInfo(device,
                          CL_DEVICE_MAX_WORK_GROUP_SIZE,
                          sizeof(size_t),
                          &max_work_group_size,
                          NULL);
  clu_CheckError(ecode, "clGetDiviceInfo()");

  ecode = clGetDeviceInfo(device,
                          CL_DEVICE_MAX_COMPUTE_UNITS,
                          sizeof(cl_uint),
                          &max_compute_units,
                          NULL);
  clu_CheckError(ecode, "clGetDiviceInfo()");

  ////////////////////////////////////////////////////////////////////////
  // FIXME: The below values are experimental.
  size_t default_wg_size = 64;
  if (device_type == CL_DEVICE_TYPE_CPU) {
    if (CLASS == 'B') default_wg_size = 128;
  } else {
    if (CLASS == 'B') default_wg_size = 32;
  }
  if (max_work_group_size > default_wg_size) {
    max_work_group_size = default_wg_size;
    int i;
    for (i = 0; i < 3; i++) {
      if (work_item_sizes[i] > default_wg_size) {
        work_item_sizes[i] = default_wg_size;
      }
    }
  }
  if (device_type == CL_DEVICE_TYPE_CPU) {
    SETBV1_DIM = SETBV1_DIM_CPU;
    SETBV2_DIM = SETBV2_DIM_CPU;
    SETBV3_DIM = SETBV3_DIM_CPU;
    SETIV_DIM = SETIV_DIM_CPU;
    ERHS1_DIM = ERHS1_DIM_CPU;
    ERHS2_DIM = ERHS2_DIM_CPU;
    ERHS3_DIM = ERHS3_DIM_CPU;
    ERHS4_DIM = ERHS4_DIM_CPU;
    PINTGR1_DIM = PINTGR1_DIM_CPU;
    PINTGR2_DIM = PINTGR2_DIM_CPU;
    PINTGR3_DIM = PINTGR3_DIM_CPU;
    RHS_DIM  = RHS_DIM_CPU;
    RHSX_DIM = RHSX_DIM_CPU;
    RHSY_DIM = RHSY_DIM_CPU;
    RHSZ_DIM = RHSZ_DIM_CPU;
    SSOR2_DIM = SSOR2_DIM_CPU;
    SSOR3_DIM = SSOR3_DIM_CPU;
  } else {
    SETBV1_DIM = SETBV1_DIM_GPU;
    SETBV2_DIM = SETBV2_DIM_GPU;
    SETBV3_DIM = SETBV3_DIM_GPU;
    SETIV_DIM = SETIV_DIM_GPU;
    ERHS1_DIM = ERHS1_DIM_GPU;
    ERHS2_DIM = ERHS2_DIM_GPU;
    ERHS3_DIM = ERHS3_DIM_GPU;
    ERHS4_DIM = ERHS4_DIM_GPU;
    PINTGR1_DIM = PINTGR1_DIM_GPU;
    PINTGR2_DIM = PINTGR2_DIM_GPU;
    PINTGR3_DIM = PINTGR3_DIM_GPU;
    RHS_DIM  = RHS_DIM_GPU;
    RHSX_DIM = RHSX_DIM_GPU;
    RHSY_DIM = RHSY_DIM_GPU;
    RHSZ_DIM = RHSZ_DIM_GPU;
    SSOR2_DIM = SSOR2_DIM_GPU;
    SSOR3_DIM = SSOR3_DIM_GPU;
  }
  ////////////////////////////////////////////////////////////////////////

  //-----------------------------------------------------------------------
  // 2. Create a context for the specified device
  //-----------------------------------------------------------------------
  context = clCreateContext(NULL, 1, &device, NULL, NULL, &ecode);
  clu_CheckError(ecode, "clCreateContext()");

  //-----------------------------------------------------------------------
  // 3. Create command queues
  //-----------------------------------------------------------------------
  cmd_queue = clCreateCommandQueue(context, device, 0, &ecode);
  clu_CheckError(ecode, "clCreateCommandQueue()");

  max_pipeline = (jend-jst) < max_compute_units ? (jend-jst) : max_compute_units;
  pipe_queue = (cl_command_queue *)malloc(sizeof(cl_command_queue) * max_pipeline);
  for (i = 0; i < max_pipeline; i++) {
    pipe_queue[i] = clCreateCommandQueue(context, device, 0, &ecode);
    clu_CheckError(ecode, "clCreateCommandQueue()");
  }

  //-----------------------------------------------------------------------
  // 4. Build programs
  //-----------------------------------------------------------------------
  if (timeron) timer_start(TIMER_BUILD);
  char build_option[100];

  if (device_type == CL_DEVICE_TYPE_CPU) {
    sprintf(build_option, "-I. -DCLASS=%d -DUSE_CPU", CLASS);
  } else {
    sprintf(build_option, "-I. -DCLASS=\'%c\'", CLASS);
  }

  p_pre = clu_MakeProgram(context, device, source_dir,
                          "kernel_pre.cl",
                          build_option);

  p_main = clu_MakeProgram(context, device, source_dir,
                          (device_type == CL_DEVICE_TYPE_CPU ? "kernel_main_cpu.cl" : "kernel_main_gpu.cl"),
                          build_option);

  p_post = clu_MakeProgram(context, device, source_dir,
                          "kernel_post.cl",
                          build_option);
  if (timeron) timer_stop(TIMER_BUILD);

  //-----------------------------------------------------------------------
  // 5. Create buffers
  //-----------------------------------------------------------------------
  if (timeron) timer_start(TIMER_BUFFER);
  m_ce = clCreateBuffer(context,
                        CL_MEM_READ_ONLY,
                        sizeof(double)*5*13,
                        NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_ce");

  m_u = clCreateBuffer(context,
                       CL_MEM_READ_WRITE,
                       sizeof(double)*(ISIZ3)*(ISIZ2/2*2+1)*(ISIZ1/2*2+1)*5,
                       NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_u");

  m_rsd = clCreateBuffer(context,
                       CL_MEM_READ_WRITE,
                       sizeof(double)*(ISIZ3)*(ISIZ2/2*2+1)*(ISIZ1/2*2+1)*5,
                       NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_rsd");

  m_frct = clCreateBuffer(context,
                       CL_MEM_READ_WRITE,
                       sizeof(double)*(ISIZ3)*(ISIZ2/2*2+1)*(ISIZ1/2*2+1)*5,
                       NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_frct");

  m_qs = clCreateBuffer(context,
                       CL_MEM_READ_WRITE,
                       sizeof(double)*(ISIZ3)*(ISIZ2/2*2+1)*(ISIZ1/2*2+1),
                       NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_qs");

  m_rho_i = clCreateBuffer(context,
                       CL_MEM_READ_WRITE,
                       sizeof(double)*(ISIZ3)*(ISIZ2/2*2+1)*(ISIZ1/2*2+1),
                       NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_rho_i");

  // workspace for work-items
  size_t max_work_items;
  if (ERHS2_DIM == 1 && ERHS3_DIM == 1 && ERHS4_DIM == 1) {
    max_work_items = ISIZ3;
  } else {
    max_work_items = ISIZ3*ISIZ2;
  }
  m_flux = clCreateBuffer(context,
                       CL_MEM_READ_WRITE,
                       sizeof(double)*ISIZ1*5 * max_work_items,
                       NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_flux");

  if (RHSZ_DIM == 1) {
    max_work_items = ISIZ2;
  } else {
    max_work_items = ISIZ2*ISIZ1;
  }

  if (device_type == CL_DEVICE_TYPE_CPU) {
    m_utmp = clCreateBuffer(context,
                         CL_MEM_READ_WRITE,
                         sizeof(double)*ISIZ3*6 * max_work_items,
                         NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_utmp");

    m_rtmp = clCreateBuffer(context,
                         CL_MEM_READ_WRITE,
                         sizeof(double)*ISIZ3*5 * max_work_items,
                         NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_rtmp");
  }

  temp = (nz0-2) / max_compute_units;
  l2norm_lws[0] = temp == 0 ? 1 : temp;
  l2norm_gws[0] = clu_RoundWorkSize((size_t)(nz0-2), l2norm_lws[0]);
  wg_num = l2norm_gws[0] / l2norm_lws[0];
  sum_size = sizeof(double) * 5 * wg_num;
  m_sum = clCreateBuffer(context,
                         CL_MEM_READ_WRITE,
                         sum_size, 
                         NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer()");

  if (timeron) timer_stop(TIMER_BUFFER);

  //-----------------------------------------------------------------------
  // 6. Create kernels
  //-----------------------------------------------------------------------
  k_setbv1 = clCreateKernel(p_pre, "setbv1", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for setbv1");
  ecode  = clSetKernelArg(k_setbv1, 0, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_setbv1, 1, sizeof(cl_mem), &m_ce);
  ecode |= clSetKernelArg(k_setbv1, 2, sizeof(int), &nx);
  ecode |= clSetKernelArg(k_setbv1, 3, sizeof(int), &ny);
  ecode |= clSetKernelArg(k_setbv1, 4, sizeof(int), &nz);
  clu_CheckError(ecode, "clSetKernelArg()");
  if (SETBV1_DIM == 3) {
    setbv1_lws[0] = 5;
    temp = max_work_group_size / setbv1_lws[0];
    setbv1_lws[1] = nx < temp ? nx : temp;
    temp = temp / setbv1_lws[1];
    setbv1_lws[2] = ny < temp ? ny : temp;
    setbv1_gws[0] = clu_RoundWorkSize((size_t)5, setbv1_lws[0]);
    setbv1_gws[1] = clu_RoundWorkSize((size_t)nx, setbv1_lws[1]);
    setbv1_gws[2] = clu_RoundWorkSize((size_t)ny, setbv1_lws[2]);
  } else if (SETBV1_DIM == 2) {
    setbv1_lws[0] = nx < work_item_sizes[0] ? nx : work_item_sizes[0];
    temp = max_work_group_size / setbv1_lws[0];
    setbv1_lws[1] = ny < temp ? ny : temp;
    setbv1_gws[0] = clu_RoundWorkSize((size_t)nx, setbv1_lws[0]);
    setbv1_gws[1] = clu_RoundWorkSize((size_t)ny, setbv1_lws[1]);
  } else {
    temp = ny / max_compute_units;
    setbv1_lws[0] = temp == 0 ? 1 : temp;
    setbv1_gws[0] = clu_RoundWorkSize((size_t)ny, setbv1_lws[0]);
  }

  k_setbv2 = clCreateKernel(p_pre, "setbv2", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for setbv2");
  ecode  = clSetKernelArg(k_setbv2, 0, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_setbv2, 1, sizeof(cl_mem), &m_ce);
  ecode |= clSetKernelArg(k_setbv2, 2, sizeof(int), &nx);
  ecode |= clSetKernelArg(k_setbv2, 3, sizeof(int), &ny);
  ecode |= clSetKernelArg(k_setbv2, 4, sizeof(int), &nz);
  clu_CheckError(ecode, "clSetKernelArg()");
  if (SETBV2_DIM == 3) {
    setbv2_lws[0] = 5;
    temp = max_work_group_size / setbv2_lws[0];
    setbv2_lws[1] = nx < temp ? nx : temp;
    temp = temp / setbv2_lws[1];
    setbv2_lws[2] = nz < temp ? nz : temp;
    setbv2_gws[0] = clu_RoundWorkSize((size_t)5, setbv2_lws[0]);
    setbv2_gws[1] = clu_RoundWorkSize((size_t)nx, setbv2_lws[1]);
    setbv2_gws[2] = clu_RoundWorkSize((size_t)nz, setbv2_lws[2]);
  } else if (SETBV2_DIM == 2) {
    setbv2_lws[0] = nx < work_item_sizes[0] ? nx : work_item_sizes[0];
    temp = max_work_group_size / setbv2_lws[0];
    setbv2_lws[1] = nz < temp ? nz : temp;
    setbv2_gws[0] = clu_RoundWorkSize((size_t)nx, setbv2_lws[0]);
    setbv2_gws[1] = clu_RoundWorkSize((size_t)nz, setbv2_lws[1]);
  } else {
    temp = nz / max_compute_units;
    setbv2_lws[0] = temp == 0 ? 1 : temp;
    setbv2_gws[0] = clu_RoundWorkSize((size_t)nz, setbv2_lws[0]);
  }

  k_setbv3 = clCreateKernel(p_pre, "setbv3", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for setbv3");
  ecode  = clSetKernelArg(k_setbv3, 0, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_setbv3, 1, sizeof(cl_mem), &m_ce);
  ecode |= clSetKernelArg(k_setbv3, 2, sizeof(int), &nx);
  ecode |= clSetKernelArg(k_setbv3, 3, sizeof(int), &ny);
  ecode |= clSetKernelArg(k_setbv3, 4, sizeof(int), &nz);
  clu_CheckError(ecode, "clSetKernelArg()");
  if (SETBV3_DIM == 3) {
    setbv3_lws[0] = 5;
    temp = max_work_group_size / setbv3_lws[0];
    setbv3_lws[1] = ny < temp ? ny : temp;
    temp = temp / setbv3_lws[1];
    setbv3_lws[2] = nz < temp ? nz : temp;
    setbv3_gws[0] = clu_RoundWorkSize((size_t)5, setbv3_lws[0]);
    setbv3_gws[1] = clu_RoundWorkSize((size_t)ny, setbv3_lws[1]);
    setbv3_gws[2] = clu_RoundWorkSize((size_t)nz, setbv3_lws[2]);
  } else if (SETBV3_DIM == 2) {
    setbv3_lws[0] = ny < work_item_sizes[0] ? ny : work_item_sizes[0];
    temp = max_work_group_size / setbv3_lws[0];
    setbv3_lws[1] = nz < temp ? nz : temp;
    setbv3_gws[0] = clu_RoundWorkSize((size_t)ny, setbv3_lws[0]);
    setbv3_gws[1] = clu_RoundWorkSize((size_t)nz, setbv3_lws[1]);
  } else {
    temp = nz / max_compute_units;
    setbv3_lws[0] = temp == 0 ? 1 : temp;
    setbv3_gws[0] = clu_RoundWorkSize((size_t)nz, setbv3_lws[0]);
  }

  k_setiv = clCreateKernel(p_pre, "setiv", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for setiv");
  ecode  = clSetKernelArg(k_setiv, 0, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_setiv, 1, sizeof(cl_mem), &m_ce);
  ecode |= clSetKernelArg(k_setiv, 2, sizeof(int), &nx);
  ecode |= clSetKernelArg(k_setiv, 3, sizeof(int), &ny);
  ecode |= clSetKernelArg(k_setiv, 4, sizeof(int), &nz);
  clu_CheckError(ecode, "clSetKernelArg()");
  if (SETIV_DIM == 3) {
    setiv_lws[0] = (nx-2) < work_item_sizes[0] ? (nx-2) : work_item_sizes[0];
    temp = max_work_group_size / setiv_lws[0];
    setiv_lws[1] = (ny-2) < temp ? (ny-2) : temp;
    temp = temp / setiv_lws[1];
    setiv_lws[2] = (nz-2) < temp ? (nz-2) : temp;
    setiv_gws[0] = clu_RoundWorkSize((size_t)(nx-2), setiv_lws[0]);
    setiv_gws[1] = clu_RoundWorkSize((size_t)(ny-2), setiv_lws[1]);
    setiv_gws[2] = clu_RoundWorkSize((size_t)(nz-2), setiv_lws[2]);
  } else if (SETIV_DIM == 2) {
    setiv_lws[0] = (ny-2) < work_item_sizes[0] ? (ny-2) : work_item_sizes[0];
    temp = max_work_group_size / setiv_lws[0];
    setiv_lws[1] = (nz-2) < temp ? (nz-2) : temp;
    setiv_gws[0] = clu_RoundWorkSize((size_t)(ny-2), setiv_lws[0]);
    setiv_gws[1] = clu_RoundWorkSize((size_t)(nz-2), setiv_lws[1]);
  } else {
    temp = (nz-2) / max_compute_units;
    setiv_lws[0] = temp == 0 ? 1 : temp;
    setiv_gws[0] = clu_RoundWorkSize((size_t)(nz-2), setiv_lws[0]);
  }

  k_l2norm = clCreateKernel(p_main, "l2norm", &ecode);
  clu_CheckError(ecode, "clCreateKernel()");
  ecode  = clSetKernelArg(k_l2norm, 1, sizeof(cl_mem), &m_sum);
  ecode |= clSetKernelArg(k_l2norm, 2, sizeof(double)*5*l2norm_lws[0], NULL);
  clu_CheckError(ecode, "clSetKernelArg()");

  k_rhs = clCreateKernel(p_main, "rhs", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for rhs");
  ecode  = clSetKernelArg(k_rhs, 0, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_rhs, 1, sizeof(cl_mem), &m_rsd);
  ecode |= clSetKernelArg(k_rhs, 2, sizeof(cl_mem), &m_frct);
  ecode |= clSetKernelArg(k_rhs, 3, sizeof(cl_mem), &m_qs);
  ecode |= clSetKernelArg(k_rhs, 4, sizeof(cl_mem), &m_rho_i);
  ecode |= clSetKernelArg(k_rhs, 5, sizeof(int), &nx);
  ecode |= clSetKernelArg(k_rhs, 6, sizeof(int), &ny);
  ecode |= clSetKernelArg(k_rhs, 7, sizeof(int), &nz);
  clu_CheckError(ecode, "clSetKernelArg()");
  if (RHS_DIM == 3) {
    rhs_lws[0] = nx < work_item_sizes[0] ? nx : work_item_sizes[0];
    temp = max_work_group_size / rhs_lws[0];
    rhs_lws[1] = ny < temp ? ny : temp;
    temp = temp / rhs_lws[1];
    rhs_lws[2] = nz < temp ? nz : temp;
    rhs_gws[0] = clu_RoundWorkSize((size_t)nx, rhs_lws[0]);
    rhs_gws[1] = clu_RoundWorkSize((size_t)ny, rhs_lws[1]);
    rhs_gws[2] = clu_RoundWorkSize((size_t)nz, rhs_lws[2]);
  } else if (RHS_DIM == 2) {
    rhs_lws[0] = ny < work_item_sizes[0] ? ny : work_item_sizes[0];
    temp = max_work_group_size / rhs_lws[0];
    rhs_lws[1] = nz < temp ? nz : temp;
    rhs_gws[0] = clu_RoundWorkSize((size_t)ny, rhs_lws[0]);
    rhs_gws[1] = clu_RoundWorkSize((size_t)nz, rhs_lws[1]);
  } else {
    //temp = nz / max_compute_units;
    temp = 1;
    rhs_lws[0] = temp == 0 ? 1 : temp;
    rhs_gws[0] = clu_RoundWorkSize((size_t)nz, rhs_lws[0]);
  }

  k_rhsx = clCreateKernel(p_main, "rhsx", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for rhsx");
  ecode  = clSetKernelArg(k_rhsx, 0, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_rhsx, 1, sizeof(cl_mem), &m_rsd);
  ecode |= clSetKernelArg(k_rhsx, 2, sizeof(cl_mem), &m_qs);
  ecode |= clSetKernelArg(k_rhsx, 3, sizeof(cl_mem), &m_rho_i);
  if (device_type == CL_DEVICE_TYPE_CPU) {
    ecode |= clSetKernelArg(k_rhsx, 4, sizeof(cl_mem), &m_flux);
    ecode |= clSetKernelArg(k_rhsx, 5, sizeof(int), &nx);
    ecode |= clSetKernelArg(k_rhsx, 6, sizeof(int), &ny);
    ecode |= clSetKernelArg(k_rhsx, 7, sizeof(int), &nz);
  } else {
    ecode |= clSetKernelArg(k_rhsx, 4, sizeof(int), &nx);
    ecode |= clSetKernelArg(k_rhsx, 5, sizeof(int), &ny);
    ecode |= clSetKernelArg(k_rhsx, 6, sizeof(int), &nz);
  }
  clu_CheckError(ecode, "clSetKernelArg()");
  if (RHSX_DIM == 2) {
    rhsx_lws[0] = (jend-jst) < work_item_sizes[0] ? (jend-jst) : work_item_sizes[0];
    temp = max_work_group_size / rhsx_lws[0];
    rhsx_lws[1] = (nz-2) < temp ? (nz-2) : temp;
    rhsx_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), rhsx_lws[0]);
    rhsx_gws[1] = clu_RoundWorkSize((size_t)(nz-2), rhsx_lws[1]);
  } else {
    //temp = (nz-2) / max_compute_units;
    temp = 1;
    rhsx_lws[0] = temp == 0 ? 1 : temp;
    rhsx_gws[0] = clu_RoundWorkSize((size_t)(nz-2), rhsx_lws[0]);
  }

  k_rhsy = clCreateKernel(p_main, "rhsy", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for rhsy");
  ecode  = clSetKernelArg(k_rhsy, 0, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_rhsy, 1, sizeof(cl_mem), &m_rsd);
  ecode |= clSetKernelArg(k_rhsy, 2, sizeof(cl_mem), &m_qs);
  ecode |= clSetKernelArg(k_rhsy, 3, sizeof(cl_mem), &m_rho_i);
  if (device_type == CL_DEVICE_TYPE_CPU) {
    ecode |= clSetKernelArg(k_rhsy, 4, sizeof(cl_mem), &m_flux);
    ecode |= clSetKernelArg(k_rhsy, 5, sizeof(int), &nx);
    ecode |= clSetKernelArg(k_rhsy, 6, sizeof(int), &ny);
    ecode |= clSetKernelArg(k_rhsy, 7, sizeof(int), &nz);
  } else {
    ecode |= clSetKernelArg(k_rhsy, 4, sizeof(int), &nx);
    ecode |= clSetKernelArg(k_rhsy, 5, sizeof(int), &ny);
    ecode |= clSetKernelArg(k_rhsy, 6, sizeof(int), &nz);
  }
  clu_CheckError(ecode, "clSetKernelArg()");
  if (RHSY_DIM == 2) {
    rhsy_lws[0] = (iend-ist) < work_item_sizes[0] ? (iend-ist) : work_item_sizes[0];
    temp = max_work_group_size / rhsy_lws[0];
    rhsy_lws[1] = (nz-2) < temp ? (nz-2) : temp;
    rhsy_gws[0] = clu_RoundWorkSize((size_t)(iend-ist), rhsy_lws[0]);
    rhsy_gws[1] = clu_RoundWorkSize((size_t)(nz-2), rhsy_lws[1]);
  } else {
    //temp = (nz-2) / max_compute_units;
    temp = 1;
    rhsy_lws[0] = temp == 0 ? 1 : temp;
    rhsy_gws[0] = clu_RoundWorkSize((size_t)(nz-2), rhsy_lws[0]);
  }

  k_rhsz = clCreateKernel(p_main, "rhsz", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for rhsz");
  ecode  = clSetKernelArg(k_rhsz, 0, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_rhsz, 1, sizeof(cl_mem), &m_rsd);
  ecode |= clSetKernelArg(k_rhsz, 2, sizeof(cl_mem), &m_qs);
  ecode |= clSetKernelArg(k_rhsz, 3, sizeof(cl_mem), &m_rho_i);
  if (device_type == CL_DEVICE_TYPE_CPU) {
    ecode |= clSetKernelArg(k_rhsz, 4, sizeof(cl_mem), &m_flux);
    ecode |= clSetKernelArg(k_rhsz, 5, sizeof(cl_mem), &m_utmp);
    ecode |= clSetKernelArg(k_rhsz, 6, sizeof(cl_mem), &m_rtmp);
    ecode |= clSetKernelArg(k_rhsz, 7, sizeof(int), &nx);
    ecode |= clSetKernelArg(k_rhsz, 8, sizeof(int), &ny);
    ecode |= clSetKernelArg(k_rhsz, 9, sizeof(int), &nz);
  } else {
    ecode |= clSetKernelArg(k_rhsz, 4, sizeof(int), &nx);
    ecode |= clSetKernelArg(k_rhsz, 5, sizeof(int), &ny);
    ecode |= clSetKernelArg(k_rhsz, 6, sizeof(int), &nz);
  }
  clu_CheckError(ecode, "clSetKernelArg()");
  if (RHSZ_DIM == 2) {
    rhsz_lws[0] = (iend-ist) < work_item_sizes[0] ? (iend-ist) : work_item_sizes[0];
    temp = max_work_group_size / rhsz_lws[0];
    rhsz_lws[1] = (jend-jst) < temp ? (jend-jst) : temp;
    rhsz_gws[0] = clu_RoundWorkSize((size_t)(iend-ist), rhsz_lws[0]);
    rhsz_gws[1] = clu_RoundWorkSize((size_t)(jend-jst), rhsz_lws[1]);
  } else {
    //temp = (jend-jst) / max_compute_units;
    temp = 1;
    rhsz_lws[0] = temp == 0 ? 1 : temp;
    rhsz_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), rhsz_lws[0]);
  }

  k_ssor2 = clCreateKernel(p_main, "ssor2", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for ssor2");
  ecode  = clSetKernelArg(k_ssor2, 0, sizeof(cl_mem), &m_rsd);
  ecode |= clSetKernelArg(k_ssor2, 2, sizeof(int), &nx);
  ecode |= clSetKernelArg(k_ssor2, 3, sizeof(int), &ny);
  ecode |= clSetKernelArg(k_ssor2, 4, sizeof(int), &nz);
  clu_CheckError(ecode, "clSetKernelArg()");
  if (SSOR2_DIM == 3) {
    ssor2_lws[0] = (iend-ist) < work_item_sizes[0] ? (iend-ist) : work_item_sizes[0];
    temp = max_work_group_size / ssor2_lws[0];
    ssor2_lws[1] = (jend-jst) < temp ? (jend-jst) : temp;
    temp = temp / ssor2_lws[1];
    ssor2_lws[2] = (nz-2) < temp ? (nz-2) : temp;
    ssor2_gws[0] = clu_RoundWorkSize((size_t)(iend-ist), ssor2_lws[0]);
    ssor2_gws[1] = clu_RoundWorkSize((size_t)(jend-jst), ssor2_lws[1]);
    ssor2_gws[2] = clu_RoundWorkSize((size_t)(nz-2), ssor2_lws[2]);
  } else if (SSOR2_DIM == 2) {
    ssor2_lws[0] = (jend-jst) < work_item_sizes[0] ? (jend-jst) : work_item_sizes[0];
    temp = max_work_group_size / ssor2_lws[0];
    ssor2_lws[1] = (nz-2) < temp ? (nz-2) : temp;
    ssor2_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), ssor2_lws[0]);
    ssor2_gws[1] = clu_RoundWorkSize((size_t)(nz-2), ssor2_lws[1]);
  } else {
    //temp = (nz-2) / max_compute_units;
    temp = 1;
    ssor2_lws[0] = temp == 0 ? 1 : temp;
    ssor2_gws[0] = clu_RoundWorkSize((size_t)(nz-2), ssor2_lws[0]);
  }

  k_ssor3 = clCreateKernel(p_main, "ssor3", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for ssor3");
  ecode  = clSetKernelArg(k_ssor3, 0, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_ssor3, 1, sizeof(cl_mem), &m_rsd);
  ecode |= clSetKernelArg(k_ssor3, 3, sizeof(int), &nx);
  ecode |= clSetKernelArg(k_ssor3, 4, sizeof(int), &ny);
  ecode |= clSetKernelArg(k_ssor3, 5, sizeof(int), &nz);
  clu_CheckError(ecode, "clSetKernelArg()");
  if (SSOR3_DIM == 3) {
    ssor3_lws[0] = (iend-ist) < work_item_sizes[0] ? (iend-ist) : work_item_sizes[0];
    temp = max_work_group_size / ssor3_lws[0];
    ssor3_lws[1] = (jend-jst) < temp ? (jend-jst) : temp;
    temp = temp / ssor3_lws[1];
    ssor3_lws[2] = (nz-2) < temp ? (nz-2) : temp;
    ssor3_gws[0] = clu_RoundWorkSize((size_t)(iend-ist), ssor3_lws[0]);
    ssor3_gws[1] = clu_RoundWorkSize((size_t)(jend-jst), ssor3_lws[1]);
    ssor3_gws[2] = clu_RoundWorkSize((size_t)(nz-2), ssor3_lws[2]);
  } else if (SSOR3_DIM == 2) {
    ssor3_lws[0] = (jend-jst) < work_item_sizes[0] ? (jend-jst) : work_item_sizes[0];
    temp = max_work_group_size / ssor3_lws[0];
    ssor3_lws[1] = (nz-2) < temp ? (nz-2) : temp;
    ssor3_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), ssor3_lws[0]);
    ssor3_gws[1] = clu_RoundWorkSize((size_t)(nz-2), ssor3_lws[1]);
  } else {
    //temp = (nz-2) / max_compute_units;
    temp = 1;
    ssor3_lws[0] = temp == 0 ? 1 : temp;
    ssor3_gws[0] = clu_RoundWorkSize((size_t)(nz-2), ssor3_lws[0]);
  }

  k_blts = clCreateKernel(p_main, "blts", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for blts");
  ecode  = clSetKernelArg(k_blts, 0, sizeof(cl_mem), &m_rsd);
  ecode |= clSetKernelArg(k_blts, 1, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_blts, 2, sizeof(cl_mem), &m_qs);
  ecode |= clSetKernelArg(k_blts, 3, sizeof(cl_mem), &m_rho_i);
  ecode |= clSetKernelArg(k_blts, 4, sizeof(int), &nz);
  ecode |= clSetKernelArg(k_blts, 5, sizeof(int), &ny);
  ecode |= clSetKernelArg(k_blts, 6, sizeof(int), &nx);
  clu_CheckError(ecode, "clSetKernelArg()");
  blts_lws[0] = (jend-jst) < work_item_sizes[0] ? (jend-jst) : work_item_sizes[0];
  temp = max_work_group_size / blts_lws[0];
  blts_lws[1] = (nz-2) < temp ? (nz-2) : temp;
  blts_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), blts_lws[0]);
  blts_gws[1] = clu_RoundWorkSize((size_t)(nz-2), blts_lws[1]);

  k_buts = clCreateKernel(p_main, "buts", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for buts");
  ecode  = clSetKernelArg(k_buts, 0, sizeof(cl_mem), &m_rsd);
  ecode |= clSetKernelArg(k_buts, 1, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_buts, 2, sizeof(cl_mem), &m_qs);
  ecode |= clSetKernelArg(k_buts, 3, sizeof(cl_mem), &m_rho_i);
  ecode |= clSetKernelArg(k_buts, 4, sizeof(int), &nz);
  ecode |= clSetKernelArg(k_buts, 5, sizeof(int), &ny);
  ecode |= clSetKernelArg(k_buts, 6, sizeof(int), &nx);
  clu_CheckError(ecode, "clSetKernelArg()");
  buts_lws[0] = (jend-jst) < work_item_sizes[0] ? (jend-jst) : work_item_sizes[0];
  temp = max_work_group_size / buts_lws[0];
  buts_lws[1] = (nz-2) < temp ? (nz-2) : temp;
  buts_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), buts_lws[0]);
  buts_gws[1] = clu_RoundWorkSize((size_t)(nz-2), buts_lws[1]);

  if (timeron) timer_stop(TIMER_OPENCL);
}
Beispiel #18
0
//---------------------------------------------------------------------
// Fill in array u0 with initial conditions from 
// random number generator 
//---------------------------------------------------------------------
static void compute_initial_conditions(cl_mem *u0, int d1, int d2, int d3)
{
  int k;
  double start, an, dummy, starts[NZ];
  size_t local_ws, global_ws, temp;
  cl_mem m_starts;
  cl_int ecode;

  start = SEED;
  //---------------------------------------------------------------------
  // Jump to the starting element for our first plane.
  //---------------------------------------------------------------------
  an = ipow46(A, 0);
  dummy = randlc(&start, an);
  an = ipow46(A, 2*NX*NY);

  starts[0] = start;
  for (k = 1; k < dims[2]; k++) {
    dummy = randlc(&start, an);
    starts[k] = start;
  }

  if (device_type == CL_DEVICE_TYPE_CPU) {
    m_starts = clCreateBuffer(context,
                              CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
                              sizeof(double) * NZ,
                              starts, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_starts");

    local_ws  = 1;
    global_ws = clu_RoundWorkSize((size_t)d2, local_ws);
  } else { //GPU
    m_starts = clCreateBuffer(context,
                              CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                              sizeof(double) * NZ,
                              starts,
                              &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_starts");

    temp = d2 / max_compute_units;
    local_ws  = temp == 0 ? 
                1 : ((temp > work_item_sizes[0]) ? work_item_sizes[0] : temp);
    global_ws = clu_RoundWorkSize((size_t)d2, local_ws);
  }

  ecode  = clSetKernelArg(k_compute_ics, 0, sizeof(cl_mem), u0);
  ecode |= clSetKernelArg(k_compute_ics, 1, sizeof(cl_mem), &m_starts);
  clu_CheckError(ecode, "clSetKernelArg() for compute_initial_conditions");

  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_compute_ics,
                                 1, NULL,
                                 &global_ws,
                                 &local_ws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel()");

  ecode = clFinish(cmd_queue);
  clu_CheckError(ecode, "clFinish()");

  DTIMER_START(T_RELEASE);
  clReleaseMemObject(m_starts);
  DTIMER_STOP(T_RELEASE);
}
Beispiel #19
0
//---------------------------------------------------------------------
// compute the right hand side based on exact solution
//---------------------------------------------------------------------
void exact_rhs()
{
  int c, i;
  int range_1, range_0;
  size_t d[3], local_ws[3], global_ws[3];
  cl_int ecode = 0;

  for (c = 0; c < ncells; c++) {
    for (i = 0; i < num_devices; i++) {
      ecode  = clSetKernelArg(k_exact_rhs1[i], 0, sizeof(cl_mem), &m_forcing[i]);
      ecode |= clSetKernelArg(k_exact_rhs1[i], 1, sizeof(int), &c);
      ecode |= clSetKernelArg(k_exact_rhs1[i], 2, sizeof(int), &cell_size[i][c][2]);
      ecode |= clSetKernelArg(k_exact_rhs1[i], 3, sizeof(int), &cell_size[i][c][1]);
      ecode |= clSetKernelArg(k_exact_rhs1[i], 4, sizeof(int), &cell_size[i][c][0]);
      clu_CheckError(ecode, "clSetKernelArg() for exact_rhs1");

      d[0] = cell_size[i][c][1];
      d[1] = cell_size[i][c][2];
      compute_ws_dim_2(d, local_ws, global_ws);

      ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                     k_exact_rhs1[i],
                                     2, NULL,
                                     global_ws,
                                     local_ws,
                                     0, NULL, NULL);
      clu_CheckError(ecode, "clEnqueueNDRange() for exact_rhs1");
    }

    for (i = 0; i < num_devices; i++) {
      CHECK_FINISH(i * 2);
    }

  }

  for (i = 0; i < num_devices; i++) {
    ecode  = clSetKernelArg(k_exact_rhs2[i], 0, sizeof(cl_mem), &m_forcing[i]);
    ecode |= clSetKernelArg(k_exact_rhs2[i], 1, sizeof(cl_mem), &m_ce[i]);
    clu_CheckError(ecode, "clSetKernelArg() for exact_rhs2");
  }

  for (c = 0; c < ncells; c++) {
    for (i = 0; i < num_devices; i++) {
      range_1 = cell_size[i][c][2] - cell_end[i][c][2];
      range_0 = cell_size[i][c][1] - cell_end[i][c][1];

      ecode  = clSetKernelArg(k_exact_rhs2[i], 2, sizeof(int), &c);
      ecode |= clSetKernelArg(k_exact_rhs2[i], 3, sizeof(int), &cell_start[i][c][2]);
      ecode |= clSetKernelArg(k_exact_rhs2[i], 4, sizeof(int), &range_1);
      ecode |= clSetKernelArg(k_exact_rhs2[i], 5, sizeof(int), &cell_start[i][c][1]);
      ecode |= clSetKernelArg(k_exact_rhs2[i], 6, sizeof(int), &range_0);
      ecode |= clSetKernelArg(k_exact_rhs2[i], 7, sizeof(int), &cell_size[i][c][0]);
      ecode |= clSetKernelArg(k_exact_rhs2[i], 8, sizeof(int), &cell_start[i][c][0]);
      ecode |= clSetKernelArg(k_exact_rhs2[i], 9, sizeof(int), &cell_end[i][c][0]);
      ecode |= clSetKernelArg(k_exact_rhs2[i], 10, sizeof(int), &cell_low[i][c][2]);
      ecode |= clSetKernelArg(k_exact_rhs2[i], 11, sizeof(int), &cell_low[i][c][1]);
      ecode |= clSetKernelArg(k_exact_rhs2[i], 12, sizeof(int), &cell_low[i][c][0]);
      clu_CheckError(ecode, "clSetKernelArg() for exact_rhs2");

      d[0] = cell_size[i][c][1] - cell_start[i][c][1] - cell_end[i][c][1];
      d[1] = cell_size[i][c][2] - cell_start[i][c][2] - cell_end[i][c][2];
      compute_ws_dim_2(d, local_ws, global_ws);

      if (c == 0) CHECK_FINISH(i * 2);

      ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                     k_exact_rhs2[i],
                                     2, NULL,
                                     global_ws,
                                     local_ws,
                                     0, NULL, NULL);
      clu_CheckError(ecode, "clEnqueueNDRange() for exact_rhs2");
    }
    for (i = 0; i < num_devices; i++) {
      CHECK_FINISH(i * 2);
    }
  }

  for (i = 0; i < num_devices; i++) {
    ecode  = clSetKernelArg(k_exact_rhs3[i], 0, sizeof(cl_mem), &m_forcing[i]);
    ecode |= clSetKernelArg(k_exact_rhs3[i], 1, sizeof(cl_mem), &m_ce[i]);
    clu_CheckError(ecode, "clSetKernelArg() for exact_rhs3");
  }

  for (c = 0; c < ncells; c++) {
    for (i = 0; i < num_devices; i++) {
      range_1 = cell_size[i][c][2] - cell_end[i][c][2];
      range_0 = cell_size[i][c][0] - cell_end[i][c][0];

      ecode  = clSetKernelArg(k_exact_rhs3[i], 2, sizeof(int), &c);
      ecode |= clSetKernelArg(k_exact_rhs3[i], 3, sizeof(int), &cell_start[i][c][2]);
      ecode |= clSetKernelArg(k_exact_rhs3[i], 4, sizeof(int), &range_1);
      ecode |= clSetKernelArg(k_exact_rhs3[i], 5, sizeof(int), &cell_start[i][c][0]);
      ecode |= clSetKernelArg(k_exact_rhs3[i], 6, sizeof(int), &range_0);
      ecode |= clSetKernelArg(k_exact_rhs3[i], 7, sizeof(int), &cell_size[i][c][1]);
      ecode |= clSetKernelArg(k_exact_rhs3[i], 8, sizeof(int), &cell_start[i][c][1]);
      ecode |= clSetKernelArg(k_exact_rhs3[i], 9, sizeof(int), &cell_end[i][c][1]);
      ecode |= clSetKernelArg(k_exact_rhs3[i], 10, sizeof(int), &cell_low[i][c][2]);
      ecode |= clSetKernelArg(k_exact_rhs3[i], 11, sizeof(int), &cell_low[i][c][1]);
      ecode |= clSetKernelArg(k_exact_rhs3[i], 12, sizeof(int), &cell_low[i][c][0]);
      clu_CheckError(ecode, "clSetKernelArg() for exact_rhs3");

      d[0] = cell_size[i][c][0] - cell_start[i][c][0] - cell_end[i][c][0];
      d[1] = cell_size[i][c][2] - cell_start[i][c][2] - cell_end[i][c][2];
      compute_ws_dim_2(d, local_ws, global_ws);

      if (c == 0) CHECK_FINISH(i * 2);

      ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                     k_exact_rhs3[i],
                                     2, NULL,
                                     global_ws,
                                     local_ws,
                                     0, NULL, NULL);
      clu_CheckError(ecode, "clEnqueueNDRange() for exact_rhs3");
    }
    for (i = 0; i < num_devices; i++) {
      CHECK_FINISH(i * 2);
    }
  }

  for (i = 0; i < num_devices; i++) {
    ecode  = clSetKernelArg(k_exact_rhs4[i], 0, sizeof(cl_mem), &m_forcing[i]);
    ecode |= clSetKernelArg(k_exact_rhs4[i], 1, sizeof(cl_mem), &m_ce[i]);
    clu_CheckError(ecode, "clSetKernelArg() for exact_rhs4");
  }

  for (c = 0; c < ncells; c++) {
    for (i = 0; i < num_devices; i++) {
      range_1 = cell_size[i][c][1] - cell_end[i][c][1];
      range_0 = cell_size[i][c][0] - cell_end[i][c][0];

      ecode  = clSetKernelArg(k_exact_rhs4[i], 2, sizeof(int), &c);
      ecode |= clSetKernelArg(k_exact_rhs4[i], 3, sizeof(int), &cell_start[i][c][1]);
      ecode |= clSetKernelArg(k_exact_rhs4[i], 4, sizeof(int), &range_1);
      ecode |= clSetKernelArg(k_exact_rhs4[i], 5, sizeof(int), &cell_start[i][c][0]);
      ecode |= clSetKernelArg(k_exact_rhs4[i], 6, sizeof(int), &range_0);
      ecode |= clSetKernelArg(k_exact_rhs4[i], 7, sizeof(int), &cell_size[i][c][2]);
      ecode |= clSetKernelArg(k_exact_rhs4[i], 8, sizeof(int), &cell_start[i][c][2]);
      ecode |= clSetKernelArg(k_exact_rhs4[i], 9, sizeof(int), &cell_end[i][c][2]);
      ecode |= clSetKernelArg(k_exact_rhs4[i], 10, sizeof(int), &cell_low[i][c][2]);
      ecode |= clSetKernelArg(k_exact_rhs4[i], 11, sizeof(int), &cell_low[i][c][1]);
      ecode |= clSetKernelArg(k_exact_rhs4[i], 12, sizeof(int), &cell_low[i][c][0]);
      clu_CheckError(ecode, "clSetKernelArg() for exact_rhs4");

      d[0] = cell_size[i][c][0] - cell_start[i][c][0] - cell_end[i][c][0];
      d[1] = cell_size[i][c][1] - cell_start[i][c][1] - cell_end[i][c][1];
      compute_ws_dim_2(d, local_ws, global_ws);

      if (c == 0) CHECK_FINISH(i * 2);

      ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                     k_exact_rhs4[i],
                                     2, NULL,
                                     global_ws,
                                     local_ws,
                                     0, NULL, NULL);
      clu_CheckError(ecode, "clEnqueueNDRange() for exact_rhs4");
    }
    for (i = 0; i < num_devices; i++) {
      CHECK_FINISH(i * 2);
    }
  }

  for (c = 0; c < ncells; c++) {
    for (i = 0; i < num_devices; i++) {
      range_1 = cell_size[i][c][2] - cell_end[i][c][2];
      range_0 = cell_size[i][c][1] - cell_end[i][c][1];

      ecode  = clSetKernelArg(k_exact_rhs5[i], 0, sizeof(cl_mem), &m_forcing[i]);
      ecode |= clSetKernelArg(k_exact_rhs5[i], 1, sizeof(int), &c);
      ecode |= clSetKernelArg(k_exact_rhs5[i], 2, sizeof(int), &cell_start[i][c][2]);
      ecode |= clSetKernelArg(k_exact_rhs5[i], 3, sizeof(int), &range_1);
      ecode |= clSetKernelArg(k_exact_rhs5[i], 4, sizeof(int), &cell_start[i][c][1]);
      ecode |= clSetKernelArg(k_exact_rhs5[i], 5, sizeof(int), &range_0);
      ecode |= clSetKernelArg(k_exact_rhs5[i], 6, sizeof(int), &cell_size[i][c][0]);
      ecode |= clSetKernelArg(k_exact_rhs5[i], 7, sizeof(int), &cell_start[i][c][0]);
      ecode |= clSetKernelArg(k_exact_rhs5[i], 8, sizeof(int), &cell_end[i][c][0]);
      clu_CheckError(ecode, "clSetKernelArg() for exact_rhs5");

      d[0] = cell_size[i][c][1] - cell_start[i][c][1] - cell_end[i][c][1];
      d[1] = cell_size[i][c][2] - cell_start[i][c][2] - cell_end[i][c][2];
      compute_ws_dim_2(d, local_ws, global_ws);

      if (c == 0) CHECK_FINISH(i * 2);

      ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                     k_exact_rhs5[i],
                                     2, NULL,
                                     global_ws,
                                     local_ws,
                                     0, NULL, NULL);
      clu_CheckError(ecode, "clEnqueueNDRange() for exact_rhs5");
    }
    for (i = 0; i < num_devices; i++) {
      CHECK_FINISH(i * 2);
    }
  }

  for (i = 0; i < num_devices; i++)
    CHECK_FINISH(i * 2);
}
Beispiel #20
0
void compute_rhs()
{
  int i;
  size_t d0_size, d1_size, d2_size;
  cl_int ecode;

  if (timeron) timer_start(t_rhs);

  //-------------------------------------------------------------------------
  // compute the reciprocal of density, and the kinetic energy, 
  // and the speed of sound. 
  //-------------------------------------------------------------------------
  for (i = 0; i < num_devices; i++) {
    size_t compute_rhs1_lws[3], compute_rhs1_gws[3], temp;

    if (COMPUTE_RHS1_DIM == 3) {
      d0_size = max_cell_size[i][1] + 2;
      d1_size = max_cell_size[i][2] + 2;
      d2_size = ncells;

      compute_rhs1_lws[0] = d0_size < work_item_sizes[0] ?
                            d0_size : work_item_sizes[0];
      temp = max_work_group_size / compute_rhs1_lws[0];
      compute_rhs1_lws[1] = d1_size < temp ? d1_size : temp;
      temp = temp / compute_rhs1_lws[1];
      compute_rhs1_lws[2] = d2_size < temp ? d2_size : temp;
      compute_rhs1_gws[0] = clu_RoundWorkSize(d0_size, compute_rhs1_lws[0]);
      compute_rhs1_gws[1] = clu_RoundWorkSize(d1_size, compute_rhs1_lws[1]);
      compute_rhs1_gws[2] = clu_RoundWorkSize(d2_size, compute_rhs1_lws[2]);
    } else {
      d0_size = max_cell_size[i][2] + 2;
      d1_size = ncells;

      compute_rhs1_lws[0] = d0_size < work_item_sizes[0] ?
                            d0_size : work_item_sizes[0];
      temp = max_work_group_size / compute_rhs1_lws[0];
      compute_rhs1_lws[1] = d1_size < temp ? d1_size : temp;
      compute_rhs1_gws[0] = clu_RoundWorkSize(d0_size, compute_rhs1_lws[0]);
      compute_rhs1_gws[1] = clu_RoundWorkSize(d1_size, compute_rhs1_lws[1]);
    }

    ecode  = clSetKernelArg(k_compute_rhs1[i], 0, sizeof(cl_mem), &m_u[i]);
    ecode |= clSetKernelArg(k_compute_rhs1[i], 1, sizeof(cl_mem), &m_us[i]);
    ecode |= clSetKernelArg(k_compute_rhs1[i], 2, sizeof(cl_mem), &m_vs[i]);
    ecode |= clSetKernelArg(k_compute_rhs1[i], 3, sizeof(cl_mem), &m_ws[i]);
    ecode |= clSetKernelArg(k_compute_rhs1[i], 4, sizeof(cl_mem), &m_qs[i]);
    ecode |= clSetKernelArg(k_compute_rhs1[i], 5, sizeof(cl_mem),
                                                  &m_rho_i[i]);
    ecode |= clSetKernelArg(k_compute_rhs1[i], 6, sizeof(cl_mem),
                                                  &m_square[i]);
    ecode |= clSetKernelArg(k_compute_rhs1[i], 7, sizeof(cl_mem),
                                                  &m_cell_size[i]);
    ecode |= clSetKernelArg(k_compute_rhs1[i], 8, sizeof(int), &ncells);
    clu_CheckError(ecode, "clSetKernelArg()");

    ecode = clEnqueueNDRangeKernel(cmd_queue[i],
                                   k_compute_rhs1[i],
                                   COMPUTE_RHS1_DIM, NULL,
                                   compute_rhs1_gws,
                                   compute_rhs1_lws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel()");
  }
  //-------------------------------------------------------------------------

  //-------------------------------------------------------------------------
  // copy the exact forcing term to the right hand side;  because 
  // this forcing term is known, we can store it on the whole of every 
  // cell,  including the boundary                   
  //-------------------------------------------------------------------------
  for (i = 0; i < num_devices; i++) {
    size_t compute_rhs2_lws[3], compute_rhs2_gws[3], temp;

    if (COMPUTE_RHS2_DIM == 3) {
      d0_size = max_cell_size[i][1];
      d1_size = max_cell_size[i][2];
      d2_size = ncells;

      compute_rhs2_lws[0] = d0_size < work_item_sizes[0] ?
                            d0_size : work_item_sizes[0];
      temp = max_work_group_size / compute_rhs2_lws[0];
      compute_rhs2_lws[1] = d1_size < temp ? d1_size : temp;
      temp = temp / compute_rhs2_lws[1];
      compute_rhs2_lws[2] = d2_size < temp ? d2_size : temp;
      compute_rhs2_gws[0] = clu_RoundWorkSize(d0_size, compute_rhs2_lws[0]);
      compute_rhs2_gws[1] = clu_RoundWorkSize(d1_size, compute_rhs2_lws[1]);
      compute_rhs2_gws[2] = clu_RoundWorkSize(d2_size, compute_rhs2_lws[2]);
    } else {
      d0_size = max_cell_size[i][2];
      d1_size = ncells;

      compute_rhs2_lws[0] = d0_size < work_item_sizes[0] ?
                            d0_size : work_item_sizes[0];
      temp = max_work_group_size / compute_rhs2_lws[0];
      compute_rhs2_lws[1] = d1_size < temp ? d1_size : temp;
      compute_rhs2_gws[0] = clu_RoundWorkSize(d0_size, compute_rhs2_lws[0]);
      compute_rhs2_gws[1] = clu_RoundWorkSize(d1_size, compute_rhs2_lws[1]);
    }

    ecode  = clSetKernelArg(k_compute_rhs2[i], 0, sizeof(cl_mem),
                                                  &m_forcing[i]);
    ecode |= clSetKernelArg(k_compute_rhs2[i], 1, sizeof(cl_mem), &m_rhs[i]);
    ecode |= clSetKernelArg(k_compute_rhs2[i], 2, sizeof(cl_mem),
                                                  &m_cell_size[i]);
    ecode |= clSetKernelArg(k_compute_rhs2[i], 3, sizeof(int), &ncells);
    clu_CheckError(ecode, "clSetKernelArg()");

    ecode = clEnqueueNDRangeKernel(cmd_queue[i],
                                   k_compute_rhs2[i],
                                   COMPUTE_RHS2_DIM, NULL,
                                   compute_rhs2_gws,
                                   compute_rhs2_lws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel()");
  }
  //-------------------------------------------------------------------------

  //-------------------------------------------------------------------------
  // compute xi-direction fluxes 
  //-------------------------------------------------------------------------
  for (i = 0; i < num_devices; i++) {
    size_t compute_rhs3_lws[3], compute_rhs3_gws[3], temp;

    d0_size = max_cell_size[i][1];
    d1_size = max_cell_size[i][2];
    d2_size = ncells;

    compute_rhs3_lws[0] = d0_size < work_item_sizes[0] ?
                          d0_size : work_item_sizes[0];
    temp = max_work_group_size / compute_rhs3_lws[0];
    compute_rhs3_lws[1] = d1_size < temp ? d1_size : temp;
    temp = temp / compute_rhs3_lws[1];
    compute_rhs3_lws[2] = d2_size < temp ? d2_size : temp;
    compute_rhs3_gws[0] = clu_RoundWorkSize(d0_size, compute_rhs3_lws[0]);
    compute_rhs3_gws[1] = clu_RoundWorkSize(d1_size, compute_rhs3_lws[1]);
    compute_rhs3_gws[2] = clu_RoundWorkSize(d2_size, compute_rhs3_lws[2]);

    ecode  = clSetKernelArg(k_compute_rhs3[i], 0, sizeof(cl_mem), &m_u[i]);
    ecode |= clSetKernelArg(k_compute_rhs3[i], 1, sizeof(cl_mem), &m_us[i]);
    ecode |= clSetKernelArg(k_compute_rhs3[i], 2, sizeof(cl_mem), &m_vs[i]);
    ecode |= clSetKernelArg(k_compute_rhs3[i], 3, sizeof(cl_mem), &m_ws[i]);
    ecode |= clSetKernelArg(k_compute_rhs3[i], 4, sizeof(cl_mem), &m_qs[i]);
    ecode |= clSetKernelArg(k_compute_rhs3[i], 5, sizeof(cl_mem),
                                                  &m_rho_i[i]);
    ecode |= clSetKernelArg(k_compute_rhs3[i], 6, sizeof(cl_mem),
                                                  &m_square[i]);
    ecode |= clSetKernelArg(k_compute_rhs3[i], 7, sizeof(cl_mem), &m_rhs[i]);
    ecode |= clSetKernelArg(k_compute_rhs3[i], 8, sizeof(cl_mem),
                                                  &m_cell_size[i]);
    ecode |= clSetKernelArg(k_compute_rhs3[i], 9, sizeof(cl_mem),&m_start[i]);
    ecode |= clSetKernelArg(k_compute_rhs3[i], 10, sizeof(cl_mem), &m_end[i]);
    ecode |= clSetKernelArg(k_compute_rhs3[i], 11, sizeof(int), &ncells);
    clu_CheckError(ecode, "clSetKernelArg()");

    ecode = clEnqueueNDRangeKernel(cmd_queue[i],
                                   k_compute_rhs3[i],
                                   3, NULL,
                                   compute_rhs3_gws,
                                   compute_rhs3_lws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel()");
  }
  //-------------------------------------------------------------------------

  //-------------------------------------------------------------------------
  // compute eta-direction fluxes 
  //-------------------------------------------------------------------------
  for (i = 0; i < num_devices; i++) {
    size_t compute_rhs4_lws[3], compute_rhs4_gws[3], temp;

    d0_size = max_cell_size[i][0];
    d1_size = max_cell_size[i][2];
    d2_size = ncells;

    compute_rhs4_lws[0] = d0_size < work_item_sizes[0] ?
                          d0_size : work_item_sizes[0];
    temp = max_work_group_size / compute_rhs4_lws[0];
    compute_rhs4_lws[1] = d1_size < temp ? d1_size : temp;
    temp = temp / compute_rhs4_lws[1];
    compute_rhs4_lws[2] = d2_size < temp ? d2_size : temp;
    compute_rhs4_gws[0] = clu_RoundWorkSize(d0_size, compute_rhs4_lws[0]);
    compute_rhs4_gws[1] = clu_RoundWorkSize(d1_size, compute_rhs4_lws[1]);
    compute_rhs4_gws[2] = clu_RoundWorkSize(d2_size, compute_rhs4_lws[2]);

    ecode  = clSetKernelArg(k_compute_rhs4[i], 0, sizeof(cl_mem), &m_u[i]);
    ecode |= clSetKernelArg(k_compute_rhs4[i], 1, sizeof(cl_mem), &m_us[i]);
    ecode |= clSetKernelArg(k_compute_rhs4[i], 2, sizeof(cl_mem), &m_vs[i]);
    ecode |= clSetKernelArg(k_compute_rhs4[i], 3, sizeof(cl_mem), &m_ws[i]);
    ecode |= clSetKernelArg(k_compute_rhs4[i], 4, sizeof(cl_mem), &m_qs[i]);
    ecode |= clSetKernelArg(k_compute_rhs4[i], 5, sizeof(cl_mem),
                                                  &m_rho_i[i]);
    ecode |= clSetKernelArg(k_compute_rhs4[i], 6, sizeof(cl_mem),
                                                  &m_square[i]);
    ecode |= clSetKernelArg(k_compute_rhs4[i], 7, sizeof(cl_mem), &m_rhs[i]);
    ecode |= clSetKernelArg(k_compute_rhs4[i], 8, sizeof(cl_mem),
                                                  &m_cell_size[i]);
    ecode |= clSetKernelArg(k_compute_rhs4[i], 9, sizeof(cl_mem),&m_start[i]);
    ecode |= clSetKernelArg(k_compute_rhs4[i], 10, sizeof(cl_mem), &m_end[i]);
    ecode |= clSetKernelArg(k_compute_rhs4[i], 11, sizeof(int), &ncells);
    clu_CheckError(ecode, "clSetKernelArg()");

    ecode = clEnqueueNDRangeKernel(cmd_queue[i],
                                   k_compute_rhs4[i],
                                   3, NULL,
                                   compute_rhs4_gws,
                                   compute_rhs4_lws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel()");
  }
  //-------------------------------------------------------------------------

  //-------------------------------------------------------------------------
  // compute zeta-direction fluxes 
  //-------------------------------------------------------------------------
  for (i = 0; i < num_devices; i++) {
    size_t compute_rhs5_lws[3], compute_rhs5_gws[3], temp;

    d0_size = max_cell_size[i][0];
    d1_size = max_cell_size[i][1];
    d2_size = ncells;

    compute_rhs5_lws[0] = d0_size < work_item_sizes[0] ?
                          d0_size : work_item_sizes[0];
    temp = max_work_group_size / compute_rhs5_lws[0];
    compute_rhs5_lws[1] = d1_size < temp ? d1_size : temp;
    temp = temp / compute_rhs5_lws[1];
    compute_rhs5_lws[2] = d2_size < temp ? d2_size : temp;
    compute_rhs5_gws[0] = clu_RoundWorkSize(d0_size, compute_rhs5_lws[0]);
    compute_rhs5_gws[1] = clu_RoundWorkSize(d1_size, compute_rhs5_lws[1]);
    compute_rhs5_gws[2] = clu_RoundWorkSize(d2_size, compute_rhs5_lws[2]);

    ecode  = clSetKernelArg(k_compute_rhs5[i], 0, sizeof(cl_mem), &m_u[i]);
    ecode |= clSetKernelArg(k_compute_rhs5[i], 1, sizeof(cl_mem), &m_us[i]);
    ecode |= clSetKernelArg(k_compute_rhs5[i], 2, sizeof(cl_mem), &m_vs[i]);
    ecode |= clSetKernelArg(k_compute_rhs5[i], 3, sizeof(cl_mem), &m_ws[i]);
    ecode |= clSetKernelArg(k_compute_rhs5[i], 4, sizeof(cl_mem), &m_qs[i]);
    ecode |= clSetKernelArg(k_compute_rhs5[i], 5, sizeof(cl_mem),
                                                  &m_rho_i[i]);
    ecode |= clSetKernelArg(k_compute_rhs5[i], 6, sizeof(cl_mem),
                                                  &m_square[i]);
    ecode |= clSetKernelArg(k_compute_rhs5[i], 7, sizeof(cl_mem), &m_rhs[i]);
    ecode |= clSetKernelArg(k_compute_rhs5[i], 8, sizeof(cl_mem),
                                                  &m_cell_size[i]);
    ecode |= clSetKernelArg(k_compute_rhs5[i], 9, sizeof(cl_mem),&m_start[i]);
    ecode |= clSetKernelArg(k_compute_rhs5[i], 10, sizeof(cl_mem), &m_end[i]);
    ecode |= clSetKernelArg(k_compute_rhs5[i], 11, sizeof(int), &ncells);
    clu_CheckError(ecode, "clSetKernelArg()");

    ecode = clEnqueueNDRangeKernel(cmd_queue[i],
                                   k_compute_rhs5[i],
                                   3, NULL,
                                   compute_rhs5_gws,
                                   compute_rhs5_lws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel()");
  }
  //-------------------------------------------------------------------------

  //-------------------------------------------------------------------------
  for (i = 0; i < num_devices; i++) {
    size_t compute_rhs6_lws[3], compute_rhs6_gws[3], temp;

    if (COMPUTE_RHS6_DIM == 3) {
      d0_size = max_cell_size[i][1];
      d1_size = max_cell_size[i][2];
      d2_size = ncells;

      compute_rhs6_lws[0] = d0_size < work_item_sizes[0] ?
                            d0_size : work_item_sizes[0];
      temp = max_work_group_size / compute_rhs6_lws[0];
      compute_rhs6_lws[1] = d1_size < temp ? d1_size : temp;
      temp = temp / compute_rhs6_lws[1];
      compute_rhs6_lws[2] = d2_size < temp ? d2_size : temp;
      compute_rhs6_gws[0] = clu_RoundWorkSize(d0_size, compute_rhs6_lws[0]);
      compute_rhs6_gws[1] = clu_RoundWorkSize(d1_size, compute_rhs6_lws[1]);
      compute_rhs6_gws[2] = clu_RoundWorkSize(d2_size, compute_rhs6_lws[2]);
    } else {
      d0_size = max_cell_size[i][2];
      d1_size = ncells;

      compute_rhs6_lws[0] = d0_size < work_item_sizes[0] ?
                            d0_size : work_item_sizes[0];
      temp = max_work_group_size / compute_rhs6_lws[0];
      compute_rhs6_lws[1] = d1_size < temp ? d1_size : temp;
      compute_rhs6_gws[0] = clu_RoundWorkSize(d0_size, compute_rhs6_lws[0]);
      compute_rhs6_gws[1] = clu_RoundWorkSize(d1_size, compute_rhs6_lws[1]);
    }

    ecode  = clSetKernelArg(k_compute_rhs6[i], 0, sizeof(cl_mem), &m_rhs[i]);
    ecode |= clSetKernelArg(k_compute_rhs6[i], 1, sizeof(cl_mem),
                                                  &m_cell_size[i]);
    ecode |= clSetKernelArg(k_compute_rhs6[i], 2, sizeof(cl_mem),&m_start[i]);
    ecode |= clSetKernelArg(k_compute_rhs6[i], 3, sizeof(cl_mem), &m_end[i]);
    ecode |= clSetKernelArg(k_compute_rhs6[i], 4, sizeof(int), &ncells);
    clu_CheckError(ecode, "clSetKernelArg()");

    ecode = clEnqueueNDRangeKernel(cmd_queue[i],
                                   k_compute_rhs6[i],
                                   COMPUTE_RHS6_DIM, NULL,
                                   compute_rhs6_gws,
                                   compute_rhs6_lws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel()");
  }
  //-------------------------------------------------------------------------

  CHECK_FINISH();
  if (timeron) timer_stop(t_rhs);
}
Beispiel #21
0
//---------------------------------------------------------------------
// this function copies the face values of a variable defined on a set 
// of cells to the overlap locations of the adjacent sets of cells. 
// Because a set of cells interfaces in each direction with exactly one 
// other set, we only need to fill six different buffers. We could try to 
// overlap communication with computation, by computing
// some internal values while communicating boundary values, but this
// adds so much overhead that it's not clearly useful. 
//---------------------------------------------------------------------
void copy_faces()
{
  int c, i;
  cl_int ecode = 0;

  //---------------------------------------------------------------------
  // exit immediately if there are no faces to be copied           
  //---------------------------------------------------------------------
  if (num_devices == 1) {
    compute_rhs();
    return;
  }

  //---------------------------------------------------------------------
  // because the difference stencil for the diagonalized scheme is 
  // orthogonal, we do not have to perform the staged copying of faces, 
  // but can send all face information simultaneously to the neighboring 
  // cells in all directions          
  //---------------------------------------------------------------------
  if (timeron) timer_start(t_bpack);

  for (c = 0; c < ncells; c++) {
    for (i = 0; i < num_devices; i++) {
      ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                     k_copy_faces1[i][c],
                                     COPY_FACES1_DIM, NULL,
                                     copy_faces1_gw[i][c],
                                     copy_faces1_lw[i][c],
                                     0, NULL, NULL);
      clu_CheckError(ecode, "clEnqueueNDRange() for copy_faces1");
    }

    for (i = 0; i < num_devices; i++) {
      ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                     k_copy_faces2[i][c],
                                     COPY_FACES2_DIM, NULL,
                                     copy_faces2_gw[i][c],
                                     copy_faces2_lw[i][c],
                                     0, NULL, NULL);
      clu_CheckError(ecode, "clEnqueueNDRange() for copy_faces2");

      ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                     k_copy_faces3[i][c],
                                     COPY_FACES3_DIM, NULL,
                                     copy_faces3_gw[i][c],
                                     copy_faces3_lw[i][c],
                                     0, NULL, NULL);
      clu_CheckError(ecode, "clEnqueueNDRange() for copy_faces3");
    }

    for (i = 0; i < num_devices; i++) {
      CHECK_FINISH(i * 2);
    }

  }

  if (timeron) timer_stop(t_bpack);

  if (timeron) timer_start(t_exch);
  for (i = 0; i < num_devices; i++) {
    CHECK_FINISH(i * 2);

    ecode = clEnqueueCopyBuffer(cmd_queue[successor[i][0] * 2 + 1],
                                m_out_buffer[i],
                                m_in_buffer[successor[i][0]],
                                start_send_east[i]*sizeof(double),
                                start_recv_west[successor[i][0]]*sizeof(double),
                                east_size[i]*sizeof(double),
                                0, NULL, NULL);
        CHECK_FINISH(successor[i][0] * 2 + 1);
  }

  for (i = 0; i < num_devices; i++) {
    ecode = clEnqueueCopyBuffer(cmd_queue[predecessor[i][0] * 2 + 1],
                                m_out_buffer[i],
                                m_in_buffer[predecessor[i][0]],
                                start_send_west[i]*sizeof(double),
                                start_recv_east[predecessor[i][0]]*sizeof(double),
                                west_size[i]*sizeof(double),
                                0, NULL, NULL);

        CHECK_FINISH(predecessor[i][0] * 2 + 1);
    ecode = clEnqueueCopyBuffer(cmd_queue[successor[i][1] * 2 + 1],
                                m_out_buffer[i],
                                m_in_buffer[successor[i][1]],
                                start_send_north[i]*sizeof(double),
                                start_recv_south[successor[i][1]]*sizeof(double),
                                north_size[i]*sizeof(double),
                                0, NULL, NULL);

        CHECK_FINISH(successor[i][1] * 2 + 1);
    ecode = clEnqueueCopyBuffer(cmd_queue[predecessor[i][1] * 2 + 1],
                                m_out_buffer[i],
                                m_in_buffer[predecessor[i][1]],
                                start_send_south[i]*sizeof(double),
                                start_recv_north[predecessor[i][1]]*sizeof(double),
                                south_size[i]*sizeof(double),
                                0, NULL, NULL);

        CHECK_FINISH(predecessor[i][1] * 2 + 1);
    ecode = clEnqueueCopyBuffer(cmd_queue[successor[i][2] * 2 + 1],
                                m_out_buffer[i],
                                m_in_buffer[successor[i][2]],
                                start_send_top[i]*sizeof(double),
                                start_recv_bottom[successor[i][2]]*sizeof(double),
                                top_size[i]*sizeof(double),
                                0, NULL, NULL);

        CHECK_FINISH(successor[i][2] * 2 + 1);
    ecode = clEnqueueCopyBuffer(cmd_queue[predecessor[i][2] * 2 + 1],
                                m_out_buffer[i],
                                m_in_buffer[predecessor[i][2]],
                                start_send_bottom[i]*sizeof(double),
                                start_recv_top[predecessor[i][2]]*sizeof(double),
                                bottom_size[i]*sizeof(double),
                                0, NULL, NULL);
        CHECK_FINISH(predecessor[i][2] * 2 + 1);
  }
  if (timeron) timer_stop(t_exch);

  //---------------------------------------------------------------------
  // unpack the data that has just been received;             
  //---------------------------------------------------------------------
  if (timeron) timer_start(t_bpack);

  for (c = 0; c < ncells; c++) {
    for (i = 0; i < num_devices; i++) {
      if (c == 0) CHECK_FINISH(i * 2 + 1);

      ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                     k_copy_faces4[i][c],
                                     COPY_FACES4_DIM, NULL,
                                     copy_faces4_gw[i][c],
                                     copy_faces4_lw[i][c],
                                     0, NULL, NULL);
      clu_CheckError(ecode, "clEnqueueNDRange() for copy_faces4");
    }

    for (i = 0; i < num_devices; i++) {
      ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                     k_copy_faces5[i][c],
                                     COPY_FACES5_DIM, NULL,
                                     copy_faces5_gw[i][c],
                                     copy_faces5_lw[i][c],
                                     0, NULL, NULL);
      clu_CheckError(ecode, "clEnqueueNDRange() for copy_faces5");

      ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                     k_copy_faces6[i][c],
                                     COPY_FACES6_DIM, NULL,
                                     copy_faces6_gw[i][c],
                                     copy_faces6_lw[i][c],
                                     0, NULL, NULL);
      clu_CheckError(ecode, "clEnqueueNDRange() for copy_faces6");
    }

    for (i = 0; i < num_devices; i++) {
      CHECK_FINISH(i * 2);
    }
  }

  if (timeron) timer_stop(t_bpack);

  for (i = 0; i < num_devices; i++)
    CHECK_FINISH(i * 2);

  //---------------------------------------------------------------------
  // now that we have all the data, compute the rhs
  //---------------------------------------------------------------------
  compute_rhs();
}
Beispiel #22
0
void opencl_info() {
  cl_int           err_code;

  cl_platform_id  *platforms;
  cl_device_type   device_type;
  cl_uint          num_devices;
  cl_device_id    *devices;

  // Get OpenCL platforms
  // - Get the number of available platforms
  cl_uint num_platforms;
  err_code = clGetPlatformIDs(0, NULL, &num_platforms);
  clu_CheckError(err_code, "clGetPlatformIDs() for num_platforms");
  if (num_platforms == 0) {
    fprintf(stderr, "No OpenCL platform!\n");
    exit(EXIT_FAILURE);
  }
  // - Get platform IDs
  platforms = (cl_platform_id *)malloc(num_platforms*sizeof(cl_platform_id));
  err_code = clGetPlatformIDs(num_platforms, platforms, NULL);
  clu_CheckError(err_code, "clGetPlatformIDs()");

  // Get platform informations
  printf("\nNumber of platforms: %u\n\n", num_platforms);
  char tmp_buf[1024];
  for (cl_uint i = 0; i < num_platforms; i++) {
    printf("platform: %u\n", i);

    err_code = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 1024,
                                 &tmp_buf, NULL);
    clu_CheckError(err_code, "clGetPlatformInfo() for CL_PLATFORM_NAME");
    printf("- CL_PLATFORM_NAME      : %s\n", tmp_buf);

    err_code = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 1024,
                                 &tmp_buf, NULL);
    clu_CheckError(err_code, "clGetPlatformInfo() for CL_PLATFORM_VENDOR");
    printf("- CL_PLATFORM_VENDOR    : %s\n", tmp_buf);

    err_code = clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 1024,
                                 &tmp_buf, NULL);
    clu_CheckError(err_code, "clGetPlatformInfo() for CL_PLATFORM_PROFILE");
    printf("- CL_PLATFORM_PROFILE   : %s\n", tmp_buf);

    err_code = clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 1024,
                                 &tmp_buf, NULL);
    clu_CheckError(err_code, "clGetPlatformInfo() for CL_PLATFORM_VERSION");
    printf("- CL_PLATFORM_VERSION   : %s\n", tmp_buf);

    err_code = clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 1024,
                                 &tmp_buf, NULL);
    clu_CheckError(err_code,"clGetPlatformInfo() for CL_PLATFORM_EXTENSIONS");
    printf("- CL_PLATFORM_EXTENSIONS: %s\n", tmp_buf);
    printf("\n");


    // Get the number of devices
    err_code = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL,
                              &num_devices);
    clu_CheckError(err_code, "clGetDeviceIDs for num_devices");
    if (num_devices == 0) {
      fprintf(stderr, "No OpenCL device in this platform!\n");
      exit(EXIT_FAILURE);
    }
    printf("Number of devices: %u\n", num_devices);

    // Get the default device
    cl_device_id default_device;
    cl_uint num_defaults;
    err_code = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_DEFAULT, 
                              1, &default_device, &num_defaults);
    clu_CheckError(err_code, "clGetDeviceIDs() for CL_DEVICE_TYPE_DEFAULT");
    if (num_defaults != 1) {
      printf("- # of default devices: %u\n", num_defaults);
    }

    // Get device IDs
    devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
    err_code = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, num_devices,
                              devices, NULL);
    clu_CheckError(err_code, "clGetDeviceIDs()");
    for (cl_uint k = 0; k < num_devices; k++) {
      printf("device: %u (", k);
      err_code = clGetDeviceInfo(devices[k], CL_DEVICE_TYPE, 
                                 sizeof(cl_device_type), &device_type, NULL);
      if (device_type & CL_DEVICE_TYPE_CPU)
        printf("CL_DEVICE_TYPE_CPU");
      if (device_type & CL_DEVICE_TYPE_GPU)
        printf("CL_DEVICE_TYPE_GPU");
      if (device_type & CL_DEVICE_TYPE_ACCELERATOR)
        printf("CL_DEVICE_TYPE_ACCELERATOR");
      if (device_type & CL_DEVICE_TYPE_DEFAULT)
        printf("CL_DEVICE_TYPE_DEFAULT");
      printf(")");
      if (default_device == devices[k]) printf(" default");
      printf("\n");

      err_code = clGetDeviceInfo(devices[k], CL_DEVICE_NAME,
                                 1024, tmp_buf, NULL);
      printf(" - CL_DEVICE_NAME                         : %s\n", tmp_buf);

      err_code = clGetDeviceInfo(devices[k], CL_DEVICE_VENDOR,
                                 1024, tmp_buf, NULL);
      printf(" - CL_DEVICE_VENDOR                       : %s\n", tmp_buf);

      err_code = clGetDeviceInfo(devices[k], CL_DRIVER_VERSION,
                                 1024, tmp_buf, NULL);
      printf(" - CL_DRIVER_VERSION                      : %s\n", tmp_buf);

      err_code = clGetDeviceInfo(devices[k], CL_DEVICE_PROFILE,
                                 1024, tmp_buf, NULL);
      printf(" - CL_DEVICE_PROFILE                      : %s\n", tmp_buf);

      err_code = clGetDeviceInfo(devices[k], CL_DEVICE_VERSION,
                                 1024, tmp_buf, NULL);
      printf(" - CL_DEVICE_VERSION                      : %s\n", tmp_buf);

      err_code = clGetDeviceInfo(devices[k], CL_DEVICE_EXTENSIONS,
                                 1024, tmp_buf, NULL);

	  //CL_DEVICE_MAX_COMPUTE_UNITS
	  //CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS
	  //CL_DEVICE_MAX_WORK_GROUP_SIZE
	  //CL_DEVICE_MAX_WORK_ITEM_SIZES
	  //
	  cl_uint usize;
      err_code = clGetDeviceInfo(devices[k], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(usize), &usize, NULL);
      printf(" - CL_DEVICE_MAX_COMPUTE_UNITS                      : %d\n", usize);

      err_code = clGetDeviceInfo(devices[k], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,  sizeof(usize), &usize, NULL);
      printf(" - CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS                      : %d\n", usize);

	  size_t size;
      err_code = clGetDeviceInfo(devices[k], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size), &size, NULL);
      printf(" - CL_DEVICE_MAX_WORK_GROUP_SIZE                      : %d\n",size);

      err_code = clGetDeviceInfo(devices[k], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size), &size, NULL);
      printf(" - CL_DEVICE_MAX_WORK_ITEM_SIZES                      : %d\n", size);
      printf("\n");
    }
    free(devices);

    printf("\n");
  }

  free(platforms);
}
Beispiel #23
0
int main( int argc, char **argv )
{

  int             i, iteration;

  double          timecounter;

  FILE            *fp;

  cl_int ecode;

  if (argc == 1) {
    fprintf(stderr, "Usage: %s <kernel directory>\n", argv[0]);
    exit(-1);
  }

  /*  Initialize timers  */
  timer_on = 0;            
  if ((fp = fopen("timer.flag", "r")) != NULL) {
    fclose(fp);
    timer_on = 1;
  }
  timer_clear( 0 );
  if (timer_on) {
    timer_clear( 1 );
    timer_clear( 2 );
    timer_clear( 3 );
  }

  if (timer_on) timer_start( 3 );

  /*  Initialize the verification arrays if a valid class */
  for( i=0; i<TEST_ARRAY_SIZE; i++ )
    switch( CLASS )
    {
      case 'S':
        test_index_array[i] = S_test_index_array[i];
        test_rank_array[i]  = S_test_rank_array[i];
        break;
      case 'A':
        test_index_array[i] = A_test_index_array[i];
        test_rank_array[i]  = A_test_rank_array[i];
        break;
      case 'W':
        test_index_array[i] = W_test_index_array[i];
        test_rank_array[i]  = W_test_rank_array[i];
        break;
      case 'B':
        test_index_array[i] = B_test_index_array[i];
        test_rank_array[i]  = B_test_rank_array[i];
        break;
      case 'C':
        test_index_array[i] = C_test_index_array[i];
        test_rank_array[i]  = C_test_rank_array[i];
        break;
      case 'D':
        test_index_array[i] = D_test_index_array[i];
        test_rank_array[i]  = D_test_rank_array[i];
        break;
    };

  /* set up the OpenCL environment. */
  setup_opencl(argc, argv);

  /*  Printout initial NPB info */
  printf( "\n\n NAS Parallel Benchmarks (NPB3.3-OCL) - IS Benchmark\n\n" );
  printf( " Size:  %ld  (class %c)\n", (long)TOTAL_KEYS, CLASS );
  printf( " Iterations:   %d\n", MAX_ITERATIONS );

  if (timer_on) timer_start( 1 );

  /*  Generate random number sequence and subsequent keys on all procs */
  create_seq( 314159265.00,                    /* Random number gen seed */
              1220703125.00 );                 /* Random number gen mult */
  if (timer_on) timer_stop( 1 );

  /*  Do one interation for free (i.e., untimed) to guarantee initialization of  
      all data and code pages and respective tables */
  rank( 1 );  

  /*  Start verification counter */
  passed_verification = 0;

  DTIMER_START(T_BUFFER_WRITE);
  ecode = clEnqueueWriteBuffer(cmd_queue,
                               m_passed_verification,
                               CL_TRUE,
                               0,
                               sizeof(cl_int),
                               &passed_verification,
                               0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueWriteBuffer() for m_passed_verification");
  DTIMER_STOP(T_BUFFER_WRITE);

  if( CLASS != 'S' ) printf( "\n   iteration\n" );

  /*  Start timer  */             
  timer_start( 0 );


  /*  This is the main iteration */
  for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ )
  {
    if( CLASS != 'S' ) printf( "        %d\n", iteration );
    rank( iteration );
  }

  DTIMER_START(T_BUFFER_READ);
  ecode = clEnqueueReadBuffer(cmd_queue,
                              m_passed_verification,
                              CL_TRUE,
                              0,
                              sizeof(cl_int),
                              &passed_verification,
                              0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueReadBuffer() for m_passed_verification");
  DTIMER_STOP(T_BUFFER_READ);

  /*  End of timing, obtain maximum time of all processors */
  timer_stop( 0 );
  timecounter = timer_read( 0 );


  /*  This tests that keys are in sequence: sorting of last ranked key seq
      occurs here, but is an untimed operation                             */
  if (timer_on) timer_start( 2 );
  full_verify();
  if (timer_on) timer_stop( 2 );

  if (timer_on) timer_stop( 3 );


  /*  The final printout  */
  if( passed_verification != 5*MAX_ITERATIONS + 1 )
    passed_verification = 0;
  c_print_results( "IS",
                   CLASS,
                   (int)(TOTAL_KEYS/64),
                   64,
                   0,
                   MAX_ITERATIONS,
                   timecounter,
                   ((double) (MAX_ITERATIONS*TOTAL_KEYS))
                              /timecounter/1000000.,
                   "keys ranked", 
                   passed_verification,
                   NPBVERSION,
                   COMPILETIME,
                   CC,
                   CLINK,
                   C_LIB,
                   C_INC,
                   CFLAGS,
                   CLINKFLAGS,
                   "",
                   clu_GetDeviceTypeName(device_type),
                   device_name);

  /*  Print additional timers  */
  if (timer_on) {
    double t_total, t_percent;

    t_total = timer_read( 3 );
    printf("\nAdditional timers -\n");
    printf(" Total execution: %8.3f\n", t_total);
    if (t_total == 0.0) t_total = 1.0;
    timecounter = timer_read(1);
    t_percent = timecounter/t_total * 100.;
    printf(" Initialization : %8.3f (%5.2f%%)\n", timecounter, t_percent);
    timecounter = timer_read(0);
    t_percent = timecounter/t_total * 100.;
    printf(" Benchmarking   : %8.3f (%5.2f%%)\n", timecounter, t_percent);
    timecounter = timer_read(2);
    t_percent = timecounter/t_total * 100.;
    printf(" Sorting        : %8.3f (%5.2f%%)\n", timecounter, t_percent);
  }

  release_opencl();
  
  fflush(stdout);

  return 0;
  /**************************/
} /*  E N D  P R O G R A M  */
Beispiel #24
0
void full_verify( void )
{
  cl_kernel k_fv1, k_fv2;
  cl_mem    m_j;
  INT_TYPE *g_j;
  INT_TYPE j = 0, i;
  size_t j_size;
  size_t fv1_lws[1], fv1_gws[1];
  size_t fv2_lws[1], fv2_gws[1];
  cl_int ecode;

  DTIMER_START(T_BUFFER_CREATE);
  // Create buffers
  j_size = sizeof(INT_TYPE) * (FV2_GLOBAL_SIZE / FV2_GROUP_SIZE);
  m_j = clCreateBuffer(context,
                       CL_MEM_READ_WRITE,
                       j_size,
                       NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer for m_j");
  DTIMER_STOP(T_BUFFER_CREATE);

  DTIMER_START(T_OPENCL_API);
  // Create kernels
  k_fv1 = clCreateKernel(program, "full_verify1", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for full_verify1");

  k_fv2 = clCreateKernel(program, "full_verify2", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for full_verify2");
  DTIMER_STOP(T_OPENCL_API);

  if (device_type == CL_DEVICE_TYPE_GPU) {
    cl_kernel k_fv0;
    size_t fv0_lws[1], fv0_gws[1];

    DTIMER_START(T_OPENCL_API);
    // Create kernels
    k_fv0 = clCreateKernel(program, "full_verify0", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for full_verify0");
    DTIMER_STOP(T_OPENCL_API);

    // Kernel execution
    DTIMER_START(T_KERNEL_FV0);
    ecode  = clSetKernelArg(k_fv0, 0, sizeof(cl_mem), (void*)&m_key_array);
    ecode |= clSetKernelArg(k_fv0, 1, sizeof(cl_mem), (void*)&m_key_buff2);
    clu_CheckError(ecode, "clSetKernelArg() for full_verify0");

    fv0_lws[0] = work_item_sizes[0];
    fv0_gws[0] = NUM_KEYS;
    ecode = clEnqueueNDRangeKernel(cmd_queue,
                                   k_fv0,
                                   1, NULL,
                                   fv0_gws, 
                                   fv0_lws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel() for full_verify0");
    CHECK_FINISH();
    DTIMER_STOP(T_KERNEL_FV0);

    DTIMER_START(T_KERNEL_FV1);
    ecode  = clSetKernelArg(k_fv1, 0, sizeof(cl_mem), (void*)&m_key_buff2);
    ecode |= clSetKernelArg(k_fv1, 1, sizeof(cl_mem), (void*)&m_key_buff1);
    ecode |= clSetKernelArg(k_fv1, 2, sizeof(cl_mem), (void*)&m_key_array);
    clu_CheckError(ecode, "clSetKernelArg() for full_verify1");

    fv1_lws[0] = work_item_sizes[0];
    fv1_gws[0] = NUM_KEYS;
    ecode = clEnqueueNDRangeKernel(cmd_queue,
                                   k_fv1,
                                   1, NULL,
                                   fv1_gws, 
                                   fv1_lws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel() for full_verify1");
    CHECK_FINISH();
    DTIMER_STOP(T_KERNEL_FV1);

    DTIMER_START(T_KERNEL_FV2);
    ecode  = clSetKernelArg(k_fv2, 0, sizeof(cl_mem), (void*)&m_key_array);
    ecode |= clSetKernelArg(k_fv2, 1, sizeof(cl_mem), (void*)&m_j);
    ecode |= clSetKernelArg(k_fv2, 2, sizeof(INT_TYPE)*FV2_GROUP_SIZE, NULL);
    clu_CheckError(ecode, "clSetKernelArg() for full_verify2");

    fv2_lws[0] = FV2_GROUP_SIZE;
    fv2_gws[0] = FV2_GLOBAL_SIZE;
    ecode = clEnqueueNDRangeKernel(cmd_queue,
                                   k_fv2,
                                   1, NULL,
                                   fv2_gws, 
                                   fv2_lws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel() for full_verify2");
    CHECK_FINISH();
    DTIMER_STOP(T_KERNEL_FV2);

    g_j = (INT_TYPE *)malloc(j_size);

    DTIMER_START(T_BUFFER_READ);
    ecode = clEnqueueReadBuffer(cmd_queue,
                                m_j,
                                CL_TRUE,
                                0,
                                j_size,
                                g_j,
                                0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueReadBuffer() for m_j");
    DTIMER_STOP(T_BUFFER_READ);

    // reduction
    for (i = 0; i < j_size/sizeof(INT_TYPE); i++) {
      j += g_j[i];
    }

    DTIMER_START(T_RELEASE);
    clReleaseKernel(k_fv0);
    DTIMER_STOP(T_RELEASE);

  } else {

    // Kernel execution
    DTIMER_START(T_KERNEL_FV1);
    ecode  = clSetKernelArg(k_fv1, 0, sizeof(cl_mem), (void*)&m_bucket_ptrs);
    ecode |= clSetKernelArg(k_fv1, 1, sizeof(cl_mem), (void*)&m_key_buff2);
    ecode |= clSetKernelArg(k_fv1, 2, sizeof(cl_mem), (void*)&m_key_buff1);
    ecode |= clSetKernelArg(k_fv1, 3, sizeof(cl_mem), (void*)&m_key_array);
    clu_CheckError(ecode, "clSetKernelArg() for full_verify1");

    fv1_lws[0] = RANK_GROUP_SIZE;
    fv1_gws[0] = RANK_GLOBAL_SIZE;
    ecode = clEnqueueNDRangeKernel(cmd_queue,
                                   k_fv1,
                                   1, NULL,
                                   fv1_gws, 
                                   fv1_lws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel() for full_verify1");
    CHECK_FINISH();
    DTIMER_STOP(T_KERNEL_FV1);

    DTIMER_START(T_KERNEL_FV2);
    ecode  = clSetKernelArg(k_fv2, 0, sizeof(cl_mem), (void*)&m_key_array);
    ecode |= clSetKernelArg(k_fv2, 1, sizeof(cl_mem), (void*)&m_j);
    ecode |= clSetKernelArg(k_fv2, 2, sizeof(INT_TYPE)*FV2_GROUP_SIZE, NULL);
    clu_CheckError(ecode, "clSetKernelArg() for full_verify2");

    fv2_lws[0] = FV2_GROUP_SIZE;
    fv2_gws[0] = FV2_GLOBAL_SIZE;
    ecode = clEnqueueNDRangeKernel(cmd_queue,
                                   k_fv2,
                                   1, NULL,
                                   fv2_gws, 
                                   fv2_lws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel() for full_verify2");
    CHECK_FINISH();
    DTIMER_STOP(T_KERNEL_FV2);

    g_j = (INT_TYPE *)malloc(j_size);

    DTIMER_START(T_BUFFER_READ);
    ecode = clEnqueueReadBuffer(cmd_queue,
                                m_j,
                                CL_TRUE,
                                0,
                                j_size,
                                g_j,
                                0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueReadBuffer() for m_j");
    DTIMER_STOP(T_BUFFER_READ);

    // reduction
    for (i = 0; i < j_size/sizeof(INT_TYPE); i++) {
      j += g_j[i];
    }

  }

  DTIMER_START(T_RELEASE);
  free(g_j);
  clReleaseMemObject(m_j);
  clReleaseKernel(k_fv1);
  clReleaseKernel(k_fv2);
  DTIMER_STOP(T_RELEASE);

  if (j != 0)
    printf( "Full_verify: number of keys out of sort: %ld\n", (long)j );
  else
    passed_verification++;
}
Beispiel #25
0
void rank( int iteration )
{
  size_t r1_lws[1], r1_gws[1];
  size_t r2_lws[1], r2_gws[1];
  size_t r3_lws[1], r3_gws[1];
  cl_int ecode;

  DTIMER_START(T_KERNEL_RANK0);
  // rank0
  ecode = clSetKernelArg(k_rank0, 3, sizeof(cl_int), (void*)&iteration);
  clu_CheckError(ecode, "clSetKernelArg() for rank0: iteration");

  ecode = clEnqueueTask(cmd_queue, k_rank0, 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueTask() for rank0");
  CHECK_FINISH();
  DTIMER_STOP(T_KERNEL_RANK0);

  DTIMER_START(T_KERNEL_RANK1);
  // rank1
  r1_lws[0] = RANK1_GROUP_SIZE;
  r1_gws[0] = RANK1_GLOBAL_SIZE;
  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_rank1,
                                 1, NULL,
                                 r1_gws, 
                                 r1_lws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel() for rank1");
  CHECK_FINISH();
  DTIMER_STOP(T_KERNEL_RANK1);

  DTIMER_START(T_KERNEL_RANK2);
  // rank2
  r2_lws[0] = RANK2_GROUP_SIZE;
  r2_gws[0] = RANK2_GLOBAL_SIZE;
  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_rank2,
                                 1, NULL,
                                 r2_gws, 
                                 r2_lws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel() for rank2");
  CHECK_FINISH();
  DTIMER_STOP(T_KERNEL_RANK2);

  DTIMER_START(T_KERNEL_RANK3);
  // rank3
  if (device_type == CL_DEVICE_TYPE_GPU) {
    r3_lws[0] = work_item_sizes[0];
    r3_gws[0] = work_item_sizes[0] * work_item_sizes[0];
    if (r3_gws[0] > MAX_KEY) r3_gws[0] = MAX_KEY;
    ecode = clEnqueueNDRangeKernel(cmd_queue,
                                   k_rank3_0,
                                   1, NULL,
                                   r3_gws, 
                                   r3_lws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel() for rank3_0");

    r3_lws[0] = work_item_sizes[0];
    r3_gws[0] = work_item_sizes[0];
    ecode = clEnqueueNDRangeKernel(cmd_queue,
                                   k_rank3_1,
                                   1, NULL,
                                   r3_gws, 
                                   r3_lws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel() for rank3_1");

    r3_lws[0] = work_item_sizes[0];
    r3_gws[0] = work_item_sizes[0] * work_item_sizes[0];
    if (r3_gws[0] > MAX_KEY) r3_gws[0] = MAX_KEY;
    ecode = clEnqueueNDRangeKernel(cmd_queue,
                                   k_rank3_2,
                                   1, NULL,
                                   r3_gws, 
                                   r3_lws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel() for rank3_2");
  } else {
    r3_lws[0] = RANK_GROUP_SIZE;
    r3_gws[0] = RANK_GLOBAL_SIZE;
    ecode = clEnqueueNDRangeKernel(cmd_queue,
                                   k_rank3,
                                   1, NULL,
                                   r3_gws, 
                                   r3_lws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel() for rank3");
  }
  CHECK_FINISH();
  DTIMER_STOP(T_KERNEL_RANK3);

  // rank4 - partial verification
  DTIMER_START(T_KERNEL_RANK4);
  ecode = clSetKernelArg(k_rank4, 4, sizeof(cl_int), (void*)&iteration);
  clu_CheckError(ecode, "clSetKernelArg() for rank4");

  ecode = clEnqueueTask(cmd_queue, k_rank4, 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueTask() for rank4");

  ecode = clFinish(cmd_queue);
  clu_CheckError(ecode, "clFinish");
  DTIMER_STOP(T_KERNEL_RANK4);
}
Beispiel #26
0
//---------------------------------------------------------------------
// this function computes the norm of the difference between the
// computed solution and the exact solution
//---------------------------------------------------------------------
void error_norm(double rms[5])
{
  int c, i, m, d;
  int k, j, kk, jj;
  double rms_work[5];
  size_t one = 1;
  cl_mem m_rms[MAX_DEVICE_NUM];
  cl_int ecode = 0;

  for (m = 0; m < 5; m++) {
    rms[m] = 0.0;
    rms_work[m] = 0.0;
  }

  for (i = 0; i < num_devices; i++) {
    m_rms[i] = clCreateBuffer(context, CL_MEM_READ_WRITE,
                              sizeof(double)*5, NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_rms");

    ecode = clEnqueueWriteBuffer(cmd_queue[i * 2], m_rms[i], CL_TRUE,
                                 0, sizeof(double)*5, rms_work,
                                 0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueWriteBuffer() for m_rms");
  }

  for (c = 0; c < ncells; c++) {
    for (i = 0; i < num_devices; i++) {
      kk = 2;
      for (k = cell_low[i][c][2]; k <= cell_high[i][c][2]; k++) {
        jj = 2;
        for (j = cell_low[i][c][1]; j <= cell_high[i][c][1]; j++) {
          ecode  = clSetKernelArg(k_error_norm[i], 0, sizeof(cl_mem), &m_u[i]);
          ecode |= clSetKernelArg(k_error_norm[i], 1, sizeof(cl_mem), &m_rms[i]);
          ecode |= clSetKernelArg(k_error_norm[i], 2, sizeof(cl_mem), &m_ce[i]);
          ecode |= clSetKernelArg(k_error_norm[i], 3, sizeof(int), &c);
          ecode |= clSetKernelArg(k_error_norm[i], 4, sizeof(int), &k);
          ecode |= clSetKernelArg(k_error_norm[i], 5, sizeof(int), &kk);
          ecode |= clSetKernelArg(k_error_norm[i], 6, sizeof(int), &j);
          ecode |= clSetKernelArg(k_error_norm[i], 7, sizeof(int), &jj);
          ecode |= clSetKernelArg(k_error_norm[i], 8, sizeof(int), &cell_low[i][c][0]);
          ecode |= clSetKernelArg(k_error_norm[i], 9, sizeof(int), &cell_high[i][c][0]);
          clu_CheckError(ecode, "clSetKernelArg() for error_norm");

          ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                         k_error_norm[i],
                                         1, NULL, &one, &one,
                                         0, NULL, NULL);
          clu_CheckError(ecode, "clEnqueueNDRangeKernel() for error_norm");
          jj = jj + 1;

          CHECK_FINISH(i * 2);
        }
        kk = kk + 1;
      }
    }
  }

  for (i = 0; i < num_devices; i++) {
    ecode = clEnqueueReadBuffer(cmd_queue[i * 2], m_rms[i], CL_TRUE,
                                0, sizeof(double)*5, rms_work,
                                0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueReadBuffer() for m_rms");

    for (m = 0; m < 5; m++)
      rms[m] += rms_work[m];

    clReleaseMemObject(m_rms[i]);
  }

  for (m = 0; m < 5; m++) {
    for (d = 0; d < 3; d++) {
      rms[m] = rms[m] / (double)(grid_points[d]-2);
    }
    rms[m] = sqrt(rms[m]);
  }
}
Beispiel #27
0
//---------------------------------------------------------------------
// Set up the OpenCL environment.
//---------------------------------------------------------------------
static void setup_opencl(int argc, char *argv[])
{
  cl_int ecode;
  char *source_dir = "IS";
  if (argc > 1) source_dir = argv[1];

#ifdef TIMER_DETAIL
  if (timer_on) {
    int i;
    for (i = T_OPENCL_API; i < T_END; i++) timer_clear(i);
  }
#endif

  DTIMER_START(T_OPENCL_API);

  // 1. Find the default device type and get a device for the device type
  device_type = clu_GetDefaultDeviceType();
  device      = clu_GetAvailableDevice(device_type);
  device_name = clu_GetDeviceName(device);

  // Device information
  ecode = clGetDeviceInfo(device,
                          CL_DEVICE_MAX_WORK_ITEM_SIZES,
                          sizeof(work_item_sizes),
                          &work_item_sizes,
                          NULL);
  clu_CheckError(ecode, "clGetDiviceInfo()");

  ecode = clGetDeviceInfo(device,
                          CL_DEVICE_MAX_WORK_GROUP_SIZE,
                          sizeof(size_t),
                          &max_work_group_size,
                          NULL);
  clu_CheckError(ecode, "clGetDiviceInfo()");

  // FIXME: The below values are experimental.
  if (max_work_group_size > 256) {
    max_work_group_size = 256;
    int i;
    for (i = 0; i < 3; i++) {
      if (work_item_sizes[i] > 256) {
        work_item_sizes[i] = 256;
      }
    }
  }

  // 2. Create a context for the specified device
  context = clCreateContext(NULL, 1, &device, NULL, NULL, &ecode);
  clu_CheckError(ecode, "clCreateContext()");

  // 3. Create a command queue
  cmd_queue = clCreateCommandQueue(context, device, 0, &ecode);
  clu_CheckError(ecode, "clCreateCommandQueue()");

  DTIMER_STOP(T_OPENCL_API);

  // 4. Build the program
  DTIMER_START(T_BUILD);
  char *source_file;
  char build_option[30];
  if (device_type == CL_DEVICE_TYPE_CPU) {
    source_file = "is_cpu.cl";
    sprintf(build_option, "-DCLASS=%d -I.", CLASS);

    CREATE_SEQ_GROUP_SIZE = 64;
    CREATE_SEQ_GLOBAL_SIZE = CREATE_SEQ_GROUP_SIZE * 256;
    RANK_GROUP_SIZE = 1;
    RANK_GLOBAL_SIZE = RANK_GROUP_SIZE * 128;
    RANK1_GROUP_SIZE = 1;
    RANK1_GLOBAL_SIZE = RANK1_GROUP_SIZE * RANK_GLOBAL_SIZE;;
    RANK2_GROUP_SIZE = RANK_GROUP_SIZE;
    RANK2_GLOBAL_SIZE = RANK_GLOBAL_SIZE;;
    FV2_GROUP_SIZE = 64;
    FV2_GLOBAL_SIZE = FV2_GROUP_SIZE * 256;
  } else if (device_type == CL_DEVICE_TYPE_GPU) {
    source_file = "is_gpu.cl";
    sprintf(build_option, "-DCLASS=\'%c\' -I.", CLASS);

    CREATE_SEQ_GROUP_SIZE = 64;
    CREATE_SEQ_GLOBAL_SIZE = CREATE_SEQ_GROUP_SIZE * 256;
    RANK1_GROUP_SIZE = work_item_sizes[0];
    RANK1_GLOBAL_SIZE = MAX_KEY;
    RANK2_GROUP_SIZE = work_item_sizes[0];
    RANK2_GLOBAL_SIZE = NUM_KEYS;
    FV2_GROUP_SIZE = work_item_sizes[0];
    FV2_GLOBAL_SIZE = NUM_KEYS;
  } else {
    fprintf(stderr, "%s: not supported.", clu_GetDeviceTypeName(device_type));
    exit(EXIT_FAILURE);
  }
  program = clu_MakeProgram(context, device, source_dir, source_file,
                            build_option);
  DTIMER_STOP(T_BUILD);

  // 5. Create buffers
  DTIMER_START(T_BUFFER_CREATE);
  m_key_array = clCreateBuffer(context,
                               CL_MEM_READ_WRITE,
                               sizeof(INT_TYPE) * SIZE_OF_BUFFERS,
                               NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_key_array");

  m_key_buff1 = clCreateBuffer(context,
                               CL_MEM_READ_WRITE,
                               sizeof(INT_TYPE) * MAX_KEY,
                               NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_key_buff1");

  m_key_buff2 = clCreateBuffer(context,
                               CL_MEM_READ_WRITE,
                               sizeof(INT_TYPE) * SIZE_OF_BUFFERS,
                               NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_key_buff2");

  size_t test_array_size = sizeof(INT_TYPE) * TEST_ARRAY_SIZE;
  m_index_array = clCreateBuffer(context,
                                 CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                 test_array_size,
                                 test_index_array, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_index_array");

  m_rank_array = clCreateBuffer(context,
                                CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                test_array_size,
                                test_rank_array, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_rank_array");

  m_partial_vals = clCreateBuffer(context,
                                  CL_MEM_WRITE_ONLY,
                                  test_array_size,
                                  NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_partial_vals");

  m_passed_verification = clCreateBuffer(context,
                                         CL_MEM_READ_WRITE,
                                         sizeof(cl_int),
                                         NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_passed_verification");

  if (device_type == CL_DEVICE_TYPE_GPU) {
    m_key_scan = clCreateBuffer(context,
                                CL_MEM_READ_WRITE,
                                sizeof(INT_TYPE) * MAX_KEY,
                                NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_key_buff1_scan");

    m_sum = clCreateBuffer(context,
                           CL_MEM_READ_WRITE,
                           sizeof(INT_TYPE) * work_item_sizes[0],
                           NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_sum");
  } else {
    size_t bs_size = RANK_GLOBAL_SIZE * sizeof(INT_TYPE) * NUM_BUCKETS;
    m_bucket_size = clCreateBuffer(context,
                                   CL_MEM_READ_WRITE,
                                   bs_size,
                                   NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_bucket_size");

    m_bucket_ptrs = clCreateBuffer(context,
                                   CL_MEM_READ_WRITE,
                                   bs_size,
                                   NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_bucket_ptrs");
  }
  DTIMER_STOP(T_BUFFER_CREATE);

  // 6. Create kernels
  DTIMER_START(T_OPENCL_API);
  k_rank0 = clCreateKernel(program, "rank0", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for rank0");
  ecode  = clSetKernelArg(k_rank0, 0, sizeof(cl_mem), (void*)&m_key_array);
  ecode |= clSetKernelArg(k_rank0, 1, sizeof(cl_mem), (void*)&m_partial_vals);
  ecode |= clSetKernelArg(k_rank0, 2, sizeof(cl_mem), (void*)&m_index_array);
  clu_CheckError(ecode, "clSetKernelArg() for rank0");

  if (device_type == CL_DEVICE_TYPE_GPU) {
    k_rank1 = clCreateKernel(program, "rank1", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for rank1");
    ecode  = clSetKernelArg(k_rank1, 0, sizeof(cl_mem), (void*)&m_key_buff1);
    clu_CheckError(ecode, "clSetKernelArg() for rank1");

    k_rank2 = clCreateKernel(program, "rank2", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for rank2");
    ecode  = clSetKernelArg(k_rank2, 0, sizeof(cl_mem), (void*)&m_key_buff1);
    ecode |= clSetKernelArg(k_rank2, 1, sizeof(cl_mem), (void*)&m_key_array);
    clu_CheckError(ecode, "clSetKernelArg() for rank2");

    k_rank3_0 = clCreateKernel(program, "rank3_0", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for rank3_0");
    ecode  = clSetKernelArg(k_rank3_0, 0, sizeof(cl_mem),(void*)&m_key_buff1);
    ecode |= clSetKernelArg(k_rank3_0, 1, sizeof(cl_mem),(void*)&m_key_buff1);
    ecode |= clSetKernelArg(k_rank3_0, 2, sizeof(cl_mem),(void*)&m_sum);
    ecode |= clSetKernelArg(k_rank3_0, 3, 
                            sizeof(INT_TYPE) * work_item_sizes[0] * 2,
                            NULL);
    clu_CheckError(ecode, "clSetKernelArg() for rank3_0");

    k_rank3_1 = clCreateKernel(program, "rank3_1", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for rank3_1");
    ecode  = clSetKernelArg(k_rank3_1, 0, sizeof(cl_mem), (void*)&m_sum);
    ecode  = clSetKernelArg(k_rank3_1, 1, sizeof(cl_mem), (void*)&m_sum);
    ecode |= clSetKernelArg(k_rank3_1, 2, 
                            sizeof(INT_TYPE) * work_item_sizes[0] * 2,
                            NULL);
    clu_CheckError(ecode, "clSetKernelArg() for rank3_1");

    k_rank3_2 = clCreateKernel(program, "rank3_2", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for rank3_2");
    ecode  = clSetKernelArg(k_rank3_2, 0, sizeof(cl_mem),(void*)&m_key_buff1);
    ecode  = clSetKernelArg(k_rank3_2, 1, sizeof(cl_mem),(void*)&m_key_buff1);
    ecode |= clSetKernelArg(k_rank3_2, 2, sizeof(cl_mem),(void*)&m_sum);
    clu_CheckError(ecode, "clSetKernelArg() for rank3_2");
  } else {
    k_rank1 = clCreateKernel(program, "rank1", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for rank1");
    ecode  = clSetKernelArg(k_rank1, 0, sizeof(cl_mem),(void*)&m_key_array);
    ecode |= clSetKernelArg(k_rank1, 1, sizeof(cl_mem),(void*)&m_bucket_size);
    clu_CheckError(ecode, "clSetKernelArg() for rank1");

    k_rank2 = clCreateKernel(program, "rank2", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for rank2");
    ecode  = clSetKernelArg(k_rank2, 0, sizeof(cl_mem),(void*)&m_key_array);
    ecode |= clSetKernelArg(k_rank2, 1, sizeof(cl_mem),(void*)&m_bucket_size);
    ecode |= clSetKernelArg(k_rank2, 2, sizeof(cl_mem),(void*)&m_bucket_ptrs);
    ecode |= clSetKernelArg(k_rank2, 3, sizeof(cl_mem),(void*)&m_key_buff2);
    clu_CheckError(ecode, "clSetKernelArg() for rank2");

    k_rank3 = clCreateKernel(program, "rank3", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for rank3");
    ecode  = clSetKernelArg(k_rank3, 0, sizeof(cl_mem),(void*)&m_bucket_size);
    ecode |= clSetKernelArg(k_rank3, 1, sizeof(cl_mem),(void*)&m_bucket_ptrs);
    ecode |= clSetKernelArg(k_rank3, 2, sizeof(cl_mem),(void*)&m_key_buff1);
    ecode |= clSetKernelArg(k_rank3, 3, sizeof(cl_mem),(void*)&m_key_buff2);
    clu_CheckError(ecode, "clSetKernelArg() for rank3");
  }

  k_rank4 = clCreateKernel(program, "rank4", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for rank4");
  ecode  = clSetKernelArg(k_rank4, 0, sizeof(cl_mem), (void*)&m_partial_vals);
  ecode |= clSetKernelArg(k_rank4, 1, sizeof(cl_mem), (void*)&m_key_buff1);
  ecode |= clSetKernelArg(k_rank4, 2, sizeof(cl_mem), (void*)&m_rank_array);
  ecode |= clSetKernelArg(k_rank4, 3, sizeof(cl_mem),
                                      (void*)&m_passed_verification);
  clu_CheckError(ecode, "clSetKernelArg() for rank4");
  DTIMER_STOP(T_OPENCL_API);
}
Beispiel #28
0
void set_constants()
{
  ce[0][0]  = 2.0;
  ce[0][1]  = 0.0;
  ce[0][2]  = 0.0;
  ce[0][3]  = 4.0;
  ce[0][4]  = 5.0;
  ce[0][5]  = 3.0;
  ce[0][6]  = 0.5;
  ce[0][7]  = 0.02;
  ce[0][8]  = 0.01;
  ce[0][9]  = 0.03;
  ce[0][10] = 0.5;
  ce[0][11] = 0.4;
  ce[0][12] = 0.3;

  ce[1][0]  = 1.0;
  ce[1][1]  = 0.0;
  ce[1][2]  = 0.0;
  ce[1][3]  = 0.0;
  ce[1][4]  = 1.0;
  ce[1][5]  = 2.0;
  ce[1][6]  = 3.0;
  ce[1][7]  = 0.01;
  ce[1][8]  = 0.03;
  ce[1][9]  = 0.02;
  ce[1][10] = 0.4;
  ce[1][11] = 0.3;
  ce[1][12] = 0.5;

  ce[2][0]  = 2.0;
  ce[2][1]  = 2.0;
  ce[2][2]  = 0.0;
  ce[2][3]  = 0.0;
  ce[2][4]  = 0.0;
  ce[2][5]  = 2.0;
  ce[2][6]  = 3.0;
  ce[2][7]  = 0.04;
  ce[2][8]  = 0.03;
  ce[2][9]  = 0.05;
  ce[2][10] = 0.3;
  ce[2][11] = 0.5;
  ce[2][12] = 0.4;

  ce[3][0]  = 2.0;
  ce[3][1]  = 2.0;
  ce[3][2]  = 0.0;
  ce[3][3]  = 0.0;
  ce[3][4]  = 0.0;
  ce[3][5]  = 2.0;
  ce[3][6]  = 3.0;
  ce[3][7]  = 0.03;
  ce[3][8]  = 0.05;
  ce[3][9] = 0.04;
  ce[3][10] = 0.2;
  ce[3][11] = 0.1;
  ce[3][12] = 0.3;

  ce[4][0]  = 5.0;
  ce[4][1]  = 4.0;
  ce[4][2]  = 3.0;
  ce[4][3]  = 2.0;
  ce[4][4]  = 0.1;
  ce[4][5]  = 0.4;
  ce[4][6]  = 0.3;
  ce[4][7]  = 0.05;
  ce[4][8]  = 0.04;
  ce[4][9] = 0.03;
  ce[4][10] = 0.1;
  ce[4][11] = 0.3;
  ce[4][12] = 0.2;

  c1 = 1.4;
  c2 = 0.4;
  c3 = 0.1;
  c4 = 1.0;
  c5 = 1.4;

  dnxm1 = 1.0 / (double)(grid_points[0]-1);
  dnym1 = 1.0 / (double)(grid_points[1]-1);
  dnzm1 = 1.0 / (double)(grid_points[2]-1);

  c1c2 = c1 * c2;
  c1c5 = c1 * c5;
  c3c4 = c3 * c4;
  c1345 = c1c5 * c3c4;

  conz1 = (1.0-c1c5);

  tx1 = 1.0 / (dnxm1 * dnxm1);
  tx2 = 1.0 / (2.0 * dnxm1);
  tx3 = 1.0 / dnxm1;

  ty1 = 1.0 / (dnym1 * dnym1);
  ty2 = 1.0 / (2.0 * dnym1);
  ty3 = 1.0 / dnym1;

  tz1 = 1.0 / (dnzm1 * dnzm1);
  tz2 = 1.0 / (2.0 * dnzm1);
  tz3 = 1.0 / dnzm1;

  dx1 = 0.75;
  dx2 = 0.75;
  dx3 = 0.75;
  dx4 = 0.75;
  dx5 = 0.75;

  dy1 = 0.75;
  dy2 = 0.75;
  dy3 = 0.75;
  dy4 = 0.75;
  dy5 = 0.75;

  dz1 = 1.0;
  dz2 = 1.0;
  dz3 = 1.0;
  dz4 = 1.0;
  dz5 = 1.0;

  dxmax = max(dx3, dx4);
  dymax = max(dy2, dy4);
  dzmax = max(dz2, dz3);

  dssp = 0.25 * max(dx1, max(dy1, dz1) );

  c4dssp = 4.0 * dssp;
  c5dssp = 5.0 * dssp;

  dttx1 = dt*tx1;
  dttx2 = dt*tx2;
  dtty1 = dt*ty1;
  dtty2 = dt*ty2;
  dttz1 = dt*tz1;
  dttz2 = dt*tz2;

  c2dttx1 = 2.0*dttx1;
  c2dtty1 = 2.0*dtty1;
  c2dttz1 = 2.0*dttz1;

  dtdssp = dt*dssp;

  comz1  = dtdssp;
  comz4  = 4.0*dtdssp;
  comz5  = 5.0*dtdssp;
  comz6  = 6.0*dtdssp;

  c3c4tx3 = c3c4*tx3;
  c3c4ty3 = c3c4*ty3;
  c3c4tz3 = c3c4*tz3;

  dx1tx1 = dx1*tx1;
  dx2tx1 = dx2*tx1;
  dx3tx1 = dx3*tx1;
  dx4tx1 = dx4*tx1;
  dx5tx1 = dx5*tx1;

  dy1ty1 = dy1*ty1;
  dy2ty1 = dy2*ty1;
  dy3ty1 = dy3*ty1;
  dy4ty1 = dy4*ty1;
  dy5ty1 = dy5*ty1;

  dz1tz1 = dz1*tz1;
  dz2tz1 = dz2*tz1;
  dz3tz1 = dz3*tz1;
  dz4tz1 = dz4*tz1;
  dz5tz1 = dz5*tz1;

  c2iv  = 2.5;
  con43 = 4.0/3.0;
  con16 = 1.0/6.0;

  xxcon1 = c3c4tx3*con43*tx3;
  xxcon2 = c3c4tx3*tx3;
  xxcon3 = c3c4tx3*conz1*tx3;
  xxcon4 = c3c4tx3*con16*tx3;
  xxcon5 = c3c4tx3*c1c5*tx3;

  yycon1 = c3c4ty3*con43*ty3;
  yycon2 = c3c4ty3*ty3;
  yycon3 = c3c4ty3*conz1*ty3;
  yycon4 = c3c4ty3*con16*ty3;
  yycon5 = c3c4ty3*c1c5*ty3;

  zzcon1 = c3c4tz3*con43*tz3;
  zzcon2 = c3c4tz3*tz3;
  zzcon3 = c3c4tz3*conz1*tz3;
  zzcon4 = c3c4tz3*con16*tz3;
  zzcon5 = c3c4tz3*c1c5*tz3;

  //------------------------------------------------------------------------
  cl_int ecode;
  int i;
  for (i = 0; i < num_devices; i++) {
  ecode = clEnqueueWriteBuffer(cmd_queue[i],
                               m_ce[i],
                               CL_TRUE,
                               0, sizeof(double)*5*13,
                               ce,
                               0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueWriteBuffer() for m_ce");
  }
  //------------------------------------------------------------------------
}
Beispiel #29
0
void compute_rhs()
{
  int c, i;
  cl_int ecode = 0;

  if (timeron) timer_start(t_rhs);
  //---------------------------------------------------------------------
  // loop over all cells owned by this node                           
  //---------------------------------------------------------------------
  for (c = 0; c < ncells; c++) {
    for (i = 0; i < num_devices; i++) {
      ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                     k_compute_rhs1[i][c],
                                     COMPUTE_RHS1_DIM, NULL,
                                     compute_rhs1_gw[i][c],
                                     compute_rhs1_lw[i][c],
                                     0, NULL, NULL);
      clu_CheckError(ecode, "clEnqueueNDRange() for compute_rhs1");
    }

    for (i = 0; i < num_devices; i++) {
      ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                     k_compute_rhs2[i][c],
                                     COMPUTE_RHS2_DIM, NULL,
                                     compute_rhs2_gw[i][c],
                                     compute_rhs2_lw[i][c],
                                     0, NULL, NULL);
      clu_CheckError(ecode, "clEnqueueNDRange() for compute_rhs2");

      ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                     k_compute_rhs3[i][c],
                                     2, NULL,
                                     compute_rhs3_gw[i][c],
                                     compute_rhs3_lw[i][c],
                                     0, NULL, NULL);
      clu_CheckError(ecode, "clEnqueueNDRange() for compute_rhs3");

      ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                     k_compute_rhs4[i][c],
                                     2, NULL,
                                     compute_rhs4_gw[i][c],
                                     compute_rhs4_lw[i][c],
                                     0, NULL, NULL);
      clu_CheckError(ecode, "clEnqueueNDRange() for compute_rhs4");

      ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                     k_compute_rhs5[i][c],
                                     2, NULL,
                                     compute_rhs5_gw[i][c],
                                     compute_rhs5_lw[i][c],
                                     0, NULL, NULL);
      clu_CheckError(ecode, "clEnqueueNDRange() for compute_rhs5");

      ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                     k_compute_rhs6[i][c],
                                     COMPUTE_RHS6_DIM, NULL,
                                     compute_rhs6_gw[i][c],
                                     compute_rhs6_lw[i][c],
                                     0, NULL, NULL);
      clu_CheckError(ecode, "clEnqueueNDRange() for compute_rhs6");
    }
    for (i = 0; i < num_devices; i++) {
      CHECK_FINISH(i * 2);
    }
  }

  for (i = 0; i < num_devices; i++)
    CHECK_FINISH(i * 2);

  if (timeron) timer_stop(t_rhs);
}
Beispiel #30
0
//---------------------------------------------------------------------
// This subroutine initializes the field variable u using 
// tri-linear transfinite interpolation of the boundary values     
//---------------------------------------------------------------------
void initialize()
{
  cl_kernel k_initialize1;
  cl_kernel k_initialize2;
  cl_kernel k_initialize3;
  cl_kernel k_initialize4;
  cl_kernel k_initialize5;

  size_t local_ws[3], global_ws[3], temp;
  cl_int ecode;

  int d0 = grid_points[0];
  int d1 = grid_points[1];
  int d2 = grid_points[2];

  //-----------------------------------------------------------------------
  k_initialize1 = clCreateKernel(p_initialize, "initialize1", &ecode);
  clu_CheckError(ecode, "clCreateKernel()");
  
  ecode  = clSetKernelArg(k_initialize1, 0, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_initialize1, 1, sizeof(int), &d0);
  ecode |= clSetKernelArg(k_initialize1, 2, sizeof(int), &d1);
  ecode |= clSetKernelArg(k_initialize1, 3, sizeof(int), &d2);
  clu_CheckError(ecode, "clSetKernelArg()");

  local_ws[0] = d1 < work_item_sizes[0] ? d1 : work_item_sizes[0];
  temp = max_work_group_size / local_ws[0];
  local_ws[1] = d2 < temp ? d2 : temp;

  global_ws[0] = clu_RoundWorkSize((size_t)d1, local_ws[0]);
  global_ws[1] = clu_RoundWorkSize((size_t)d2, local_ws[1]);

  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_initialize1,
                                 2, NULL,
                                 global_ws,
                                 local_ws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel()");
  //-----------------------------------------------------------------------

  //---------------------------------------------------------------------
  // first store the "interpolated" values everywhere on the grid    
  //---------------------------------------------------------------------
  k_initialize2 = clCreateKernel(p_initialize, "initialize2", &ecode);
  clu_CheckError(ecode, "clCreateKernel()");
  
  ecode  = clSetKernelArg(k_initialize2, 0, sizeof(cl_mem), &m_u);
  ecode  = clSetKernelArg(k_initialize2, 1, sizeof(cl_mem), &m_ce);
  ecode |= clSetKernelArg(k_initialize2, 2, sizeof(int), &d0);
  ecode |= clSetKernelArg(k_initialize2, 3, sizeof(int), &d1);
  ecode |= clSetKernelArg(k_initialize2, 4, sizeof(int), &d2);
  clu_CheckError(ecode, "clSetKernelArg()");

  if (INITIALIZE2_DIM == 3) {
    local_ws[0] = d0 < work_item_sizes[0] ? d0 : work_item_sizes[0];
    temp = max_work_group_size / local_ws[0];
    local_ws[1] = d1 < temp ? d1 : temp;
    temp = temp / local_ws[1];
    local_ws[2] = d2 < temp ? d2 : temp;

    global_ws[0] = clu_RoundWorkSize((size_t)d0, local_ws[0]);
    global_ws[1] = clu_RoundWorkSize((size_t)d1, local_ws[1]);
    global_ws[2] = clu_RoundWorkSize((size_t)d2, local_ws[2]);
  } else if (INITIALIZE2_DIM == 2) {
    local_ws[0] = d1 < work_item_sizes[0] ? d1 : work_item_sizes[0];
    temp = max_work_group_size / local_ws[0];
    local_ws[1] = d2 < temp ? d2 : temp;

    global_ws[0] = clu_RoundWorkSize((size_t)d1, local_ws[0]);
    global_ws[1] = clu_RoundWorkSize((size_t)d2, local_ws[1]);
  }
  
  CHECK_FINISH();
  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_initialize2,
                                 INITIALIZE2_DIM, NULL,
                                 global_ws,
                                 local_ws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel()");
  //-----------------------------------------------------------------------

  //---------------------------------------------------------------------
  // now store the exact values on the boundaries        
  //---------------------------------------------------------------------
  k_initialize3 = clCreateKernel(p_initialize, "initialize3", &ecode);
  clu_CheckError(ecode, "clCreateKernel()");
  
  ecode  = clSetKernelArg(k_initialize3, 0, sizeof(cl_mem), &m_u);
  ecode  = clSetKernelArg(k_initialize3, 1, sizeof(cl_mem), &m_ce);
  ecode |= clSetKernelArg(k_initialize3, 2, sizeof(int), &d0);
  ecode |= clSetKernelArg(k_initialize3, 3, sizeof(int), &d1);
  ecode |= clSetKernelArg(k_initialize3, 4, sizeof(int), &d2);
  clu_CheckError(ecode, "clSetKernelArg()");

  local_ws[0] = d1 < work_item_sizes[0] ? d1 : work_item_sizes[0];
  temp = max_work_group_size / local_ws[0];
  local_ws[1] = d2 < temp ? d2 : temp;

  global_ws[0] = clu_RoundWorkSize((size_t)d1, local_ws[0]);
  global_ws[1] = clu_RoundWorkSize((size_t)d2, local_ws[1]);

  CHECK_FINISH();
  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_initialize3,
                                 2, NULL,
                                 global_ws,
                                 local_ws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel()");
  //-----------------------------------------------------------------------

  k_initialize4 = clCreateKernel(p_initialize, "initialize4", &ecode);
  clu_CheckError(ecode, "clCreateKernel()");
  
  ecode  = clSetKernelArg(k_initialize4, 0, sizeof(cl_mem), &m_u);
  ecode  = clSetKernelArg(k_initialize4, 1, sizeof(cl_mem), &m_ce);
  ecode |= clSetKernelArg(k_initialize4, 2, sizeof(int), &d0);
  ecode |= clSetKernelArg(k_initialize4, 3, sizeof(int), &d1);
  ecode |= clSetKernelArg(k_initialize4, 4, sizeof(int), &d2);
  clu_CheckError(ecode, "clSetKernelArg()");

  local_ws[0] = d0 < work_item_sizes[0] ? d0 : work_item_sizes[0];
  temp = max_work_group_size / local_ws[0];
  local_ws[1] = d2 < temp ? d2 : temp;

  global_ws[0] = clu_RoundWorkSize((size_t)d0, local_ws[0]);
  global_ws[1] = clu_RoundWorkSize((size_t)d2, local_ws[1]);

  CHECK_FINISH();
  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_initialize4,
                                 2, NULL,
                                 global_ws,
                                 local_ws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel()");
  //-----------------------------------------------------------------------

  k_initialize5 = clCreateKernel(p_initialize, "initialize5", &ecode);
  clu_CheckError(ecode, "clCreateKernel()");
  
  ecode  = clSetKernelArg(k_initialize5, 0, sizeof(cl_mem), &m_u);
  ecode  = clSetKernelArg(k_initialize5, 1, sizeof(cl_mem), &m_ce);
  ecode |= clSetKernelArg(k_initialize5, 2, sizeof(int), &d0);
  ecode |= clSetKernelArg(k_initialize5, 3, sizeof(int), &d1);
  ecode |= clSetKernelArg(k_initialize5, 4, sizeof(int), &d2);
  clu_CheckError(ecode, "clSetKernelArg()");

  local_ws[0] = d0 < work_item_sizes[0] ? d0 : work_item_sizes[0];
  temp = max_work_group_size / local_ws[0];
  local_ws[1] = d1 < temp ? d1 : temp;

  global_ws[0] = clu_RoundWorkSize((size_t)d0, local_ws[0]);
  global_ws[1] = clu_RoundWorkSize((size_t)d1, local_ws[1]);

  CHECK_FINISH();
  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_initialize5,
                                 2, NULL,
                                 global_ws,
                                 local_ws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel()");
  //-----------------------------------------------------------------------

  clReleaseKernel(k_initialize1);
  clReleaseKernel(k_initialize2);
  clReleaseKernel(k_initialize3);
  clReleaseKernel(k_initialize4);
  CHECK_FINISH();
  clReleaseKernel(k_initialize5);
}