Example #1
0
//---------------------------------------------------------------------
// set the boundary values of dependent variables
//---------------------------------------------------------------------
void setbv()
{
  DTIMER_START(t_setbv);

  cl_int ecode;

  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_setbv1,
                                 SETBV1_DIM, NULL,
                                 setbv1_gws,
                                 setbv1_lws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel()");

  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_setbv2,
                                 SETBV2_DIM, NULL,
                                 setbv2_gws,
                                 setbv2_lws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel()");

  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_setbv3,
                                 SETBV3_DIM, NULL,
                                 setbv3_gws,
                                 setbv3_lws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel()");
  CHECK_FINISH();

  DTIMER_STOP(t_setbv);
}
Example #2
0
//---------------------------------------------------------------------
// compute function from local (i,j,k) to ibar^2+jbar^2+kbar^2 
// for time evolution exponent. 
//---------------------------------------------------------------------
static void compute_indexmap(cl_mem *twiddle, int d1, int d2, int d3)
{
  cl_int ecode;
  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_compute_indexmap,
                                 COMPUTE_IMAP_DIM, NULL,
                                 cimap_gws,
                                 cimap_lws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel() for compute_indexmap");
  CHECK_FINISH();
}
Example #3
0
static void cffts3(int is, int d1, int d2, int d3, cl_mem *x, cl_mem *xout)
{
  int logd3 = ilog2(d3);
  size_t local_ws[2], global_ws[2], temp;
  cl_int ecode;

  if (timers_enabled) timer_start(T_fftz);

  ecode  = clSetKernelArg(k_cffts3, 0, sizeof(cl_mem), x);
  ecode |= clSetKernelArg(k_cffts3, 1, sizeof(cl_mem), xout);
  ecode |= clSetKernelArg(k_cffts3, 3, sizeof(int), &is);
  ecode |= clSetKernelArg(k_cffts3, 4, sizeof(int), &d1);
  ecode |= clSetKernelArg(k_cffts3, 5, sizeof(int), &d2);
  ecode |= clSetKernelArg(k_cffts3, 6, sizeof(int), &d3);
  ecode |= clSetKernelArg(k_cffts3, 7, sizeof(int), &logd3);
  clu_CheckError(ecode, "clSetKernelArg() for k_cffts3");

  if (device_type == CL_DEVICE_TYPE_CPU) {
    local_ws[0] = d1 < work_item_sizes[0] ? d1 : work_item_sizes[0];
    temp = max_work_group_size / local_ws[0];
    local_ws[1] = d2 < temp ? d2 : temp;

    global_ws[0] = clu_RoundWorkSize((size_t)d1, local_ws[0]);
    global_ws[1] = clu_RoundWorkSize((size_t)d2, local_ws[1]);
  } else if (device_type == CL_DEVICE_TYPE_GPU) {
    if (CFFTS_DIM == 2) {
      local_ws[0] = CFFTS_LSIZE;
      local_ws[1] = 1;
      global_ws[0] = d1 * local_ws[0];
      global_ws[1] = d2;
    } else {
      local_ws[0] = CFFTS_LSIZE;
      global_ws[0] = d2 * local_ws[0];
    }
  }

  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_cffts3,
                                 CFFTS_DIM, NULL,
                                 global_ws,
                                 local_ws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel() for cffts3");
  CHECK_FINISH();

  if (timers_enabled) timer_stop(T_fftz);
}
Example #4
0
//---------------------------------------------------------------------
// evolve u0 -> u1 (t time steps) in fourier space
//---------------------------------------------------------------------
static void evolve(cl_mem *u0, cl_mem *u1, cl_mem *twiddle,
                   int d1, int d2, int d3)
{
  cl_int ecode;
  size_t local_ws[3], global_ws[3];

  ecode  = clSetKernelArg(k_evolve, 0, sizeof(cl_mem), u0);
  ecode |= clSetKernelArg(k_evolve, 1, sizeof(cl_mem), u1);
  ecode |= clSetKernelArg(k_evolve, 2, sizeof(cl_mem), twiddle);
  ecode |= clSetKernelArg(k_evolve, 3, sizeof(int), &d1);
  ecode |= clSetKernelArg(k_evolve, 4, sizeof(int), &d2);
  ecode |= clSetKernelArg(k_evolve, 5, sizeof(int), &d3);
  clu_CheckError(ecode, "clSetKernelArg() for evolve");

  if (EVOLVE_DIM == 3) {
    local_ws[0] = d1 < work_item_sizes[0] ? d1 : work_item_sizes[0];
    int temp = max_work_group_size / local_ws[0];
    local_ws[1] = d2 < temp ? d2 : temp;
    temp = temp / local_ws[1];
    local_ws[2] = d3 < temp ? d3 : temp;

    global_ws[0] = clu_RoundWorkSize((size_t)d1, local_ws[0]);
    global_ws[1] = clu_RoundWorkSize((size_t)d2, local_ws[1]);
    global_ws[2] = clu_RoundWorkSize((size_t)d3, local_ws[2]);
  } else if (EVOLVE_DIM == 2) {
    local_ws[0] = d2 < work_item_sizes[0] ? d2 : work_item_sizes[0];
    int temp = max_work_group_size / local_ws[0];
    local_ws[1] = d3 < temp ? d3 : temp;

    global_ws[0] = clu_RoundWorkSize((size_t)d2, local_ws[0]);
    global_ws[1] = clu_RoundWorkSize((size_t)d3, local_ws[1]);
  } else {
    int temp = d3 / max_compute_units;
    local_ws[0] = temp == 0 ? 1 : temp;
    global_ws[0] = clu_RoundWorkSize((size_t)d3, local_ws[0]);
  }

  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_evolve,
                                 EVOLVE_DIM, NULL,
                                 global_ws,
                                 local_ws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel() for evolve");
  CHECK_FINISH();
}
Example #5
0
//---------------------------------------------------------------------
// block-diagonal matrix-vector multiplication              
//---------------------------------------------------------------------
void ninvr()
{
  cl_int ecode;

  if (timeron) timer_start(t_ninvr);

  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_ninvr,
                                 NINVR_DIM, NULL,
                                 ninvr_gws,
                                 ninvr_lws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel()");
  CHECK_FINISH();

  if (timeron) timer_stop(t_ninvr);
}
Example #6
0
//---------------------------------------------------------------------
// addition of update to the vector u
//---------------------------------------------------------------------
void add()
{
  cl_int ecode;

  if (timeron) timer_start(t_add);

  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_add,
                                 ADD_DIM, NULL,
                                 add_gws,
                                 add_lws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel()");
  CHECK_FINISH();

  if (timeron) timer_stop(t_add);
}
Example #7
0
//---------------------------------------------------------------------
// this function performs the solution of the approximate factorization
// step in the z-direction for all five matrix components
// simultaneously. The Thomas algorithm is employed to solve the
// systems for the z-lines. Boundary conditions are non-periodic
//---------------------------------------------------------------------
void z_solve()
{
  cl_int ecode;

  if (timeron) timer_start(t_zsolve);

  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_z_solve,
                                 Z_SOLVE_DIM, NULL,
                                 z_solve_gws,
                                 z_solve_lws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel()");
  CHECK_FINISH();

  if (timeron) timer_stop(t_zsolve);

  tzetar();
}
Example #8
0
static void checksum(int i, cl_mem *u1, int d1, int d2, int d3)
{
  dcomplex chk = dcmplx(0.0, 0.0);
  int k;
  cl_int ecode;

  ecode = clSetKernelArg(k_checksum, 0, sizeof(cl_mem), u1);
  clu_CheckError(ecode, "clSetKernelArg() for checksum");

  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_checksum,
                                 1, NULL,
                                 &checksum_global_ws,
                                 &checksum_local_ws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel()");
  CHECK_FINISH();

  ecode = clEnqueueReadBuffer(cmd_queue,
                              m_chk,
                              CL_TRUE,
                              0, checksum_wg_num * sizeof(dcomplex),
                              g_chk,
                              0, NULL, NULL);
  clu_CheckError(ecode, "clReadBuffer()");

  // reduction
  for (k = 0; k < checksum_wg_num; k++) {
    chk = dcmplx_add(chk, g_chk[k]);
  }

  chk = dcmplx_div2(chk, (double)(NTOTAL));

  printf(" T =%5d     Checksum =%22.12E%22.12E\n", i, chk.real, chk.imag);
  sums[i] = chk;
}
Example #9
0
//---------------------------------------------------------------------
// this function computes the norm of the difference between the
// computed solution and the exact solution
//---------------------------------------------------------------------
void error_norm(double rms[5])
{
  int c, i, m, d;
  int k, j, kk, jj;
  double rms_work[5];
  size_t one = 1;
  cl_mem m_rms[MAX_DEVICE_NUM];
  cl_int ecode = 0;

  for (m = 0; m < 5; m++) {
    rms[m] = 0.0;
    rms_work[m] = 0.0;
  }

  for (i = 0; i < num_devices; i++) {
    m_rms[i] = clCreateBuffer(context, CL_MEM_READ_WRITE,
                              sizeof(double)*5, NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_rms");

    ecode = clEnqueueWriteBuffer(cmd_queue[i * 2], m_rms[i], CL_TRUE,
                                 0, sizeof(double)*5, rms_work,
                                 0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueWriteBuffer() for m_rms");
  }

  for (c = 0; c < ncells; c++) {
    for (i = 0; i < num_devices; i++) {
      kk = 2;
      for (k = cell_low[i][c][2]; k <= cell_high[i][c][2]; k++) {
        jj = 2;
        for (j = cell_low[i][c][1]; j <= cell_high[i][c][1]; j++) {
          ecode  = clSetKernelArg(k_error_norm[i], 0, sizeof(cl_mem), &m_u[i]);
          ecode |= clSetKernelArg(k_error_norm[i], 1, sizeof(cl_mem), &m_rms[i]);
          ecode |= clSetKernelArg(k_error_norm[i], 2, sizeof(cl_mem), &m_ce[i]);
          ecode |= clSetKernelArg(k_error_norm[i], 3, sizeof(int), &c);
          ecode |= clSetKernelArg(k_error_norm[i], 4, sizeof(int), &k);
          ecode |= clSetKernelArg(k_error_norm[i], 5, sizeof(int), &kk);
          ecode |= clSetKernelArg(k_error_norm[i], 6, sizeof(int), &j);
          ecode |= clSetKernelArg(k_error_norm[i], 7, sizeof(int), &jj);
          ecode |= clSetKernelArg(k_error_norm[i], 8, sizeof(int), &cell_low[i][c][0]);
          ecode |= clSetKernelArg(k_error_norm[i], 9, sizeof(int), &cell_high[i][c][0]);
          clu_CheckError(ecode, "clSetKernelArg() for error_norm");

          ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                         k_error_norm[i],
                                         1, NULL, &one, &one,
                                         0, NULL, NULL);
          clu_CheckError(ecode, "clEnqueueNDRangeKernel() for error_norm");
          jj = jj + 1;

          CHECK_FINISH(i * 2);
        }
        kk = kk + 1;
      }
    }
  }

  for (i = 0; i < num_devices; i++) {
    ecode = clEnqueueReadBuffer(cmd_queue[i * 2], m_rms[i], CL_TRUE,
                                0, sizeof(double)*5, rms_work,
                                0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueReadBuffer() for m_rms");

    for (m = 0; m < 5; m++)
      rms[m] += rms_work[m];

    clReleaseMemObject(m_rms[i]);
  }

  for (m = 0; m < 5; m++) {
    for (d = 0; d < 3; d++) {
      rms[m] = rms[m] / (double)(grid_points[d]-2);
    }
    rms[m] = sqrt(rms[m]);
  }
}
Example #10
0
//---------------------------------------------------------------------
// This subroutine initializes the field variable u using 
// tri-linear transfinite interpolation of the boundary values     
//---------------------------------------------------------------------
void initialize()
{
  cl_kernel k_initialize1;
  cl_kernel k_initialize2;
  cl_kernel k_initialize3;
  cl_kernel k_initialize4;
  cl_kernel k_initialize5;

  size_t local_ws[3], global_ws[3], temp;
  cl_int ecode;

  int d0 = grid_points[0];
  int d1 = grid_points[1];
  int d2 = grid_points[2];

  //-----------------------------------------------------------------------
  k_initialize1 = clCreateKernel(p_initialize, "initialize1", &ecode);
  clu_CheckError(ecode, "clCreateKernel()");
  
  ecode  = clSetKernelArg(k_initialize1, 0, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_initialize1, 1, sizeof(int), &d0);
  ecode |= clSetKernelArg(k_initialize1, 2, sizeof(int), &d1);
  ecode |= clSetKernelArg(k_initialize1, 3, sizeof(int), &d2);
  clu_CheckError(ecode, "clSetKernelArg()");

  local_ws[0] = d1 < work_item_sizes[0] ? d1 : work_item_sizes[0];
  temp = max_work_group_size / local_ws[0];
  local_ws[1] = d2 < temp ? d2 : temp;

  global_ws[0] = clu_RoundWorkSize((size_t)d1, local_ws[0]);
  global_ws[1] = clu_RoundWorkSize((size_t)d2, local_ws[1]);

  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_initialize1,
                                 2, NULL,
                                 global_ws,
                                 local_ws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel()");
  //-----------------------------------------------------------------------

  //---------------------------------------------------------------------
  // first store the "interpolated" values everywhere on the grid    
  //---------------------------------------------------------------------
  k_initialize2 = clCreateKernel(p_initialize, "initialize2", &ecode);
  clu_CheckError(ecode, "clCreateKernel()");
  
  ecode  = clSetKernelArg(k_initialize2, 0, sizeof(cl_mem), &m_u);
  ecode  = clSetKernelArg(k_initialize2, 1, sizeof(cl_mem), &m_ce);
  ecode |= clSetKernelArg(k_initialize2, 2, sizeof(int), &d0);
  ecode |= clSetKernelArg(k_initialize2, 3, sizeof(int), &d1);
  ecode |= clSetKernelArg(k_initialize2, 4, sizeof(int), &d2);
  clu_CheckError(ecode, "clSetKernelArg()");

  if (INITIALIZE2_DIM == 3) {
    local_ws[0] = d0 < work_item_sizes[0] ? d0 : work_item_sizes[0];
    temp = max_work_group_size / local_ws[0];
    local_ws[1] = d1 < temp ? d1 : temp;
    temp = temp / local_ws[1];
    local_ws[2] = d2 < temp ? d2 : temp;

    global_ws[0] = clu_RoundWorkSize((size_t)d0, local_ws[0]);
    global_ws[1] = clu_RoundWorkSize((size_t)d1, local_ws[1]);
    global_ws[2] = clu_RoundWorkSize((size_t)d2, local_ws[2]);
  } else if (INITIALIZE2_DIM == 2) {
    local_ws[0] = d1 < work_item_sizes[0] ? d1 : work_item_sizes[0];
    temp = max_work_group_size / local_ws[0];
    local_ws[1] = d2 < temp ? d2 : temp;

    global_ws[0] = clu_RoundWorkSize((size_t)d1, local_ws[0]);
    global_ws[1] = clu_RoundWorkSize((size_t)d2, local_ws[1]);
  }
  
  CHECK_FINISH();
  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_initialize2,
                                 INITIALIZE2_DIM, NULL,
                                 global_ws,
                                 local_ws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel()");
  //-----------------------------------------------------------------------

  //---------------------------------------------------------------------
  // now store the exact values on the boundaries        
  //---------------------------------------------------------------------
  k_initialize3 = clCreateKernel(p_initialize, "initialize3", &ecode);
  clu_CheckError(ecode, "clCreateKernel()");
  
  ecode  = clSetKernelArg(k_initialize3, 0, sizeof(cl_mem), &m_u);
  ecode  = clSetKernelArg(k_initialize3, 1, sizeof(cl_mem), &m_ce);
  ecode |= clSetKernelArg(k_initialize3, 2, sizeof(int), &d0);
  ecode |= clSetKernelArg(k_initialize3, 3, sizeof(int), &d1);
  ecode |= clSetKernelArg(k_initialize3, 4, sizeof(int), &d2);
  clu_CheckError(ecode, "clSetKernelArg()");

  local_ws[0] = d1 < work_item_sizes[0] ? d1 : work_item_sizes[0];
  temp = max_work_group_size / local_ws[0];
  local_ws[1] = d2 < temp ? d2 : temp;

  global_ws[0] = clu_RoundWorkSize((size_t)d1, local_ws[0]);
  global_ws[1] = clu_RoundWorkSize((size_t)d2, local_ws[1]);

  CHECK_FINISH();
  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_initialize3,
                                 2, NULL,
                                 global_ws,
                                 local_ws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel()");
  //-----------------------------------------------------------------------

  k_initialize4 = clCreateKernel(p_initialize, "initialize4", &ecode);
  clu_CheckError(ecode, "clCreateKernel()");
  
  ecode  = clSetKernelArg(k_initialize4, 0, sizeof(cl_mem), &m_u);
  ecode  = clSetKernelArg(k_initialize4, 1, sizeof(cl_mem), &m_ce);
  ecode |= clSetKernelArg(k_initialize4, 2, sizeof(int), &d0);
  ecode |= clSetKernelArg(k_initialize4, 3, sizeof(int), &d1);
  ecode |= clSetKernelArg(k_initialize4, 4, sizeof(int), &d2);
  clu_CheckError(ecode, "clSetKernelArg()");

  local_ws[0] = d0 < work_item_sizes[0] ? d0 : work_item_sizes[0];
  temp = max_work_group_size / local_ws[0];
  local_ws[1] = d2 < temp ? d2 : temp;

  global_ws[0] = clu_RoundWorkSize((size_t)d0, local_ws[0]);
  global_ws[1] = clu_RoundWorkSize((size_t)d2, local_ws[1]);

  CHECK_FINISH();
  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_initialize4,
                                 2, NULL,
                                 global_ws,
                                 local_ws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel()");
  //-----------------------------------------------------------------------

  k_initialize5 = clCreateKernel(p_initialize, "initialize5", &ecode);
  clu_CheckError(ecode, "clCreateKernel()");
  
  ecode  = clSetKernelArg(k_initialize5, 0, sizeof(cl_mem), &m_u);
  ecode  = clSetKernelArg(k_initialize5, 1, sizeof(cl_mem), &m_ce);
  ecode |= clSetKernelArg(k_initialize5, 2, sizeof(int), &d0);
  ecode |= clSetKernelArg(k_initialize5, 3, sizeof(int), &d1);
  ecode |= clSetKernelArg(k_initialize5, 4, sizeof(int), &d2);
  clu_CheckError(ecode, "clSetKernelArg()");

  local_ws[0] = d0 < work_item_sizes[0] ? d0 : work_item_sizes[0];
  temp = max_work_group_size / local_ws[0];
  local_ws[1] = d1 < temp ? d1 : temp;

  global_ws[0] = clu_RoundWorkSize((size_t)d0, local_ws[0]);
  global_ws[1] = clu_RoundWorkSize((size_t)d1, local_ws[1]);

  CHECK_FINISH();
  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_initialize5,
                                 2, NULL,
                                 global_ws,
                                 local_ws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel()");
  //-----------------------------------------------------------------------

  clReleaseKernel(k_initialize1);
  clReleaseKernel(k_initialize2);
  clReleaseKernel(k_initialize3);
  clReleaseKernel(k_initialize4);
  CHECK_FINISH();
  clReleaseKernel(k_initialize5);
}
Example #11
0
int main(int argc, char *argv[]) 
{
  double Mops, t1, t2;
  double tsx, tsy, tm, an, tt, gc;
  double sx_verify_value, sy_verify_value, sx_err, sy_err;
  int    i, nit;
  int    k_offset, j;
  logical verified;

  char   size[16];

  FILE *fp;

  if (argc == 1) {
    fprintf(stderr, "Usage: %s <kernel directory>\n", argv[0]);
    exit(-1);
  }

  if ((fp = fopen("timer.flag", "r")) == NULL) {
    timers_enabled = false;
  } else {
    timers_enabled = true;
    fclose(fp);
  }

  //--------------------------------------------------------------------
  //  Because the size of the problem is too large to store in a 32-bit
  //  integer for some classes, we put it into a string (for printing).
  //  Have to strip off the decimal point put in there by the floating
  //  point print statement (internal file)
  //--------------------------------------------------------------------

  sprintf(size, "%15.0lf", pow(2.0, M+1));
  j = 14;
  if (size[j] == '.') j--;
  size[j+1] = '\0';
  printf("\n\n NAS Parallel Benchmarks (NPB3.3-OCL) - EP Benchmark\n");
  printf("\n Number of random numbers generated: %15s\n", size);

  verified = false;

  //--------------------------------------------------------------------
  //  Compute the number of "batches" of random number pairs generated 
  //  per processor. Adjust if the number of processors does not evenly 
  //  divide the total number
  //--------------------------------------------------------------------

  np = NN; 

  setup_opencl(argc, argv);

  timer_clear(0);
  timer_start(0);

  //--------------------------------------------------------------------
  //  Compute AN = A ^ (2 * NK) (mod 2^46).
  //--------------------------------------------------------------------

  t1 = A;

  for (i = 0; i < MK + 1; i++) {
    t2 = randlc(&t1, t1);
  }

  an = t1;
  tt = S;

  //--------------------------------------------------------------------
  //  Each instance of this loop may be performed independently. We compute
  //  the k offsets separately to take into account the fact that some nodes
  //  have more numbers to generate than others
  //--------------------------------------------------------------------

  k_offset = -1;

  DTIMER_START(T_KERNEL_EMBAR);

  // Launch the kernel
  int q_size  = GROUP_SIZE * NQ * sizeof(cl_double);
  int sx_size = GROUP_SIZE * sizeof(cl_double);
  int sy_size = GROUP_SIZE * sizeof(cl_double);
  err_code  = clSetKernelArg(kernel, 0, q_size, NULL);
  err_code |= clSetKernelArg(kernel, 1, sx_size, NULL);
  err_code |= clSetKernelArg(kernel, 2, sy_size, NULL);
  err_code |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&pgq);
  err_code |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&pgsx);
  err_code |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&pgsy);
  err_code |= clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&k_offset);
  err_code |= clSetKernelArg(kernel, 7, sizeof(cl_double), (void*)&an);
  clu_CheckError(err_code, "clSetKernelArg()");
  
  size_t localWorkSize[] = { GROUP_SIZE };
  size_t globalWorkSize[] = { np };
  err_code = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL,
                                    globalWorkSize, 
                                    localWorkSize,
                                    0, NULL, NULL);
  clu_CheckError(err_code, "clEnqueueNDRangeKernel()");
  CHECK_FINISH();
  DTIMER_STOP(T_KERNEL_EMBAR);

  double (*gq)[NQ] = (double (*)[NQ])malloc(gq_size);
  double *gsx = (double*)malloc(gsx_size);
  double *gsy = (double*)malloc(gsy_size);

  gc  = 0.0;
  tsx = 0.0;
  tsy = 0.0;

  for (i = 0; i < NQ; i++) {
    q[i] = 0.0;
  }

  // 9. Get the result
  DTIMER_START(T_BUFFER_READ);
  err_code = clEnqueueReadBuffer(cmd_queue, pgq, CL_FALSE, 0, gq_size, 
                                 gq, 0, NULL, NULL);
  clu_CheckError(err_code, "clEnqueueReadbuffer()");

  err_code = clEnqueueReadBuffer(cmd_queue, pgsx, CL_FALSE, 0, gsx_size, 
                                 gsx, 0, NULL, NULL);
  clu_CheckError(err_code, "clEnqueueReadbuffer()");

  err_code = clEnqueueReadBuffer(cmd_queue, pgsy, CL_TRUE, 0, gsy_size, 
                                 gsy, 0, NULL, NULL);
  clu_CheckError(err_code, "clEnqueueReadbuffer()");
  DTIMER_STOP(T_BUFFER_READ);

  for (i = 0; i < np/localWorkSize[0]; i++) {
    for (j = 0; j < NQ; j++ ){
      q[j] = q[j] + gq[i][j];
    }
    tsx = tsx + gsx[i];
    tsy = tsy + gsy[i];
  }

  for (i = 0; i < NQ; i++) {
    gc = gc + q[i];
  }

  timer_stop(0);
  tm = timer_read(0);

  nit = 0;
  verified = true;
  if (M == 24) {
    sx_verify_value = -3.247834652034740e+3;
    sy_verify_value = -6.958407078382297e+3;
  } else if (M == 25) {
    sx_verify_value = -2.863319731645753e+3;
    sy_verify_value = -6.320053679109499e+3;
  } else if (M == 28) {
    sx_verify_value = -4.295875165629892e+3;
    sy_verify_value = -1.580732573678431e+4;
  } else if (M == 30) {
    sx_verify_value =  4.033815542441498e+4;
    sy_verify_value = -2.660669192809235e+4;
  } else if (M == 32) {
    sx_verify_value =  4.764367927995374e+4;
    sy_verify_value = -8.084072988043731e+4;
  } else if (M == 36) {
    sx_verify_value =  1.982481200946593e+5;
    sy_verify_value = -1.020596636361769e+5;
  } else if (M == 40) {
    sx_verify_value = -5.319717441530e+05;
    sy_verify_value = -3.688834557731e+05;
  } else {
    verified = false;
  }

  if (verified) {
    sx_err = fabs((tsx - sx_verify_value) / sx_verify_value);
    sy_err = fabs((tsy - sy_verify_value) / sy_verify_value);
    verified = ((sx_err <= EPSILON) && (sy_err <= EPSILON));
  }

  Mops = pow(2.0, M+1) / tm / 1000000.0;

  printf("\nEP Benchmark Results:\n\n");
  printf("CPU Time =%10.4lf\n", tm);
  printf("N = 2^%5d\n", M);
  printf("No. Gaussian Pairs = %15.0lf\n", gc);
  printf("Sums = %25.15lE %25.15lE\n", tsx, tsy);
  printf("Counts: \n");
  for (i = 0; i < NQ; i++) {
    printf("%3d%15.0lf\n", i, q[i]);
  }

  c_print_results("EP", CLASS, M+1, 0, 0, nit,
      tm, Mops, 
      "Random numbers generated",
      verified, NPBVERSION, COMPILETIME, 
      CS1, CS2, CS3, CS4, CS5, CS6, CS7,
      clu_GetDeviceTypeName(device_type), device_name);

  if (timers_enabled) {
    if (tm <= 0.0) tm = 1.0;
    tt = timer_read(0);
    printf("\nTotal time:     %9.3lf (%6.2lf)\n", tt, tt*100.0/tm);
  }

  free(gq);
  free(gsx);
  free(gsy);
  release_opencl();

  fflush(stdout);

  return 0;
}
Example #12
0
//---------------------------------------------------------------------
// compute the right hand side based on exact solution
//---------------------------------------------------------------------
void exact_rhs()
{
  int c, i;
  int range_1, range_0;
  size_t d[3], local_ws[3], global_ws[3];
  cl_int ecode = 0;

  for (c = 0; c < ncells; c++) {
    for (i = 0; i < num_devices; i++) {
      ecode  = clSetKernelArg(k_exact_rhs1[i], 0, sizeof(cl_mem), &m_forcing[i]);
      ecode |= clSetKernelArg(k_exact_rhs1[i], 1, sizeof(int), &c);
      ecode |= clSetKernelArg(k_exact_rhs1[i], 2, sizeof(int), &cell_size[i][c][2]);
      ecode |= clSetKernelArg(k_exact_rhs1[i], 3, sizeof(int), &cell_size[i][c][1]);
      ecode |= clSetKernelArg(k_exact_rhs1[i], 4, sizeof(int), &cell_size[i][c][0]);
      clu_CheckError(ecode, "clSetKernelArg() for exact_rhs1");

      d[0] = cell_size[i][c][1];
      d[1] = cell_size[i][c][2];
      compute_ws_dim_2(d, local_ws, global_ws);

      ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                     k_exact_rhs1[i],
                                     2, NULL,
                                     global_ws,
                                     local_ws,
                                     0, NULL, NULL);
      clu_CheckError(ecode, "clEnqueueNDRange() for exact_rhs1");
    }

    for (i = 0; i < num_devices; i++) {
      CHECK_FINISH(i * 2);
    }

  }

  for (i = 0; i < num_devices; i++) {
    ecode  = clSetKernelArg(k_exact_rhs2[i], 0, sizeof(cl_mem), &m_forcing[i]);
    ecode |= clSetKernelArg(k_exact_rhs2[i], 1, sizeof(cl_mem), &m_ce[i]);
    clu_CheckError(ecode, "clSetKernelArg() for exact_rhs2");
  }

  for (c = 0; c < ncells; c++) {
    for (i = 0; i < num_devices; i++) {
      range_1 = cell_size[i][c][2] - cell_end[i][c][2];
      range_0 = cell_size[i][c][1] - cell_end[i][c][1];

      ecode  = clSetKernelArg(k_exact_rhs2[i], 2, sizeof(int), &c);
      ecode |= clSetKernelArg(k_exact_rhs2[i], 3, sizeof(int), &cell_start[i][c][2]);
      ecode |= clSetKernelArg(k_exact_rhs2[i], 4, sizeof(int), &range_1);
      ecode |= clSetKernelArg(k_exact_rhs2[i], 5, sizeof(int), &cell_start[i][c][1]);
      ecode |= clSetKernelArg(k_exact_rhs2[i], 6, sizeof(int), &range_0);
      ecode |= clSetKernelArg(k_exact_rhs2[i], 7, sizeof(int), &cell_size[i][c][0]);
      ecode |= clSetKernelArg(k_exact_rhs2[i], 8, sizeof(int), &cell_start[i][c][0]);
      ecode |= clSetKernelArg(k_exact_rhs2[i], 9, sizeof(int), &cell_end[i][c][0]);
      ecode |= clSetKernelArg(k_exact_rhs2[i], 10, sizeof(int), &cell_low[i][c][2]);
      ecode |= clSetKernelArg(k_exact_rhs2[i], 11, sizeof(int), &cell_low[i][c][1]);
      ecode |= clSetKernelArg(k_exact_rhs2[i], 12, sizeof(int), &cell_low[i][c][0]);
      clu_CheckError(ecode, "clSetKernelArg() for exact_rhs2");

      d[0] = cell_size[i][c][1] - cell_start[i][c][1] - cell_end[i][c][1];
      d[1] = cell_size[i][c][2] - cell_start[i][c][2] - cell_end[i][c][2];
      compute_ws_dim_2(d, local_ws, global_ws);

      if (c == 0) CHECK_FINISH(i * 2);

      ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                     k_exact_rhs2[i],
                                     2, NULL,
                                     global_ws,
                                     local_ws,
                                     0, NULL, NULL);
      clu_CheckError(ecode, "clEnqueueNDRange() for exact_rhs2");
    }
    for (i = 0; i < num_devices; i++) {
      CHECK_FINISH(i * 2);
    }
  }

  for (i = 0; i < num_devices; i++) {
    ecode  = clSetKernelArg(k_exact_rhs3[i], 0, sizeof(cl_mem), &m_forcing[i]);
    ecode |= clSetKernelArg(k_exact_rhs3[i], 1, sizeof(cl_mem), &m_ce[i]);
    clu_CheckError(ecode, "clSetKernelArg() for exact_rhs3");
  }

  for (c = 0; c < ncells; c++) {
    for (i = 0; i < num_devices; i++) {
      range_1 = cell_size[i][c][2] - cell_end[i][c][2];
      range_0 = cell_size[i][c][0] - cell_end[i][c][0];

      ecode  = clSetKernelArg(k_exact_rhs3[i], 2, sizeof(int), &c);
      ecode |= clSetKernelArg(k_exact_rhs3[i], 3, sizeof(int), &cell_start[i][c][2]);
      ecode |= clSetKernelArg(k_exact_rhs3[i], 4, sizeof(int), &range_1);
      ecode |= clSetKernelArg(k_exact_rhs3[i], 5, sizeof(int), &cell_start[i][c][0]);
      ecode |= clSetKernelArg(k_exact_rhs3[i], 6, sizeof(int), &range_0);
      ecode |= clSetKernelArg(k_exact_rhs3[i], 7, sizeof(int), &cell_size[i][c][1]);
      ecode |= clSetKernelArg(k_exact_rhs3[i], 8, sizeof(int), &cell_start[i][c][1]);
      ecode |= clSetKernelArg(k_exact_rhs3[i], 9, sizeof(int), &cell_end[i][c][1]);
      ecode |= clSetKernelArg(k_exact_rhs3[i], 10, sizeof(int), &cell_low[i][c][2]);
      ecode |= clSetKernelArg(k_exact_rhs3[i], 11, sizeof(int), &cell_low[i][c][1]);
      ecode |= clSetKernelArg(k_exact_rhs3[i], 12, sizeof(int), &cell_low[i][c][0]);
      clu_CheckError(ecode, "clSetKernelArg() for exact_rhs3");

      d[0] = cell_size[i][c][0] - cell_start[i][c][0] - cell_end[i][c][0];
      d[1] = cell_size[i][c][2] - cell_start[i][c][2] - cell_end[i][c][2];
      compute_ws_dim_2(d, local_ws, global_ws);

      if (c == 0) CHECK_FINISH(i * 2);

      ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                     k_exact_rhs3[i],
                                     2, NULL,
                                     global_ws,
                                     local_ws,
                                     0, NULL, NULL);
      clu_CheckError(ecode, "clEnqueueNDRange() for exact_rhs3");
    }
    for (i = 0; i < num_devices; i++) {
      CHECK_FINISH(i * 2);
    }
  }

  for (i = 0; i < num_devices; i++) {
    ecode  = clSetKernelArg(k_exact_rhs4[i], 0, sizeof(cl_mem), &m_forcing[i]);
    ecode |= clSetKernelArg(k_exact_rhs4[i], 1, sizeof(cl_mem), &m_ce[i]);
    clu_CheckError(ecode, "clSetKernelArg() for exact_rhs4");
  }

  for (c = 0; c < ncells; c++) {
    for (i = 0; i < num_devices; i++) {
      range_1 = cell_size[i][c][1] - cell_end[i][c][1];
      range_0 = cell_size[i][c][0] - cell_end[i][c][0];

      ecode  = clSetKernelArg(k_exact_rhs4[i], 2, sizeof(int), &c);
      ecode |= clSetKernelArg(k_exact_rhs4[i], 3, sizeof(int), &cell_start[i][c][1]);
      ecode |= clSetKernelArg(k_exact_rhs4[i], 4, sizeof(int), &range_1);
      ecode |= clSetKernelArg(k_exact_rhs4[i], 5, sizeof(int), &cell_start[i][c][0]);
      ecode |= clSetKernelArg(k_exact_rhs4[i], 6, sizeof(int), &range_0);
      ecode |= clSetKernelArg(k_exact_rhs4[i], 7, sizeof(int), &cell_size[i][c][2]);
      ecode |= clSetKernelArg(k_exact_rhs4[i], 8, sizeof(int), &cell_start[i][c][2]);
      ecode |= clSetKernelArg(k_exact_rhs4[i], 9, sizeof(int), &cell_end[i][c][2]);
      ecode |= clSetKernelArg(k_exact_rhs4[i], 10, sizeof(int), &cell_low[i][c][2]);
      ecode |= clSetKernelArg(k_exact_rhs4[i], 11, sizeof(int), &cell_low[i][c][1]);
      ecode |= clSetKernelArg(k_exact_rhs4[i], 12, sizeof(int), &cell_low[i][c][0]);
      clu_CheckError(ecode, "clSetKernelArg() for exact_rhs4");

      d[0] = cell_size[i][c][0] - cell_start[i][c][0] - cell_end[i][c][0];
      d[1] = cell_size[i][c][1] - cell_start[i][c][1] - cell_end[i][c][1];
      compute_ws_dim_2(d, local_ws, global_ws);

      if (c == 0) CHECK_FINISH(i * 2);

      ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                     k_exact_rhs4[i],
                                     2, NULL,
                                     global_ws,
                                     local_ws,
                                     0, NULL, NULL);
      clu_CheckError(ecode, "clEnqueueNDRange() for exact_rhs4");
    }
    for (i = 0; i < num_devices; i++) {
      CHECK_FINISH(i * 2);
    }
  }

  for (c = 0; c < ncells; c++) {
    for (i = 0; i < num_devices; i++) {
      range_1 = cell_size[i][c][2] - cell_end[i][c][2];
      range_0 = cell_size[i][c][1] - cell_end[i][c][1];

      ecode  = clSetKernelArg(k_exact_rhs5[i], 0, sizeof(cl_mem), &m_forcing[i]);
      ecode |= clSetKernelArg(k_exact_rhs5[i], 1, sizeof(int), &c);
      ecode |= clSetKernelArg(k_exact_rhs5[i], 2, sizeof(int), &cell_start[i][c][2]);
      ecode |= clSetKernelArg(k_exact_rhs5[i], 3, sizeof(int), &range_1);
      ecode |= clSetKernelArg(k_exact_rhs5[i], 4, sizeof(int), &cell_start[i][c][1]);
      ecode |= clSetKernelArg(k_exact_rhs5[i], 5, sizeof(int), &range_0);
      ecode |= clSetKernelArg(k_exact_rhs5[i], 6, sizeof(int), &cell_size[i][c][0]);
      ecode |= clSetKernelArg(k_exact_rhs5[i], 7, sizeof(int), &cell_start[i][c][0]);
      ecode |= clSetKernelArg(k_exact_rhs5[i], 8, sizeof(int), &cell_end[i][c][0]);
      clu_CheckError(ecode, "clSetKernelArg() for exact_rhs5");

      d[0] = cell_size[i][c][1] - cell_start[i][c][1] - cell_end[i][c][1];
      d[1] = cell_size[i][c][2] - cell_start[i][c][2] - cell_end[i][c][2];
      compute_ws_dim_2(d, local_ws, global_ws);

      if (c == 0) CHECK_FINISH(i * 2);

      ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                     k_exact_rhs5[i],
                                     2, NULL,
                                     global_ws,
                                     local_ws,
                                     0, NULL, NULL);
      clu_CheckError(ecode, "clEnqueueNDRange() for exact_rhs5");
    }
    for (i = 0; i < num_devices; i++) {
      CHECK_FINISH(i * 2);
    }
  }

  for (i = 0; i < num_devices; i++)
    CHECK_FINISH(i * 2);
}
Example #13
0
void compute_rhs()
{
  int i;
  size_t d0_size, d1_size, d2_size;
  cl_int ecode;

  if (timeron) timer_start(t_rhs);

  //-------------------------------------------------------------------------
  // compute the reciprocal of density, and the kinetic energy, 
  // and the speed of sound. 
  //-------------------------------------------------------------------------
  for (i = 0; i < num_devices; i++) {
    size_t compute_rhs1_lws[3], compute_rhs1_gws[3], temp;

    if (COMPUTE_RHS1_DIM == 3) {
      d0_size = max_cell_size[i][1] + 2;
      d1_size = max_cell_size[i][2] + 2;
      d2_size = ncells;

      compute_rhs1_lws[0] = d0_size < work_item_sizes[0] ?
                            d0_size : work_item_sizes[0];
      temp = max_work_group_size / compute_rhs1_lws[0];
      compute_rhs1_lws[1] = d1_size < temp ? d1_size : temp;
      temp = temp / compute_rhs1_lws[1];
      compute_rhs1_lws[2] = d2_size < temp ? d2_size : temp;
      compute_rhs1_gws[0] = clu_RoundWorkSize(d0_size, compute_rhs1_lws[0]);
      compute_rhs1_gws[1] = clu_RoundWorkSize(d1_size, compute_rhs1_lws[1]);
      compute_rhs1_gws[2] = clu_RoundWorkSize(d2_size, compute_rhs1_lws[2]);
    } else {
      d0_size = max_cell_size[i][2] + 2;
      d1_size = ncells;

      compute_rhs1_lws[0] = d0_size < work_item_sizes[0] ?
                            d0_size : work_item_sizes[0];
      temp = max_work_group_size / compute_rhs1_lws[0];
      compute_rhs1_lws[1] = d1_size < temp ? d1_size : temp;
      compute_rhs1_gws[0] = clu_RoundWorkSize(d0_size, compute_rhs1_lws[0]);
      compute_rhs1_gws[1] = clu_RoundWorkSize(d1_size, compute_rhs1_lws[1]);
    }

    ecode  = clSetKernelArg(k_compute_rhs1[i], 0, sizeof(cl_mem), &m_u[i]);
    ecode |= clSetKernelArg(k_compute_rhs1[i], 1, sizeof(cl_mem), &m_us[i]);
    ecode |= clSetKernelArg(k_compute_rhs1[i], 2, sizeof(cl_mem), &m_vs[i]);
    ecode |= clSetKernelArg(k_compute_rhs1[i], 3, sizeof(cl_mem), &m_ws[i]);
    ecode |= clSetKernelArg(k_compute_rhs1[i], 4, sizeof(cl_mem), &m_qs[i]);
    ecode |= clSetKernelArg(k_compute_rhs1[i], 5, sizeof(cl_mem),
                                                  &m_rho_i[i]);
    ecode |= clSetKernelArg(k_compute_rhs1[i], 6, sizeof(cl_mem),
                                                  &m_square[i]);
    ecode |= clSetKernelArg(k_compute_rhs1[i], 7, sizeof(cl_mem),
                                                  &m_cell_size[i]);
    ecode |= clSetKernelArg(k_compute_rhs1[i], 8, sizeof(int), &ncells);
    clu_CheckError(ecode, "clSetKernelArg()");

    ecode = clEnqueueNDRangeKernel(cmd_queue[i],
                                   k_compute_rhs1[i],
                                   COMPUTE_RHS1_DIM, NULL,
                                   compute_rhs1_gws,
                                   compute_rhs1_lws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel()");
  }
  //-------------------------------------------------------------------------

  //-------------------------------------------------------------------------
  // copy the exact forcing term to the right hand side;  because 
  // this forcing term is known, we can store it on the whole of every 
  // cell,  including the boundary                   
  //-------------------------------------------------------------------------
  for (i = 0; i < num_devices; i++) {
    size_t compute_rhs2_lws[3], compute_rhs2_gws[3], temp;

    if (COMPUTE_RHS2_DIM == 3) {
      d0_size = max_cell_size[i][1];
      d1_size = max_cell_size[i][2];
      d2_size = ncells;

      compute_rhs2_lws[0] = d0_size < work_item_sizes[0] ?
                            d0_size : work_item_sizes[0];
      temp = max_work_group_size / compute_rhs2_lws[0];
      compute_rhs2_lws[1] = d1_size < temp ? d1_size : temp;
      temp = temp / compute_rhs2_lws[1];
      compute_rhs2_lws[2] = d2_size < temp ? d2_size : temp;
      compute_rhs2_gws[0] = clu_RoundWorkSize(d0_size, compute_rhs2_lws[0]);
      compute_rhs2_gws[1] = clu_RoundWorkSize(d1_size, compute_rhs2_lws[1]);
      compute_rhs2_gws[2] = clu_RoundWorkSize(d2_size, compute_rhs2_lws[2]);
    } else {
      d0_size = max_cell_size[i][2];
      d1_size = ncells;

      compute_rhs2_lws[0] = d0_size < work_item_sizes[0] ?
                            d0_size : work_item_sizes[0];
      temp = max_work_group_size / compute_rhs2_lws[0];
      compute_rhs2_lws[1] = d1_size < temp ? d1_size : temp;
      compute_rhs2_gws[0] = clu_RoundWorkSize(d0_size, compute_rhs2_lws[0]);
      compute_rhs2_gws[1] = clu_RoundWorkSize(d1_size, compute_rhs2_lws[1]);
    }

    ecode  = clSetKernelArg(k_compute_rhs2[i], 0, sizeof(cl_mem),
                                                  &m_forcing[i]);
    ecode |= clSetKernelArg(k_compute_rhs2[i], 1, sizeof(cl_mem), &m_rhs[i]);
    ecode |= clSetKernelArg(k_compute_rhs2[i], 2, sizeof(cl_mem),
                                                  &m_cell_size[i]);
    ecode |= clSetKernelArg(k_compute_rhs2[i], 3, sizeof(int), &ncells);
    clu_CheckError(ecode, "clSetKernelArg()");

    ecode = clEnqueueNDRangeKernel(cmd_queue[i],
                                   k_compute_rhs2[i],
                                   COMPUTE_RHS2_DIM, NULL,
                                   compute_rhs2_gws,
                                   compute_rhs2_lws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel()");
  }
  //-------------------------------------------------------------------------

  //-------------------------------------------------------------------------
  // compute xi-direction fluxes 
  //-------------------------------------------------------------------------
  for (i = 0; i < num_devices; i++) {
    size_t compute_rhs3_lws[3], compute_rhs3_gws[3], temp;

    d0_size = max_cell_size[i][1];
    d1_size = max_cell_size[i][2];
    d2_size = ncells;

    compute_rhs3_lws[0] = d0_size < work_item_sizes[0] ?
                          d0_size : work_item_sizes[0];
    temp = max_work_group_size / compute_rhs3_lws[0];
    compute_rhs3_lws[1] = d1_size < temp ? d1_size : temp;
    temp = temp / compute_rhs3_lws[1];
    compute_rhs3_lws[2] = d2_size < temp ? d2_size : temp;
    compute_rhs3_gws[0] = clu_RoundWorkSize(d0_size, compute_rhs3_lws[0]);
    compute_rhs3_gws[1] = clu_RoundWorkSize(d1_size, compute_rhs3_lws[1]);
    compute_rhs3_gws[2] = clu_RoundWorkSize(d2_size, compute_rhs3_lws[2]);

    ecode  = clSetKernelArg(k_compute_rhs3[i], 0, sizeof(cl_mem), &m_u[i]);
    ecode |= clSetKernelArg(k_compute_rhs3[i], 1, sizeof(cl_mem), &m_us[i]);
    ecode |= clSetKernelArg(k_compute_rhs3[i], 2, sizeof(cl_mem), &m_vs[i]);
    ecode |= clSetKernelArg(k_compute_rhs3[i], 3, sizeof(cl_mem), &m_ws[i]);
    ecode |= clSetKernelArg(k_compute_rhs3[i], 4, sizeof(cl_mem), &m_qs[i]);
    ecode |= clSetKernelArg(k_compute_rhs3[i], 5, sizeof(cl_mem),
                                                  &m_rho_i[i]);
    ecode |= clSetKernelArg(k_compute_rhs3[i], 6, sizeof(cl_mem),
                                                  &m_square[i]);
    ecode |= clSetKernelArg(k_compute_rhs3[i], 7, sizeof(cl_mem), &m_rhs[i]);
    ecode |= clSetKernelArg(k_compute_rhs3[i], 8, sizeof(cl_mem),
                                                  &m_cell_size[i]);
    ecode |= clSetKernelArg(k_compute_rhs3[i], 9, sizeof(cl_mem),&m_start[i]);
    ecode |= clSetKernelArg(k_compute_rhs3[i], 10, sizeof(cl_mem), &m_end[i]);
    ecode |= clSetKernelArg(k_compute_rhs3[i], 11, sizeof(int), &ncells);
    clu_CheckError(ecode, "clSetKernelArg()");

    ecode = clEnqueueNDRangeKernel(cmd_queue[i],
                                   k_compute_rhs3[i],
                                   3, NULL,
                                   compute_rhs3_gws,
                                   compute_rhs3_lws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel()");
  }
  //-------------------------------------------------------------------------

  //-------------------------------------------------------------------------
  // compute eta-direction fluxes 
  //-------------------------------------------------------------------------
  for (i = 0; i < num_devices; i++) {
    size_t compute_rhs4_lws[3], compute_rhs4_gws[3], temp;

    d0_size = max_cell_size[i][0];
    d1_size = max_cell_size[i][2];
    d2_size = ncells;

    compute_rhs4_lws[0] = d0_size < work_item_sizes[0] ?
                          d0_size : work_item_sizes[0];
    temp = max_work_group_size / compute_rhs4_lws[0];
    compute_rhs4_lws[1] = d1_size < temp ? d1_size : temp;
    temp = temp / compute_rhs4_lws[1];
    compute_rhs4_lws[2] = d2_size < temp ? d2_size : temp;
    compute_rhs4_gws[0] = clu_RoundWorkSize(d0_size, compute_rhs4_lws[0]);
    compute_rhs4_gws[1] = clu_RoundWorkSize(d1_size, compute_rhs4_lws[1]);
    compute_rhs4_gws[2] = clu_RoundWorkSize(d2_size, compute_rhs4_lws[2]);

    ecode  = clSetKernelArg(k_compute_rhs4[i], 0, sizeof(cl_mem), &m_u[i]);
    ecode |= clSetKernelArg(k_compute_rhs4[i], 1, sizeof(cl_mem), &m_us[i]);
    ecode |= clSetKernelArg(k_compute_rhs4[i], 2, sizeof(cl_mem), &m_vs[i]);
    ecode |= clSetKernelArg(k_compute_rhs4[i], 3, sizeof(cl_mem), &m_ws[i]);
    ecode |= clSetKernelArg(k_compute_rhs4[i], 4, sizeof(cl_mem), &m_qs[i]);
    ecode |= clSetKernelArg(k_compute_rhs4[i], 5, sizeof(cl_mem),
                                                  &m_rho_i[i]);
    ecode |= clSetKernelArg(k_compute_rhs4[i], 6, sizeof(cl_mem),
                                                  &m_square[i]);
    ecode |= clSetKernelArg(k_compute_rhs4[i], 7, sizeof(cl_mem), &m_rhs[i]);
    ecode |= clSetKernelArg(k_compute_rhs4[i], 8, sizeof(cl_mem),
                                                  &m_cell_size[i]);
    ecode |= clSetKernelArg(k_compute_rhs4[i], 9, sizeof(cl_mem),&m_start[i]);
    ecode |= clSetKernelArg(k_compute_rhs4[i], 10, sizeof(cl_mem), &m_end[i]);
    ecode |= clSetKernelArg(k_compute_rhs4[i], 11, sizeof(int), &ncells);
    clu_CheckError(ecode, "clSetKernelArg()");

    ecode = clEnqueueNDRangeKernel(cmd_queue[i],
                                   k_compute_rhs4[i],
                                   3, NULL,
                                   compute_rhs4_gws,
                                   compute_rhs4_lws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel()");
  }
  //-------------------------------------------------------------------------

  //-------------------------------------------------------------------------
  // compute zeta-direction fluxes 
  //-------------------------------------------------------------------------
  for (i = 0; i < num_devices; i++) {
    size_t compute_rhs5_lws[3], compute_rhs5_gws[3], temp;

    d0_size = max_cell_size[i][0];
    d1_size = max_cell_size[i][1];
    d2_size = ncells;

    compute_rhs5_lws[0] = d0_size < work_item_sizes[0] ?
                          d0_size : work_item_sizes[0];
    temp = max_work_group_size / compute_rhs5_lws[0];
    compute_rhs5_lws[1] = d1_size < temp ? d1_size : temp;
    temp = temp / compute_rhs5_lws[1];
    compute_rhs5_lws[2] = d2_size < temp ? d2_size : temp;
    compute_rhs5_gws[0] = clu_RoundWorkSize(d0_size, compute_rhs5_lws[0]);
    compute_rhs5_gws[1] = clu_RoundWorkSize(d1_size, compute_rhs5_lws[1]);
    compute_rhs5_gws[2] = clu_RoundWorkSize(d2_size, compute_rhs5_lws[2]);

    ecode  = clSetKernelArg(k_compute_rhs5[i], 0, sizeof(cl_mem), &m_u[i]);
    ecode |= clSetKernelArg(k_compute_rhs5[i], 1, sizeof(cl_mem), &m_us[i]);
    ecode |= clSetKernelArg(k_compute_rhs5[i], 2, sizeof(cl_mem), &m_vs[i]);
    ecode |= clSetKernelArg(k_compute_rhs5[i], 3, sizeof(cl_mem), &m_ws[i]);
    ecode |= clSetKernelArg(k_compute_rhs5[i], 4, sizeof(cl_mem), &m_qs[i]);
    ecode |= clSetKernelArg(k_compute_rhs5[i], 5, sizeof(cl_mem),
                                                  &m_rho_i[i]);
    ecode |= clSetKernelArg(k_compute_rhs5[i], 6, sizeof(cl_mem),
                                                  &m_square[i]);
    ecode |= clSetKernelArg(k_compute_rhs5[i], 7, sizeof(cl_mem), &m_rhs[i]);
    ecode |= clSetKernelArg(k_compute_rhs5[i], 8, sizeof(cl_mem),
                                                  &m_cell_size[i]);
    ecode |= clSetKernelArg(k_compute_rhs5[i], 9, sizeof(cl_mem),&m_start[i]);
    ecode |= clSetKernelArg(k_compute_rhs5[i], 10, sizeof(cl_mem), &m_end[i]);
    ecode |= clSetKernelArg(k_compute_rhs5[i], 11, sizeof(int), &ncells);
    clu_CheckError(ecode, "clSetKernelArg()");

    ecode = clEnqueueNDRangeKernel(cmd_queue[i],
                                   k_compute_rhs5[i],
                                   3, NULL,
                                   compute_rhs5_gws,
                                   compute_rhs5_lws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel()");
  }
  //-------------------------------------------------------------------------

  //-------------------------------------------------------------------------
  for (i = 0; i < num_devices; i++) {
    size_t compute_rhs6_lws[3], compute_rhs6_gws[3], temp;

    if (COMPUTE_RHS6_DIM == 3) {
      d0_size = max_cell_size[i][1];
      d1_size = max_cell_size[i][2];
      d2_size = ncells;

      compute_rhs6_lws[0] = d0_size < work_item_sizes[0] ?
                            d0_size : work_item_sizes[0];
      temp = max_work_group_size / compute_rhs6_lws[0];
      compute_rhs6_lws[1] = d1_size < temp ? d1_size : temp;
      temp = temp / compute_rhs6_lws[1];
      compute_rhs6_lws[2] = d2_size < temp ? d2_size : temp;
      compute_rhs6_gws[0] = clu_RoundWorkSize(d0_size, compute_rhs6_lws[0]);
      compute_rhs6_gws[1] = clu_RoundWorkSize(d1_size, compute_rhs6_lws[1]);
      compute_rhs6_gws[2] = clu_RoundWorkSize(d2_size, compute_rhs6_lws[2]);
    } else {
      d0_size = max_cell_size[i][2];
      d1_size = ncells;

      compute_rhs6_lws[0] = d0_size < work_item_sizes[0] ?
                            d0_size : work_item_sizes[0];
      temp = max_work_group_size / compute_rhs6_lws[0];
      compute_rhs6_lws[1] = d1_size < temp ? d1_size : temp;
      compute_rhs6_gws[0] = clu_RoundWorkSize(d0_size, compute_rhs6_lws[0]);
      compute_rhs6_gws[1] = clu_RoundWorkSize(d1_size, compute_rhs6_lws[1]);
    }

    ecode  = clSetKernelArg(k_compute_rhs6[i], 0, sizeof(cl_mem), &m_rhs[i]);
    ecode |= clSetKernelArg(k_compute_rhs6[i], 1, sizeof(cl_mem),
                                                  &m_cell_size[i]);
    ecode |= clSetKernelArg(k_compute_rhs6[i], 2, sizeof(cl_mem),&m_start[i]);
    ecode |= clSetKernelArg(k_compute_rhs6[i], 3, sizeof(cl_mem), &m_end[i]);
    ecode |= clSetKernelArg(k_compute_rhs6[i], 4, sizeof(int), &ncells);
    clu_CheckError(ecode, "clSetKernelArg()");

    ecode = clEnqueueNDRangeKernel(cmd_queue[i],
                                   k_compute_rhs6[i],
                                   COMPUTE_RHS6_DIM, NULL,
                                   compute_rhs6_gws,
                                   compute_rhs6_lws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel()");
  }
  //-------------------------------------------------------------------------

  CHECK_FINISH();
  if (timeron) timer_stop(t_rhs);
}
Example #14
0
//---------------------------------------------------------------------
// this function copies the face values of a variable defined on a set 
// of cells to the overlap locations of the adjacent sets of cells. 
// Because a set of cells interfaces in each direction with exactly one 
// other set, we only need to fill six different buffers. We could try to 
// overlap communication with computation, by computing
// some internal values while communicating boundary values, but this
// adds so much overhead that it's not clearly useful. 
//---------------------------------------------------------------------
void copy_faces()
{
  int c, i;
  cl_int ecode = 0;

  //---------------------------------------------------------------------
  // exit immediately if there are no faces to be copied           
  //---------------------------------------------------------------------
  if (num_devices == 1) {
    compute_rhs();
    return;
  }

  //---------------------------------------------------------------------
  // because the difference stencil for the diagonalized scheme is 
  // orthogonal, we do not have to perform the staged copying of faces, 
  // but can send all face information simultaneously to the neighboring 
  // cells in all directions          
  //---------------------------------------------------------------------
  if (timeron) timer_start(t_bpack);

  for (c = 0; c < ncells; c++) {
    for (i = 0; i < num_devices; i++) {
      ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                     k_copy_faces1[i][c],
                                     COPY_FACES1_DIM, NULL,
                                     copy_faces1_gw[i][c],
                                     copy_faces1_lw[i][c],
                                     0, NULL, NULL);
      clu_CheckError(ecode, "clEnqueueNDRange() for copy_faces1");
    }

    for (i = 0; i < num_devices; i++) {
      ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                     k_copy_faces2[i][c],
                                     COPY_FACES2_DIM, NULL,
                                     copy_faces2_gw[i][c],
                                     copy_faces2_lw[i][c],
                                     0, NULL, NULL);
      clu_CheckError(ecode, "clEnqueueNDRange() for copy_faces2");

      ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                     k_copy_faces3[i][c],
                                     COPY_FACES3_DIM, NULL,
                                     copy_faces3_gw[i][c],
                                     copy_faces3_lw[i][c],
                                     0, NULL, NULL);
      clu_CheckError(ecode, "clEnqueueNDRange() for copy_faces3");
    }

    for (i = 0; i < num_devices; i++) {
      CHECK_FINISH(i * 2);
    }

  }

  if (timeron) timer_stop(t_bpack);

  if (timeron) timer_start(t_exch);
  for (i = 0; i < num_devices; i++) {
    CHECK_FINISH(i * 2);

    ecode = clEnqueueCopyBuffer(cmd_queue[successor[i][0] * 2 + 1],
                                m_out_buffer[i],
                                m_in_buffer[successor[i][0]],
                                start_send_east[i]*sizeof(double),
                                start_recv_west[successor[i][0]]*sizeof(double),
                                east_size[i]*sizeof(double),
                                0, NULL, NULL);
        CHECK_FINISH(successor[i][0] * 2 + 1);
  }

  for (i = 0; i < num_devices; i++) {
    ecode = clEnqueueCopyBuffer(cmd_queue[predecessor[i][0] * 2 + 1],
                                m_out_buffer[i],
                                m_in_buffer[predecessor[i][0]],
                                start_send_west[i]*sizeof(double),
                                start_recv_east[predecessor[i][0]]*sizeof(double),
                                west_size[i]*sizeof(double),
                                0, NULL, NULL);

        CHECK_FINISH(predecessor[i][0] * 2 + 1);
    ecode = clEnqueueCopyBuffer(cmd_queue[successor[i][1] * 2 + 1],
                                m_out_buffer[i],
                                m_in_buffer[successor[i][1]],
                                start_send_north[i]*sizeof(double),
                                start_recv_south[successor[i][1]]*sizeof(double),
                                north_size[i]*sizeof(double),
                                0, NULL, NULL);

        CHECK_FINISH(successor[i][1] * 2 + 1);
    ecode = clEnqueueCopyBuffer(cmd_queue[predecessor[i][1] * 2 + 1],
                                m_out_buffer[i],
                                m_in_buffer[predecessor[i][1]],
                                start_send_south[i]*sizeof(double),
                                start_recv_north[predecessor[i][1]]*sizeof(double),
                                south_size[i]*sizeof(double),
                                0, NULL, NULL);

        CHECK_FINISH(predecessor[i][1] * 2 + 1);
    ecode = clEnqueueCopyBuffer(cmd_queue[successor[i][2] * 2 + 1],
                                m_out_buffer[i],
                                m_in_buffer[successor[i][2]],
                                start_send_top[i]*sizeof(double),
                                start_recv_bottom[successor[i][2]]*sizeof(double),
                                top_size[i]*sizeof(double),
                                0, NULL, NULL);

        CHECK_FINISH(successor[i][2] * 2 + 1);
    ecode = clEnqueueCopyBuffer(cmd_queue[predecessor[i][2] * 2 + 1],
                                m_out_buffer[i],
                                m_in_buffer[predecessor[i][2]],
                                start_send_bottom[i]*sizeof(double),
                                start_recv_top[predecessor[i][2]]*sizeof(double),
                                bottom_size[i]*sizeof(double),
                                0, NULL, NULL);
        CHECK_FINISH(predecessor[i][2] * 2 + 1);
  }
  if (timeron) timer_stop(t_exch);

  //---------------------------------------------------------------------
  // unpack the data that has just been received;             
  //---------------------------------------------------------------------
  if (timeron) timer_start(t_bpack);

  for (c = 0; c < ncells; c++) {
    for (i = 0; i < num_devices; i++) {
      if (c == 0) CHECK_FINISH(i * 2 + 1);

      ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                     k_copy_faces4[i][c],
                                     COPY_FACES4_DIM, NULL,
                                     copy_faces4_gw[i][c],
                                     copy_faces4_lw[i][c],
                                     0, NULL, NULL);
      clu_CheckError(ecode, "clEnqueueNDRange() for copy_faces4");
    }

    for (i = 0; i < num_devices; i++) {
      ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                     k_copy_faces5[i][c],
                                     COPY_FACES5_DIM, NULL,
                                     copy_faces5_gw[i][c],
                                     copy_faces5_lw[i][c],
                                     0, NULL, NULL);
      clu_CheckError(ecode, "clEnqueueNDRange() for copy_faces5");

      ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                     k_copy_faces6[i][c],
                                     COPY_FACES6_DIM, NULL,
                                     copy_faces6_gw[i][c],
                                     copy_faces6_lw[i][c],
                                     0, NULL, NULL);
      clu_CheckError(ecode, "clEnqueueNDRange() for copy_faces6");
    }

    for (i = 0; i < num_devices; i++) {
      CHECK_FINISH(i * 2);
    }
  }

  if (timeron) timer_stop(t_bpack);

  for (i = 0; i < num_devices; i++)
    CHECK_FINISH(i * 2);

  //---------------------------------------------------------------------
  // now that we have all the data, compute the rhs
  //---------------------------------------------------------------------
  compute_rhs();
}
Example #15
0
void rank( int iteration )
{
  size_t r1_lws[1], r1_gws[1];
  size_t r2_lws[1], r2_gws[1];
  size_t r3_lws[1], r3_gws[1];
  cl_int ecode;

  DTIMER_START(T_KERNEL_RANK0);
  // rank0
  ecode = clSetKernelArg(k_rank0, 3, sizeof(cl_int), (void*)&iteration);
  clu_CheckError(ecode, "clSetKernelArg() for rank0: iteration");

  ecode = clEnqueueTask(cmd_queue, k_rank0, 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueTask() for rank0");
  CHECK_FINISH();
  DTIMER_STOP(T_KERNEL_RANK0);

  DTIMER_START(T_KERNEL_RANK1);
  // rank1
  r1_lws[0] = RANK1_GROUP_SIZE;
  r1_gws[0] = RANK1_GLOBAL_SIZE;
  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_rank1,
                                 1, NULL,
                                 r1_gws, 
                                 r1_lws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel() for rank1");
  CHECK_FINISH();
  DTIMER_STOP(T_KERNEL_RANK1);

  DTIMER_START(T_KERNEL_RANK2);
  // rank2
  r2_lws[0] = RANK2_GROUP_SIZE;
  r2_gws[0] = RANK2_GLOBAL_SIZE;
  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 k_rank2,
                                 1, NULL,
                                 r2_gws, 
                                 r2_lws,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel() for rank2");
  CHECK_FINISH();
  DTIMER_STOP(T_KERNEL_RANK2);

  DTIMER_START(T_KERNEL_RANK3);
  // rank3
  if (device_type == CL_DEVICE_TYPE_GPU) {
    r3_lws[0] = work_item_sizes[0];
    r3_gws[0] = work_item_sizes[0] * work_item_sizes[0];
    if (r3_gws[0] > MAX_KEY) r3_gws[0] = MAX_KEY;
    ecode = clEnqueueNDRangeKernel(cmd_queue,
                                   k_rank3_0,
                                   1, NULL,
                                   r3_gws, 
                                   r3_lws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel() for rank3_0");

    r3_lws[0] = work_item_sizes[0];
    r3_gws[0] = work_item_sizes[0];
    ecode = clEnqueueNDRangeKernel(cmd_queue,
                                   k_rank3_1,
                                   1, NULL,
                                   r3_gws, 
                                   r3_lws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel() for rank3_1");

    r3_lws[0] = work_item_sizes[0];
    r3_gws[0] = work_item_sizes[0] * work_item_sizes[0];
    if (r3_gws[0] > MAX_KEY) r3_gws[0] = MAX_KEY;
    ecode = clEnqueueNDRangeKernel(cmd_queue,
                                   k_rank3_2,
                                   1, NULL,
                                   r3_gws, 
                                   r3_lws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel() for rank3_2");
  } else {
    r3_lws[0] = RANK_GROUP_SIZE;
    r3_gws[0] = RANK_GLOBAL_SIZE;
    ecode = clEnqueueNDRangeKernel(cmd_queue,
                                   k_rank3,
                                   1, NULL,
                                   r3_gws, 
                                   r3_lws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel() for rank3");
  }
  CHECK_FINISH();
  DTIMER_STOP(T_KERNEL_RANK3);

  // rank4 - partial verification
  DTIMER_START(T_KERNEL_RANK4);
  ecode = clSetKernelArg(k_rank4, 4, sizeof(cl_int), (void*)&iteration);
  clu_CheckError(ecode, "clSetKernelArg() for rank4");

  ecode = clEnqueueTask(cmd_queue, k_rank4, 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueTask() for rank4");

  ecode = clFinish(cmd_queue);
  clu_CheckError(ecode, "clFinish");
  DTIMER_STOP(T_KERNEL_RANK4);
}
Example #16
0
void full_verify( void )
{
  cl_kernel k_fv1, k_fv2;
  cl_mem    m_j;
  INT_TYPE *g_j;
  INT_TYPE j = 0, i;
  size_t j_size;
  size_t fv1_lws[1], fv1_gws[1];
  size_t fv2_lws[1], fv2_gws[1];
  cl_int ecode;

  DTIMER_START(T_BUFFER_CREATE);
  // Create buffers
  j_size = sizeof(INT_TYPE) * (FV2_GLOBAL_SIZE / FV2_GROUP_SIZE);
  m_j = clCreateBuffer(context,
                       CL_MEM_READ_WRITE,
                       j_size,
                       NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer for m_j");
  DTIMER_STOP(T_BUFFER_CREATE);

  DTIMER_START(T_OPENCL_API);
  // Create kernels
  k_fv1 = clCreateKernel(program, "full_verify1", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for full_verify1");

  k_fv2 = clCreateKernel(program, "full_verify2", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for full_verify2");
  DTIMER_STOP(T_OPENCL_API);

  if (device_type == CL_DEVICE_TYPE_GPU) {
    cl_kernel k_fv0;
    size_t fv0_lws[1], fv0_gws[1];

    DTIMER_START(T_OPENCL_API);
    // Create kernels
    k_fv0 = clCreateKernel(program, "full_verify0", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for full_verify0");
    DTIMER_STOP(T_OPENCL_API);

    // Kernel execution
    DTIMER_START(T_KERNEL_FV0);
    ecode  = clSetKernelArg(k_fv0, 0, sizeof(cl_mem), (void*)&m_key_array);
    ecode |= clSetKernelArg(k_fv0, 1, sizeof(cl_mem), (void*)&m_key_buff2);
    clu_CheckError(ecode, "clSetKernelArg() for full_verify0");

    fv0_lws[0] = work_item_sizes[0];
    fv0_gws[0] = NUM_KEYS;
    ecode = clEnqueueNDRangeKernel(cmd_queue,
                                   k_fv0,
                                   1, NULL,
                                   fv0_gws, 
                                   fv0_lws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel() for full_verify0");
    CHECK_FINISH();
    DTIMER_STOP(T_KERNEL_FV0);

    DTIMER_START(T_KERNEL_FV1);
    ecode  = clSetKernelArg(k_fv1, 0, sizeof(cl_mem), (void*)&m_key_buff2);
    ecode |= clSetKernelArg(k_fv1, 1, sizeof(cl_mem), (void*)&m_key_buff1);
    ecode |= clSetKernelArg(k_fv1, 2, sizeof(cl_mem), (void*)&m_key_array);
    clu_CheckError(ecode, "clSetKernelArg() for full_verify1");

    fv1_lws[0] = work_item_sizes[0];
    fv1_gws[0] = NUM_KEYS;
    ecode = clEnqueueNDRangeKernel(cmd_queue,
                                   k_fv1,
                                   1, NULL,
                                   fv1_gws, 
                                   fv1_lws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel() for full_verify1");
    CHECK_FINISH();
    DTIMER_STOP(T_KERNEL_FV1);

    DTIMER_START(T_KERNEL_FV2);
    ecode  = clSetKernelArg(k_fv2, 0, sizeof(cl_mem), (void*)&m_key_array);
    ecode |= clSetKernelArg(k_fv2, 1, sizeof(cl_mem), (void*)&m_j);
    ecode |= clSetKernelArg(k_fv2, 2, sizeof(INT_TYPE)*FV2_GROUP_SIZE, NULL);
    clu_CheckError(ecode, "clSetKernelArg() for full_verify2");

    fv2_lws[0] = FV2_GROUP_SIZE;
    fv2_gws[0] = FV2_GLOBAL_SIZE;
    ecode = clEnqueueNDRangeKernel(cmd_queue,
                                   k_fv2,
                                   1, NULL,
                                   fv2_gws, 
                                   fv2_lws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel() for full_verify2");
    CHECK_FINISH();
    DTIMER_STOP(T_KERNEL_FV2);

    g_j = (INT_TYPE *)malloc(j_size);

    DTIMER_START(T_BUFFER_READ);
    ecode = clEnqueueReadBuffer(cmd_queue,
                                m_j,
                                CL_TRUE,
                                0,
                                j_size,
                                g_j,
                                0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueReadBuffer() for m_j");
    DTIMER_STOP(T_BUFFER_READ);

    // reduction
    for (i = 0; i < j_size/sizeof(INT_TYPE); i++) {
      j += g_j[i];
    }

    DTIMER_START(T_RELEASE);
    clReleaseKernel(k_fv0);
    DTIMER_STOP(T_RELEASE);

  } else {

    // Kernel execution
    DTIMER_START(T_KERNEL_FV1);
    ecode  = clSetKernelArg(k_fv1, 0, sizeof(cl_mem), (void*)&m_bucket_ptrs);
    ecode |= clSetKernelArg(k_fv1, 1, sizeof(cl_mem), (void*)&m_key_buff2);
    ecode |= clSetKernelArg(k_fv1, 2, sizeof(cl_mem), (void*)&m_key_buff1);
    ecode |= clSetKernelArg(k_fv1, 3, sizeof(cl_mem), (void*)&m_key_array);
    clu_CheckError(ecode, "clSetKernelArg() for full_verify1");

    fv1_lws[0] = RANK_GROUP_SIZE;
    fv1_gws[0] = RANK_GLOBAL_SIZE;
    ecode = clEnqueueNDRangeKernel(cmd_queue,
                                   k_fv1,
                                   1, NULL,
                                   fv1_gws, 
                                   fv1_lws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel() for full_verify1");
    CHECK_FINISH();
    DTIMER_STOP(T_KERNEL_FV1);

    DTIMER_START(T_KERNEL_FV2);
    ecode  = clSetKernelArg(k_fv2, 0, sizeof(cl_mem), (void*)&m_key_array);
    ecode |= clSetKernelArg(k_fv2, 1, sizeof(cl_mem), (void*)&m_j);
    ecode |= clSetKernelArg(k_fv2, 2, sizeof(INT_TYPE)*FV2_GROUP_SIZE, NULL);
    clu_CheckError(ecode, "clSetKernelArg() for full_verify2");

    fv2_lws[0] = FV2_GROUP_SIZE;
    fv2_gws[0] = FV2_GLOBAL_SIZE;
    ecode = clEnqueueNDRangeKernel(cmd_queue,
                                   k_fv2,
                                   1, NULL,
                                   fv2_gws, 
                                   fv2_lws,
                                   0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueNDRangeKernel() for full_verify2");
    CHECK_FINISH();
    DTIMER_STOP(T_KERNEL_FV2);

    g_j = (INT_TYPE *)malloc(j_size);

    DTIMER_START(T_BUFFER_READ);
    ecode = clEnqueueReadBuffer(cmd_queue,
                                m_j,
                                CL_TRUE,
                                0,
                                j_size,
                                g_j,
                                0, NULL, NULL);
    clu_CheckError(ecode, "clEnqueueReadBuffer() for m_j");
    DTIMER_STOP(T_BUFFER_READ);

    // reduction
    for (i = 0; i < j_size/sizeof(INT_TYPE); i++) {
      j += g_j[i];
    }

  }

  DTIMER_START(T_RELEASE);
  free(g_j);
  clReleaseMemObject(m_j);
  clReleaseKernel(k_fv1);
  clReleaseKernel(k_fv2);
  DTIMER_STOP(T_RELEASE);

  if (j != 0)
    printf( "Full_verify: number of keys out of sort: %ld\n", (long)j );
  else
    passed_verification++;
}
Example #17
0
void compute_rhs()
{
  int c, i;
  cl_int ecode = 0;

  if (timeron) timer_start(t_rhs);
  //---------------------------------------------------------------------
  // loop over all cells owned by this node                           
  //---------------------------------------------------------------------
  for (c = 0; c < ncells; c++) {
    for (i = 0; i < num_devices; i++) {
      ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                     k_compute_rhs1[i][c],
                                     COMPUTE_RHS1_DIM, NULL,
                                     compute_rhs1_gw[i][c],
                                     compute_rhs1_lw[i][c],
                                     0, NULL, NULL);
      clu_CheckError(ecode, "clEnqueueNDRange() for compute_rhs1");
    }

    for (i = 0; i < num_devices; i++) {
      ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                     k_compute_rhs2[i][c],
                                     COMPUTE_RHS2_DIM, NULL,
                                     compute_rhs2_gw[i][c],
                                     compute_rhs2_lw[i][c],
                                     0, NULL, NULL);
      clu_CheckError(ecode, "clEnqueueNDRange() for compute_rhs2");

      ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                     k_compute_rhs3[i][c],
                                     2, NULL,
                                     compute_rhs3_gw[i][c],
                                     compute_rhs3_lw[i][c],
                                     0, NULL, NULL);
      clu_CheckError(ecode, "clEnqueueNDRange() for compute_rhs3");

      ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                     k_compute_rhs4[i][c],
                                     2, NULL,
                                     compute_rhs4_gw[i][c],
                                     compute_rhs4_lw[i][c],
                                     0, NULL, NULL);
      clu_CheckError(ecode, "clEnqueueNDRange() for compute_rhs4");

      ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                     k_compute_rhs5[i][c],
                                     2, NULL,
                                     compute_rhs5_gw[i][c],
                                     compute_rhs5_lw[i][c],
                                     0, NULL, NULL);
      clu_CheckError(ecode, "clEnqueueNDRange() for compute_rhs5");

      ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                     k_compute_rhs6[i][c],
                                     COMPUTE_RHS6_DIM, NULL,
                                     compute_rhs6_gw[i][c],
                                     compute_rhs6_lw[i][c],
                                     0, NULL, NULL);
      clu_CheckError(ecode, "clEnqueueNDRange() for compute_rhs6");
    }
    for (i = 0; i < num_devices; i++) {
      CHECK_FINISH(i * 2);
    }
  }

  for (i = 0; i < num_devices; i++)
    CHECK_FINISH(i * 2);

  if (timeron) timer_stop(t_rhs);
}
Example #18
0
//---------------------------------------------------------------------
// this function performs the solution of the approximate factorization
// step in the x-direction for all five matrix components
// simultaneously. The Thomas algorithm is employed to solve the
// systems for the x-lines. Boundary conditions are non-periodic
//---------------------------------------------------------------------
void x_solve()
{
  int stage, i, c, buffer_size;
  cl_int ecode = 0;

  //---------------------------------------------------------------------
  // OK, now we know that there are multiple processors
  //---------------------------------------------------------------------

  //---------------------------------------------------------------------
  // now do a sweep on a layer-by-layer basis, i.e. sweeping through cells
  // on this node in the direction of increasing i for the forward sweep,
  // and after that reversing the direction for the backsubstitution.
  //---------------------------------------------------------------------

  if (timeron) timer_start(t_xsolve);
  //---------------------------------------------------------------------
  // FORWARD ELIMINATION  
  //---------------------------------------------------------------------
  for (stage = 1; stage <= ncells; stage++) {
    if (stage != 1) {
      //---------------------------------------------------------------------
      // communication has already been started. 
      // compute the left hand side while waiting for the msg
      //---------------------------------------------------------------------
      for (i = 0; i < num_devices; i++) {
        ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                       k_lhsx[i][slice[i][stage-1][0]],
                                       2, NULL,
                                       lhsx_gw[i][slice[i][stage-1][0]],
                                       lhsx_lw[i][slice[i][stage-1][0]],
                                       0, NULL, NULL);
        clu_CheckError(ecode, "clEnqueueNDRange() for lhsx");
      }
      for (i = 0; i < num_devices; i++) {
        CHECK_FINISH(i * 2);
      }

      //---------------------------------------------------------------------
      // wait for pending communication to complete
      // This waits on the current receive and on the send
      // from the previous stage. They always come in pairs. 
      //---------------------------------------------------------------------
      for (i = 0; i < num_devices; i++) {
        CHECK_FINISH(i * 2 + 1);
        ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                       k_x_solve1[i][slice[i][stage-1][0]],
                                       2, NULL,
                                       x_solve1_gw[i][slice[i][stage-1][0]],
                                       x_solve1_lw[i][slice[i][stage-1][0]],
                                       0, NULL, NULL);
        clu_CheckError(ecode, "clEnqueueNDRange() for x_solve1");
      }
      for (i = 0; i < num_devices; i++) {
        CHECK_FINISH(i * 2);
      }
    } else {            
      //---------------------------------------------------------------------
      // if this IS the first cell, we still compute the lhs
      //---------------------------------------------------------------------
      for (i = 0; i < num_devices; i++) {
        ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                       k_lhsx[i][slice[i][stage-1][0]],
                                       2, NULL,
                                       lhsx_gw[i][slice[i][stage-1][0]],
                                       lhsx_lw[i][slice[i][stage-1][0]],
                                       0, NULL, NULL);
        clu_CheckError(ecode, "clEnqueueNDRange() for lhsx");
      }
      for (i = 0; i < num_devices; i++) {
        CHECK_FINISH(i * 2);
      }
    }

    for (i = 0; i < num_devices; i++) {
      ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                     k_x_solve2[i][slice[i][stage-1][0]],
                                     2, NULL,
                                     x_solve2_gw[i][slice[i][stage-1][0]],
                                     x_solve2_lw[i][slice[i][stage-1][0]],
                                     0, NULL, NULL);
      clu_CheckError(ecode, "clEnqueueNDRange() for x_solve2");
    }
    for (i = 0; i < num_devices; i++) {
      CHECK_FINISH(i * 2);
    }

    //---------------------------------------------------------------------
    // send information to the next processor, except when this
    // is the last grid block
    //---------------------------------------------------------------------
    if (stage != ncells) {
      for (i = 0; i < num_devices; i++) {
        ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                       k_x_solve3[i][slice[i][stage-1][0]],
                                       2, NULL,
                                       x_solve3_gw[i][slice[i][stage-1][0]],
                                       x_solve3_lw[i][slice[i][stage-1][0]],
                                       0, NULL, NULL);
        clu_CheckError(ecode, "clEnqueueNDRange() for x_solve3");
      }
      for (i = 0; i < num_devices; i++) {
        CHECK_FINISH(i * 2);
      }

      //---------------------------------------------------------------------
      // send data to next phase
      // can't receive data yet because buffer size will be wrong 
      //---------------------------------------------------------------------
      for (i = 0; i < num_devices; i++) {
        CHECK_FINISH(i * 2);

        c      = slice[i][stage-1][0];
        buffer_size = (cell_size[i][c][1]-cell_start[i][c][1]-cell_end[i][c][1]) * 
                      (cell_size[i][c][2]-cell_start[i][c][2]-cell_end[i][c][2]);

        ecode = clEnqueueCopyBuffer(cmd_queue[successor[i][0] * 2 + 1],
                                    m_out_buffer[i],
                                    m_in_buffer[successor[i][0]],
                                    0, 0, 22*buffer_size*sizeof(double),
                                    0, NULL, NULL);
      }
    }
  }

  //---------------------------------------------------------------------
  // now go in the reverse direction                      
  //---------------------------------------------------------------------

  //---------------------------------------------------------------------
  // BACKSUBSTITUTION 
  //---------------------------------------------------------------------
  for (stage = ncells; stage >= 1; stage--) {
    if (stage != ncells) {
      //---------------------------------------------------------------------
      // communication has already been started
      // while waiting, do the block-diagonal inversion for the 
      // cell that was just finished                
      //---------------------------------------------------------------------
      for (i = 0; i < num_devices; i++) {
        ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                       k_ninvr[i][slice[i][stage][0]],
                                       NINVR_DIM, NULL,
                                       ninvr_gw[i][slice[i][stage][0]],
                                       ninvr_lw[i][slice[i][stage][0]],
                                       0, NULL, NULL);
        clu_CheckError(ecode, "clEnqueueNDRange() for ninvr");
      }
      for (i = 0; i < num_devices; i++) {
        CHECK_FINISH(i * 2);
      }


      //---------------------------------------------------------------------
      // wait for pending communication to complete
      //---------------------------------------------------------------------
      for (i = 0; i < num_devices; i++) {
        CHECK_FINISH(i * 2 + 1);
        ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                       k_x_solve4[i][slice[i][stage-1][0]],
                                       2, NULL,
                                       x_solve4_gw[i][slice[i][stage-1][0]],
                                       x_solve4_lw[i][slice[i][stage-1][0]],
                                       0, NULL, NULL);
        clu_CheckError(ecode, "clEnqueueNDRange() for x_solve4");
      }
      for (i = 0; i < num_devices; i++) {
        CHECK_FINISH(i * 2);
      }

    } else {
      for (i = 0; i < num_devices; i++) {
        ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                       k_x_solve5[i][slice[i][stage-1][0]],
                                       2, NULL,
                                       x_solve5_gw[i][slice[i][stage-1][0]],
                                       x_solve5_lw[i][slice[i][stage-1][0]],
                                       0, NULL, NULL);
        clu_CheckError(ecode, "clEnqueueNDRange() for x_solve5");
      }
      for (i = 0; i < num_devices; i++) {
        CHECK_FINISH(i * 2);
      }
    }

    //---------------------------------------------------------------------
    // send on information to the previous processor, if needed
    //---------------------------------------------------------------------
    if (stage != 1) {
      for (i = 0; i < num_devices; i++) {
        ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                       k_x_solve6[i][slice[i][stage-1][0]],
                                       X_SOLVE6_DIM, NULL,
                                       x_solve6_gw[i][slice[i][stage-1][0]],
                                       x_solve6_lw[i][slice[i][stage-1][0]],
                                       0, NULL, NULL);
        clu_CheckError(ecode, "clEnqueueNDRange() for x_solve6");
      }
      for (i = 0; i < num_devices; i++) {
        CHECK_FINISH(i * 2);
      }
      //---------------------------------------------------------------------
      // pack and send the buffer
      //---------------------------------------------------------------------
      for (i = 0; i < num_devices; i++) {
        CHECK_FINISH(i * 2);

        c      = slice[i][stage-1][0];
        buffer_size = (cell_size[i][c][1]-cell_start[i][c][1]-cell_end[i][c][1]) * 
                      (cell_size[i][c][2]-cell_start[i][c][2]-cell_end[i][c][2]);

        ecode = clEnqueueCopyBuffer(cmd_queue[predecessor[i][0] * 2 + 1],
                                    m_out_buffer[i],
                                    m_in_buffer[predecessor[i][0]],
                                    0, 0, 10*buffer_size*sizeof(double),
                                    0, NULL, NULL);
      }
    }

    //---------------------------------------------------------------------
    // If this was the last stage, do the block-diagonal inversion
    //---------------------------------------------------------------------
    if (stage == 1) {
      for (i = 0; i < num_devices; i++) {
        ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                       k_ninvr[i][slice[i][stage-1][0]],
                                       NINVR_DIM, NULL,
                                       ninvr_gw[i][slice[i][stage-1][0]],
                                       ninvr_lw[i][slice[i][stage-1][0]],
                                       0, NULL, NULL);
        clu_CheckError(ecode, "clEnqueueNDRange() for ninvr");
      }
      for (i = 0; i < num_devices; i++) {
        CHECK_FINISH(i * 2);
      }
    }
  }

  for (i = 0; i < num_devices; i++)
    CHECK_FINISH(i * 2);

  if (timeron) timer_stop(t_xsolve);
}