コード例 #1
0
ファイル: ft.c プロジェクト: ashwinma/multicl
//---------------------------------------------------------------------
// Set up the OpenCL environment.
//---------------------------------------------------------------------
static void setup_opencl(int argc, char *argv[])
{
  size_t temp;
  cl_int ecode;
  char *source_dir = "FT";
  if (argc > 1) source_dir = argv[1];

#ifdef TIMER_DETAIL
  if (timers_enabled) {
    int i;
    for (i = T_OPENCL_API; i < T_END; i++) timer_clear(i);
  }
#endif

  DTIMER_START(T_OPENCL_API);

  // 1. Find the default device type and get a device for the device type
  device_type = clu_GetDefaultDeviceType();
  device      = clu_GetAvailableDevice(device_type);
  device_name = clu_GetDeviceName(device);

  // Device information
  ecode = clGetDeviceInfo(device,
                          CL_DEVICE_MAX_WORK_ITEM_SIZES,
                          sizeof(work_item_sizes),
                          &work_item_sizes,
                          NULL);
  clu_CheckError(ecode, "clGetDiviceInfo()");

  ecode = clGetDeviceInfo(device,
                          CL_DEVICE_MAX_WORK_GROUP_SIZE,
                          sizeof(size_t),
                          &max_work_group_size,
                          NULL);
  clu_CheckError(ecode, "clGetDiviceInfo()");

  // FIXME: The below values are experimental.
  if (max_work_group_size > 64) {
    max_work_group_size = 64;
    int i;
    for (i = 0; i < 3; i++) {
      if (work_item_sizes[i] > 64) {
        work_item_sizes[i] = 64;
      }
    }
  }

  ecode = clGetDeviceInfo(device,
                          CL_DEVICE_MAX_COMPUTE_UNITS,
                          sizeof(cl_uint),
                          &max_compute_units,
                          NULL);
  clu_CheckError(ecode, "clGetDiviceInfo()");

  // 2. Create a context for the specified device
  context = clCreateContext(NULL, 1, &device, NULL, NULL, &ecode);
  clu_CheckError(ecode, "clCreateContext()");

  // 3. Create a command queue
  cmd_queue = clCreateCommandQueue(context, device, 0, &ecode);
  clu_CheckError(ecode, "clCreateCommandQueue()");

  DTIMER_STOP(T_OPENCL_API);

  // 4. Build the program
  DTIMER_START(T_BUILD);
  char *source_file;
  char build_option[50];
  if (device_type == CL_DEVICE_TYPE_CPU) {
    source_file = "ft_cpu.cl";
    sprintf(build_option, "-I. -DCLASS=%d -DUSE_CPU", CLASS);

    COMPUTE_IMAP_DIM = COMPUTE_IMAP_DIM_CPU;
    EVOLVE_DIM = EVOLVE_DIM_CPU;
    CFFTS_DIM = CFFTS_DIM_CPU;

  } else if (device_type == CL_DEVICE_TYPE_GPU) {
    char vendor[50];
    ecode = clGetDeviceInfo(device, CL_DEVICE_VENDOR, 50, vendor, NULL);
    clu_CheckError(ecode, "clGetDeviceInfo()");
    if (strncmp(vendor, DEV_VENDOR_NVIDIA, strlen(DEV_VENDOR_NVIDIA)) == 0) {
      source_file = "ft_gpu_nvidia.cl";
      CFFTS_LSIZE = 32;
    } else {
      source_file = "ft_gpu.cl";
      CFFTS_LSIZE = 64;
    }

    sprintf(build_option, "-I. -DCLASS=\'%c\' -DLSIZE=%lu",
            CLASS, CFFTS_LSIZE);

    COMPUTE_IMAP_DIM = COMPUTE_IMAP_DIM_GPU;
    EVOLVE_DIM = EVOLVE_DIM_GPU;
    CFFTS_DIM = CFFTS_DIM_GPU;

  } else {
    fprintf(stderr, "Set the environment variable OPENCL_DEVICE_TYPE!\n");
    exit(EXIT_FAILURE);
  }
  program = clu_MakeProgram(context, device, source_dir, source_file,
                            build_option);
  DTIMER_STOP(T_BUILD);

  // 5. Create buffers
  DTIMER_START(T_BUFFER_CREATE);
  m_u = clCreateBuffer(context,
                       CL_MEM_READ_ONLY,
                       sizeof(dcomplex) * NXP,
                       NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_u");

  m_u0 = clCreateBuffer(context,
                        CL_MEM_READ_WRITE,
                        sizeof(dcomplex) * NTOTALP,
                        NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_u0");

  m_u1 = clCreateBuffer(context,
                        CL_MEM_READ_WRITE,
                        sizeof(dcomplex) * NTOTALP,
                        NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_u1");

  m_twiddle = clCreateBuffer(context,
                             CL_MEM_READ_WRITE,
                             sizeof(double) * NTOTALP,
                             NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_twiddle");

  if (device_type == CL_DEVICE_TYPE_CPU) {
    size_t ty1_size, ty2_size;
    if (CFFTS_DIM == 2) {
      ty1_size = sizeof(dcomplex) * NX * NY * NZ;
      ty2_size = sizeof(dcomplex) * NX * NY * NZ;
    } else {
      fprintf(stderr, "Wrong CFFTS_DIM: %u\n", CFFTS_DIM);
      exit(EXIT_FAILURE);
    }

    m_ty1 = clCreateBuffer(context,
                           CL_MEM_READ_WRITE,
                           ty1_size,
                           NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_ty1");

    m_ty2 = clCreateBuffer(context,
                           CL_MEM_READ_WRITE,
                           ty2_size,
                           NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_ty2");
  }

  if (device_type == CL_DEVICE_TYPE_CPU) {
    temp = 1024 / max_compute_units;
    checksum_local_ws  = temp == 0 ? 1 : temp;
    checksum_global_ws = clu_RoundWorkSize((size_t)1024, checksum_local_ws);
  } else if (device_type == CL_DEVICE_TYPE_GPU) {
    checksum_local_ws  = 32;
    checksum_global_ws = clu_RoundWorkSize((size_t)1024, checksum_local_ws);
  }
  checksum_wg_num = checksum_global_ws / checksum_local_ws;
  m_chk = clCreateBuffer(context,
                         CL_MEM_READ_WRITE,
                         sizeof(dcomplex) * checksum_wg_num,
                         NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_chk");
  g_chk = (dcomplex *)malloc(sizeof(dcomplex) * checksum_wg_num);
  DTIMER_STOP(T_BUFFER_CREATE);

  // 6. Create kernels
  DTIMER_START(T_OPENCL_API);
  double ap = -4.0 * ALPHA * PI * PI;
  int d1 = dims[0];
  int d2 = dims[1];
  int d3 = dims[2];

  k_compute_indexmap = clCreateKernel(program, "compute_indexmap", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for compute_indexmap");
  ecode  = clSetKernelArg(k_compute_indexmap, 0, sizeof(cl_mem), &m_twiddle);
  ecode |= clSetKernelArg(k_compute_indexmap, 1, sizeof(int), &d1);
  ecode |= clSetKernelArg(k_compute_indexmap, 2, sizeof(int), &d2);
  ecode |= clSetKernelArg(k_compute_indexmap, 3, sizeof(int), &d3);
  ecode |= clSetKernelArg(k_compute_indexmap, 4, sizeof(double), &ap);
  clu_CheckError(ecode, "clSetKernelArg() for compute_indexmap");
  if (COMPUTE_IMAP_DIM == 3) {
    cimap_lws[0] = d1 < work_item_sizes[0] ? d1 : work_item_sizes[0];
    temp = max_work_group_size / cimap_lws[0];
    cimap_lws[1] = d2 < temp ? d2 : temp;
    temp = temp / cimap_lws[1];
    cimap_lws[2] = d3 < temp ? d3 : temp;

    cimap_gws[0] = clu_RoundWorkSize((size_t)d1, cimap_lws[0]);
    cimap_gws[1] = clu_RoundWorkSize((size_t)d2, cimap_lws[1]);
    cimap_gws[2] = clu_RoundWorkSize((size_t)d3, cimap_lws[2]);
  } else if (COMPUTE_IMAP_DIM == 2) {
    cimap_lws[0] = d2 < work_item_sizes[0] ? d2 : work_item_sizes[0];
    temp = max_work_group_size / cimap_lws[0];
    cimap_lws[1] = d3 < temp ? d3 : temp;

    cimap_gws[0] = clu_RoundWorkSize((size_t)d2, cimap_lws[0]);
    cimap_gws[1] = clu_RoundWorkSize((size_t)d3, cimap_lws[1]);
  } else {
    //temp = d3 / max_compute_units;
    temp = 1;
    cimap_lws[0] = temp == 0 ? 1 : temp;
    cimap_gws[0] = clu_RoundWorkSize((size_t)d3, cimap_lws[0]);
  }

  k_compute_ics = clCreateKernel(program,
                                 "compute_initial_conditions", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for compute_initial_conditions");
  ecode  = clSetKernelArg(k_compute_ics, 2, sizeof(int), &d1);
  ecode |= clSetKernelArg(k_compute_ics, 3, sizeof(int), &d2);
  ecode |= clSetKernelArg(k_compute_ics, 4, sizeof(int), &d3);
  clu_CheckError(ecode, "clSetKernelArg() for compute_initial_conditions");

  k_cffts1 = clCreateKernel(program, "cffts1", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for cffts1");
  ecode  = clSetKernelArg(k_cffts1, 2, sizeof(cl_mem), &m_u);
  if (device_type == CL_DEVICE_TYPE_CPU) {
    ecode |= clSetKernelArg(k_cffts1, 8, sizeof(cl_mem), &m_ty1);
    ecode |= clSetKernelArg(k_cffts1, 9, sizeof(cl_mem), &m_ty2);
  }
  clu_CheckError(ecode, "clSetKernelArg() for k_cffts1");

  k_cffts2 = clCreateKernel(program, "cffts2", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for cffts2");
  ecode  = clSetKernelArg(k_cffts2, 2, sizeof(cl_mem), &m_u);
  if (device_type == CL_DEVICE_TYPE_CPU) {
    ecode |= clSetKernelArg(k_cffts2, 8, sizeof(cl_mem), &m_ty1);
    ecode |= clSetKernelArg(k_cffts2, 9, sizeof(cl_mem), &m_ty2);
  }
  clu_CheckError(ecode, "clSetKernelArg() for k_cffts2");

  k_cffts3 = clCreateKernel(program, "cffts3", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for cffts3");
  ecode  = clSetKernelArg(k_cffts3, 2, sizeof(cl_mem), &m_u);
  if (device_type == CL_DEVICE_TYPE_CPU) {
    ecode |= clSetKernelArg(k_cffts3, 8, sizeof(cl_mem), &m_ty1);
    ecode |= clSetKernelArg(k_cffts3, 9, sizeof(cl_mem), &m_ty2);
  }
  clu_CheckError(ecode, "clSetKernelArg() for k_cffts3");

  k_evolve = clCreateKernel(program, "evolve", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for evolve");

  k_checksum = clCreateKernel(program, "checksum", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for checksum");
  ecode  = clSetKernelArg(k_checksum, 1, sizeof(cl_mem), &m_chk);
  ecode |= clSetKernelArg(k_checksum, 2, sizeof(dcomplex)*checksum_local_ws,
                          NULL);
  ecode |= clSetKernelArg(k_checksum, 3, sizeof(int), &dims[0]);
  ecode |= clSetKernelArg(k_checksum, 4, sizeof(int), &dims[1]);
  clu_CheckError(ecode, "clSetKernelArg() for checksum");
  DTIMER_STOP(T_OPENCL_API);
}
コード例 #2
0
ファイル: is.c プロジェクト: NatTuck/cakemark
//---------------------------------------------------------------------
// Set up the OpenCL environment.
//---------------------------------------------------------------------
static void setup_opencl(int argc, char *argv[])
{
  cl_int ecode;
  char *source_dir = "IS";
  if (argc > 1) source_dir = argv[1];

#ifdef TIMER_DETAIL
  if (timer_on) {
    int i;
    for (i = T_OPENCL_API; i < T_END; i++) timer_clear(i);
  }
#endif

  DTIMER_START(T_OPENCL_API);

  // 1. Find the default device type and get a device for the device type
  device_type = clu_GetDefaultDeviceType();
  device      = clu_GetAvailableDevice(device_type);
  device_name = clu_GetDeviceName(device);

  // Device information
  ecode = clGetDeviceInfo(device,
                          CL_DEVICE_MAX_WORK_ITEM_SIZES,
                          sizeof(work_item_sizes),
                          &work_item_sizes,
                          NULL);
  clu_CheckError(ecode, "clGetDiviceInfo()");

  ecode = clGetDeviceInfo(device,
                          CL_DEVICE_MAX_WORK_GROUP_SIZE,
                          sizeof(size_t),
                          &max_work_group_size,
                          NULL);
  clu_CheckError(ecode, "clGetDiviceInfo()");

  // FIXME: The below values are experimental.
  if (max_work_group_size > 256) {
    max_work_group_size = 256;
    int i;
    for (i = 0; i < 3; i++) {
      if (work_item_sizes[i] > 256) {
        work_item_sizes[i] = 256;
      }
    }
  }

  // 2. Create a context for the specified device
  context = clCreateContext(NULL, 1, &device, NULL, NULL, &ecode);
  clu_CheckError(ecode, "clCreateContext()");

  // 3. Create a command queue
  cmd_queue = clCreateCommandQueue(context, device, 0, &ecode);
  clu_CheckError(ecode, "clCreateCommandQueue()");

  DTIMER_STOP(T_OPENCL_API);

  // 4. Build the program
  DTIMER_START(T_BUILD);
  char *source_file;
  char build_option[30];
  if (device_type == CL_DEVICE_TYPE_CPU) {
    source_file = "is_cpu.cl";
    sprintf(build_option, "-DCLASS=%d -I.", CLASS);

    CREATE_SEQ_GROUP_SIZE = 64;
    CREATE_SEQ_GLOBAL_SIZE = CREATE_SEQ_GROUP_SIZE * 256;
    RANK_GROUP_SIZE = 1;
    RANK_GLOBAL_SIZE = RANK_GROUP_SIZE * 128;
    RANK1_GROUP_SIZE = 1;
    RANK1_GLOBAL_SIZE = RANK1_GROUP_SIZE * RANK_GLOBAL_SIZE;;
    RANK2_GROUP_SIZE = RANK_GROUP_SIZE;
    RANK2_GLOBAL_SIZE = RANK_GLOBAL_SIZE;;
    FV2_GROUP_SIZE = 64;
    FV2_GLOBAL_SIZE = FV2_GROUP_SIZE * 256;
  } else if (device_type == CL_DEVICE_TYPE_GPU) {
    source_file = "is_gpu.cl";
    sprintf(build_option, "-DCLASS=\'%c\' -I.", CLASS);

    CREATE_SEQ_GROUP_SIZE = 64;
    CREATE_SEQ_GLOBAL_SIZE = CREATE_SEQ_GROUP_SIZE * 256;
    RANK1_GROUP_SIZE = work_item_sizes[0];
    RANK1_GLOBAL_SIZE = MAX_KEY;
    RANK2_GROUP_SIZE = work_item_sizes[0];
    RANK2_GLOBAL_SIZE = NUM_KEYS;
    FV2_GROUP_SIZE = work_item_sizes[0];
    FV2_GLOBAL_SIZE = NUM_KEYS;
  } else {
    fprintf(stderr, "%s: not supported.", clu_GetDeviceTypeName(device_type));
    exit(EXIT_FAILURE);
  }
  program = clu_MakeProgram(context, device, source_dir, source_file,
                            build_option);
  DTIMER_STOP(T_BUILD);

  // 5. Create buffers
  DTIMER_START(T_BUFFER_CREATE);
  m_key_array = clCreateBuffer(context,
                               CL_MEM_READ_WRITE,
                               sizeof(INT_TYPE) * SIZE_OF_BUFFERS,
                               NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_key_array");

  m_key_buff1 = clCreateBuffer(context,
                               CL_MEM_READ_WRITE,
                               sizeof(INT_TYPE) * MAX_KEY,
                               NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_key_buff1");

  m_key_buff2 = clCreateBuffer(context,
                               CL_MEM_READ_WRITE,
                               sizeof(INT_TYPE) * SIZE_OF_BUFFERS,
                               NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_key_buff2");

  size_t test_array_size = sizeof(INT_TYPE) * TEST_ARRAY_SIZE;
  m_index_array = clCreateBuffer(context,
                                 CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                 test_array_size,
                                 test_index_array, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_index_array");

  m_rank_array = clCreateBuffer(context,
                                CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                test_array_size,
                                test_rank_array, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_rank_array");

  m_partial_vals = clCreateBuffer(context,
                                  CL_MEM_WRITE_ONLY,
                                  test_array_size,
                                  NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_partial_vals");

  m_passed_verification = clCreateBuffer(context,
                                         CL_MEM_READ_WRITE,
                                         sizeof(cl_int),
                                         NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_passed_verification");

  if (device_type == CL_DEVICE_TYPE_GPU) {
    m_key_scan = clCreateBuffer(context,
                                CL_MEM_READ_WRITE,
                                sizeof(INT_TYPE) * MAX_KEY,
                                NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_key_buff1_scan");

    m_sum = clCreateBuffer(context,
                           CL_MEM_READ_WRITE,
                           sizeof(INT_TYPE) * work_item_sizes[0],
                           NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_sum");
  } else {
    size_t bs_size = RANK_GLOBAL_SIZE * sizeof(INT_TYPE) * NUM_BUCKETS;
    m_bucket_size = clCreateBuffer(context,
                                   CL_MEM_READ_WRITE,
                                   bs_size,
                                   NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_bucket_size");

    m_bucket_ptrs = clCreateBuffer(context,
                                   CL_MEM_READ_WRITE,
                                   bs_size,
                                   NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_bucket_ptrs");
  }
  DTIMER_STOP(T_BUFFER_CREATE);

  // 6. Create kernels
  DTIMER_START(T_OPENCL_API);
  k_rank0 = clCreateKernel(program, "rank0", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for rank0");
  ecode  = clSetKernelArg(k_rank0, 0, sizeof(cl_mem), (void*)&m_key_array);
  ecode |= clSetKernelArg(k_rank0, 1, sizeof(cl_mem), (void*)&m_partial_vals);
  ecode |= clSetKernelArg(k_rank0, 2, sizeof(cl_mem), (void*)&m_index_array);
  clu_CheckError(ecode, "clSetKernelArg() for rank0");

  if (device_type == CL_DEVICE_TYPE_GPU) {
    k_rank1 = clCreateKernel(program, "rank1", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for rank1");
    ecode  = clSetKernelArg(k_rank1, 0, sizeof(cl_mem), (void*)&m_key_buff1);
    clu_CheckError(ecode, "clSetKernelArg() for rank1");

    k_rank2 = clCreateKernel(program, "rank2", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for rank2");
    ecode  = clSetKernelArg(k_rank2, 0, sizeof(cl_mem), (void*)&m_key_buff1);
    ecode |= clSetKernelArg(k_rank2, 1, sizeof(cl_mem), (void*)&m_key_array);
    clu_CheckError(ecode, "clSetKernelArg() for rank2");

    k_rank3_0 = clCreateKernel(program, "rank3_0", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for rank3_0");
    ecode  = clSetKernelArg(k_rank3_0, 0, sizeof(cl_mem),(void*)&m_key_buff1);
    ecode |= clSetKernelArg(k_rank3_0, 1, sizeof(cl_mem),(void*)&m_key_buff1);
    ecode |= clSetKernelArg(k_rank3_0, 2, sizeof(cl_mem),(void*)&m_sum);
    ecode |= clSetKernelArg(k_rank3_0, 3, 
                            sizeof(INT_TYPE) * work_item_sizes[0] * 2,
                            NULL);
    clu_CheckError(ecode, "clSetKernelArg() for rank3_0");

    k_rank3_1 = clCreateKernel(program, "rank3_1", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for rank3_1");
    ecode  = clSetKernelArg(k_rank3_1, 0, sizeof(cl_mem), (void*)&m_sum);
    ecode  = clSetKernelArg(k_rank3_1, 1, sizeof(cl_mem), (void*)&m_sum);
    ecode |= clSetKernelArg(k_rank3_1, 2, 
                            sizeof(INT_TYPE) * work_item_sizes[0] * 2,
                            NULL);
    clu_CheckError(ecode, "clSetKernelArg() for rank3_1");

    k_rank3_2 = clCreateKernel(program, "rank3_2", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for rank3_2");
    ecode  = clSetKernelArg(k_rank3_2, 0, sizeof(cl_mem),(void*)&m_key_buff1);
    ecode  = clSetKernelArg(k_rank3_2, 1, sizeof(cl_mem),(void*)&m_key_buff1);
    ecode |= clSetKernelArg(k_rank3_2, 2, sizeof(cl_mem),(void*)&m_sum);
    clu_CheckError(ecode, "clSetKernelArg() for rank3_2");
  } else {
    k_rank1 = clCreateKernel(program, "rank1", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for rank1");
    ecode  = clSetKernelArg(k_rank1, 0, sizeof(cl_mem),(void*)&m_key_array);
    ecode |= clSetKernelArg(k_rank1, 1, sizeof(cl_mem),(void*)&m_bucket_size);
    clu_CheckError(ecode, "clSetKernelArg() for rank1");

    k_rank2 = clCreateKernel(program, "rank2", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for rank2");
    ecode  = clSetKernelArg(k_rank2, 0, sizeof(cl_mem),(void*)&m_key_array);
    ecode |= clSetKernelArg(k_rank2, 1, sizeof(cl_mem),(void*)&m_bucket_size);
    ecode |= clSetKernelArg(k_rank2, 2, sizeof(cl_mem),(void*)&m_bucket_ptrs);
    ecode |= clSetKernelArg(k_rank2, 3, sizeof(cl_mem),(void*)&m_key_buff2);
    clu_CheckError(ecode, "clSetKernelArg() for rank2");

    k_rank3 = clCreateKernel(program, "rank3", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for rank3");
    ecode  = clSetKernelArg(k_rank3, 0, sizeof(cl_mem),(void*)&m_bucket_size);
    ecode |= clSetKernelArg(k_rank3, 1, sizeof(cl_mem),(void*)&m_bucket_ptrs);
    ecode |= clSetKernelArg(k_rank3, 2, sizeof(cl_mem),(void*)&m_key_buff1);
    ecode |= clSetKernelArg(k_rank3, 3, sizeof(cl_mem),(void*)&m_key_buff2);
    clu_CheckError(ecode, "clSetKernelArg() for rank3");
  }

  k_rank4 = clCreateKernel(program, "rank4", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for rank4");
  ecode  = clSetKernelArg(k_rank4, 0, sizeof(cl_mem), (void*)&m_partial_vals);
  ecode |= clSetKernelArg(k_rank4, 1, sizeof(cl_mem), (void*)&m_key_buff1);
  ecode |= clSetKernelArg(k_rank4, 2, sizeof(cl_mem), (void*)&m_rank_array);
  ecode |= clSetKernelArg(k_rank4, 3, sizeof(cl_mem),
                                      (void*)&m_passed_verification);
  clu_CheckError(ecode, "clSetKernelArg() for rank4");
  DTIMER_STOP(T_OPENCL_API);
}
コード例 #3
0
ファイル: lu.c プロジェクト: ashwinma/multicl
//---------------------------------------------------------------------
// Set up the OpenCL environment.
//---------------------------------------------------------------------
static void setup_opencl(int argc, char *argv[])
{
  int i;
  size_t temp, wg_num;
  cl_int ecode;
  char *source_dir = "LU";

  if (timeron) {
    timer_clear(TIMER_OPENCL);
    timer_clear(TIMER_BUILD);
    timer_clear(TIMER_BUFFER);
    timer_clear(TIMER_RELEASE);

    timer_start(TIMER_OPENCL);
  }

  if (argc > 1) source_dir = argv[1];

  //-----------------------------------------------------------------------
  // 1. Find the default device type and get a device for the device type
  //-----------------------------------------------------------------------
  device_type = clu_GetDefaultDeviceType();
  device      = clu_GetAvailableDevice(device_type);
  device_name = clu_GetDeviceName(device);

  // Device information
  ecode = clGetDeviceInfo(device,
                          CL_DEVICE_MAX_WORK_ITEM_SIZES,
                          sizeof(work_item_sizes),
                          &work_item_sizes,
                          NULL);
  clu_CheckError(ecode, "clGetDiviceInfo()");

  ecode = clGetDeviceInfo(device,
                          CL_DEVICE_MAX_WORK_GROUP_SIZE,
                          sizeof(size_t),
                          &max_work_group_size,
                          NULL);
  clu_CheckError(ecode, "clGetDiviceInfo()");

  ecode = clGetDeviceInfo(device,
                          CL_DEVICE_MAX_COMPUTE_UNITS,
                          sizeof(cl_uint),
                          &max_compute_units,
                          NULL);
  clu_CheckError(ecode, "clGetDiviceInfo()");

  ////////////////////////////////////////////////////////////////////////
  // FIXME: The below values are experimental.
  size_t default_wg_size = 64;
  if (device_type == CL_DEVICE_TYPE_CPU) {
    if (CLASS == 'B') default_wg_size = 128;
  } else {
    if (CLASS == 'B') default_wg_size = 32;
  }
  if (max_work_group_size > default_wg_size) {
    max_work_group_size = default_wg_size;
    int i;
    for (i = 0; i < 3; i++) {
      if (work_item_sizes[i] > default_wg_size) {
        work_item_sizes[i] = default_wg_size;
      }
    }
  }
  if (device_type == CL_DEVICE_TYPE_CPU) {
    SETBV1_DIM = SETBV1_DIM_CPU;
    SETBV2_DIM = SETBV2_DIM_CPU;
    SETBV3_DIM = SETBV3_DIM_CPU;
    SETIV_DIM = SETIV_DIM_CPU;
    ERHS1_DIM = ERHS1_DIM_CPU;
    ERHS2_DIM = ERHS2_DIM_CPU;
    ERHS3_DIM = ERHS3_DIM_CPU;
    ERHS4_DIM = ERHS4_DIM_CPU;
    PINTGR1_DIM = PINTGR1_DIM_CPU;
    PINTGR2_DIM = PINTGR2_DIM_CPU;
    PINTGR3_DIM = PINTGR3_DIM_CPU;
    RHS_DIM  = RHS_DIM_CPU;
    RHSX_DIM = RHSX_DIM_CPU;
    RHSY_DIM = RHSY_DIM_CPU;
    RHSZ_DIM = RHSZ_DIM_CPU;
    SSOR2_DIM = SSOR2_DIM_CPU;
    SSOR3_DIM = SSOR3_DIM_CPU;
  } else {
    SETBV1_DIM = SETBV1_DIM_GPU;
    SETBV2_DIM = SETBV2_DIM_GPU;
    SETBV3_DIM = SETBV3_DIM_GPU;
    SETIV_DIM = SETIV_DIM_GPU;
    ERHS1_DIM = ERHS1_DIM_GPU;
    ERHS2_DIM = ERHS2_DIM_GPU;
    ERHS3_DIM = ERHS3_DIM_GPU;
    ERHS4_DIM = ERHS4_DIM_GPU;
    PINTGR1_DIM = PINTGR1_DIM_GPU;
    PINTGR2_DIM = PINTGR2_DIM_GPU;
    PINTGR3_DIM = PINTGR3_DIM_GPU;
    RHS_DIM  = RHS_DIM_GPU;
    RHSX_DIM = RHSX_DIM_GPU;
    RHSY_DIM = RHSY_DIM_GPU;
    RHSZ_DIM = RHSZ_DIM_GPU;
    SSOR2_DIM = SSOR2_DIM_GPU;
    SSOR3_DIM = SSOR3_DIM_GPU;
  }
  ////////////////////////////////////////////////////////////////////////

  //-----------------------------------------------------------------------
  // 2. Create a context for the specified device
  //-----------------------------------------------------------------------
  context = clCreateContext(NULL, 1, &device, NULL, NULL, &ecode);
  clu_CheckError(ecode, "clCreateContext()");

  //-----------------------------------------------------------------------
  // 3. Create command queues
  //-----------------------------------------------------------------------
  cmd_queue = clCreateCommandQueue(context, device, 0, &ecode);
  clu_CheckError(ecode, "clCreateCommandQueue()");

  max_pipeline = (jend-jst) < max_compute_units ? (jend-jst) : max_compute_units;
  pipe_queue = (cl_command_queue *)malloc(sizeof(cl_command_queue) * max_pipeline);
  for (i = 0; i < max_pipeline; i++) {
    pipe_queue[i] = clCreateCommandQueue(context, device, 0, &ecode);
    clu_CheckError(ecode, "clCreateCommandQueue()");
  }

  //-----------------------------------------------------------------------
  // 4. Build programs
  //-----------------------------------------------------------------------
  if (timeron) timer_start(TIMER_BUILD);
  char build_option[100];

  if (device_type == CL_DEVICE_TYPE_CPU) {
    sprintf(build_option, "-I. -DCLASS=%d -DUSE_CPU", CLASS);
  } else {
    sprintf(build_option, "-I. -DCLASS=\'%c\'", CLASS);
  }

  p_pre = clu_MakeProgram(context, device, source_dir,
                          "kernel_pre.cl",
                          build_option);

  p_main = clu_MakeProgram(context, device, source_dir,
                          (device_type == CL_DEVICE_TYPE_CPU ? "kernel_main_cpu.cl" : "kernel_main_gpu.cl"),
                          build_option);

  p_post = clu_MakeProgram(context, device, source_dir,
                          "kernel_post.cl",
                          build_option);
  if (timeron) timer_stop(TIMER_BUILD);

  //-----------------------------------------------------------------------
  // 5. Create buffers
  //-----------------------------------------------------------------------
  if (timeron) timer_start(TIMER_BUFFER);
  m_ce = clCreateBuffer(context,
                        CL_MEM_READ_ONLY,
                        sizeof(double)*5*13,
                        NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_ce");

  m_u = clCreateBuffer(context,
                       CL_MEM_READ_WRITE,
                       sizeof(double)*(ISIZ3)*(ISIZ2/2*2+1)*(ISIZ1/2*2+1)*5,
                       NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_u");

  m_rsd = clCreateBuffer(context,
                       CL_MEM_READ_WRITE,
                       sizeof(double)*(ISIZ3)*(ISIZ2/2*2+1)*(ISIZ1/2*2+1)*5,
                       NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_rsd");

  m_frct = clCreateBuffer(context,
                       CL_MEM_READ_WRITE,
                       sizeof(double)*(ISIZ3)*(ISIZ2/2*2+1)*(ISIZ1/2*2+1)*5,
                       NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_frct");

  m_qs = clCreateBuffer(context,
                       CL_MEM_READ_WRITE,
                       sizeof(double)*(ISIZ3)*(ISIZ2/2*2+1)*(ISIZ1/2*2+1),
                       NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_qs");

  m_rho_i = clCreateBuffer(context,
                       CL_MEM_READ_WRITE,
                       sizeof(double)*(ISIZ3)*(ISIZ2/2*2+1)*(ISIZ1/2*2+1),
                       NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_rho_i");

  // workspace for work-items
  size_t max_work_items;
  if (ERHS2_DIM == 1 && ERHS3_DIM == 1 && ERHS4_DIM == 1) {
    max_work_items = ISIZ3;
  } else {
    max_work_items = ISIZ3*ISIZ2;
  }
  m_flux = clCreateBuffer(context,
                       CL_MEM_READ_WRITE,
                       sizeof(double)*ISIZ1*5 * max_work_items,
                       NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_flux");

  if (RHSZ_DIM == 1) {
    max_work_items = ISIZ2;
  } else {
    max_work_items = ISIZ2*ISIZ1;
  }

  if (device_type == CL_DEVICE_TYPE_CPU) {
    m_utmp = clCreateBuffer(context,
                         CL_MEM_READ_WRITE,
                         sizeof(double)*ISIZ3*6 * max_work_items,
                         NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_utmp");

    m_rtmp = clCreateBuffer(context,
                         CL_MEM_READ_WRITE,
                         sizeof(double)*ISIZ3*5 * max_work_items,
                         NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_rtmp");
  }

  temp = (nz0-2) / max_compute_units;
  l2norm_lws[0] = temp == 0 ? 1 : temp;
  l2norm_gws[0] = clu_RoundWorkSize((size_t)(nz0-2), l2norm_lws[0]);
  wg_num = l2norm_gws[0] / l2norm_lws[0];
  sum_size = sizeof(double) * 5 * wg_num;
  m_sum = clCreateBuffer(context,
                         CL_MEM_READ_WRITE,
                         sum_size, 
                         NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer()");

  if (timeron) timer_stop(TIMER_BUFFER);

  //-----------------------------------------------------------------------
  // 6. Create kernels
  //-----------------------------------------------------------------------
  k_setbv1 = clCreateKernel(p_pre, "setbv1", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for setbv1");
  ecode  = clSetKernelArg(k_setbv1, 0, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_setbv1, 1, sizeof(cl_mem), &m_ce);
  ecode |= clSetKernelArg(k_setbv1, 2, sizeof(int), &nx);
  ecode |= clSetKernelArg(k_setbv1, 3, sizeof(int), &ny);
  ecode |= clSetKernelArg(k_setbv1, 4, sizeof(int), &nz);
  clu_CheckError(ecode, "clSetKernelArg()");
  if (SETBV1_DIM == 3) {
    setbv1_lws[0] = 5;
    temp = max_work_group_size / setbv1_lws[0];
    setbv1_lws[1] = nx < temp ? nx : temp;
    temp = temp / setbv1_lws[1];
    setbv1_lws[2] = ny < temp ? ny : temp;
    setbv1_gws[0] = clu_RoundWorkSize((size_t)5, setbv1_lws[0]);
    setbv1_gws[1] = clu_RoundWorkSize((size_t)nx, setbv1_lws[1]);
    setbv1_gws[2] = clu_RoundWorkSize((size_t)ny, setbv1_lws[2]);
  } else if (SETBV1_DIM == 2) {
    setbv1_lws[0] = nx < work_item_sizes[0] ? nx : work_item_sizes[0];
    temp = max_work_group_size / setbv1_lws[0];
    setbv1_lws[1] = ny < temp ? ny : temp;
    setbv1_gws[0] = clu_RoundWorkSize((size_t)nx, setbv1_lws[0]);
    setbv1_gws[1] = clu_RoundWorkSize((size_t)ny, setbv1_lws[1]);
  } else {
    temp = ny / max_compute_units;
    setbv1_lws[0] = temp == 0 ? 1 : temp;
    setbv1_gws[0] = clu_RoundWorkSize((size_t)ny, setbv1_lws[0]);
  }

  k_setbv2 = clCreateKernel(p_pre, "setbv2", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for setbv2");
  ecode  = clSetKernelArg(k_setbv2, 0, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_setbv2, 1, sizeof(cl_mem), &m_ce);
  ecode |= clSetKernelArg(k_setbv2, 2, sizeof(int), &nx);
  ecode |= clSetKernelArg(k_setbv2, 3, sizeof(int), &ny);
  ecode |= clSetKernelArg(k_setbv2, 4, sizeof(int), &nz);
  clu_CheckError(ecode, "clSetKernelArg()");
  if (SETBV2_DIM == 3) {
    setbv2_lws[0] = 5;
    temp = max_work_group_size / setbv2_lws[0];
    setbv2_lws[1] = nx < temp ? nx : temp;
    temp = temp / setbv2_lws[1];
    setbv2_lws[2] = nz < temp ? nz : temp;
    setbv2_gws[0] = clu_RoundWorkSize((size_t)5, setbv2_lws[0]);
    setbv2_gws[1] = clu_RoundWorkSize((size_t)nx, setbv2_lws[1]);
    setbv2_gws[2] = clu_RoundWorkSize((size_t)nz, setbv2_lws[2]);
  } else if (SETBV2_DIM == 2) {
    setbv2_lws[0] = nx < work_item_sizes[0] ? nx : work_item_sizes[0];
    temp = max_work_group_size / setbv2_lws[0];
    setbv2_lws[1] = nz < temp ? nz : temp;
    setbv2_gws[0] = clu_RoundWorkSize((size_t)nx, setbv2_lws[0]);
    setbv2_gws[1] = clu_RoundWorkSize((size_t)nz, setbv2_lws[1]);
  } else {
    temp = nz / max_compute_units;
    setbv2_lws[0] = temp == 0 ? 1 : temp;
    setbv2_gws[0] = clu_RoundWorkSize((size_t)nz, setbv2_lws[0]);
  }

  k_setbv3 = clCreateKernel(p_pre, "setbv3", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for setbv3");
  ecode  = clSetKernelArg(k_setbv3, 0, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_setbv3, 1, sizeof(cl_mem), &m_ce);
  ecode |= clSetKernelArg(k_setbv3, 2, sizeof(int), &nx);
  ecode |= clSetKernelArg(k_setbv3, 3, sizeof(int), &ny);
  ecode |= clSetKernelArg(k_setbv3, 4, sizeof(int), &nz);
  clu_CheckError(ecode, "clSetKernelArg()");
  if (SETBV3_DIM == 3) {
    setbv3_lws[0] = 5;
    temp = max_work_group_size / setbv3_lws[0];
    setbv3_lws[1] = ny < temp ? ny : temp;
    temp = temp / setbv3_lws[1];
    setbv3_lws[2] = nz < temp ? nz : temp;
    setbv3_gws[0] = clu_RoundWorkSize((size_t)5, setbv3_lws[0]);
    setbv3_gws[1] = clu_RoundWorkSize((size_t)ny, setbv3_lws[1]);
    setbv3_gws[2] = clu_RoundWorkSize((size_t)nz, setbv3_lws[2]);
  } else if (SETBV3_DIM == 2) {
    setbv3_lws[0] = ny < work_item_sizes[0] ? ny : work_item_sizes[0];
    temp = max_work_group_size / setbv3_lws[0];
    setbv3_lws[1] = nz < temp ? nz : temp;
    setbv3_gws[0] = clu_RoundWorkSize((size_t)ny, setbv3_lws[0]);
    setbv3_gws[1] = clu_RoundWorkSize((size_t)nz, setbv3_lws[1]);
  } else {
    temp = nz / max_compute_units;
    setbv3_lws[0] = temp == 0 ? 1 : temp;
    setbv3_gws[0] = clu_RoundWorkSize((size_t)nz, setbv3_lws[0]);
  }

  k_setiv = clCreateKernel(p_pre, "setiv", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for setiv");
  ecode  = clSetKernelArg(k_setiv, 0, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_setiv, 1, sizeof(cl_mem), &m_ce);
  ecode |= clSetKernelArg(k_setiv, 2, sizeof(int), &nx);
  ecode |= clSetKernelArg(k_setiv, 3, sizeof(int), &ny);
  ecode |= clSetKernelArg(k_setiv, 4, sizeof(int), &nz);
  clu_CheckError(ecode, "clSetKernelArg()");
  if (SETIV_DIM == 3) {
    setiv_lws[0] = (nx-2) < work_item_sizes[0] ? (nx-2) : work_item_sizes[0];
    temp = max_work_group_size / setiv_lws[0];
    setiv_lws[1] = (ny-2) < temp ? (ny-2) : temp;
    temp = temp / setiv_lws[1];
    setiv_lws[2] = (nz-2) < temp ? (nz-2) : temp;
    setiv_gws[0] = clu_RoundWorkSize((size_t)(nx-2), setiv_lws[0]);
    setiv_gws[1] = clu_RoundWorkSize((size_t)(ny-2), setiv_lws[1]);
    setiv_gws[2] = clu_RoundWorkSize((size_t)(nz-2), setiv_lws[2]);
  } else if (SETIV_DIM == 2) {
    setiv_lws[0] = (ny-2) < work_item_sizes[0] ? (ny-2) : work_item_sizes[0];
    temp = max_work_group_size / setiv_lws[0];
    setiv_lws[1] = (nz-2) < temp ? (nz-2) : temp;
    setiv_gws[0] = clu_RoundWorkSize((size_t)(ny-2), setiv_lws[0]);
    setiv_gws[1] = clu_RoundWorkSize((size_t)(nz-2), setiv_lws[1]);
  } else {
    temp = (nz-2) / max_compute_units;
    setiv_lws[0] = temp == 0 ? 1 : temp;
    setiv_gws[0] = clu_RoundWorkSize((size_t)(nz-2), setiv_lws[0]);
  }

  k_l2norm = clCreateKernel(p_main, "l2norm", &ecode);
  clu_CheckError(ecode, "clCreateKernel()");
  ecode  = clSetKernelArg(k_l2norm, 1, sizeof(cl_mem), &m_sum);
  ecode |= clSetKernelArg(k_l2norm, 2, sizeof(double)*5*l2norm_lws[0], NULL);
  clu_CheckError(ecode, "clSetKernelArg()");

  k_rhs = clCreateKernel(p_main, "rhs", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for rhs");
  ecode  = clSetKernelArg(k_rhs, 0, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_rhs, 1, sizeof(cl_mem), &m_rsd);
  ecode |= clSetKernelArg(k_rhs, 2, sizeof(cl_mem), &m_frct);
  ecode |= clSetKernelArg(k_rhs, 3, sizeof(cl_mem), &m_qs);
  ecode |= clSetKernelArg(k_rhs, 4, sizeof(cl_mem), &m_rho_i);
  ecode |= clSetKernelArg(k_rhs, 5, sizeof(int), &nx);
  ecode |= clSetKernelArg(k_rhs, 6, sizeof(int), &ny);
  ecode |= clSetKernelArg(k_rhs, 7, sizeof(int), &nz);
  clu_CheckError(ecode, "clSetKernelArg()");
  if (RHS_DIM == 3) {
    rhs_lws[0] = nx < work_item_sizes[0] ? nx : work_item_sizes[0];
    temp = max_work_group_size / rhs_lws[0];
    rhs_lws[1] = ny < temp ? ny : temp;
    temp = temp / rhs_lws[1];
    rhs_lws[2] = nz < temp ? nz : temp;
    rhs_gws[0] = clu_RoundWorkSize((size_t)nx, rhs_lws[0]);
    rhs_gws[1] = clu_RoundWorkSize((size_t)ny, rhs_lws[1]);
    rhs_gws[2] = clu_RoundWorkSize((size_t)nz, rhs_lws[2]);
  } else if (RHS_DIM == 2) {
    rhs_lws[0] = ny < work_item_sizes[0] ? ny : work_item_sizes[0];
    temp = max_work_group_size / rhs_lws[0];
    rhs_lws[1] = nz < temp ? nz : temp;
    rhs_gws[0] = clu_RoundWorkSize((size_t)ny, rhs_lws[0]);
    rhs_gws[1] = clu_RoundWorkSize((size_t)nz, rhs_lws[1]);
  } else {
    //temp = nz / max_compute_units;
    temp = 1;
    rhs_lws[0] = temp == 0 ? 1 : temp;
    rhs_gws[0] = clu_RoundWorkSize((size_t)nz, rhs_lws[0]);
  }

  k_rhsx = clCreateKernel(p_main, "rhsx", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for rhsx");
  ecode  = clSetKernelArg(k_rhsx, 0, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_rhsx, 1, sizeof(cl_mem), &m_rsd);
  ecode |= clSetKernelArg(k_rhsx, 2, sizeof(cl_mem), &m_qs);
  ecode |= clSetKernelArg(k_rhsx, 3, sizeof(cl_mem), &m_rho_i);
  if (device_type == CL_DEVICE_TYPE_CPU) {
    ecode |= clSetKernelArg(k_rhsx, 4, sizeof(cl_mem), &m_flux);
    ecode |= clSetKernelArg(k_rhsx, 5, sizeof(int), &nx);
    ecode |= clSetKernelArg(k_rhsx, 6, sizeof(int), &ny);
    ecode |= clSetKernelArg(k_rhsx, 7, sizeof(int), &nz);
  } else {
    ecode |= clSetKernelArg(k_rhsx, 4, sizeof(int), &nx);
    ecode |= clSetKernelArg(k_rhsx, 5, sizeof(int), &ny);
    ecode |= clSetKernelArg(k_rhsx, 6, sizeof(int), &nz);
  }
  clu_CheckError(ecode, "clSetKernelArg()");
  if (RHSX_DIM == 2) {
    rhsx_lws[0] = (jend-jst) < work_item_sizes[0] ? (jend-jst) : work_item_sizes[0];
    temp = max_work_group_size / rhsx_lws[0];
    rhsx_lws[1] = (nz-2) < temp ? (nz-2) : temp;
    rhsx_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), rhsx_lws[0]);
    rhsx_gws[1] = clu_RoundWorkSize((size_t)(nz-2), rhsx_lws[1]);
  } else {
    //temp = (nz-2) / max_compute_units;
    temp = 1;
    rhsx_lws[0] = temp == 0 ? 1 : temp;
    rhsx_gws[0] = clu_RoundWorkSize((size_t)(nz-2), rhsx_lws[0]);
  }

  k_rhsy = clCreateKernel(p_main, "rhsy", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for rhsy");
  ecode  = clSetKernelArg(k_rhsy, 0, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_rhsy, 1, sizeof(cl_mem), &m_rsd);
  ecode |= clSetKernelArg(k_rhsy, 2, sizeof(cl_mem), &m_qs);
  ecode |= clSetKernelArg(k_rhsy, 3, sizeof(cl_mem), &m_rho_i);
  if (device_type == CL_DEVICE_TYPE_CPU) {
    ecode |= clSetKernelArg(k_rhsy, 4, sizeof(cl_mem), &m_flux);
    ecode |= clSetKernelArg(k_rhsy, 5, sizeof(int), &nx);
    ecode |= clSetKernelArg(k_rhsy, 6, sizeof(int), &ny);
    ecode |= clSetKernelArg(k_rhsy, 7, sizeof(int), &nz);
  } else {
    ecode |= clSetKernelArg(k_rhsy, 4, sizeof(int), &nx);
    ecode |= clSetKernelArg(k_rhsy, 5, sizeof(int), &ny);
    ecode |= clSetKernelArg(k_rhsy, 6, sizeof(int), &nz);
  }
  clu_CheckError(ecode, "clSetKernelArg()");
  if (RHSY_DIM == 2) {
    rhsy_lws[0] = (iend-ist) < work_item_sizes[0] ? (iend-ist) : work_item_sizes[0];
    temp = max_work_group_size / rhsy_lws[0];
    rhsy_lws[1] = (nz-2) < temp ? (nz-2) : temp;
    rhsy_gws[0] = clu_RoundWorkSize((size_t)(iend-ist), rhsy_lws[0]);
    rhsy_gws[1] = clu_RoundWorkSize((size_t)(nz-2), rhsy_lws[1]);
  } else {
    //temp = (nz-2) / max_compute_units;
    temp = 1;
    rhsy_lws[0] = temp == 0 ? 1 : temp;
    rhsy_gws[0] = clu_RoundWorkSize((size_t)(nz-2), rhsy_lws[0]);
  }

  k_rhsz = clCreateKernel(p_main, "rhsz", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for rhsz");
  ecode  = clSetKernelArg(k_rhsz, 0, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_rhsz, 1, sizeof(cl_mem), &m_rsd);
  ecode |= clSetKernelArg(k_rhsz, 2, sizeof(cl_mem), &m_qs);
  ecode |= clSetKernelArg(k_rhsz, 3, sizeof(cl_mem), &m_rho_i);
  if (device_type == CL_DEVICE_TYPE_CPU) {
    ecode |= clSetKernelArg(k_rhsz, 4, sizeof(cl_mem), &m_flux);
    ecode |= clSetKernelArg(k_rhsz, 5, sizeof(cl_mem), &m_utmp);
    ecode |= clSetKernelArg(k_rhsz, 6, sizeof(cl_mem), &m_rtmp);
    ecode |= clSetKernelArg(k_rhsz, 7, sizeof(int), &nx);
    ecode |= clSetKernelArg(k_rhsz, 8, sizeof(int), &ny);
    ecode |= clSetKernelArg(k_rhsz, 9, sizeof(int), &nz);
  } else {
    ecode |= clSetKernelArg(k_rhsz, 4, sizeof(int), &nx);
    ecode |= clSetKernelArg(k_rhsz, 5, sizeof(int), &ny);
    ecode |= clSetKernelArg(k_rhsz, 6, sizeof(int), &nz);
  }
  clu_CheckError(ecode, "clSetKernelArg()");
  if (RHSZ_DIM == 2) {
    rhsz_lws[0] = (iend-ist) < work_item_sizes[0] ? (iend-ist) : work_item_sizes[0];
    temp = max_work_group_size / rhsz_lws[0];
    rhsz_lws[1] = (jend-jst) < temp ? (jend-jst) : temp;
    rhsz_gws[0] = clu_RoundWorkSize((size_t)(iend-ist), rhsz_lws[0]);
    rhsz_gws[1] = clu_RoundWorkSize((size_t)(jend-jst), rhsz_lws[1]);
  } else {
    //temp = (jend-jst) / max_compute_units;
    temp = 1;
    rhsz_lws[0] = temp == 0 ? 1 : temp;
    rhsz_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), rhsz_lws[0]);
  }

  k_ssor2 = clCreateKernel(p_main, "ssor2", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for ssor2");
  ecode  = clSetKernelArg(k_ssor2, 0, sizeof(cl_mem), &m_rsd);
  ecode |= clSetKernelArg(k_ssor2, 2, sizeof(int), &nx);
  ecode |= clSetKernelArg(k_ssor2, 3, sizeof(int), &ny);
  ecode |= clSetKernelArg(k_ssor2, 4, sizeof(int), &nz);
  clu_CheckError(ecode, "clSetKernelArg()");
  if (SSOR2_DIM == 3) {
    ssor2_lws[0] = (iend-ist) < work_item_sizes[0] ? (iend-ist) : work_item_sizes[0];
    temp = max_work_group_size / ssor2_lws[0];
    ssor2_lws[1] = (jend-jst) < temp ? (jend-jst) : temp;
    temp = temp / ssor2_lws[1];
    ssor2_lws[2] = (nz-2) < temp ? (nz-2) : temp;
    ssor2_gws[0] = clu_RoundWorkSize((size_t)(iend-ist), ssor2_lws[0]);
    ssor2_gws[1] = clu_RoundWorkSize((size_t)(jend-jst), ssor2_lws[1]);
    ssor2_gws[2] = clu_RoundWorkSize((size_t)(nz-2), ssor2_lws[2]);
  } else if (SSOR2_DIM == 2) {
    ssor2_lws[0] = (jend-jst) < work_item_sizes[0] ? (jend-jst) : work_item_sizes[0];
    temp = max_work_group_size / ssor2_lws[0];
    ssor2_lws[1] = (nz-2) < temp ? (nz-2) : temp;
    ssor2_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), ssor2_lws[0]);
    ssor2_gws[1] = clu_RoundWorkSize((size_t)(nz-2), ssor2_lws[1]);
  } else {
    //temp = (nz-2) / max_compute_units;
    temp = 1;
    ssor2_lws[0] = temp == 0 ? 1 : temp;
    ssor2_gws[0] = clu_RoundWorkSize((size_t)(nz-2), ssor2_lws[0]);
  }

  k_ssor3 = clCreateKernel(p_main, "ssor3", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for ssor3");
  ecode  = clSetKernelArg(k_ssor3, 0, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_ssor3, 1, sizeof(cl_mem), &m_rsd);
  ecode |= clSetKernelArg(k_ssor3, 3, sizeof(int), &nx);
  ecode |= clSetKernelArg(k_ssor3, 4, sizeof(int), &ny);
  ecode |= clSetKernelArg(k_ssor3, 5, sizeof(int), &nz);
  clu_CheckError(ecode, "clSetKernelArg()");
  if (SSOR3_DIM == 3) {
    ssor3_lws[0] = (iend-ist) < work_item_sizes[0] ? (iend-ist) : work_item_sizes[0];
    temp = max_work_group_size / ssor3_lws[0];
    ssor3_lws[1] = (jend-jst) < temp ? (jend-jst) : temp;
    temp = temp / ssor3_lws[1];
    ssor3_lws[2] = (nz-2) < temp ? (nz-2) : temp;
    ssor3_gws[0] = clu_RoundWorkSize((size_t)(iend-ist), ssor3_lws[0]);
    ssor3_gws[1] = clu_RoundWorkSize((size_t)(jend-jst), ssor3_lws[1]);
    ssor3_gws[2] = clu_RoundWorkSize((size_t)(nz-2), ssor3_lws[2]);
  } else if (SSOR3_DIM == 2) {
    ssor3_lws[0] = (jend-jst) < work_item_sizes[0] ? (jend-jst) : work_item_sizes[0];
    temp = max_work_group_size / ssor3_lws[0];
    ssor3_lws[1] = (nz-2) < temp ? (nz-2) : temp;
    ssor3_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), ssor3_lws[0]);
    ssor3_gws[1] = clu_RoundWorkSize((size_t)(nz-2), ssor3_lws[1]);
  } else {
    //temp = (nz-2) / max_compute_units;
    temp = 1;
    ssor3_lws[0] = temp == 0 ? 1 : temp;
    ssor3_gws[0] = clu_RoundWorkSize((size_t)(nz-2), ssor3_lws[0]);
  }

  k_blts = clCreateKernel(p_main, "blts", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for blts");
  ecode  = clSetKernelArg(k_blts, 0, sizeof(cl_mem), &m_rsd);
  ecode |= clSetKernelArg(k_blts, 1, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_blts, 2, sizeof(cl_mem), &m_qs);
  ecode |= clSetKernelArg(k_blts, 3, sizeof(cl_mem), &m_rho_i);
  ecode |= clSetKernelArg(k_blts, 4, sizeof(int), &nz);
  ecode |= clSetKernelArg(k_blts, 5, sizeof(int), &ny);
  ecode |= clSetKernelArg(k_blts, 6, sizeof(int), &nx);
  clu_CheckError(ecode, "clSetKernelArg()");
  blts_lws[0] = (jend-jst) < work_item_sizes[0] ? (jend-jst) : work_item_sizes[0];
  temp = max_work_group_size / blts_lws[0];
  blts_lws[1] = (nz-2) < temp ? (nz-2) : temp;
  blts_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), blts_lws[0]);
  blts_gws[1] = clu_RoundWorkSize((size_t)(nz-2), blts_lws[1]);

  k_buts = clCreateKernel(p_main, "buts", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for buts");
  ecode  = clSetKernelArg(k_buts, 0, sizeof(cl_mem), &m_rsd);
  ecode |= clSetKernelArg(k_buts, 1, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_buts, 2, sizeof(cl_mem), &m_qs);
  ecode |= clSetKernelArg(k_buts, 3, sizeof(cl_mem), &m_rho_i);
  ecode |= clSetKernelArg(k_buts, 4, sizeof(int), &nz);
  ecode |= clSetKernelArg(k_buts, 5, sizeof(int), &ny);
  ecode |= clSetKernelArg(k_buts, 6, sizeof(int), &nx);
  clu_CheckError(ecode, "clSetKernelArg()");
  buts_lws[0] = (jend-jst) < work_item_sizes[0] ? (jend-jst) : work_item_sizes[0];
  temp = max_work_group_size / buts_lws[0];
  buts_lws[1] = (nz-2) < temp ? (nz-2) : temp;
  buts_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), buts_lws[0]);
  buts_gws[1] = clu_RoundWorkSize((size_t)(nz-2), buts_lws[1]);

  if (timeron) timer_stop(TIMER_OPENCL);
}
コード例 #4
0
ファイル: ep.c プロジェクト: NatTuck/cakemark
//---------------------------------------------------------------------
// Set up the OpenCL environment.
//---------------------------------------------------------------------
void setup_opencl(int argc, char *argv[])
{
  cl_int err_code;
  char *source_dir = "EP";
  if (argc > 1) source_dir = argv[1];

#ifdef TIMER_DETAIL
  if (timers_enabled) {
    int i;
    for (i = T_OPENCL_API; i < T_END; i++) timer_clear(i);
  }
#endif

  DTIMER_START(T_OPENCL_API);

  // 1. Find the default device type and get a device for the device type
  device_type = clu_GetDefaultDeviceType();
  device      = clu_GetAvailableDevice(device_type);
  device_name = clu_GetDeviceName(device);

  // 2. Create a context for the specified device
  context = clCreateContext(NULL, 1, &device, NULL, NULL, &err_code);
  clu_CheckError(err_code, "clCreateContext()");

  // 3. Create a command queue
  cmd_queue = clCreateCommandQueue(context, device, 0, &err_code);
  clu_CheckError(err_code, "clCreateCommandQueue()");

  DTIMER_STOP(T_OPENCL_API);

  // 4. Build the program
  DTIMER_START(T_BUILD);
  char *source_file;
  char build_option[30];
  sprintf(build_option, "-DM=%d -I.", M);
  if (device_type == CL_DEVICE_TYPE_CPU) {
    source_file = "ep_cpu.cl";
    GROUP_SIZE = 16;
  } else {
    source_file = "ep_gpu.cl";
    GROUP_SIZE = 64;
  }
  program = clu_MakeProgram(context, device, source_dir, source_file,
                            build_option);
  DTIMER_STOP(T_BUILD);

  // 5. Create buffers
  DTIMER_START(T_BUFFER_CREATE);

  gq_size  = np / GROUP_SIZE * NQ * sizeof(double);
  gsx_size = np / GROUP_SIZE * sizeof(double);
  gsy_size = np / GROUP_SIZE * sizeof(double);

  pgq = clCreateBuffer(context, CL_MEM_READ_WRITE, gq_size, NULL, &err_code);
  clu_CheckError(err_code, "clCreateBuffer() for pgq");

  pgsx = clCreateBuffer(context, CL_MEM_READ_WRITE, gsx_size,NULL, &err_code);
  clu_CheckError(err_code, "clCreateBuffer() for pgsx");

  pgsy = clCreateBuffer(context, CL_MEM_READ_WRITE, gsy_size,NULL, &err_code);
  clu_CheckError(err_code, "clCreateBuffer() for pgsy");

  DTIMER_STOP(T_BUFFER_CREATE);

  // 6. Create a kernel
  DTIMER_START(T_OPENCL_API);
  kernel = clCreateKernel(program, "embar", &err_code);
  clu_CheckError(err_code, "clCreateKernel()");
  DTIMER_STOP(T_OPENCL_API);
}