Пример #1
0
//---------------------------------------------------------------------
// Set up the OpenCL environment.
//---------------------------------------------------------------------
static void setup_opencl(int argc, char *argv[])
{
  size_t temp;
  cl_int ecode;
  char *source_dir = "FT";
  if (argc > 1) source_dir = argv[1];

#ifdef TIMER_DETAIL
  if (timers_enabled) {
    int i;
    for (i = T_OPENCL_API; i < T_END; i++) timer_clear(i);
  }
#endif

  DTIMER_START(T_OPENCL_API);

  // 1. Find the default device type and get a device for the device type
  device_type = clu_GetDefaultDeviceType();
  device      = clu_GetAvailableDevice(device_type);
  device_name = clu_GetDeviceName(device);

  // Device information
  ecode = clGetDeviceInfo(device,
                          CL_DEVICE_MAX_WORK_ITEM_SIZES,
                          sizeof(work_item_sizes),
                          &work_item_sizes,
                          NULL);
  clu_CheckError(ecode, "clGetDiviceInfo()");

  ecode = clGetDeviceInfo(device,
                          CL_DEVICE_MAX_WORK_GROUP_SIZE,
                          sizeof(size_t),
                          &max_work_group_size,
                          NULL);
  clu_CheckError(ecode, "clGetDiviceInfo()");

  // FIXME: The below values are experimental.
  if (max_work_group_size > 64) {
    max_work_group_size = 64;
    int i;
    for (i = 0; i < 3; i++) {
      if (work_item_sizes[i] > 64) {
        work_item_sizes[i] = 64;
      }
    }
  }

  ecode = clGetDeviceInfo(device,
                          CL_DEVICE_MAX_COMPUTE_UNITS,
                          sizeof(cl_uint),
                          &max_compute_units,
                          NULL);
  clu_CheckError(ecode, "clGetDiviceInfo()");

  // 2. Create a context for the specified device
  context = clCreateContext(NULL, 1, &device, NULL, NULL, &ecode);
  clu_CheckError(ecode, "clCreateContext()");

  // 3. Create a command queue
  cmd_queue = clCreateCommandQueue(context, device, 0, &ecode);
  clu_CheckError(ecode, "clCreateCommandQueue()");

  DTIMER_STOP(T_OPENCL_API);

  // 4. Build the program
  DTIMER_START(T_BUILD);
  char *source_file;
  char build_option[50];
  if (device_type == CL_DEVICE_TYPE_CPU) {
    source_file = "ft_cpu.cl";
    sprintf(build_option, "-I. -DCLASS=%d -DUSE_CPU", CLASS);

    COMPUTE_IMAP_DIM = COMPUTE_IMAP_DIM_CPU;
    EVOLVE_DIM = EVOLVE_DIM_CPU;
    CFFTS_DIM = CFFTS_DIM_CPU;

  } else if (device_type == CL_DEVICE_TYPE_GPU) {
    char vendor[50];
    ecode = clGetDeviceInfo(device, CL_DEVICE_VENDOR, 50, vendor, NULL);
    clu_CheckError(ecode, "clGetDeviceInfo()");
    if (strncmp(vendor, DEV_VENDOR_NVIDIA, strlen(DEV_VENDOR_NVIDIA)) == 0) {
      source_file = "ft_gpu_nvidia.cl";
      CFFTS_LSIZE = 32;
    } else {
      source_file = "ft_gpu.cl";
      CFFTS_LSIZE = 64;
    }

    sprintf(build_option, "-I. -DCLASS=\'%c\' -DLSIZE=%lu",
            CLASS, CFFTS_LSIZE);

    COMPUTE_IMAP_DIM = COMPUTE_IMAP_DIM_GPU;
    EVOLVE_DIM = EVOLVE_DIM_GPU;
    CFFTS_DIM = CFFTS_DIM_GPU;

  } else {
    fprintf(stderr, "Set the environment variable OPENCL_DEVICE_TYPE!\n");
    exit(EXIT_FAILURE);
  }
  program = clu_MakeProgram(context, device, source_dir, source_file,
                            build_option);
  DTIMER_STOP(T_BUILD);

  // 5. Create buffers
  DTIMER_START(T_BUFFER_CREATE);
  m_u = clCreateBuffer(context,
                       CL_MEM_READ_ONLY,
                       sizeof(dcomplex) * NXP,
                       NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_u");

  m_u0 = clCreateBuffer(context,
                        CL_MEM_READ_WRITE,
                        sizeof(dcomplex) * NTOTALP,
                        NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_u0");

  m_u1 = clCreateBuffer(context,
                        CL_MEM_READ_WRITE,
                        sizeof(dcomplex) * NTOTALP,
                        NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_u1");

  m_twiddle = clCreateBuffer(context,
                             CL_MEM_READ_WRITE,
                             sizeof(double) * NTOTALP,
                             NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_twiddle");

  if (device_type == CL_DEVICE_TYPE_CPU) {
    size_t ty1_size, ty2_size;
    if (CFFTS_DIM == 2) {
      ty1_size = sizeof(dcomplex) * NX * NY * NZ;
      ty2_size = sizeof(dcomplex) * NX * NY * NZ;
    } else {
      fprintf(stderr, "Wrong CFFTS_DIM: %u\n", CFFTS_DIM);
      exit(EXIT_FAILURE);
    }

    m_ty1 = clCreateBuffer(context,
                           CL_MEM_READ_WRITE,
                           ty1_size,
                           NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_ty1");

    m_ty2 = clCreateBuffer(context,
                           CL_MEM_READ_WRITE,
                           ty2_size,
                           NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_ty2");
  }

  if (device_type == CL_DEVICE_TYPE_CPU) {
    temp = 1024 / max_compute_units;
    checksum_local_ws  = temp == 0 ? 1 : temp;
    checksum_global_ws = clu_RoundWorkSize((size_t)1024, checksum_local_ws);
  } else if (device_type == CL_DEVICE_TYPE_GPU) {
    checksum_local_ws  = 32;
    checksum_global_ws = clu_RoundWorkSize((size_t)1024, checksum_local_ws);
  }
  checksum_wg_num = checksum_global_ws / checksum_local_ws;
  m_chk = clCreateBuffer(context,
                         CL_MEM_READ_WRITE,
                         sizeof(dcomplex) * checksum_wg_num,
                         NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_chk");
  g_chk = (dcomplex *)malloc(sizeof(dcomplex) * checksum_wg_num);
  DTIMER_STOP(T_BUFFER_CREATE);

  // 6. Create kernels
  DTIMER_START(T_OPENCL_API);
  double ap = -4.0 * ALPHA * PI * PI;
  int d1 = dims[0];
  int d2 = dims[1];
  int d3 = dims[2];

  k_compute_indexmap = clCreateKernel(program, "compute_indexmap", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for compute_indexmap");
  ecode  = clSetKernelArg(k_compute_indexmap, 0, sizeof(cl_mem), &m_twiddle);
  ecode |= clSetKernelArg(k_compute_indexmap, 1, sizeof(int), &d1);
  ecode |= clSetKernelArg(k_compute_indexmap, 2, sizeof(int), &d2);
  ecode |= clSetKernelArg(k_compute_indexmap, 3, sizeof(int), &d3);
  ecode |= clSetKernelArg(k_compute_indexmap, 4, sizeof(double), &ap);
  clu_CheckError(ecode, "clSetKernelArg() for compute_indexmap");
  if (COMPUTE_IMAP_DIM == 3) {
    cimap_lws[0] = d1 < work_item_sizes[0] ? d1 : work_item_sizes[0];
    temp = max_work_group_size / cimap_lws[0];
    cimap_lws[1] = d2 < temp ? d2 : temp;
    temp = temp / cimap_lws[1];
    cimap_lws[2] = d3 < temp ? d3 : temp;

    cimap_gws[0] = clu_RoundWorkSize((size_t)d1, cimap_lws[0]);
    cimap_gws[1] = clu_RoundWorkSize((size_t)d2, cimap_lws[1]);
    cimap_gws[2] = clu_RoundWorkSize((size_t)d3, cimap_lws[2]);
  } else if (COMPUTE_IMAP_DIM == 2) {
    cimap_lws[0] = d2 < work_item_sizes[0] ? d2 : work_item_sizes[0];
    temp = max_work_group_size / cimap_lws[0];
    cimap_lws[1] = d3 < temp ? d3 : temp;

    cimap_gws[0] = clu_RoundWorkSize((size_t)d2, cimap_lws[0]);
    cimap_gws[1] = clu_RoundWorkSize((size_t)d3, cimap_lws[1]);
  } else {
    //temp = d3 / max_compute_units;
    temp = 1;
    cimap_lws[0] = temp == 0 ? 1 : temp;
    cimap_gws[0] = clu_RoundWorkSize((size_t)d3, cimap_lws[0]);
  }

  k_compute_ics = clCreateKernel(program,
                                 "compute_initial_conditions", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for compute_initial_conditions");
  ecode  = clSetKernelArg(k_compute_ics, 2, sizeof(int), &d1);
  ecode |= clSetKernelArg(k_compute_ics, 3, sizeof(int), &d2);
  ecode |= clSetKernelArg(k_compute_ics, 4, sizeof(int), &d3);
  clu_CheckError(ecode, "clSetKernelArg() for compute_initial_conditions");

  k_cffts1 = clCreateKernel(program, "cffts1", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for cffts1");
  ecode  = clSetKernelArg(k_cffts1, 2, sizeof(cl_mem), &m_u);
  if (device_type == CL_DEVICE_TYPE_CPU) {
    ecode |= clSetKernelArg(k_cffts1, 8, sizeof(cl_mem), &m_ty1);
    ecode |= clSetKernelArg(k_cffts1, 9, sizeof(cl_mem), &m_ty2);
  }
  clu_CheckError(ecode, "clSetKernelArg() for k_cffts1");

  k_cffts2 = clCreateKernel(program, "cffts2", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for cffts2");
  ecode  = clSetKernelArg(k_cffts2, 2, sizeof(cl_mem), &m_u);
  if (device_type == CL_DEVICE_TYPE_CPU) {
    ecode |= clSetKernelArg(k_cffts2, 8, sizeof(cl_mem), &m_ty1);
    ecode |= clSetKernelArg(k_cffts2, 9, sizeof(cl_mem), &m_ty2);
  }
  clu_CheckError(ecode, "clSetKernelArg() for k_cffts2");

  k_cffts3 = clCreateKernel(program, "cffts3", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for cffts3");
  ecode  = clSetKernelArg(k_cffts3, 2, sizeof(cl_mem), &m_u);
  if (device_type == CL_DEVICE_TYPE_CPU) {
    ecode |= clSetKernelArg(k_cffts3, 8, sizeof(cl_mem), &m_ty1);
    ecode |= clSetKernelArg(k_cffts3, 9, sizeof(cl_mem), &m_ty2);
  }
  clu_CheckError(ecode, "clSetKernelArg() for k_cffts3");

  k_evolve = clCreateKernel(program, "evolve", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for evolve");

  k_checksum = clCreateKernel(program, "checksum", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for checksum");
  ecode  = clSetKernelArg(k_checksum, 1, sizeof(cl_mem), &m_chk);
  ecode |= clSetKernelArg(k_checksum, 2, sizeof(dcomplex)*checksum_local_ws,
                          NULL);
  ecode |= clSetKernelArg(k_checksum, 3, sizeof(int), &dims[0]);
  ecode |= clSetKernelArg(k_checksum, 4, sizeof(int), &dims[1]);
  clu_CheckError(ecode, "clSetKernelArg() for checksum");
  DTIMER_STOP(T_OPENCL_API);
}
Пример #2
0
//---------------------------------------------------------------------
// Set up the OpenCL environment.
//---------------------------------------------------------------------
static void setup_opencl(int argc, char **argv)
{
  int i, c;
//  size_t temp;
  cl_int ecode = 0;
  char *source_dir = ".";  //FIXME
  int num_subs = DEFAULT_NUM_SUBS;
  int num_cus;
  int sqrt_num_command_queues;

  if (argc > 1) source_dir = argv[1];

  devices = (cl_device_id *)malloc(sizeof(cl_device_id) * num_subs);

  if (timeron) {
    timer_clear(TIMER_OPENCL);
    timer_clear(TIMER_BUILD);
    timer_clear(TIMER_BUFFER);
    timer_clear(TIMER_RELEASE);

    timer_start(TIMER_OPENCL);
  }

  // 1. Find the default device type and get a device for the device type
  //    Then, create sub-devices from the parent device.
  //device_type = CL_DEVICE_TYPE_CPU;
  device_type = CL_DEVICE_TYPE_ALL;
  //device_type = CL_DEVICE_TYPE_GPU;
  if(argc <= 2) {
    printf("Device type argument missing!\n");
	exit(-1);
  }
  char *device_type_str = argv[2];
  if(strcmp(device_type_str, "CPU") == 0 || strcmp(device_type_str, "cpu") == 0) {
  	device_type = CL_DEVICE_TYPE_CPU;
  } else if(strcmp(device_type_str, "GPU") == 0 || strcmp(device_type_str, "gpu") == 0) {
  	device_type = CL_DEVICE_TYPE_GPU;
  } else if(strcmp(device_type_str, "ALL") == 0 || strcmp(device_type_str, "all") == 0) {
  	device_type = CL_DEVICE_TYPE_ALL;
  } else {
    printf("Unsupported device type!\n");
	exit(-1);
  }
  cl_uint num_command_queues = 4;
  char *num_command_queues_str = getenv("SNU_NPB_COMMAND_QUEUES");
  if(num_command_queues_str != NULL)
  	num_command_queues = atoi(num_command_queues_str);

  cl_platform_id platform;
  ecode = clGetPlatformIDs(1, &platform, NULL);
  clu_CheckError(ecode, "clGetPlatformIDs()");

  ecode = clGetDeviceIDs(platform, device_type, 0, NULL, &num_devices);
  clu_CheckError(ecode, "clGetDeviceIDs()");

  //num_devices = 2;
  ecode = clGetDeviceIDs(platform, device_type, num_devices, devices, NULL);
  clu_CheckError(ecode, "clGetDeviceIDs()");
  cl_device_id tmp_dev;

  work_item_sizes[0] = work_item_sizes[1] = work_item_sizes[2] = 1024;
  max_work_group_size = 1024;
  max_compute_units = 22;

  sqrt_num_command_queues = (int)(sqrt((double)(num_command_queues) + 0.00001));
  if (num_command_queues != sqrt_num_command_queues * sqrt_num_command_queues) {
    fprintf(stderr, "Number of devices is not a square of some integer\n");
    exit(EXIT_FAILURE);
  }

  ncells = (int)(sqrt((double)(num_command_queues) + 0.00001));
  MAX_CELL_DIM = ((PROBLEM_SIZE/ncells)+1);
  IMAX = MAX_CELL_DIM;
  JMAX = MAX_CELL_DIM;
  KMAX = MAX_CELL_DIM;
  IMAXP = (IMAX/2*2+1);
  JMAXP = (JMAX/2*2+1);
  //---------------------------------------------------------------------
  // +1 at end to avoid zero length arrays for 1 node
  //---------------------------------------------------------------------
  BUF_SIZE = (MAX_CELL_DIM*MAX_CELL_DIM*(MAXCELLS-1)*60*2+1);


  // FIXME
  if (max_work_group_size > 64) {
    max_work_group_size = 64;
    int i;
    for (i = 0; i < 3; i++) {
      if (work_item_sizes[i] > 64) {
        work_item_sizes[i] = 64;
      }
    }
  }

  // 2. Create a context for devices
#ifdef MINIMD_SNUCL_OPTIMIZATIONS
	cl_context_properties props[5] = {
		CL_CONTEXT_PLATFORM,
		(cl_context_properties)platform,
		CL_CONTEXT_SCHEDULER,
		CL_CONTEXT_SCHEDULER_CODE_SEGMENTED_PERF_MODEL,
		//CL_CONTEXT_SCHEDULER_PERF_MODEL,
		//CL_CONTEXT_SCHEDULER_FIRST_EPOCH_BASED_PERF_MODEL,
		//CL_CONTEXT_SCHEDULER_ALL_EPOCH_BASED_PERF_MODEL,
		0 };
  context = clCreateContext(props, 
#elif defined(SOCL_OPTIMIZATIONS)
	cl_context_properties props[5] = {
		CL_CONTEXT_PLATFORM,
		(cl_context_properties)platform,
		CL_CONTEXT_SCHEDULER_SOCL,
		"dmda",
		//"random",
		0 };
  context = clCreateContext(props, 
#else
  context = clCreateContext(NULL, 
#endif
                            num_devices,
                            devices,
                            NULL, NULL, &ecode);
  clu_CheckError(ecode, "clCreateContext()");

  // 3. Create a command queue
  cmd_queue = (cl_command_queue*)malloc(sizeof(cl_command_queue)*num_command_queues*3);
  for (i = 0; i < num_command_queues * 2; i++) {
    //cmd_queue[i] = clCreateCommandQueue(context, devices[(i / 2) % num_devices], 
#ifdef SOCL_OPTIMIZATIONS
    cmd_queue[i] = clCreateCommandQueue(context, NULL, 
#else    
	cmd_queue[i] = clCreateCommandQueue(context, devices[num_devices - 1 - ((i / 2) % num_devices)],
#endif
   // cmd_queue[i] = clCreateCommandQueue(context, devices[0], 
#ifdef MINIMD_SNUCL_OPTIMIZATIONS
	0,
	//		CL_QUEUE_AUTO_DEVICE_SELECTION | 
	//		CL_QUEUE_ITERATIVE, 
			//CL_QUEUE_COMPUTE_INTENSIVE,
#else
	0,
#endif
	&ecode);
    clu_CheckError(ecode, "clCreateCommandQueue()");
  }

  // 4. Build the program
  if (timeron) timer_start(TIMER_BUILD);
  char *source_file = "sp_kernel.cl";
  //p_program = clu_MakeProgram(context, devices, source_dir, source_file, build_option);
  p_program = clu_CreateProgram(context, source_dir, source_file);
  for(i = 0; i < num_devices; i++) {
	  char build_option[200] = {0};
	  cl_device_type cur_device_type;
	  cl_int err = clGetDeviceInfo(devices[i],
			  CL_DEVICE_TYPE,
			  sizeof(cl_device_type),
			  &cur_device_type,
			  NULL);
	  clu_CheckError(err, "clGetDeviceInfo()");
  if (cur_device_type == CL_DEVICE_TYPE_CPU) {
    sprintf(build_option, "-I. -DCLASS=%d -DUSE_CPU -DMAX_CELL_DIM=%d -DIMAX=%d -DJMAX=%d -DKMAX=%d -DIMAXP=%d -DJMAXP=%d", CLASS, MAX_CELL_DIM, IMAX, JMAX, KMAX, IMAXP, JMAXP);
  } else {
    sprintf(build_option, "-I. -DCLASS=%d -DUSE_GPU -DMAX_CELL_DIM=%d -DIMAX=%d -DJMAX=%d -DKMAX=%d -DIMAXP=%d -DJMAXP=%d", CLASS, MAX_CELL_DIM, IMAX, JMAX, KMAX, IMAXP, JMAXP);
  }

  clu_MakeProgram(p_program, 1, &devices[i], source_dir, build_option);
  //clu_MakeProgram(p_program, num_devices, devices, source_dir, build_option);
  }
  num_devices = num_command_queues;
  program = (cl_program *)malloc(sizeof(cl_program) * num_devices);
  for (i = 0; i < num_devices; i++) {
    program[i] = p_program;
  }
  if (timeron) timer_stop(TIMER_BUILD);

  // 5. Create kernels
  size_t asize = sizeof(cl_kernel) * num_devices;
  k_initialize1 = (cl_kernel *)malloc(asize);
  k_initialize2 = (cl_kernel *)malloc(asize);
  k_initialize3 = (cl_kernel *)malloc(asize);
  k_initialize4 = (cl_kernel *)malloc(asize);
  k_initialize5 = (cl_kernel *)malloc(asize);
  k_initialize6 = (cl_kernel *)malloc(asize);
  k_initialize7 = (cl_kernel *)malloc(asize);
  k_initialize8 = (cl_kernel *)malloc(asize);
  k_lhsinit = (cl_kernel *)malloc(asize);
  k_exact_rhs1 = (cl_kernel *)malloc(asize);
  k_exact_rhs2 = (cl_kernel *)malloc(asize);
  k_exact_rhs3 = (cl_kernel *)malloc(asize);
  k_exact_rhs4 = (cl_kernel *)malloc(asize);
  k_exact_rhs5 = (cl_kernel *)malloc(asize);
  k_copy_faces1 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_copy_faces2 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_copy_faces3 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_copy_faces4 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_copy_faces5 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_copy_faces6 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_compute_rhs1 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_compute_rhs2 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_compute_rhs3 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_compute_rhs4 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_compute_rhs5 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_compute_rhs6 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_txinvr = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_lhsx = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_ninvr = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_x_solve1 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_x_solve2 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_x_solve3 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_x_solve4 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_x_solve5 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_x_solve6 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_lhsy = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_pinvr = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_y_solve1 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_y_solve2 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_y_solve3 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_y_solve4 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_y_solve5 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_y_solve6 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_lhsz = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_tzetar = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_z_solve1 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_z_solve2 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_z_solve3 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_z_solve4 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_z_solve5 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_z_solve6 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_add = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_error_norm = (cl_kernel *)malloc(asize);
  k_rhs_norm = (cl_kernel *)malloc(asize);

  for (i = 0; i < num_devices; i++) {
    k_initialize1[i] = clCreateKernel(program[i], "initialize1", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for initialize1");

    k_initialize2[i] = clCreateKernel(program[i], "initialize2", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for initialize2");

    k_initialize3[i] = clCreateKernel(program[i], "initialize3", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for initialize3");

    k_initialize4[i] = clCreateKernel(program[i], "initialize4", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for initialize4");

    k_initialize5[i] = clCreateKernel(program[i], "initialize5", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for initialize5");

    k_initialize6[i] = clCreateKernel(program[i], "initialize6", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for initialize6");

    k_initialize7[i] = clCreateKernel(program[i], "initialize7", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for initialize7");

    k_initialize8[i] = clCreateKernel(program[i], "initialize8", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for initialize8");

    k_lhsinit[i] = clCreateKernel(program[i], "lhsinit", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for lhsinit");

    k_exact_rhs1[i] = clCreateKernel(program[i], "exact_rhs1", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for exact_rhs1");

    k_exact_rhs2[i] = clCreateKernel(program[i], "exact_rhs2", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for exact_rhs2");

    k_exact_rhs3[i] = clCreateKernel(program[i], "exact_rhs3", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for exact_rhs3");

    k_exact_rhs4[i] = clCreateKernel(program[i], "exact_rhs4", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for exact_rhs4");

    k_exact_rhs5[i] = clCreateKernel(program[i], "exact_rhs5", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for exact_rhs5");

    for (c = 0; c < MAXCELLS; c++) {
      k_copy_faces1[i][c] = clCreateKernel(program[i], "copy_faces1", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for copy_faces1");

      k_copy_faces2[i][c] = clCreateKernel(program[i], "copy_faces2", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for copy_faces2");

      k_copy_faces3[i][c] = clCreateKernel(program[i], "copy_faces3", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for copy_faces3");

      k_copy_faces4[i][c] = clCreateKernel(program[i], "copy_faces4", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for copy_faces4");

      k_copy_faces5[i][c] = clCreateKernel(program[i], "copy_faces5", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for copy_faces5");

      k_copy_faces6[i][c] = clCreateKernel(program[i], "copy_faces6", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for copy_faces6");

      k_compute_rhs1[i][c] = clCreateKernel(program[i], "compute_rhs1", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for compute_rhs1");

      k_compute_rhs2[i][c] = clCreateKernel(program[i], "compute_rhs2", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for compute_rhs2");

      k_compute_rhs3[i][c] = clCreateKernel(program[i], "compute_rhs3", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for compute_rhs3");

      k_compute_rhs4[i][c] = clCreateKernel(program[i], "compute_rhs4", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for compute_rhs4");

      k_compute_rhs5[i][c] = clCreateKernel(program[i], "compute_rhs5", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for compute_rhs5");

      k_compute_rhs6[i][c] = clCreateKernel(program[i], "compute_rhs6", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for compute_rhs6");

      k_txinvr[i][c] = clCreateKernel(program[i], "txinvr", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for txinvr");

      k_lhsx[i][c] = clCreateKernel(program[i], "lhsx", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for lhsx");

      k_ninvr[i][c] = clCreateKernel(program[i], "ninvr", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for ninvr");

      k_x_solve1[i][c] = clCreateKernel(program[i], "x_solve1", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for x_solve1");

      k_x_solve2[i][c] = clCreateKernel(program[i], "x_solve2", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for x_solve2");

      k_x_solve3[i][c] = clCreateKernel(program[i], "x_solve3", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for x_solve3");

      k_x_solve4[i][c] = clCreateKernel(program[i], "x_solve4", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for x_solve4");

      k_x_solve5[i][c] = clCreateKernel(program[i], "x_solve5", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for x_solve5");

      k_x_solve6[i][c] = clCreateKernel(program[i], "x_solve6", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for x_solve6");

      k_lhsy[i][c] = clCreateKernel(program[i], "lhsy", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for lhsy");

      k_pinvr[i][c] = clCreateKernel(program[i], "pinvr", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for pinvr");

      k_y_solve1[i][c] = clCreateKernel(program[i], "y_solve1", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for y_solve1");

      k_y_solve2[i][c] = clCreateKernel(program[i], "y_solve2", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for y_solve2");

      k_y_solve3[i][c] = clCreateKernel(program[i], "y_solve3", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for y_solve3");

      k_y_solve4[i][c] = clCreateKernel(program[i], "y_solve4", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for y_solve4");

      k_y_solve5[i][c] = clCreateKernel(program[i], "y_solve5", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for y_solve5");

      k_y_solve6[i][c] = clCreateKernel(program[i], "y_solve6", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for y_solve6");

      k_lhsz[i][c] = clCreateKernel(program[i], "lhsz", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for lhsz");

      k_tzetar[i][c] = clCreateKernel(program[i], "tzetar", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for tzetar");

      k_z_solve1[i][c] = clCreateKernel(program[i], "z_solve1", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for z_solve1");

      k_z_solve2[i][c] = clCreateKernel(program[i], "z_solve2", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for z_solve2");

      k_z_solve3[i][c] = clCreateKernel(program[i], "z_solve3", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for z_solve3");

      k_z_solve4[i][c] = clCreateKernel(program[i], "z_solve4", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for z_solve4");

      k_z_solve5[i][c] = clCreateKernel(program[i], "z_solve5", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for z_solve5");

      k_z_solve6[i][c] = clCreateKernel(program[i], "z_solve6", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for z_solve6");

      k_add[i][c] = clCreateKernel(program[i], "add", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for add");
    }

    k_error_norm[i] = clCreateKernel(program[i], "error_norm", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for error_norm");

    k_rhs_norm[i] = clCreateKernel(program[i], "rhs_norm", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for rhs_norm");
  }

  // 6. Create buffers
  if (timeron) timer_start(TIMER_BUFFER);

  asize = sizeof(cl_mem) * num_devices;

  m_u = (cl_mem *)malloc(asize);
  m_us = (cl_mem *)malloc(asize);
  m_vs = (cl_mem *)malloc(asize);
  m_ws = (cl_mem *)malloc(asize);
  m_qs = (cl_mem *)malloc(asize);
  m_ainv = (cl_mem *)malloc(asize);
  m_rho_i = (cl_mem *)malloc(asize);
  m_speed = (cl_mem *)malloc(asize);
  m_square = (cl_mem *)malloc(asize);
  m_rhs = (cl_mem *)malloc(asize);
  m_forcing = (cl_mem *)malloc(asize);
  m_lhs = (cl_mem *)malloc(asize);
  m_in_buffer = (cl_mem *)malloc(asize);
  m_out_buffer = (cl_mem *)malloc(asize);

  m_ce = (cl_mem *)malloc(asize);

  for (i = 0; i < num_devices; i++) {
    m_u[i] = clCreateBuffer(context,
                            CL_MEM_READ_WRITE,
                            sizeof(double)*MAXCELLS*(KMAX+4)*(JMAXP+4)*(IMAXP+4)*5,
                            NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_u");

    m_us[i] = clCreateBuffer(context,
                             CL_MEM_READ_WRITE,
                             sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2),
                             NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_us");

    m_vs[i] = clCreateBuffer(context,
                             CL_MEM_READ_WRITE,
                             sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2),
                             NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_vs");

    m_ws[i] = clCreateBuffer(context,
                             CL_MEM_READ_WRITE,
                             sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2),
                             NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_ws");

    m_qs[i] = clCreateBuffer(context,
                             CL_MEM_READ_WRITE,
                             sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2),
                             NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_qs");

    m_ainv[i] = clCreateBuffer(context,
                               CL_MEM_READ_WRITE,
                               sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2),
                               NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_ainv");

    m_rho_i[i] = clCreateBuffer(context,
                                CL_MEM_READ_WRITE,
                                sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2),
                                NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_rho_i");

    m_speed[i] = clCreateBuffer(context,
                                CL_MEM_READ_WRITE,
                                sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2),
                                NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_speed");

    m_square[i] = clCreateBuffer(context,
                                 CL_MEM_READ_WRITE,
                                 sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2),
                                 NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_square");

    m_rhs[i] = clCreateBuffer(context,
                              CL_MEM_READ_WRITE,
                              sizeof(double)*MAXCELLS*KMAX*JMAXP*IMAXP*5,
                              NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_rhs");

    m_forcing[i] = clCreateBuffer(context,
                                  CL_MEM_READ_WRITE,
                                  sizeof(double)*MAXCELLS*KMAX*JMAXP*IMAXP*5,
                                  NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_forcing");

    m_lhs[i] = clCreateBuffer(context,
                              CL_MEM_READ_WRITE,
                              sizeof(double)*MAXCELLS*KMAX*JMAXP*IMAXP*15,
                              NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_lhs");

    m_in_buffer[i] = clCreateBuffer(context,
                                    CL_MEM_READ_WRITE,
                                    sizeof(double)*BUF_SIZE,
                                    NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_in_buffer");

    m_out_buffer[i] = clCreateBuffer(context,
                                     CL_MEM_READ_WRITE,
                                     sizeof(double)*BUF_SIZE,
                                     NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_out_buffer");

    m_ce[i] = clCreateBuffer(context,
                             CL_MEM_READ_ONLY,
                             sizeof(double)*5*13,
                             NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_ce");
  }

  if (timeron) timer_stop(TIMER_BUFFER);

  if (timeron) timer_stop(TIMER_OPENCL);
}
Пример #3
0
//---------------------------------------------------------------------
// Set up the OpenCL environment.
//---------------------------------------------------------------------
static void setup_opencl(int argc, char *argv[])
{
  cl_int ecode;
  char *source_dir = "IS";
  if (argc > 1) source_dir = argv[1];

#ifdef TIMER_DETAIL
  if (timer_on) {
    int i;
    for (i = T_OPENCL_API; i < T_END; i++) timer_clear(i);
  }
#endif

  DTIMER_START(T_OPENCL_API);

  // 1. Find the default device type and get a device for the device type
  device_type = clu_GetDefaultDeviceType();
  device      = clu_GetAvailableDevice(device_type);
  device_name = clu_GetDeviceName(device);

  // Device information
  ecode = clGetDeviceInfo(device,
                          CL_DEVICE_MAX_WORK_ITEM_SIZES,
                          sizeof(work_item_sizes),
                          &work_item_sizes,
                          NULL);
  clu_CheckError(ecode, "clGetDiviceInfo()");

  ecode = clGetDeviceInfo(device,
                          CL_DEVICE_MAX_WORK_GROUP_SIZE,
                          sizeof(size_t),
                          &max_work_group_size,
                          NULL);
  clu_CheckError(ecode, "clGetDiviceInfo()");

  // FIXME: The below values are experimental.
  if (max_work_group_size > 256) {
    max_work_group_size = 256;
    int i;
    for (i = 0; i < 3; i++) {
      if (work_item_sizes[i] > 256) {
        work_item_sizes[i] = 256;
      }
    }
  }

  // 2. Create a context for the specified device
  context = clCreateContext(NULL, 1, &device, NULL, NULL, &ecode);
  clu_CheckError(ecode, "clCreateContext()");

  // 3. Create a command queue
  cmd_queue = clCreateCommandQueue(context, device, 0, &ecode);
  clu_CheckError(ecode, "clCreateCommandQueue()");

  DTIMER_STOP(T_OPENCL_API);

  // 4. Build the program
  DTIMER_START(T_BUILD);
  char *source_file;
  char build_option[30];
  if (device_type == CL_DEVICE_TYPE_CPU) {
    source_file = "is_cpu.cl";
    sprintf(build_option, "-DCLASS=%d -I.", CLASS);

    CREATE_SEQ_GROUP_SIZE = 64;
    CREATE_SEQ_GLOBAL_SIZE = CREATE_SEQ_GROUP_SIZE * 256;
    RANK_GROUP_SIZE = 1;
    RANK_GLOBAL_SIZE = RANK_GROUP_SIZE * 128;
    RANK1_GROUP_SIZE = 1;
    RANK1_GLOBAL_SIZE = RANK1_GROUP_SIZE * RANK_GLOBAL_SIZE;;
    RANK2_GROUP_SIZE = RANK_GROUP_SIZE;
    RANK2_GLOBAL_SIZE = RANK_GLOBAL_SIZE;;
    FV2_GROUP_SIZE = 64;
    FV2_GLOBAL_SIZE = FV2_GROUP_SIZE * 256;
  } else if (device_type == CL_DEVICE_TYPE_GPU) {
    source_file = "is_gpu.cl";
    sprintf(build_option, "-DCLASS=\'%c\' -I.", CLASS);

    CREATE_SEQ_GROUP_SIZE = 64;
    CREATE_SEQ_GLOBAL_SIZE = CREATE_SEQ_GROUP_SIZE * 256;
    RANK1_GROUP_SIZE = work_item_sizes[0];
    RANK1_GLOBAL_SIZE = MAX_KEY;
    RANK2_GROUP_SIZE = work_item_sizes[0];
    RANK2_GLOBAL_SIZE = NUM_KEYS;
    FV2_GROUP_SIZE = work_item_sizes[0];
    FV2_GLOBAL_SIZE = NUM_KEYS;
  } else {
    fprintf(stderr, "%s: not supported.", clu_GetDeviceTypeName(device_type));
    exit(EXIT_FAILURE);
  }
  program = clu_MakeProgram(context, device, source_dir, source_file,
                            build_option);
  DTIMER_STOP(T_BUILD);

  // 5. Create buffers
  DTIMER_START(T_BUFFER_CREATE);
  m_key_array = clCreateBuffer(context,
                               CL_MEM_READ_WRITE,
                               sizeof(INT_TYPE) * SIZE_OF_BUFFERS,
                               NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_key_array");

  m_key_buff1 = clCreateBuffer(context,
                               CL_MEM_READ_WRITE,
                               sizeof(INT_TYPE) * MAX_KEY,
                               NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_key_buff1");

  m_key_buff2 = clCreateBuffer(context,
                               CL_MEM_READ_WRITE,
                               sizeof(INT_TYPE) * SIZE_OF_BUFFERS,
                               NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_key_buff2");

  size_t test_array_size = sizeof(INT_TYPE) * TEST_ARRAY_SIZE;
  m_index_array = clCreateBuffer(context,
                                 CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                 test_array_size,
                                 test_index_array, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_index_array");

  m_rank_array = clCreateBuffer(context,
                                CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                test_array_size,
                                test_rank_array, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_rank_array");

  m_partial_vals = clCreateBuffer(context,
                                  CL_MEM_WRITE_ONLY,
                                  test_array_size,
                                  NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_partial_vals");

  m_passed_verification = clCreateBuffer(context,
                                         CL_MEM_READ_WRITE,
                                         sizeof(cl_int),
                                         NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_passed_verification");

  if (device_type == CL_DEVICE_TYPE_GPU) {
    m_key_scan = clCreateBuffer(context,
                                CL_MEM_READ_WRITE,
                                sizeof(INT_TYPE) * MAX_KEY,
                                NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_key_buff1_scan");

    m_sum = clCreateBuffer(context,
                           CL_MEM_READ_WRITE,
                           sizeof(INT_TYPE) * work_item_sizes[0],
                           NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_sum");
  } else {
    size_t bs_size = RANK_GLOBAL_SIZE * sizeof(INT_TYPE) * NUM_BUCKETS;
    m_bucket_size = clCreateBuffer(context,
                                   CL_MEM_READ_WRITE,
                                   bs_size,
                                   NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_bucket_size");

    m_bucket_ptrs = clCreateBuffer(context,
                                   CL_MEM_READ_WRITE,
                                   bs_size,
                                   NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_bucket_ptrs");
  }
  DTIMER_STOP(T_BUFFER_CREATE);

  // 6. Create kernels
  DTIMER_START(T_OPENCL_API);
  k_rank0 = clCreateKernel(program, "rank0", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for rank0");
  ecode  = clSetKernelArg(k_rank0, 0, sizeof(cl_mem), (void*)&m_key_array);
  ecode |= clSetKernelArg(k_rank0, 1, sizeof(cl_mem), (void*)&m_partial_vals);
  ecode |= clSetKernelArg(k_rank0, 2, sizeof(cl_mem), (void*)&m_index_array);
  clu_CheckError(ecode, "clSetKernelArg() for rank0");

  if (device_type == CL_DEVICE_TYPE_GPU) {
    k_rank1 = clCreateKernel(program, "rank1", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for rank1");
    ecode  = clSetKernelArg(k_rank1, 0, sizeof(cl_mem), (void*)&m_key_buff1);
    clu_CheckError(ecode, "clSetKernelArg() for rank1");

    k_rank2 = clCreateKernel(program, "rank2", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for rank2");
    ecode  = clSetKernelArg(k_rank2, 0, sizeof(cl_mem), (void*)&m_key_buff1);
    ecode |= clSetKernelArg(k_rank2, 1, sizeof(cl_mem), (void*)&m_key_array);
    clu_CheckError(ecode, "clSetKernelArg() for rank2");

    k_rank3_0 = clCreateKernel(program, "rank3_0", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for rank3_0");
    ecode  = clSetKernelArg(k_rank3_0, 0, sizeof(cl_mem),(void*)&m_key_buff1);
    ecode |= clSetKernelArg(k_rank3_0, 1, sizeof(cl_mem),(void*)&m_key_buff1);
    ecode |= clSetKernelArg(k_rank3_0, 2, sizeof(cl_mem),(void*)&m_sum);
    ecode |= clSetKernelArg(k_rank3_0, 3, 
                            sizeof(INT_TYPE) * work_item_sizes[0] * 2,
                            NULL);
    clu_CheckError(ecode, "clSetKernelArg() for rank3_0");

    k_rank3_1 = clCreateKernel(program, "rank3_1", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for rank3_1");
    ecode  = clSetKernelArg(k_rank3_1, 0, sizeof(cl_mem), (void*)&m_sum);
    ecode  = clSetKernelArg(k_rank3_1, 1, sizeof(cl_mem), (void*)&m_sum);
    ecode |= clSetKernelArg(k_rank3_1, 2, 
                            sizeof(INT_TYPE) * work_item_sizes[0] * 2,
                            NULL);
    clu_CheckError(ecode, "clSetKernelArg() for rank3_1");

    k_rank3_2 = clCreateKernel(program, "rank3_2", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for rank3_2");
    ecode  = clSetKernelArg(k_rank3_2, 0, sizeof(cl_mem),(void*)&m_key_buff1);
    ecode  = clSetKernelArg(k_rank3_2, 1, sizeof(cl_mem),(void*)&m_key_buff1);
    ecode |= clSetKernelArg(k_rank3_2, 2, sizeof(cl_mem),(void*)&m_sum);
    clu_CheckError(ecode, "clSetKernelArg() for rank3_2");
  } else {
    k_rank1 = clCreateKernel(program, "rank1", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for rank1");
    ecode  = clSetKernelArg(k_rank1, 0, sizeof(cl_mem),(void*)&m_key_array);
    ecode |= clSetKernelArg(k_rank1, 1, sizeof(cl_mem),(void*)&m_bucket_size);
    clu_CheckError(ecode, "clSetKernelArg() for rank1");

    k_rank2 = clCreateKernel(program, "rank2", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for rank2");
    ecode  = clSetKernelArg(k_rank2, 0, sizeof(cl_mem),(void*)&m_key_array);
    ecode |= clSetKernelArg(k_rank2, 1, sizeof(cl_mem),(void*)&m_bucket_size);
    ecode |= clSetKernelArg(k_rank2, 2, sizeof(cl_mem),(void*)&m_bucket_ptrs);
    ecode |= clSetKernelArg(k_rank2, 3, sizeof(cl_mem),(void*)&m_key_buff2);
    clu_CheckError(ecode, "clSetKernelArg() for rank2");

    k_rank3 = clCreateKernel(program, "rank3", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for rank3");
    ecode  = clSetKernelArg(k_rank3, 0, sizeof(cl_mem),(void*)&m_bucket_size);
    ecode |= clSetKernelArg(k_rank3, 1, sizeof(cl_mem),(void*)&m_bucket_ptrs);
    ecode |= clSetKernelArg(k_rank3, 2, sizeof(cl_mem),(void*)&m_key_buff1);
    ecode |= clSetKernelArg(k_rank3, 3, sizeof(cl_mem),(void*)&m_key_buff2);
    clu_CheckError(ecode, "clSetKernelArg() for rank3");
  }

  k_rank4 = clCreateKernel(program, "rank4", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for rank4");
  ecode  = clSetKernelArg(k_rank4, 0, sizeof(cl_mem), (void*)&m_partial_vals);
  ecode |= clSetKernelArg(k_rank4, 1, sizeof(cl_mem), (void*)&m_key_buff1);
  ecode |= clSetKernelArg(k_rank4, 2, sizeof(cl_mem), (void*)&m_rank_array);
  ecode |= clSetKernelArg(k_rank4, 3, sizeof(cl_mem),
                                      (void*)&m_passed_verification);
  clu_CheckError(ecode, "clSetKernelArg() for rank4");
  DTIMER_STOP(T_OPENCL_API);
}
Пример #4
0
//---------------------------------------------------------------------
// Set up the OpenCL environment.
//---------------------------------------------------------------------
static void setup_opencl(int argc, char *argv[])
{
  int i;
  size_t temp, wg_num;
  cl_int ecode;
  char *source_dir = "LU";

  if (timeron) {
    timer_clear(TIMER_OPENCL);
    timer_clear(TIMER_BUILD);
    timer_clear(TIMER_BUFFER);
    timer_clear(TIMER_RELEASE);

    timer_start(TIMER_OPENCL);
  }

  if (argc > 1) source_dir = argv[1];

  //-----------------------------------------------------------------------
  // 1. Find the default device type and get a device for the device type
  //-----------------------------------------------------------------------
  device_type = clu_GetDefaultDeviceType();
  device      = clu_GetAvailableDevice(device_type);
  device_name = clu_GetDeviceName(device);

  // Device information
  ecode = clGetDeviceInfo(device,
                          CL_DEVICE_MAX_WORK_ITEM_SIZES,
                          sizeof(work_item_sizes),
                          &work_item_sizes,
                          NULL);
  clu_CheckError(ecode, "clGetDiviceInfo()");

  ecode = clGetDeviceInfo(device,
                          CL_DEVICE_MAX_WORK_GROUP_SIZE,
                          sizeof(size_t),
                          &max_work_group_size,
                          NULL);
  clu_CheckError(ecode, "clGetDiviceInfo()");

  ecode = clGetDeviceInfo(device,
                          CL_DEVICE_MAX_COMPUTE_UNITS,
                          sizeof(cl_uint),
                          &max_compute_units,
                          NULL);
  clu_CheckError(ecode, "clGetDiviceInfo()");

  ////////////////////////////////////////////////////////////////////////
  // FIXME: The below values are experimental.
  size_t default_wg_size = 64;
  if (device_type == CL_DEVICE_TYPE_CPU) {
    if (CLASS == 'B') default_wg_size = 128;
  } else {
    if (CLASS == 'B') default_wg_size = 32;
  }
  if (max_work_group_size > default_wg_size) {
    max_work_group_size = default_wg_size;
    int i;
    for (i = 0; i < 3; i++) {
      if (work_item_sizes[i] > default_wg_size) {
        work_item_sizes[i] = default_wg_size;
      }
    }
  }
  if (device_type == CL_DEVICE_TYPE_CPU) {
    SETBV1_DIM = SETBV1_DIM_CPU;
    SETBV2_DIM = SETBV2_DIM_CPU;
    SETBV3_DIM = SETBV3_DIM_CPU;
    SETIV_DIM = SETIV_DIM_CPU;
    ERHS1_DIM = ERHS1_DIM_CPU;
    ERHS2_DIM = ERHS2_DIM_CPU;
    ERHS3_DIM = ERHS3_DIM_CPU;
    ERHS4_DIM = ERHS4_DIM_CPU;
    PINTGR1_DIM = PINTGR1_DIM_CPU;
    PINTGR2_DIM = PINTGR2_DIM_CPU;
    PINTGR3_DIM = PINTGR3_DIM_CPU;
    RHS_DIM  = RHS_DIM_CPU;
    RHSX_DIM = RHSX_DIM_CPU;
    RHSY_DIM = RHSY_DIM_CPU;
    RHSZ_DIM = RHSZ_DIM_CPU;
    SSOR2_DIM = SSOR2_DIM_CPU;
    SSOR3_DIM = SSOR3_DIM_CPU;
  } else {
    SETBV1_DIM = SETBV1_DIM_GPU;
    SETBV2_DIM = SETBV2_DIM_GPU;
    SETBV3_DIM = SETBV3_DIM_GPU;
    SETIV_DIM = SETIV_DIM_GPU;
    ERHS1_DIM = ERHS1_DIM_GPU;
    ERHS2_DIM = ERHS2_DIM_GPU;
    ERHS3_DIM = ERHS3_DIM_GPU;
    ERHS4_DIM = ERHS4_DIM_GPU;
    PINTGR1_DIM = PINTGR1_DIM_GPU;
    PINTGR2_DIM = PINTGR2_DIM_GPU;
    PINTGR3_DIM = PINTGR3_DIM_GPU;
    RHS_DIM  = RHS_DIM_GPU;
    RHSX_DIM = RHSX_DIM_GPU;
    RHSY_DIM = RHSY_DIM_GPU;
    RHSZ_DIM = RHSZ_DIM_GPU;
    SSOR2_DIM = SSOR2_DIM_GPU;
    SSOR3_DIM = SSOR3_DIM_GPU;
  }
  ////////////////////////////////////////////////////////////////////////

  //-----------------------------------------------------------------------
  // 2. Create a context for the specified device
  //-----------------------------------------------------------------------
  context = clCreateContext(NULL, 1, &device, NULL, NULL, &ecode);
  clu_CheckError(ecode, "clCreateContext()");

  //-----------------------------------------------------------------------
  // 3. Create command queues
  //-----------------------------------------------------------------------
  cmd_queue = clCreateCommandQueue(context, device, 0, &ecode);
  clu_CheckError(ecode, "clCreateCommandQueue()");

  max_pipeline = (jend-jst) < max_compute_units ? (jend-jst) : max_compute_units;
  pipe_queue = (cl_command_queue *)malloc(sizeof(cl_command_queue) * max_pipeline);
  for (i = 0; i < max_pipeline; i++) {
    pipe_queue[i] = clCreateCommandQueue(context, device, 0, &ecode);
    clu_CheckError(ecode, "clCreateCommandQueue()");
  }

  //-----------------------------------------------------------------------
  // 4. Build programs
  //-----------------------------------------------------------------------
  if (timeron) timer_start(TIMER_BUILD);
  char build_option[100];

  if (device_type == CL_DEVICE_TYPE_CPU) {
    sprintf(build_option, "-I. -DCLASS=%d -DUSE_CPU", CLASS);
  } else {
    sprintf(build_option, "-I. -DCLASS=\'%c\'", CLASS);
  }

  p_pre = clu_MakeProgram(context, device, source_dir,
                          "kernel_pre.cl",
                          build_option);

  p_main = clu_MakeProgram(context, device, source_dir,
                          (device_type == CL_DEVICE_TYPE_CPU ? "kernel_main_cpu.cl" : "kernel_main_gpu.cl"),
                          build_option);

  p_post = clu_MakeProgram(context, device, source_dir,
                          "kernel_post.cl",
                          build_option);
  if (timeron) timer_stop(TIMER_BUILD);

  //-----------------------------------------------------------------------
  // 5. Create buffers
  //-----------------------------------------------------------------------
  if (timeron) timer_start(TIMER_BUFFER);
  m_ce = clCreateBuffer(context,
                        CL_MEM_READ_ONLY,
                        sizeof(double)*5*13,
                        NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_ce");

  m_u = clCreateBuffer(context,
                       CL_MEM_READ_WRITE,
                       sizeof(double)*(ISIZ3)*(ISIZ2/2*2+1)*(ISIZ1/2*2+1)*5,
                       NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_u");

  m_rsd = clCreateBuffer(context,
                       CL_MEM_READ_WRITE,
                       sizeof(double)*(ISIZ3)*(ISIZ2/2*2+1)*(ISIZ1/2*2+1)*5,
                       NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_rsd");

  m_frct = clCreateBuffer(context,
                       CL_MEM_READ_WRITE,
                       sizeof(double)*(ISIZ3)*(ISIZ2/2*2+1)*(ISIZ1/2*2+1)*5,
                       NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_frct");

  m_qs = clCreateBuffer(context,
                       CL_MEM_READ_WRITE,
                       sizeof(double)*(ISIZ3)*(ISIZ2/2*2+1)*(ISIZ1/2*2+1),
                       NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_qs");

  m_rho_i = clCreateBuffer(context,
                       CL_MEM_READ_WRITE,
                       sizeof(double)*(ISIZ3)*(ISIZ2/2*2+1)*(ISIZ1/2*2+1),
                       NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_rho_i");

  // workspace for work-items
  size_t max_work_items;
  if (ERHS2_DIM == 1 && ERHS3_DIM == 1 && ERHS4_DIM == 1) {
    max_work_items = ISIZ3;
  } else {
    max_work_items = ISIZ3*ISIZ2;
  }
  m_flux = clCreateBuffer(context,
                       CL_MEM_READ_WRITE,
                       sizeof(double)*ISIZ1*5 * max_work_items,
                       NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_flux");

  if (RHSZ_DIM == 1) {
    max_work_items = ISIZ2;
  } else {
    max_work_items = ISIZ2*ISIZ1;
  }

  if (device_type == CL_DEVICE_TYPE_CPU) {
    m_utmp = clCreateBuffer(context,
                         CL_MEM_READ_WRITE,
                         sizeof(double)*ISIZ3*6 * max_work_items,
                         NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_utmp");

    m_rtmp = clCreateBuffer(context,
                         CL_MEM_READ_WRITE,
                         sizeof(double)*ISIZ3*5 * max_work_items,
                         NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_rtmp");
  }

  temp = (nz0-2) / max_compute_units;
  l2norm_lws[0] = temp == 0 ? 1 : temp;
  l2norm_gws[0] = clu_RoundWorkSize((size_t)(nz0-2), l2norm_lws[0]);
  wg_num = l2norm_gws[0] / l2norm_lws[0];
  sum_size = sizeof(double) * 5 * wg_num;
  m_sum = clCreateBuffer(context,
                         CL_MEM_READ_WRITE,
                         sum_size, 
                         NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer()");

  if (timeron) timer_stop(TIMER_BUFFER);

  //-----------------------------------------------------------------------
  // 6. Create kernels
  //-----------------------------------------------------------------------
  k_setbv1 = clCreateKernel(p_pre, "setbv1", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for setbv1");
  ecode  = clSetKernelArg(k_setbv1, 0, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_setbv1, 1, sizeof(cl_mem), &m_ce);
  ecode |= clSetKernelArg(k_setbv1, 2, sizeof(int), &nx);
  ecode |= clSetKernelArg(k_setbv1, 3, sizeof(int), &ny);
  ecode |= clSetKernelArg(k_setbv1, 4, sizeof(int), &nz);
  clu_CheckError(ecode, "clSetKernelArg()");
  if (SETBV1_DIM == 3) {
    setbv1_lws[0] = 5;
    temp = max_work_group_size / setbv1_lws[0];
    setbv1_lws[1] = nx < temp ? nx : temp;
    temp = temp / setbv1_lws[1];
    setbv1_lws[2] = ny < temp ? ny : temp;
    setbv1_gws[0] = clu_RoundWorkSize((size_t)5, setbv1_lws[0]);
    setbv1_gws[1] = clu_RoundWorkSize((size_t)nx, setbv1_lws[1]);
    setbv1_gws[2] = clu_RoundWorkSize((size_t)ny, setbv1_lws[2]);
  } else if (SETBV1_DIM == 2) {
    setbv1_lws[0] = nx < work_item_sizes[0] ? nx : work_item_sizes[0];
    temp = max_work_group_size / setbv1_lws[0];
    setbv1_lws[1] = ny < temp ? ny : temp;
    setbv1_gws[0] = clu_RoundWorkSize((size_t)nx, setbv1_lws[0]);
    setbv1_gws[1] = clu_RoundWorkSize((size_t)ny, setbv1_lws[1]);
  } else {
    temp = ny / max_compute_units;
    setbv1_lws[0] = temp == 0 ? 1 : temp;
    setbv1_gws[0] = clu_RoundWorkSize((size_t)ny, setbv1_lws[0]);
  }

  k_setbv2 = clCreateKernel(p_pre, "setbv2", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for setbv2");
  ecode  = clSetKernelArg(k_setbv2, 0, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_setbv2, 1, sizeof(cl_mem), &m_ce);
  ecode |= clSetKernelArg(k_setbv2, 2, sizeof(int), &nx);
  ecode |= clSetKernelArg(k_setbv2, 3, sizeof(int), &ny);
  ecode |= clSetKernelArg(k_setbv2, 4, sizeof(int), &nz);
  clu_CheckError(ecode, "clSetKernelArg()");
  if (SETBV2_DIM == 3) {
    setbv2_lws[0] = 5;
    temp = max_work_group_size / setbv2_lws[0];
    setbv2_lws[1] = nx < temp ? nx : temp;
    temp = temp / setbv2_lws[1];
    setbv2_lws[2] = nz < temp ? nz : temp;
    setbv2_gws[0] = clu_RoundWorkSize((size_t)5, setbv2_lws[0]);
    setbv2_gws[1] = clu_RoundWorkSize((size_t)nx, setbv2_lws[1]);
    setbv2_gws[2] = clu_RoundWorkSize((size_t)nz, setbv2_lws[2]);
  } else if (SETBV2_DIM == 2) {
    setbv2_lws[0] = nx < work_item_sizes[0] ? nx : work_item_sizes[0];
    temp = max_work_group_size / setbv2_lws[0];
    setbv2_lws[1] = nz < temp ? nz : temp;
    setbv2_gws[0] = clu_RoundWorkSize((size_t)nx, setbv2_lws[0]);
    setbv2_gws[1] = clu_RoundWorkSize((size_t)nz, setbv2_lws[1]);
  } else {
    temp = nz / max_compute_units;
    setbv2_lws[0] = temp == 0 ? 1 : temp;
    setbv2_gws[0] = clu_RoundWorkSize((size_t)nz, setbv2_lws[0]);
  }

  k_setbv3 = clCreateKernel(p_pre, "setbv3", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for setbv3");
  ecode  = clSetKernelArg(k_setbv3, 0, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_setbv3, 1, sizeof(cl_mem), &m_ce);
  ecode |= clSetKernelArg(k_setbv3, 2, sizeof(int), &nx);
  ecode |= clSetKernelArg(k_setbv3, 3, sizeof(int), &ny);
  ecode |= clSetKernelArg(k_setbv3, 4, sizeof(int), &nz);
  clu_CheckError(ecode, "clSetKernelArg()");
  if (SETBV3_DIM == 3) {
    setbv3_lws[0] = 5;
    temp = max_work_group_size / setbv3_lws[0];
    setbv3_lws[1] = ny < temp ? ny : temp;
    temp = temp / setbv3_lws[1];
    setbv3_lws[2] = nz < temp ? nz : temp;
    setbv3_gws[0] = clu_RoundWorkSize((size_t)5, setbv3_lws[0]);
    setbv3_gws[1] = clu_RoundWorkSize((size_t)ny, setbv3_lws[1]);
    setbv3_gws[2] = clu_RoundWorkSize((size_t)nz, setbv3_lws[2]);
  } else if (SETBV3_DIM == 2) {
    setbv3_lws[0] = ny < work_item_sizes[0] ? ny : work_item_sizes[0];
    temp = max_work_group_size / setbv3_lws[0];
    setbv3_lws[1] = nz < temp ? nz : temp;
    setbv3_gws[0] = clu_RoundWorkSize((size_t)ny, setbv3_lws[0]);
    setbv3_gws[1] = clu_RoundWorkSize((size_t)nz, setbv3_lws[1]);
  } else {
    temp = nz / max_compute_units;
    setbv3_lws[0] = temp == 0 ? 1 : temp;
    setbv3_gws[0] = clu_RoundWorkSize((size_t)nz, setbv3_lws[0]);
  }

  k_setiv = clCreateKernel(p_pre, "setiv", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for setiv");
  ecode  = clSetKernelArg(k_setiv, 0, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_setiv, 1, sizeof(cl_mem), &m_ce);
  ecode |= clSetKernelArg(k_setiv, 2, sizeof(int), &nx);
  ecode |= clSetKernelArg(k_setiv, 3, sizeof(int), &ny);
  ecode |= clSetKernelArg(k_setiv, 4, sizeof(int), &nz);
  clu_CheckError(ecode, "clSetKernelArg()");
  if (SETIV_DIM == 3) {
    setiv_lws[0] = (nx-2) < work_item_sizes[0] ? (nx-2) : work_item_sizes[0];
    temp = max_work_group_size / setiv_lws[0];
    setiv_lws[1] = (ny-2) < temp ? (ny-2) : temp;
    temp = temp / setiv_lws[1];
    setiv_lws[2] = (nz-2) < temp ? (nz-2) : temp;
    setiv_gws[0] = clu_RoundWorkSize((size_t)(nx-2), setiv_lws[0]);
    setiv_gws[1] = clu_RoundWorkSize((size_t)(ny-2), setiv_lws[1]);
    setiv_gws[2] = clu_RoundWorkSize((size_t)(nz-2), setiv_lws[2]);
  } else if (SETIV_DIM == 2) {
    setiv_lws[0] = (ny-2) < work_item_sizes[0] ? (ny-2) : work_item_sizes[0];
    temp = max_work_group_size / setiv_lws[0];
    setiv_lws[1] = (nz-2) < temp ? (nz-2) : temp;
    setiv_gws[0] = clu_RoundWorkSize((size_t)(ny-2), setiv_lws[0]);
    setiv_gws[1] = clu_RoundWorkSize((size_t)(nz-2), setiv_lws[1]);
  } else {
    temp = (nz-2) / max_compute_units;
    setiv_lws[0] = temp == 0 ? 1 : temp;
    setiv_gws[0] = clu_RoundWorkSize((size_t)(nz-2), setiv_lws[0]);
  }

  k_l2norm = clCreateKernel(p_main, "l2norm", &ecode);
  clu_CheckError(ecode, "clCreateKernel()");
  ecode  = clSetKernelArg(k_l2norm, 1, sizeof(cl_mem), &m_sum);
  ecode |= clSetKernelArg(k_l2norm, 2, sizeof(double)*5*l2norm_lws[0], NULL);
  clu_CheckError(ecode, "clSetKernelArg()");

  k_rhs = clCreateKernel(p_main, "rhs", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for rhs");
  ecode  = clSetKernelArg(k_rhs, 0, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_rhs, 1, sizeof(cl_mem), &m_rsd);
  ecode |= clSetKernelArg(k_rhs, 2, sizeof(cl_mem), &m_frct);
  ecode |= clSetKernelArg(k_rhs, 3, sizeof(cl_mem), &m_qs);
  ecode |= clSetKernelArg(k_rhs, 4, sizeof(cl_mem), &m_rho_i);
  ecode |= clSetKernelArg(k_rhs, 5, sizeof(int), &nx);
  ecode |= clSetKernelArg(k_rhs, 6, sizeof(int), &ny);
  ecode |= clSetKernelArg(k_rhs, 7, sizeof(int), &nz);
  clu_CheckError(ecode, "clSetKernelArg()");
  if (RHS_DIM == 3) {
    rhs_lws[0] = nx < work_item_sizes[0] ? nx : work_item_sizes[0];
    temp = max_work_group_size / rhs_lws[0];
    rhs_lws[1] = ny < temp ? ny : temp;
    temp = temp / rhs_lws[1];
    rhs_lws[2] = nz < temp ? nz : temp;
    rhs_gws[0] = clu_RoundWorkSize((size_t)nx, rhs_lws[0]);
    rhs_gws[1] = clu_RoundWorkSize((size_t)ny, rhs_lws[1]);
    rhs_gws[2] = clu_RoundWorkSize((size_t)nz, rhs_lws[2]);
  } else if (RHS_DIM == 2) {
    rhs_lws[0] = ny < work_item_sizes[0] ? ny : work_item_sizes[0];
    temp = max_work_group_size / rhs_lws[0];
    rhs_lws[1] = nz < temp ? nz : temp;
    rhs_gws[0] = clu_RoundWorkSize((size_t)ny, rhs_lws[0]);
    rhs_gws[1] = clu_RoundWorkSize((size_t)nz, rhs_lws[1]);
  } else {
    //temp = nz / max_compute_units;
    temp = 1;
    rhs_lws[0] = temp == 0 ? 1 : temp;
    rhs_gws[0] = clu_RoundWorkSize((size_t)nz, rhs_lws[0]);
  }

  k_rhsx = clCreateKernel(p_main, "rhsx", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for rhsx");
  ecode  = clSetKernelArg(k_rhsx, 0, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_rhsx, 1, sizeof(cl_mem), &m_rsd);
  ecode |= clSetKernelArg(k_rhsx, 2, sizeof(cl_mem), &m_qs);
  ecode |= clSetKernelArg(k_rhsx, 3, sizeof(cl_mem), &m_rho_i);
  if (device_type == CL_DEVICE_TYPE_CPU) {
    ecode |= clSetKernelArg(k_rhsx, 4, sizeof(cl_mem), &m_flux);
    ecode |= clSetKernelArg(k_rhsx, 5, sizeof(int), &nx);
    ecode |= clSetKernelArg(k_rhsx, 6, sizeof(int), &ny);
    ecode |= clSetKernelArg(k_rhsx, 7, sizeof(int), &nz);
  } else {
    ecode |= clSetKernelArg(k_rhsx, 4, sizeof(int), &nx);
    ecode |= clSetKernelArg(k_rhsx, 5, sizeof(int), &ny);
    ecode |= clSetKernelArg(k_rhsx, 6, sizeof(int), &nz);
  }
  clu_CheckError(ecode, "clSetKernelArg()");
  if (RHSX_DIM == 2) {
    rhsx_lws[0] = (jend-jst) < work_item_sizes[0] ? (jend-jst) : work_item_sizes[0];
    temp = max_work_group_size / rhsx_lws[0];
    rhsx_lws[1] = (nz-2) < temp ? (nz-2) : temp;
    rhsx_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), rhsx_lws[0]);
    rhsx_gws[1] = clu_RoundWorkSize((size_t)(nz-2), rhsx_lws[1]);
  } else {
    //temp = (nz-2) / max_compute_units;
    temp = 1;
    rhsx_lws[0] = temp == 0 ? 1 : temp;
    rhsx_gws[0] = clu_RoundWorkSize((size_t)(nz-2), rhsx_lws[0]);
  }

  k_rhsy = clCreateKernel(p_main, "rhsy", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for rhsy");
  ecode  = clSetKernelArg(k_rhsy, 0, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_rhsy, 1, sizeof(cl_mem), &m_rsd);
  ecode |= clSetKernelArg(k_rhsy, 2, sizeof(cl_mem), &m_qs);
  ecode |= clSetKernelArg(k_rhsy, 3, sizeof(cl_mem), &m_rho_i);
  if (device_type == CL_DEVICE_TYPE_CPU) {
    ecode |= clSetKernelArg(k_rhsy, 4, sizeof(cl_mem), &m_flux);
    ecode |= clSetKernelArg(k_rhsy, 5, sizeof(int), &nx);
    ecode |= clSetKernelArg(k_rhsy, 6, sizeof(int), &ny);
    ecode |= clSetKernelArg(k_rhsy, 7, sizeof(int), &nz);
  } else {
    ecode |= clSetKernelArg(k_rhsy, 4, sizeof(int), &nx);
    ecode |= clSetKernelArg(k_rhsy, 5, sizeof(int), &ny);
    ecode |= clSetKernelArg(k_rhsy, 6, sizeof(int), &nz);
  }
  clu_CheckError(ecode, "clSetKernelArg()");
  if (RHSY_DIM == 2) {
    rhsy_lws[0] = (iend-ist) < work_item_sizes[0] ? (iend-ist) : work_item_sizes[0];
    temp = max_work_group_size / rhsy_lws[0];
    rhsy_lws[1] = (nz-2) < temp ? (nz-2) : temp;
    rhsy_gws[0] = clu_RoundWorkSize((size_t)(iend-ist), rhsy_lws[0]);
    rhsy_gws[1] = clu_RoundWorkSize((size_t)(nz-2), rhsy_lws[1]);
  } else {
    //temp = (nz-2) / max_compute_units;
    temp = 1;
    rhsy_lws[0] = temp == 0 ? 1 : temp;
    rhsy_gws[0] = clu_RoundWorkSize((size_t)(nz-2), rhsy_lws[0]);
  }

  k_rhsz = clCreateKernel(p_main, "rhsz", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for rhsz");
  ecode  = clSetKernelArg(k_rhsz, 0, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_rhsz, 1, sizeof(cl_mem), &m_rsd);
  ecode |= clSetKernelArg(k_rhsz, 2, sizeof(cl_mem), &m_qs);
  ecode |= clSetKernelArg(k_rhsz, 3, sizeof(cl_mem), &m_rho_i);
  if (device_type == CL_DEVICE_TYPE_CPU) {
    ecode |= clSetKernelArg(k_rhsz, 4, sizeof(cl_mem), &m_flux);
    ecode |= clSetKernelArg(k_rhsz, 5, sizeof(cl_mem), &m_utmp);
    ecode |= clSetKernelArg(k_rhsz, 6, sizeof(cl_mem), &m_rtmp);
    ecode |= clSetKernelArg(k_rhsz, 7, sizeof(int), &nx);
    ecode |= clSetKernelArg(k_rhsz, 8, sizeof(int), &ny);
    ecode |= clSetKernelArg(k_rhsz, 9, sizeof(int), &nz);
  } else {
    ecode |= clSetKernelArg(k_rhsz, 4, sizeof(int), &nx);
    ecode |= clSetKernelArg(k_rhsz, 5, sizeof(int), &ny);
    ecode |= clSetKernelArg(k_rhsz, 6, sizeof(int), &nz);
  }
  clu_CheckError(ecode, "clSetKernelArg()");
  if (RHSZ_DIM == 2) {
    rhsz_lws[0] = (iend-ist) < work_item_sizes[0] ? (iend-ist) : work_item_sizes[0];
    temp = max_work_group_size / rhsz_lws[0];
    rhsz_lws[1] = (jend-jst) < temp ? (jend-jst) : temp;
    rhsz_gws[0] = clu_RoundWorkSize((size_t)(iend-ist), rhsz_lws[0]);
    rhsz_gws[1] = clu_RoundWorkSize((size_t)(jend-jst), rhsz_lws[1]);
  } else {
    //temp = (jend-jst) / max_compute_units;
    temp = 1;
    rhsz_lws[0] = temp == 0 ? 1 : temp;
    rhsz_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), rhsz_lws[0]);
  }

  k_ssor2 = clCreateKernel(p_main, "ssor2", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for ssor2");
  ecode  = clSetKernelArg(k_ssor2, 0, sizeof(cl_mem), &m_rsd);
  ecode |= clSetKernelArg(k_ssor2, 2, sizeof(int), &nx);
  ecode |= clSetKernelArg(k_ssor2, 3, sizeof(int), &ny);
  ecode |= clSetKernelArg(k_ssor2, 4, sizeof(int), &nz);
  clu_CheckError(ecode, "clSetKernelArg()");
  if (SSOR2_DIM == 3) {
    ssor2_lws[0] = (iend-ist) < work_item_sizes[0] ? (iend-ist) : work_item_sizes[0];
    temp = max_work_group_size / ssor2_lws[0];
    ssor2_lws[1] = (jend-jst) < temp ? (jend-jst) : temp;
    temp = temp / ssor2_lws[1];
    ssor2_lws[2] = (nz-2) < temp ? (nz-2) : temp;
    ssor2_gws[0] = clu_RoundWorkSize((size_t)(iend-ist), ssor2_lws[0]);
    ssor2_gws[1] = clu_RoundWorkSize((size_t)(jend-jst), ssor2_lws[1]);
    ssor2_gws[2] = clu_RoundWorkSize((size_t)(nz-2), ssor2_lws[2]);
  } else if (SSOR2_DIM == 2) {
    ssor2_lws[0] = (jend-jst) < work_item_sizes[0] ? (jend-jst) : work_item_sizes[0];
    temp = max_work_group_size / ssor2_lws[0];
    ssor2_lws[1] = (nz-2) < temp ? (nz-2) : temp;
    ssor2_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), ssor2_lws[0]);
    ssor2_gws[1] = clu_RoundWorkSize((size_t)(nz-2), ssor2_lws[1]);
  } else {
    //temp = (nz-2) / max_compute_units;
    temp = 1;
    ssor2_lws[0] = temp == 0 ? 1 : temp;
    ssor2_gws[0] = clu_RoundWorkSize((size_t)(nz-2), ssor2_lws[0]);
  }

  k_ssor3 = clCreateKernel(p_main, "ssor3", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for ssor3");
  ecode  = clSetKernelArg(k_ssor3, 0, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_ssor3, 1, sizeof(cl_mem), &m_rsd);
  ecode |= clSetKernelArg(k_ssor3, 3, sizeof(int), &nx);
  ecode |= clSetKernelArg(k_ssor3, 4, sizeof(int), &ny);
  ecode |= clSetKernelArg(k_ssor3, 5, sizeof(int), &nz);
  clu_CheckError(ecode, "clSetKernelArg()");
  if (SSOR3_DIM == 3) {
    ssor3_lws[0] = (iend-ist) < work_item_sizes[0] ? (iend-ist) : work_item_sizes[0];
    temp = max_work_group_size / ssor3_lws[0];
    ssor3_lws[1] = (jend-jst) < temp ? (jend-jst) : temp;
    temp = temp / ssor3_lws[1];
    ssor3_lws[2] = (nz-2) < temp ? (nz-2) : temp;
    ssor3_gws[0] = clu_RoundWorkSize((size_t)(iend-ist), ssor3_lws[0]);
    ssor3_gws[1] = clu_RoundWorkSize((size_t)(jend-jst), ssor3_lws[1]);
    ssor3_gws[2] = clu_RoundWorkSize((size_t)(nz-2), ssor3_lws[2]);
  } else if (SSOR3_DIM == 2) {
    ssor3_lws[0] = (jend-jst) < work_item_sizes[0] ? (jend-jst) : work_item_sizes[0];
    temp = max_work_group_size / ssor3_lws[0];
    ssor3_lws[1] = (nz-2) < temp ? (nz-2) : temp;
    ssor3_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), ssor3_lws[0]);
    ssor3_gws[1] = clu_RoundWorkSize((size_t)(nz-2), ssor3_lws[1]);
  } else {
    //temp = (nz-2) / max_compute_units;
    temp = 1;
    ssor3_lws[0] = temp == 0 ? 1 : temp;
    ssor3_gws[0] = clu_RoundWorkSize((size_t)(nz-2), ssor3_lws[0]);
  }

  k_blts = clCreateKernel(p_main, "blts", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for blts");
  ecode  = clSetKernelArg(k_blts, 0, sizeof(cl_mem), &m_rsd);
  ecode |= clSetKernelArg(k_blts, 1, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_blts, 2, sizeof(cl_mem), &m_qs);
  ecode |= clSetKernelArg(k_blts, 3, sizeof(cl_mem), &m_rho_i);
  ecode |= clSetKernelArg(k_blts, 4, sizeof(int), &nz);
  ecode |= clSetKernelArg(k_blts, 5, sizeof(int), &ny);
  ecode |= clSetKernelArg(k_blts, 6, sizeof(int), &nx);
  clu_CheckError(ecode, "clSetKernelArg()");
  blts_lws[0] = (jend-jst) < work_item_sizes[0] ? (jend-jst) : work_item_sizes[0];
  temp = max_work_group_size / blts_lws[0];
  blts_lws[1] = (nz-2) < temp ? (nz-2) : temp;
  blts_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), blts_lws[0]);
  blts_gws[1] = clu_RoundWorkSize((size_t)(nz-2), blts_lws[1]);

  k_buts = clCreateKernel(p_main, "buts", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for buts");
  ecode  = clSetKernelArg(k_buts, 0, sizeof(cl_mem), &m_rsd);
  ecode |= clSetKernelArg(k_buts, 1, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_buts, 2, sizeof(cl_mem), &m_qs);
  ecode |= clSetKernelArg(k_buts, 3, sizeof(cl_mem), &m_rho_i);
  ecode |= clSetKernelArg(k_buts, 4, sizeof(int), &nz);
  ecode |= clSetKernelArg(k_buts, 5, sizeof(int), &ny);
  ecode |= clSetKernelArg(k_buts, 6, sizeof(int), &nx);
  clu_CheckError(ecode, "clSetKernelArg()");
  buts_lws[0] = (jend-jst) < work_item_sizes[0] ? (jend-jst) : work_item_sizes[0];
  temp = max_work_group_size / buts_lws[0];
  buts_lws[1] = (nz-2) < temp ? (nz-2) : temp;
  buts_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), buts_lws[0]);
  buts_gws[1] = clu_RoundWorkSize((size_t)(nz-2), buts_lws[1]);

  if (timeron) timer_stop(TIMER_OPENCL);
}
Пример #5
0
//---------------------------------------------------------------------
// Set up the OpenCL environment.
//---------------------------------------------------------------------
void setup_opencl(int argc, char *argv[])
{
  cl_int err_code;
  char *source_dir = "EP";
  if (argc > 1) source_dir = argv[1];

#ifdef TIMER_DETAIL
  if (timers_enabled) {
    int i;
    for (i = T_OPENCL_API; i < T_END; i++) timer_clear(i);
  }
#endif

  DTIMER_START(T_OPENCL_API);

  // 1. Find the default device type and get a device for the device type
  device_type = clu_GetDefaultDeviceType();
  device      = clu_GetAvailableDevice(device_type);
  device_name = clu_GetDeviceName(device);

  // 2. Create a context for the specified device
  context = clCreateContext(NULL, 1, &device, NULL, NULL, &err_code);
  clu_CheckError(err_code, "clCreateContext()");

  // 3. Create a command queue
  cmd_queue = clCreateCommandQueue(context, device, 0, &err_code);
  clu_CheckError(err_code, "clCreateCommandQueue()");

  DTIMER_STOP(T_OPENCL_API);

  // 4. Build the program
  DTIMER_START(T_BUILD);
  char *source_file;
  char build_option[30];
  sprintf(build_option, "-DM=%d -I.", M);
  if (device_type == CL_DEVICE_TYPE_CPU) {
    source_file = "ep_cpu.cl";
    GROUP_SIZE = 16;
  } else {
    source_file = "ep_gpu.cl";
    GROUP_SIZE = 64;
  }
  program = clu_MakeProgram(context, device, source_dir, source_file,
                            build_option);
  DTIMER_STOP(T_BUILD);

  // 5. Create buffers
  DTIMER_START(T_BUFFER_CREATE);

  gq_size  = np / GROUP_SIZE * NQ * sizeof(double);
  gsx_size = np / GROUP_SIZE * sizeof(double);
  gsy_size = np / GROUP_SIZE * sizeof(double);

  pgq = clCreateBuffer(context, CL_MEM_READ_WRITE, gq_size, NULL, &err_code);
  clu_CheckError(err_code, "clCreateBuffer() for pgq");

  pgsx = clCreateBuffer(context, CL_MEM_READ_WRITE, gsx_size,NULL, &err_code);
  clu_CheckError(err_code, "clCreateBuffer() for pgsx");

  pgsy = clCreateBuffer(context, CL_MEM_READ_WRITE, gsy_size,NULL, &err_code);
  clu_CheckError(err_code, "clCreateBuffer() for pgsy");

  DTIMER_STOP(T_BUFFER_CREATE);

  // 6. Create a kernel
  DTIMER_START(T_OPENCL_API);
  kernel = clCreateKernel(program, "embar", &err_code);
  clu_CheckError(err_code, "clCreateKernel()");
  DTIMER_STOP(T_OPENCL_API);
}