//--------------------------------------------------------------------- // Set up the OpenCL environment. //--------------------------------------------------------------------- static void setup_opencl(int argc, char *argv[]) { size_t temp; cl_int ecode; char *source_dir = "FT"; if (argc > 1) source_dir = argv[1]; #ifdef TIMER_DETAIL if (timers_enabled) { int i; for (i = T_OPENCL_API; i < T_END; i++) timer_clear(i); } #endif DTIMER_START(T_OPENCL_API); // 1. Find the default device type and get a device for the device type device_type = clu_GetDefaultDeviceType(); device = clu_GetAvailableDevice(device_type); device_name = clu_GetDeviceName(device); // Device information ecode = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(work_item_sizes), &work_item_sizes, NULL); clu_CheckError(ecode, "clGetDiviceInfo()"); ecode = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_work_group_size, NULL); clu_CheckError(ecode, "clGetDiviceInfo()"); // FIXME: The below values are experimental. if (max_work_group_size > 64) { max_work_group_size = 64; int i; for (i = 0; i < 3; i++) { if (work_item_sizes[i] > 64) { work_item_sizes[i] = 64; } } } ecode = clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &max_compute_units, NULL); clu_CheckError(ecode, "clGetDiviceInfo()"); // 2. Create a context for the specified device context = clCreateContext(NULL, 1, &device, NULL, NULL, &ecode); clu_CheckError(ecode, "clCreateContext()"); // 3. Create a command queue cmd_queue = clCreateCommandQueue(context, device, 0, &ecode); clu_CheckError(ecode, "clCreateCommandQueue()"); DTIMER_STOP(T_OPENCL_API); // 4. Build the program DTIMER_START(T_BUILD); char *source_file; char build_option[50]; if (device_type == CL_DEVICE_TYPE_CPU) { source_file = "ft_cpu.cl"; sprintf(build_option, "-I. -DCLASS=%d -DUSE_CPU", CLASS); COMPUTE_IMAP_DIM = COMPUTE_IMAP_DIM_CPU; EVOLVE_DIM = EVOLVE_DIM_CPU; CFFTS_DIM = CFFTS_DIM_CPU; } else if (device_type == CL_DEVICE_TYPE_GPU) { char vendor[50]; ecode = clGetDeviceInfo(device, CL_DEVICE_VENDOR, 50, vendor, NULL); clu_CheckError(ecode, "clGetDeviceInfo()"); if (strncmp(vendor, DEV_VENDOR_NVIDIA, strlen(DEV_VENDOR_NVIDIA)) == 0) { source_file = "ft_gpu_nvidia.cl"; CFFTS_LSIZE = 32; } else { source_file = "ft_gpu.cl"; CFFTS_LSIZE = 64; } sprintf(build_option, "-I. -DCLASS=\'%c\' -DLSIZE=%lu", CLASS, CFFTS_LSIZE); COMPUTE_IMAP_DIM = COMPUTE_IMAP_DIM_GPU; EVOLVE_DIM = EVOLVE_DIM_GPU; CFFTS_DIM = CFFTS_DIM_GPU; } else { fprintf(stderr, "Set the environment variable OPENCL_DEVICE_TYPE!\n"); exit(EXIT_FAILURE); } program = clu_MakeProgram(context, device, source_dir, source_file, build_option); DTIMER_STOP(T_BUILD); // 5. Create buffers DTIMER_START(T_BUFFER_CREATE); m_u = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(dcomplex) * NXP, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_u"); m_u0 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(dcomplex) * NTOTALP, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_u0"); m_u1 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(dcomplex) * NTOTALP, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_u1"); m_twiddle = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double) * NTOTALP, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_twiddle"); if (device_type == CL_DEVICE_TYPE_CPU) { size_t ty1_size, ty2_size; if (CFFTS_DIM == 2) { ty1_size = sizeof(dcomplex) * NX * NY * NZ; ty2_size = sizeof(dcomplex) * NX * NY * NZ; } else { fprintf(stderr, "Wrong CFFTS_DIM: %u\n", CFFTS_DIM); exit(EXIT_FAILURE); } m_ty1 = clCreateBuffer(context, CL_MEM_READ_WRITE, ty1_size, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_ty1"); m_ty2 = clCreateBuffer(context, CL_MEM_READ_WRITE, ty2_size, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_ty2"); } if (device_type == CL_DEVICE_TYPE_CPU) { temp = 1024 / max_compute_units; checksum_local_ws = temp == 0 ? 1 : temp; checksum_global_ws = clu_RoundWorkSize((size_t)1024, checksum_local_ws); } else if (device_type == CL_DEVICE_TYPE_GPU) { checksum_local_ws = 32; checksum_global_ws = clu_RoundWorkSize((size_t)1024, checksum_local_ws); } checksum_wg_num = checksum_global_ws / checksum_local_ws; m_chk = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(dcomplex) * checksum_wg_num, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_chk"); g_chk = (dcomplex *)malloc(sizeof(dcomplex) * checksum_wg_num); DTIMER_STOP(T_BUFFER_CREATE); // 6. Create kernels DTIMER_START(T_OPENCL_API); double ap = -4.0 * ALPHA * PI * PI; int d1 = dims[0]; int d2 = dims[1]; int d3 = dims[2]; k_compute_indexmap = clCreateKernel(program, "compute_indexmap", &ecode); clu_CheckError(ecode, "clCreateKernel() for compute_indexmap"); ecode = clSetKernelArg(k_compute_indexmap, 0, sizeof(cl_mem), &m_twiddle); ecode |= clSetKernelArg(k_compute_indexmap, 1, sizeof(int), &d1); ecode |= clSetKernelArg(k_compute_indexmap, 2, sizeof(int), &d2); ecode |= clSetKernelArg(k_compute_indexmap, 3, sizeof(int), &d3); ecode |= clSetKernelArg(k_compute_indexmap, 4, sizeof(double), &ap); clu_CheckError(ecode, "clSetKernelArg() for compute_indexmap"); if (COMPUTE_IMAP_DIM == 3) { cimap_lws[0] = d1 < work_item_sizes[0] ? d1 : work_item_sizes[0]; temp = max_work_group_size / cimap_lws[0]; cimap_lws[1] = d2 < temp ? d2 : temp; temp = temp / cimap_lws[1]; cimap_lws[2] = d3 < temp ? d3 : temp; cimap_gws[0] = clu_RoundWorkSize((size_t)d1, cimap_lws[0]); cimap_gws[1] = clu_RoundWorkSize((size_t)d2, cimap_lws[1]); cimap_gws[2] = clu_RoundWorkSize((size_t)d3, cimap_lws[2]); } else if (COMPUTE_IMAP_DIM == 2) { cimap_lws[0] = d2 < work_item_sizes[0] ? d2 : work_item_sizes[0]; temp = max_work_group_size / cimap_lws[0]; cimap_lws[1] = d3 < temp ? d3 : temp; cimap_gws[0] = clu_RoundWorkSize((size_t)d2, cimap_lws[0]); cimap_gws[1] = clu_RoundWorkSize((size_t)d3, cimap_lws[1]); } else { //temp = d3 / max_compute_units; temp = 1; cimap_lws[0] = temp == 0 ? 1 : temp; cimap_gws[0] = clu_RoundWorkSize((size_t)d3, cimap_lws[0]); } k_compute_ics = clCreateKernel(program, "compute_initial_conditions", &ecode); clu_CheckError(ecode, "clCreateKernel() for compute_initial_conditions"); ecode = clSetKernelArg(k_compute_ics, 2, sizeof(int), &d1); ecode |= clSetKernelArg(k_compute_ics, 3, sizeof(int), &d2); ecode |= clSetKernelArg(k_compute_ics, 4, sizeof(int), &d3); clu_CheckError(ecode, "clSetKernelArg() for compute_initial_conditions"); k_cffts1 = clCreateKernel(program, "cffts1", &ecode); clu_CheckError(ecode, "clCreateKernel() for cffts1"); ecode = clSetKernelArg(k_cffts1, 2, sizeof(cl_mem), &m_u); if (device_type == CL_DEVICE_TYPE_CPU) { ecode |= clSetKernelArg(k_cffts1, 8, sizeof(cl_mem), &m_ty1); ecode |= clSetKernelArg(k_cffts1, 9, sizeof(cl_mem), &m_ty2); } clu_CheckError(ecode, "clSetKernelArg() for k_cffts1"); k_cffts2 = clCreateKernel(program, "cffts2", &ecode); clu_CheckError(ecode, "clCreateKernel() for cffts2"); ecode = clSetKernelArg(k_cffts2, 2, sizeof(cl_mem), &m_u); if (device_type == CL_DEVICE_TYPE_CPU) { ecode |= clSetKernelArg(k_cffts2, 8, sizeof(cl_mem), &m_ty1); ecode |= clSetKernelArg(k_cffts2, 9, sizeof(cl_mem), &m_ty2); } clu_CheckError(ecode, "clSetKernelArg() for k_cffts2"); k_cffts3 = clCreateKernel(program, "cffts3", &ecode); clu_CheckError(ecode, "clCreateKernel() for cffts3"); ecode = clSetKernelArg(k_cffts3, 2, sizeof(cl_mem), &m_u); if (device_type == CL_DEVICE_TYPE_CPU) { ecode |= clSetKernelArg(k_cffts3, 8, sizeof(cl_mem), &m_ty1); ecode |= clSetKernelArg(k_cffts3, 9, sizeof(cl_mem), &m_ty2); } clu_CheckError(ecode, "clSetKernelArg() for k_cffts3"); k_evolve = clCreateKernel(program, "evolve", &ecode); clu_CheckError(ecode, "clCreateKernel() for evolve"); k_checksum = clCreateKernel(program, "checksum", &ecode); clu_CheckError(ecode, "clCreateKernel() for checksum"); ecode = clSetKernelArg(k_checksum, 1, sizeof(cl_mem), &m_chk); ecode |= clSetKernelArg(k_checksum, 2, sizeof(dcomplex)*checksum_local_ws, NULL); ecode |= clSetKernelArg(k_checksum, 3, sizeof(int), &dims[0]); ecode |= clSetKernelArg(k_checksum, 4, sizeof(int), &dims[1]); clu_CheckError(ecode, "clSetKernelArg() for checksum"); DTIMER_STOP(T_OPENCL_API); }
//--------------------------------------------------------------------- // Set up the OpenCL environment. //--------------------------------------------------------------------- static void setup_opencl(int argc, char **argv) { int i, c; // size_t temp; cl_int ecode = 0; char *source_dir = "."; //FIXME int num_subs = DEFAULT_NUM_SUBS; int num_cus; int sqrt_num_command_queues; if (argc > 1) source_dir = argv[1]; devices = (cl_device_id *)malloc(sizeof(cl_device_id) * num_subs); if (timeron) { timer_clear(TIMER_OPENCL); timer_clear(TIMER_BUILD); timer_clear(TIMER_BUFFER); timer_clear(TIMER_RELEASE); timer_start(TIMER_OPENCL); } // 1. Find the default device type and get a device for the device type // Then, create sub-devices from the parent device. //device_type = CL_DEVICE_TYPE_CPU; device_type = CL_DEVICE_TYPE_ALL; //device_type = CL_DEVICE_TYPE_GPU; if(argc <= 2) { printf("Device type argument missing!\n"); exit(-1); } char *device_type_str = argv[2]; if(strcmp(device_type_str, "CPU") == 0 || strcmp(device_type_str, "cpu") == 0) { device_type = CL_DEVICE_TYPE_CPU; } else if(strcmp(device_type_str, "GPU") == 0 || strcmp(device_type_str, "gpu") == 0) { device_type = CL_DEVICE_TYPE_GPU; } else if(strcmp(device_type_str, "ALL") == 0 || strcmp(device_type_str, "all") == 0) { device_type = CL_DEVICE_TYPE_ALL; } else { printf("Unsupported device type!\n"); exit(-1); } cl_uint num_command_queues = 4; char *num_command_queues_str = getenv("SNU_NPB_COMMAND_QUEUES"); if(num_command_queues_str != NULL) num_command_queues = atoi(num_command_queues_str); cl_platform_id platform; ecode = clGetPlatformIDs(1, &platform, NULL); clu_CheckError(ecode, "clGetPlatformIDs()"); ecode = clGetDeviceIDs(platform, device_type, 0, NULL, &num_devices); clu_CheckError(ecode, "clGetDeviceIDs()"); //num_devices = 2; ecode = clGetDeviceIDs(platform, device_type, num_devices, devices, NULL); clu_CheckError(ecode, "clGetDeviceIDs()"); cl_device_id tmp_dev; work_item_sizes[0] = work_item_sizes[1] = work_item_sizes[2] = 1024; max_work_group_size = 1024; max_compute_units = 22; sqrt_num_command_queues = (int)(sqrt((double)(num_command_queues) + 0.00001)); if (num_command_queues != sqrt_num_command_queues * sqrt_num_command_queues) { fprintf(stderr, "Number of devices is not a square of some integer\n"); exit(EXIT_FAILURE); } ncells = (int)(sqrt((double)(num_command_queues) + 0.00001)); MAX_CELL_DIM = ((PROBLEM_SIZE/ncells)+1); IMAX = MAX_CELL_DIM; JMAX = MAX_CELL_DIM; KMAX = MAX_CELL_DIM; IMAXP = (IMAX/2*2+1); JMAXP = (JMAX/2*2+1); //--------------------------------------------------------------------- // +1 at end to avoid zero length arrays for 1 node //--------------------------------------------------------------------- BUF_SIZE = (MAX_CELL_DIM*MAX_CELL_DIM*(MAXCELLS-1)*60*2+1); // FIXME if (max_work_group_size > 64) { max_work_group_size = 64; int i; for (i = 0; i < 3; i++) { if (work_item_sizes[i] > 64) { work_item_sizes[i] = 64; } } } // 2. Create a context for devices #ifdef MINIMD_SNUCL_OPTIMIZATIONS cl_context_properties props[5] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, CL_CONTEXT_SCHEDULER, CL_CONTEXT_SCHEDULER_CODE_SEGMENTED_PERF_MODEL, //CL_CONTEXT_SCHEDULER_PERF_MODEL, //CL_CONTEXT_SCHEDULER_FIRST_EPOCH_BASED_PERF_MODEL, //CL_CONTEXT_SCHEDULER_ALL_EPOCH_BASED_PERF_MODEL, 0 }; context = clCreateContext(props, #elif defined(SOCL_OPTIMIZATIONS) cl_context_properties props[5] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, CL_CONTEXT_SCHEDULER_SOCL, "dmda", //"random", 0 }; context = clCreateContext(props, #else context = clCreateContext(NULL, #endif num_devices, devices, NULL, NULL, &ecode); clu_CheckError(ecode, "clCreateContext()"); // 3. Create a command queue cmd_queue = (cl_command_queue*)malloc(sizeof(cl_command_queue)*num_command_queues*3); for (i = 0; i < num_command_queues * 2; i++) { //cmd_queue[i] = clCreateCommandQueue(context, devices[(i / 2) % num_devices], #ifdef SOCL_OPTIMIZATIONS cmd_queue[i] = clCreateCommandQueue(context, NULL, #else cmd_queue[i] = clCreateCommandQueue(context, devices[num_devices - 1 - ((i / 2) % num_devices)], #endif // cmd_queue[i] = clCreateCommandQueue(context, devices[0], #ifdef MINIMD_SNUCL_OPTIMIZATIONS 0, // CL_QUEUE_AUTO_DEVICE_SELECTION | // CL_QUEUE_ITERATIVE, //CL_QUEUE_COMPUTE_INTENSIVE, #else 0, #endif &ecode); clu_CheckError(ecode, "clCreateCommandQueue()"); } // 4. Build the program if (timeron) timer_start(TIMER_BUILD); char *source_file = "sp_kernel.cl"; //p_program = clu_MakeProgram(context, devices, source_dir, source_file, build_option); p_program = clu_CreateProgram(context, source_dir, source_file); for(i = 0; i < num_devices; i++) { char build_option[200] = {0}; cl_device_type cur_device_type; cl_int err = clGetDeviceInfo(devices[i], CL_DEVICE_TYPE, sizeof(cl_device_type), &cur_device_type, NULL); clu_CheckError(err, "clGetDeviceInfo()"); if (cur_device_type == CL_DEVICE_TYPE_CPU) { sprintf(build_option, "-I. -DCLASS=%d -DUSE_CPU -DMAX_CELL_DIM=%d -DIMAX=%d -DJMAX=%d -DKMAX=%d -DIMAXP=%d -DJMAXP=%d", CLASS, MAX_CELL_DIM, IMAX, JMAX, KMAX, IMAXP, JMAXP); } else { sprintf(build_option, "-I. -DCLASS=%d -DUSE_GPU -DMAX_CELL_DIM=%d -DIMAX=%d -DJMAX=%d -DKMAX=%d -DIMAXP=%d -DJMAXP=%d", CLASS, MAX_CELL_DIM, IMAX, JMAX, KMAX, IMAXP, JMAXP); } clu_MakeProgram(p_program, 1, &devices[i], source_dir, build_option); //clu_MakeProgram(p_program, num_devices, devices, source_dir, build_option); } num_devices = num_command_queues; program = (cl_program *)malloc(sizeof(cl_program) * num_devices); for (i = 0; i < num_devices; i++) { program[i] = p_program; } if (timeron) timer_stop(TIMER_BUILD); // 5. Create kernels size_t asize = sizeof(cl_kernel) * num_devices; k_initialize1 = (cl_kernel *)malloc(asize); k_initialize2 = (cl_kernel *)malloc(asize); k_initialize3 = (cl_kernel *)malloc(asize); k_initialize4 = (cl_kernel *)malloc(asize); k_initialize5 = (cl_kernel *)malloc(asize); k_initialize6 = (cl_kernel *)malloc(asize); k_initialize7 = (cl_kernel *)malloc(asize); k_initialize8 = (cl_kernel *)malloc(asize); k_lhsinit = (cl_kernel *)malloc(asize); k_exact_rhs1 = (cl_kernel *)malloc(asize); k_exact_rhs2 = (cl_kernel *)malloc(asize); k_exact_rhs3 = (cl_kernel *)malloc(asize); k_exact_rhs4 = (cl_kernel *)malloc(asize); k_exact_rhs5 = (cl_kernel *)malloc(asize); k_copy_faces1 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_copy_faces2 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_copy_faces3 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_copy_faces4 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_copy_faces5 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_copy_faces6 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_compute_rhs1 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_compute_rhs2 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_compute_rhs3 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_compute_rhs4 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_compute_rhs5 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_compute_rhs6 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_txinvr = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_lhsx = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_ninvr = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_x_solve1 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_x_solve2 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_x_solve3 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_x_solve4 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_x_solve5 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_x_solve6 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_lhsy = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_pinvr = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_y_solve1 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_y_solve2 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_y_solve3 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_y_solve4 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_y_solve5 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_y_solve6 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_lhsz = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_tzetar = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_z_solve1 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_z_solve2 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_z_solve3 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_z_solve4 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_z_solve5 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_z_solve6 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_add = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_error_norm = (cl_kernel *)malloc(asize); k_rhs_norm = (cl_kernel *)malloc(asize); for (i = 0; i < num_devices; i++) { k_initialize1[i] = clCreateKernel(program[i], "initialize1", &ecode); clu_CheckError(ecode, "clCreateKernel() for initialize1"); k_initialize2[i] = clCreateKernel(program[i], "initialize2", &ecode); clu_CheckError(ecode, "clCreateKernel() for initialize2"); k_initialize3[i] = clCreateKernel(program[i], "initialize3", &ecode); clu_CheckError(ecode, "clCreateKernel() for initialize3"); k_initialize4[i] = clCreateKernel(program[i], "initialize4", &ecode); clu_CheckError(ecode, "clCreateKernel() for initialize4"); k_initialize5[i] = clCreateKernel(program[i], "initialize5", &ecode); clu_CheckError(ecode, "clCreateKernel() for initialize5"); k_initialize6[i] = clCreateKernel(program[i], "initialize6", &ecode); clu_CheckError(ecode, "clCreateKernel() for initialize6"); k_initialize7[i] = clCreateKernel(program[i], "initialize7", &ecode); clu_CheckError(ecode, "clCreateKernel() for initialize7"); k_initialize8[i] = clCreateKernel(program[i], "initialize8", &ecode); clu_CheckError(ecode, "clCreateKernel() for initialize8"); k_lhsinit[i] = clCreateKernel(program[i], "lhsinit", &ecode); clu_CheckError(ecode, "clCreateKernel() for lhsinit"); k_exact_rhs1[i] = clCreateKernel(program[i], "exact_rhs1", &ecode); clu_CheckError(ecode, "clCreateKernel() for exact_rhs1"); k_exact_rhs2[i] = clCreateKernel(program[i], "exact_rhs2", &ecode); clu_CheckError(ecode, "clCreateKernel() for exact_rhs2"); k_exact_rhs3[i] = clCreateKernel(program[i], "exact_rhs3", &ecode); clu_CheckError(ecode, "clCreateKernel() for exact_rhs3"); k_exact_rhs4[i] = clCreateKernel(program[i], "exact_rhs4", &ecode); clu_CheckError(ecode, "clCreateKernel() for exact_rhs4"); k_exact_rhs5[i] = clCreateKernel(program[i], "exact_rhs5", &ecode); clu_CheckError(ecode, "clCreateKernel() for exact_rhs5"); for (c = 0; c < MAXCELLS; c++) { k_copy_faces1[i][c] = clCreateKernel(program[i], "copy_faces1", &ecode); clu_CheckError(ecode, "clCreateKernel() for copy_faces1"); k_copy_faces2[i][c] = clCreateKernel(program[i], "copy_faces2", &ecode); clu_CheckError(ecode, "clCreateKernel() for copy_faces2"); k_copy_faces3[i][c] = clCreateKernel(program[i], "copy_faces3", &ecode); clu_CheckError(ecode, "clCreateKernel() for copy_faces3"); k_copy_faces4[i][c] = clCreateKernel(program[i], "copy_faces4", &ecode); clu_CheckError(ecode, "clCreateKernel() for copy_faces4"); k_copy_faces5[i][c] = clCreateKernel(program[i], "copy_faces5", &ecode); clu_CheckError(ecode, "clCreateKernel() for copy_faces5"); k_copy_faces6[i][c] = clCreateKernel(program[i], "copy_faces6", &ecode); clu_CheckError(ecode, "clCreateKernel() for copy_faces6"); k_compute_rhs1[i][c] = clCreateKernel(program[i], "compute_rhs1", &ecode); clu_CheckError(ecode, "clCreateKernel() for compute_rhs1"); k_compute_rhs2[i][c] = clCreateKernel(program[i], "compute_rhs2", &ecode); clu_CheckError(ecode, "clCreateKernel() for compute_rhs2"); k_compute_rhs3[i][c] = clCreateKernel(program[i], "compute_rhs3", &ecode); clu_CheckError(ecode, "clCreateKernel() for compute_rhs3"); k_compute_rhs4[i][c] = clCreateKernel(program[i], "compute_rhs4", &ecode); clu_CheckError(ecode, "clCreateKernel() for compute_rhs4"); k_compute_rhs5[i][c] = clCreateKernel(program[i], "compute_rhs5", &ecode); clu_CheckError(ecode, "clCreateKernel() for compute_rhs5"); k_compute_rhs6[i][c] = clCreateKernel(program[i], "compute_rhs6", &ecode); clu_CheckError(ecode, "clCreateKernel() for compute_rhs6"); k_txinvr[i][c] = clCreateKernel(program[i], "txinvr", &ecode); clu_CheckError(ecode, "clCreateKernel() for txinvr"); k_lhsx[i][c] = clCreateKernel(program[i], "lhsx", &ecode); clu_CheckError(ecode, "clCreateKernel() for lhsx"); k_ninvr[i][c] = clCreateKernel(program[i], "ninvr", &ecode); clu_CheckError(ecode, "clCreateKernel() for ninvr"); k_x_solve1[i][c] = clCreateKernel(program[i], "x_solve1", &ecode); clu_CheckError(ecode, "clCreateKernel() for x_solve1"); k_x_solve2[i][c] = clCreateKernel(program[i], "x_solve2", &ecode); clu_CheckError(ecode, "clCreateKernel() for x_solve2"); k_x_solve3[i][c] = clCreateKernel(program[i], "x_solve3", &ecode); clu_CheckError(ecode, "clCreateKernel() for x_solve3"); k_x_solve4[i][c] = clCreateKernel(program[i], "x_solve4", &ecode); clu_CheckError(ecode, "clCreateKernel() for x_solve4"); k_x_solve5[i][c] = clCreateKernel(program[i], "x_solve5", &ecode); clu_CheckError(ecode, "clCreateKernel() for x_solve5"); k_x_solve6[i][c] = clCreateKernel(program[i], "x_solve6", &ecode); clu_CheckError(ecode, "clCreateKernel() for x_solve6"); k_lhsy[i][c] = clCreateKernel(program[i], "lhsy", &ecode); clu_CheckError(ecode, "clCreateKernel() for lhsy"); k_pinvr[i][c] = clCreateKernel(program[i], "pinvr", &ecode); clu_CheckError(ecode, "clCreateKernel() for pinvr"); k_y_solve1[i][c] = clCreateKernel(program[i], "y_solve1", &ecode); clu_CheckError(ecode, "clCreateKernel() for y_solve1"); k_y_solve2[i][c] = clCreateKernel(program[i], "y_solve2", &ecode); clu_CheckError(ecode, "clCreateKernel() for y_solve2"); k_y_solve3[i][c] = clCreateKernel(program[i], "y_solve3", &ecode); clu_CheckError(ecode, "clCreateKernel() for y_solve3"); k_y_solve4[i][c] = clCreateKernel(program[i], "y_solve4", &ecode); clu_CheckError(ecode, "clCreateKernel() for y_solve4"); k_y_solve5[i][c] = clCreateKernel(program[i], "y_solve5", &ecode); clu_CheckError(ecode, "clCreateKernel() for y_solve5"); k_y_solve6[i][c] = clCreateKernel(program[i], "y_solve6", &ecode); clu_CheckError(ecode, "clCreateKernel() for y_solve6"); k_lhsz[i][c] = clCreateKernel(program[i], "lhsz", &ecode); clu_CheckError(ecode, "clCreateKernel() for lhsz"); k_tzetar[i][c] = clCreateKernel(program[i], "tzetar", &ecode); clu_CheckError(ecode, "clCreateKernel() for tzetar"); k_z_solve1[i][c] = clCreateKernel(program[i], "z_solve1", &ecode); clu_CheckError(ecode, "clCreateKernel() for z_solve1"); k_z_solve2[i][c] = clCreateKernel(program[i], "z_solve2", &ecode); clu_CheckError(ecode, "clCreateKernel() for z_solve2"); k_z_solve3[i][c] = clCreateKernel(program[i], "z_solve3", &ecode); clu_CheckError(ecode, "clCreateKernel() for z_solve3"); k_z_solve4[i][c] = clCreateKernel(program[i], "z_solve4", &ecode); clu_CheckError(ecode, "clCreateKernel() for z_solve4"); k_z_solve5[i][c] = clCreateKernel(program[i], "z_solve5", &ecode); clu_CheckError(ecode, "clCreateKernel() for z_solve5"); k_z_solve6[i][c] = clCreateKernel(program[i], "z_solve6", &ecode); clu_CheckError(ecode, "clCreateKernel() for z_solve6"); k_add[i][c] = clCreateKernel(program[i], "add", &ecode); clu_CheckError(ecode, "clCreateKernel() for add"); } k_error_norm[i] = clCreateKernel(program[i], "error_norm", &ecode); clu_CheckError(ecode, "clCreateKernel() for error_norm"); k_rhs_norm[i] = clCreateKernel(program[i], "rhs_norm", &ecode); clu_CheckError(ecode, "clCreateKernel() for rhs_norm"); } // 6. Create buffers if (timeron) timer_start(TIMER_BUFFER); asize = sizeof(cl_mem) * num_devices; m_u = (cl_mem *)malloc(asize); m_us = (cl_mem *)malloc(asize); m_vs = (cl_mem *)malloc(asize); m_ws = (cl_mem *)malloc(asize); m_qs = (cl_mem *)malloc(asize); m_ainv = (cl_mem *)malloc(asize); m_rho_i = (cl_mem *)malloc(asize); m_speed = (cl_mem *)malloc(asize); m_square = (cl_mem *)malloc(asize); m_rhs = (cl_mem *)malloc(asize); m_forcing = (cl_mem *)malloc(asize); m_lhs = (cl_mem *)malloc(asize); m_in_buffer = (cl_mem *)malloc(asize); m_out_buffer = (cl_mem *)malloc(asize); m_ce = (cl_mem *)malloc(asize); for (i = 0; i < num_devices; i++) { m_u[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*MAXCELLS*(KMAX+4)*(JMAXP+4)*(IMAXP+4)*5, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_u"); m_us[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2), NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_us"); m_vs[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2), NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_vs"); m_ws[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2), NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_ws"); m_qs[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2), NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_qs"); m_ainv[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2), NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_ainv"); m_rho_i[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2), NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_rho_i"); m_speed[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2), NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_speed"); m_square[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2), NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_square"); m_rhs[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*MAXCELLS*KMAX*JMAXP*IMAXP*5, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_rhs"); m_forcing[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*MAXCELLS*KMAX*JMAXP*IMAXP*5, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_forcing"); m_lhs[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*MAXCELLS*KMAX*JMAXP*IMAXP*15, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_lhs"); m_in_buffer[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*BUF_SIZE, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_in_buffer"); m_out_buffer[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*BUF_SIZE, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_out_buffer"); m_ce[i] = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(double)*5*13, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_ce"); } if (timeron) timer_stop(TIMER_BUFFER); if (timeron) timer_stop(TIMER_OPENCL); }
//--------------------------------------------------------------------- // Set up the OpenCL environment. //--------------------------------------------------------------------- static void setup_opencl(int argc, char *argv[]) { cl_int ecode; char *source_dir = "IS"; if (argc > 1) source_dir = argv[1]; #ifdef TIMER_DETAIL if (timer_on) { int i; for (i = T_OPENCL_API; i < T_END; i++) timer_clear(i); } #endif DTIMER_START(T_OPENCL_API); // 1. Find the default device type and get a device for the device type device_type = clu_GetDefaultDeviceType(); device = clu_GetAvailableDevice(device_type); device_name = clu_GetDeviceName(device); // Device information ecode = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(work_item_sizes), &work_item_sizes, NULL); clu_CheckError(ecode, "clGetDiviceInfo()"); ecode = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_work_group_size, NULL); clu_CheckError(ecode, "clGetDiviceInfo()"); // FIXME: The below values are experimental. if (max_work_group_size > 256) { max_work_group_size = 256; int i; for (i = 0; i < 3; i++) { if (work_item_sizes[i] > 256) { work_item_sizes[i] = 256; } } } // 2. Create a context for the specified device context = clCreateContext(NULL, 1, &device, NULL, NULL, &ecode); clu_CheckError(ecode, "clCreateContext()"); // 3. Create a command queue cmd_queue = clCreateCommandQueue(context, device, 0, &ecode); clu_CheckError(ecode, "clCreateCommandQueue()"); DTIMER_STOP(T_OPENCL_API); // 4. Build the program DTIMER_START(T_BUILD); char *source_file; char build_option[30]; if (device_type == CL_DEVICE_TYPE_CPU) { source_file = "is_cpu.cl"; sprintf(build_option, "-DCLASS=%d -I.", CLASS); CREATE_SEQ_GROUP_SIZE = 64; CREATE_SEQ_GLOBAL_SIZE = CREATE_SEQ_GROUP_SIZE * 256; RANK_GROUP_SIZE = 1; RANK_GLOBAL_SIZE = RANK_GROUP_SIZE * 128; RANK1_GROUP_SIZE = 1; RANK1_GLOBAL_SIZE = RANK1_GROUP_SIZE * RANK_GLOBAL_SIZE;; RANK2_GROUP_SIZE = RANK_GROUP_SIZE; RANK2_GLOBAL_SIZE = RANK_GLOBAL_SIZE;; FV2_GROUP_SIZE = 64; FV2_GLOBAL_SIZE = FV2_GROUP_SIZE * 256; } else if (device_type == CL_DEVICE_TYPE_GPU) { source_file = "is_gpu.cl"; sprintf(build_option, "-DCLASS=\'%c\' -I.", CLASS); CREATE_SEQ_GROUP_SIZE = 64; CREATE_SEQ_GLOBAL_SIZE = CREATE_SEQ_GROUP_SIZE * 256; RANK1_GROUP_SIZE = work_item_sizes[0]; RANK1_GLOBAL_SIZE = MAX_KEY; RANK2_GROUP_SIZE = work_item_sizes[0]; RANK2_GLOBAL_SIZE = NUM_KEYS; FV2_GROUP_SIZE = work_item_sizes[0]; FV2_GLOBAL_SIZE = NUM_KEYS; } else { fprintf(stderr, "%s: not supported.", clu_GetDeviceTypeName(device_type)); exit(EXIT_FAILURE); } program = clu_MakeProgram(context, device, source_dir, source_file, build_option); DTIMER_STOP(T_BUILD); // 5. Create buffers DTIMER_START(T_BUFFER_CREATE); m_key_array = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(INT_TYPE) * SIZE_OF_BUFFERS, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_key_array"); m_key_buff1 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(INT_TYPE) * MAX_KEY, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_key_buff1"); m_key_buff2 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(INT_TYPE) * SIZE_OF_BUFFERS, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_key_buff2"); size_t test_array_size = sizeof(INT_TYPE) * TEST_ARRAY_SIZE; m_index_array = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, test_array_size, test_index_array, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_index_array"); m_rank_array = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, test_array_size, test_rank_array, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_rank_array"); m_partial_vals = clCreateBuffer(context, CL_MEM_WRITE_ONLY, test_array_size, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_partial_vals"); m_passed_verification = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_passed_verification"); if (device_type == CL_DEVICE_TYPE_GPU) { m_key_scan = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(INT_TYPE) * MAX_KEY, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_key_buff1_scan"); m_sum = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(INT_TYPE) * work_item_sizes[0], NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_sum"); } else { size_t bs_size = RANK_GLOBAL_SIZE * sizeof(INT_TYPE) * NUM_BUCKETS; m_bucket_size = clCreateBuffer(context, CL_MEM_READ_WRITE, bs_size, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_bucket_size"); m_bucket_ptrs = clCreateBuffer(context, CL_MEM_READ_WRITE, bs_size, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_bucket_ptrs"); } DTIMER_STOP(T_BUFFER_CREATE); // 6. Create kernels DTIMER_START(T_OPENCL_API); k_rank0 = clCreateKernel(program, "rank0", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank0"); ecode = clSetKernelArg(k_rank0, 0, sizeof(cl_mem), (void*)&m_key_array); ecode |= clSetKernelArg(k_rank0, 1, sizeof(cl_mem), (void*)&m_partial_vals); ecode |= clSetKernelArg(k_rank0, 2, sizeof(cl_mem), (void*)&m_index_array); clu_CheckError(ecode, "clSetKernelArg() for rank0"); if (device_type == CL_DEVICE_TYPE_GPU) { k_rank1 = clCreateKernel(program, "rank1", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank1"); ecode = clSetKernelArg(k_rank1, 0, sizeof(cl_mem), (void*)&m_key_buff1); clu_CheckError(ecode, "clSetKernelArg() for rank1"); k_rank2 = clCreateKernel(program, "rank2", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank2"); ecode = clSetKernelArg(k_rank2, 0, sizeof(cl_mem), (void*)&m_key_buff1); ecode |= clSetKernelArg(k_rank2, 1, sizeof(cl_mem), (void*)&m_key_array); clu_CheckError(ecode, "clSetKernelArg() for rank2"); k_rank3_0 = clCreateKernel(program, "rank3_0", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank3_0"); ecode = clSetKernelArg(k_rank3_0, 0, sizeof(cl_mem),(void*)&m_key_buff1); ecode |= clSetKernelArg(k_rank3_0, 1, sizeof(cl_mem),(void*)&m_key_buff1); ecode |= clSetKernelArg(k_rank3_0, 2, sizeof(cl_mem),(void*)&m_sum); ecode |= clSetKernelArg(k_rank3_0, 3, sizeof(INT_TYPE) * work_item_sizes[0] * 2, NULL); clu_CheckError(ecode, "clSetKernelArg() for rank3_0"); k_rank3_1 = clCreateKernel(program, "rank3_1", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank3_1"); ecode = clSetKernelArg(k_rank3_1, 0, sizeof(cl_mem), (void*)&m_sum); ecode = clSetKernelArg(k_rank3_1, 1, sizeof(cl_mem), (void*)&m_sum); ecode |= clSetKernelArg(k_rank3_1, 2, sizeof(INT_TYPE) * work_item_sizes[0] * 2, NULL); clu_CheckError(ecode, "clSetKernelArg() for rank3_1"); k_rank3_2 = clCreateKernel(program, "rank3_2", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank3_2"); ecode = clSetKernelArg(k_rank3_2, 0, sizeof(cl_mem),(void*)&m_key_buff1); ecode = clSetKernelArg(k_rank3_2, 1, sizeof(cl_mem),(void*)&m_key_buff1); ecode |= clSetKernelArg(k_rank3_2, 2, sizeof(cl_mem),(void*)&m_sum); clu_CheckError(ecode, "clSetKernelArg() for rank3_2"); } else { k_rank1 = clCreateKernel(program, "rank1", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank1"); ecode = clSetKernelArg(k_rank1, 0, sizeof(cl_mem),(void*)&m_key_array); ecode |= clSetKernelArg(k_rank1, 1, sizeof(cl_mem),(void*)&m_bucket_size); clu_CheckError(ecode, "clSetKernelArg() for rank1"); k_rank2 = clCreateKernel(program, "rank2", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank2"); ecode = clSetKernelArg(k_rank2, 0, sizeof(cl_mem),(void*)&m_key_array); ecode |= clSetKernelArg(k_rank2, 1, sizeof(cl_mem),(void*)&m_bucket_size); ecode |= clSetKernelArg(k_rank2, 2, sizeof(cl_mem),(void*)&m_bucket_ptrs); ecode |= clSetKernelArg(k_rank2, 3, sizeof(cl_mem),(void*)&m_key_buff2); clu_CheckError(ecode, "clSetKernelArg() for rank2"); k_rank3 = clCreateKernel(program, "rank3", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank3"); ecode = clSetKernelArg(k_rank3, 0, sizeof(cl_mem),(void*)&m_bucket_size); ecode |= clSetKernelArg(k_rank3, 1, sizeof(cl_mem),(void*)&m_bucket_ptrs); ecode |= clSetKernelArg(k_rank3, 2, sizeof(cl_mem),(void*)&m_key_buff1); ecode |= clSetKernelArg(k_rank3, 3, sizeof(cl_mem),(void*)&m_key_buff2); clu_CheckError(ecode, "clSetKernelArg() for rank3"); } k_rank4 = clCreateKernel(program, "rank4", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank4"); ecode = clSetKernelArg(k_rank4, 0, sizeof(cl_mem), (void*)&m_partial_vals); ecode |= clSetKernelArg(k_rank4, 1, sizeof(cl_mem), (void*)&m_key_buff1); ecode |= clSetKernelArg(k_rank4, 2, sizeof(cl_mem), (void*)&m_rank_array); ecode |= clSetKernelArg(k_rank4, 3, sizeof(cl_mem), (void*)&m_passed_verification); clu_CheckError(ecode, "clSetKernelArg() for rank4"); DTIMER_STOP(T_OPENCL_API); }
//--------------------------------------------------------------------- // Set up the OpenCL environment. //--------------------------------------------------------------------- static void setup_opencl(int argc, char *argv[]) { int i; size_t temp, wg_num; cl_int ecode; char *source_dir = "LU"; if (timeron) { timer_clear(TIMER_OPENCL); timer_clear(TIMER_BUILD); timer_clear(TIMER_BUFFER); timer_clear(TIMER_RELEASE); timer_start(TIMER_OPENCL); } if (argc > 1) source_dir = argv[1]; //----------------------------------------------------------------------- // 1. Find the default device type and get a device for the device type //----------------------------------------------------------------------- device_type = clu_GetDefaultDeviceType(); device = clu_GetAvailableDevice(device_type); device_name = clu_GetDeviceName(device); // Device information ecode = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(work_item_sizes), &work_item_sizes, NULL); clu_CheckError(ecode, "clGetDiviceInfo()"); ecode = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_work_group_size, NULL); clu_CheckError(ecode, "clGetDiviceInfo()"); ecode = clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &max_compute_units, NULL); clu_CheckError(ecode, "clGetDiviceInfo()"); //////////////////////////////////////////////////////////////////////// // FIXME: The below values are experimental. size_t default_wg_size = 64; if (device_type == CL_DEVICE_TYPE_CPU) { if (CLASS == 'B') default_wg_size = 128; } else { if (CLASS == 'B') default_wg_size = 32; } if (max_work_group_size > default_wg_size) { max_work_group_size = default_wg_size; int i; for (i = 0; i < 3; i++) { if (work_item_sizes[i] > default_wg_size) { work_item_sizes[i] = default_wg_size; } } } if (device_type == CL_DEVICE_TYPE_CPU) { SETBV1_DIM = SETBV1_DIM_CPU; SETBV2_DIM = SETBV2_DIM_CPU; SETBV3_DIM = SETBV3_DIM_CPU; SETIV_DIM = SETIV_DIM_CPU; ERHS1_DIM = ERHS1_DIM_CPU; ERHS2_DIM = ERHS2_DIM_CPU; ERHS3_DIM = ERHS3_DIM_CPU; ERHS4_DIM = ERHS4_DIM_CPU; PINTGR1_DIM = PINTGR1_DIM_CPU; PINTGR2_DIM = PINTGR2_DIM_CPU; PINTGR3_DIM = PINTGR3_DIM_CPU; RHS_DIM = RHS_DIM_CPU; RHSX_DIM = RHSX_DIM_CPU; RHSY_DIM = RHSY_DIM_CPU; RHSZ_DIM = RHSZ_DIM_CPU; SSOR2_DIM = SSOR2_DIM_CPU; SSOR3_DIM = SSOR3_DIM_CPU; } else { SETBV1_DIM = SETBV1_DIM_GPU; SETBV2_DIM = SETBV2_DIM_GPU; SETBV3_DIM = SETBV3_DIM_GPU; SETIV_DIM = SETIV_DIM_GPU; ERHS1_DIM = ERHS1_DIM_GPU; ERHS2_DIM = ERHS2_DIM_GPU; ERHS3_DIM = ERHS3_DIM_GPU; ERHS4_DIM = ERHS4_DIM_GPU; PINTGR1_DIM = PINTGR1_DIM_GPU; PINTGR2_DIM = PINTGR2_DIM_GPU; PINTGR3_DIM = PINTGR3_DIM_GPU; RHS_DIM = RHS_DIM_GPU; RHSX_DIM = RHSX_DIM_GPU; RHSY_DIM = RHSY_DIM_GPU; RHSZ_DIM = RHSZ_DIM_GPU; SSOR2_DIM = SSOR2_DIM_GPU; SSOR3_DIM = SSOR3_DIM_GPU; } //////////////////////////////////////////////////////////////////////// //----------------------------------------------------------------------- // 2. Create a context for the specified device //----------------------------------------------------------------------- context = clCreateContext(NULL, 1, &device, NULL, NULL, &ecode); clu_CheckError(ecode, "clCreateContext()"); //----------------------------------------------------------------------- // 3. Create command queues //----------------------------------------------------------------------- cmd_queue = clCreateCommandQueue(context, device, 0, &ecode); clu_CheckError(ecode, "clCreateCommandQueue()"); max_pipeline = (jend-jst) < max_compute_units ? (jend-jst) : max_compute_units; pipe_queue = (cl_command_queue *)malloc(sizeof(cl_command_queue) * max_pipeline); for (i = 0; i < max_pipeline; i++) { pipe_queue[i] = clCreateCommandQueue(context, device, 0, &ecode); clu_CheckError(ecode, "clCreateCommandQueue()"); } //----------------------------------------------------------------------- // 4. Build programs //----------------------------------------------------------------------- if (timeron) timer_start(TIMER_BUILD); char build_option[100]; if (device_type == CL_DEVICE_TYPE_CPU) { sprintf(build_option, "-I. -DCLASS=%d -DUSE_CPU", CLASS); } else { sprintf(build_option, "-I. -DCLASS=\'%c\'", CLASS); } p_pre = clu_MakeProgram(context, device, source_dir, "kernel_pre.cl", build_option); p_main = clu_MakeProgram(context, device, source_dir, (device_type == CL_DEVICE_TYPE_CPU ? "kernel_main_cpu.cl" : "kernel_main_gpu.cl"), build_option); p_post = clu_MakeProgram(context, device, source_dir, "kernel_post.cl", build_option); if (timeron) timer_stop(TIMER_BUILD); //----------------------------------------------------------------------- // 5. Create buffers //----------------------------------------------------------------------- if (timeron) timer_start(TIMER_BUFFER); m_ce = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(double)*5*13, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_ce"); m_u = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*(ISIZ3)*(ISIZ2/2*2+1)*(ISIZ1/2*2+1)*5, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_u"); m_rsd = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*(ISIZ3)*(ISIZ2/2*2+1)*(ISIZ1/2*2+1)*5, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_rsd"); m_frct = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*(ISIZ3)*(ISIZ2/2*2+1)*(ISIZ1/2*2+1)*5, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_frct"); m_qs = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*(ISIZ3)*(ISIZ2/2*2+1)*(ISIZ1/2*2+1), NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_qs"); m_rho_i = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*(ISIZ3)*(ISIZ2/2*2+1)*(ISIZ1/2*2+1), NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_rho_i"); // workspace for work-items size_t max_work_items; if (ERHS2_DIM == 1 && ERHS3_DIM == 1 && ERHS4_DIM == 1) { max_work_items = ISIZ3; } else { max_work_items = ISIZ3*ISIZ2; } m_flux = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*ISIZ1*5 * max_work_items, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_flux"); if (RHSZ_DIM == 1) { max_work_items = ISIZ2; } else { max_work_items = ISIZ2*ISIZ1; } if (device_type == CL_DEVICE_TYPE_CPU) { m_utmp = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*ISIZ3*6 * max_work_items, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_utmp"); m_rtmp = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*ISIZ3*5 * max_work_items, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_rtmp"); } temp = (nz0-2) / max_compute_units; l2norm_lws[0] = temp == 0 ? 1 : temp; l2norm_gws[0] = clu_RoundWorkSize((size_t)(nz0-2), l2norm_lws[0]); wg_num = l2norm_gws[0] / l2norm_lws[0]; sum_size = sizeof(double) * 5 * wg_num; m_sum = clCreateBuffer(context, CL_MEM_READ_WRITE, sum_size, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer()"); if (timeron) timer_stop(TIMER_BUFFER); //----------------------------------------------------------------------- // 6. Create kernels //----------------------------------------------------------------------- k_setbv1 = clCreateKernel(p_pre, "setbv1", &ecode); clu_CheckError(ecode, "clCreateKernel() for setbv1"); ecode = clSetKernelArg(k_setbv1, 0, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_setbv1, 1, sizeof(cl_mem), &m_ce); ecode |= clSetKernelArg(k_setbv1, 2, sizeof(int), &nx); ecode |= clSetKernelArg(k_setbv1, 3, sizeof(int), &ny); ecode |= clSetKernelArg(k_setbv1, 4, sizeof(int), &nz); clu_CheckError(ecode, "clSetKernelArg()"); if (SETBV1_DIM == 3) { setbv1_lws[0] = 5; temp = max_work_group_size / setbv1_lws[0]; setbv1_lws[1] = nx < temp ? nx : temp; temp = temp / setbv1_lws[1]; setbv1_lws[2] = ny < temp ? ny : temp; setbv1_gws[0] = clu_RoundWorkSize((size_t)5, setbv1_lws[0]); setbv1_gws[1] = clu_RoundWorkSize((size_t)nx, setbv1_lws[1]); setbv1_gws[2] = clu_RoundWorkSize((size_t)ny, setbv1_lws[2]); } else if (SETBV1_DIM == 2) { setbv1_lws[0] = nx < work_item_sizes[0] ? nx : work_item_sizes[0]; temp = max_work_group_size / setbv1_lws[0]; setbv1_lws[1] = ny < temp ? ny : temp; setbv1_gws[0] = clu_RoundWorkSize((size_t)nx, setbv1_lws[0]); setbv1_gws[1] = clu_RoundWorkSize((size_t)ny, setbv1_lws[1]); } else { temp = ny / max_compute_units; setbv1_lws[0] = temp == 0 ? 1 : temp; setbv1_gws[0] = clu_RoundWorkSize((size_t)ny, setbv1_lws[0]); } k_setbv2 = clCreateKernel(p_pre, "setbv2", &ecode); clu_CheckError(ecode, "clCreateKernel() for setbv2"); ecode = clSetKernelArg(k_setbv2, 0, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_setbv2, 1, sizeof(cl_mem), &m_ce); ecode |= clSetKernelArg(k_setbv2, 2, sizeof(int), &nx); ecode |= clSetKernelArg(k_setbv2, 3, sizeof(int), &ny); ecode |= clSetKernelArg(k_setbv2, 4, sizeof(int), &nz); clu_CheckError(ecode, "clSetKernelArg()"); if (SETBV2_DIM == 3) { setbv2_lws[0] = 5; temp = max_work_group_size / setbv2_lws[0]; setbv2_lws[1] = nx < temp ? nx : temp; temp = temp / setbv2_lws[1]; setbv2_lws[2] = nz < temp ? nz : temp; setbv2_gws[0] = clu_RoundWorkSize((size_t)5, setbv2_lws[0]); setbv2_gws[1] = clu_RoundWorkSize((size_t)nx, setbv2_lws[1]); setbv2_gws[2] = clu_RoundWorkSize((size_t)nz, setbv2_lws[2]); } else if (SETBV2_DIM == 2) { setbv2_lws[0] = nx < work_item_sizes[0] ? nx : work_item_sizes[0]; temp = max_work_group_size / setbv2_lws[0]; setbv2_lws[1] = nz < temp ? nz : temp; setbv2_gws[0] = clu_RoundWorkSize((size_t)nx, setbv2_lws[0]); setbv2_gws[1] = clu_RoundWorkSize((size_t)nz, setbv2_lws[1]); } else { temp = nz / max_compute_units; setbv2_lws[0] = temp == 0 ? 1 : temp; setbv2_gws[0] = clu_RoundWorkSize((size_t)nz, setbv2_lws[0]); } k_setbv3 = clCreateKernel(p_pre, "setbv3", &ecode); clu_CheckError(ecode, "clCreateKernel() for setbv3"); ecode = clSetKernelArg(k_setbv3, 0, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_setbv3, 1, sizeof(cl_mem), &m_ce); ecode |= clSetKernelArg(k_setbv3, 2, sizeof(int), &nx); ecode |= clSetKernelArg(k_setbv3, 3, sizeof(int), &ny); ecode |= clSetKernelArg(k_setbv3, 4, sizeof(int), &nz); clu_CheckError(ecode, "clSetKernelArg()"); if (SETBV3_DIM == 3) { setbv3_lws[0] = 5; temp = max_work_group_size / setbv3_lws[0]; setbv3_lws[1] = ny < temp ? ny : temp; temp = temp / setbv3_lws[1]; setbv3_lws[2] = nz < temp ? nz : temp; setbv3_gws[0] = clu_RoundWorkSize((size_t)5, setbv3_lws[0]); setbv3_gws[1] = clu_RoundWorkSize((size_t)ny, setbv3_lws[1]); setbv3_gws[2] = clu_RoundWorkSize((size_t)nz, setbv3_lws[2]); } else if (SETBV3_DIM == 2) { setbv3_lws[0] = ny < work_item_sizes[0] ? ny : work_item_sizes[0]; temp = max_work_group_size / setbv3_lws[0]; setbv3_lws[1] = nz < temp ? nz : temp; setbv3_gws[0] = clu_RoundWorkSize((size_t)ny, setbv3_lws[0]); setbv3_gws[1] = clu_RoundWorkSize((size_t)nz, setbv3_lws[1]); } else { temp = nz / max_compute_units; setbv3_lws[0] = temp == 0 ? 1 : temp; setbv3_gws[0] = clu_RoundWorkSize((size_t)nz, setbv3_lws[0]); } k_setiv = clCreateKernel(p_pre, "setiv", &ecode); clu_CheckError(ecode, "clCreateKernel() for setiv"); ecode = clSetKernelArg(k_setiv, 0, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_setiv, 1, sizeof(cl_mem), &m_ce); ecode |= clSetKernelArg(k_setiv, 2, sizeof(int), &nx); ecode |= clSetKernelArg(k_setiv, 3, sizeof(int), &ny); ecode |= clSetKernelArg(k_setiv, 4, sizeof(int), &nz); clu_CheckError(ecode, "clSetKernelArg()"); if (SETIV_DIM == 3) { setiv_lws[0] = (nx-2) < work_item_sizes[0] ? (nx-2) : work_item_sizes[0]; temp = max_work_group_size / setiv_lws[0]; setiv_lws[1] = (ny-2) < temp ? (ny-2) : temp; temp = temp / setiv_lws[1]; setiv_lws[2] = (nz-2) < temp ? (nz-2) : temp; setiv_gws[0] = clu_RoundWorkSize((size_t)(nx-2), setiv_lws[0]); setiv_gws[1] = clu_RoundWorkSize((size_t)(ny-2), setiv_lws[1]); setiv_gws[2] = clu_RoundWorkSize((size_t)(nz-2), setiv_lws[2]); } else if (SETIV_DIM == 2) { setiv_lws[0] = (ny-2) < work_item_sizes[0] ? (ny-2) : work_item_sizes[0]; temp = max_work_group_size / setiv_lws[0]; setiv_lws[1] = (nz-2) < temp ? (nz-2) : temp; setiv_gws[0] = clu_RoundWorkSize((size_t)(ny-2), setiv_lws[0]); setiv_gws[1] = clu_RoundWorkSize((size_t)(nz-2), setiv_lws[1]); } else { temp = (nz-2) / max_compute_units; setiv_lws[0] = temp == 0 ? 1 : temp; setiv_gws[0] = clu_RoundWorkSize((size_t)(nz-2), setiv_lws[0]); } k_l2norm = clCreateKernel(p_main, "l2norm", &ecode); clu_CheckError(ecode, "clCreateKernel()"); ecode = clSetKernelArg(k_l2norm, 1, sizeof(cl_mem), &m_sum); ecode |= clSetKernelArg(k_l2norm, 2, sizeof(double)*5*l2norm_lws[0], NULL); clu_CheckError(ecode, "clSetKernelArg()"); k_rhs = clCreateKernel(p_main, "rhs", &ecode); clu_CheckError(ecode, "clCreateKernel() for rhs"); ecode = clSetKernelArg(k_rhs, 0, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_rhs, 1, sizeof(cl_mem), &m_rsd); ecode |= clSetKernelArg(k_rhs, 2, sizeof(cl_mem), &m_frct); ecode |= clSetKernelArg(k_rhs, 3, sizeof(cl_mem), &m_qs); ecode |= clSetKernelArg(k_rhs, 4, sizeof(cl_mem), &m_rho_i); ecode |= clSetKernelArg(k_rhs, 5, sizeof(int), &nx); ecode |= clSetKernelArg(k_rhs, 6, sizeof(int), &ny); ecode |= clSetKernelArg(k_rhs, 7, sizeof(int), &nz); clu_CheckError(ecode, "clSetKernelArg()"); if (RHS_DIM == 3) { rhs_lws[0] = nx < work_item_sizes[0] ? nx : work_item_sizes[0]; temp = max_work_group_size / rhs_lws[0]; rhs_lws[1] = ny < temp ? ny : temp; temp = temp / rhs_lws[1]; rhs_lws[2] = nz < temp ? nz : temp; rhs_gws[0] = clu_RoundWorkSize((size_t)nx, rhs_lws[0]); rhs_gws[1] = clu_RoundWorkSize((size_t)ny, rhs_lws[1]); rhs_gws[2] = clu_RoundWorkSize((size_t)nz, rhs_lws[2]); } else if (RHS_DIM == 2) { rhs_lws[0] = ny < work_item_sizes[0] ? ny : work_item_sizes[0]; temp = max_work_group_size / rhs_lws[0]; rhs_lws[1] = nz < temp ? nz : temp; rhs_gws[0] = clu_RoundWorkSize((size_t)ny, rhs_lws[0]); rhs_gws[1] = clu_RoundWorkSize((size_t)nz, rhs_lws[1]); } else { //temp = nz / max_compute_units; temp = 1; rhs_lws[0] = temp == 0 ? 1 : temp; rhs_gws[0] = clu_RoundWorkSize((size_t)nz, rhs_lws[0]); } k_rhsx = clCreateKernel(p_main, "rhsx", &ecode); clu_CheckError(ecode, "clCreateKernel() for rhsx"); ecode = clSetKernelArg(k_rhsx, 0, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_rhsx, 1, sizeof(cl_mem), &m_rsd); ecode |= clSetKernelArg(k_rhsx, 2, sizeof(cl_mem), &m_qs); ecode |= clSetKernelArg(k_rhsx, 3, sizeof(cl_mem), &m_rho_i); if (device_type == CL_DEVICE_TYPE_CPU) { ecode |= clSetKernelArg(k_rhsx, 4, sizeof(cl_mem), &m_flux); ecode |= clSetKernelArg(k_rhsx, 5, sizeof(int), &nx); ecode |= clSetKernelArg(k_rhsx, 6, sizeof(int), &ny); ecode |= clSetKernelArg(k_rhsx, 7, sizeof(int), &nz); } else { ecode |= clSetKernelArg(k_rhsx, 4, sizeof(int), &nx); ecode |= clSetKernelArg(k_rhsx, 5, sizeof(int), &ny); ecode |= clSetKernelArg(k_rhsx, 6, sizeof(int), &nz); } clu_CheckError(ecode, "clSetKernelArg()"); if (RHSX_DIM == 2) { rhsx_lws[0] = (jend-jst) < work_item_sizes[0] ? (jend-jst) : work_item_sizes[0]; temp = max_work_group_size / rhsx_lws[0]; rhsx_lws[1] = (nz-2) < temp ? (nz-2) : temp; rhsx_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), rhsx_lws[0]); rhsx_gws[1] = clu_RoundWorkSize((size_t)(nz-2), rhsx_lws[1]); } else { //temp = (nz-2) / max_compute_units; temp = 1; rhsx_lws[0] = temp == 0 ? 1 : temp; rhsx_gws[0] = clu_RoundWorkSize((size_t)(nz-2), rhsx_lws[0]); } k_rhsy = clCreateKernel(p_main, "rhsy", &ecode); clu_CheckError(ecode, "clCreateKernel() for rhsy"); ecode = clSetKernelArg(k_rhsy, 0, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_rhsy, 1, sizeof(cl_mem), &m_rsd); ecode |= clSetKernelArg(k_rhsy, 2, sizeof(cl_mem), &m_qs); ecode |= clSetKernelArg(k_rhsy, 3, sizeof(cl_mem), &m_rho_i); if (device_type == CL_DEVICE_TYPE_CPU) { ecode |= clSetKernelArg(k_rhsy, 4, sizeof(cl_mem), &m_flux); ecode |= clSetKernelArg(k_rhsy, 5, sizeof(int), &nx); ecode |= clSetKernelArg(k_rhsy, 6, sizeof(int), &ny); ecode |= clSetKernelArg(k_rhsy, 7, sizeof(int), &nz); } else { ecode |= clSetKernelArg(k_rhsy, 4, sizeof(int), &nx); ecode |= clSetKernelArg(k_rhsy, 5, sizeof(int), &ny); ecode |= clSetKernelArg(k_rhsy, 6, sizeof(int), &nz); } clu_CheckError(ecode, "clSetKernelArg()"); if (RHSY_DIM == 2) { rhsy_lws[0] = (iend-ist) < work_item_sizes[0] ? (iend-ist) : work_item_sizes[0]; temp = max_work_group_size / rhsy_lws[0]; rhsy_lws[1] = (nz-2) < temp ? (nz-2) : temp; rhsy_gws[0] = clu_RoundWorkSize((size_t)(iend-ist), rhsy_lws[0]); rhsy_gws[1] = clu_RoundWorkSize((size_t)(nz-2), rhsy_lws[1]); } else { //temp = (nz-2) / max_compute_units; temp = 1; rhsy_lws[0] = temp == 0 ? 1 : temp; rhsy_gws[0] = clu_RoundWorkSize((size_t)(nz-2), rhsy_lws[0]); } k_rhsz = clCreateKernel(p_main, "rhsz", &ecode); clu_CheckError(ecode, "clCreateKernel() for rhsz"); ecode = clSetKernelArg(k_rhsz, 0, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_rhsz, 1, sizeof(cl_mem), &m_rsd); ecode |= clSetKernelArg(k_rhsz, 2, sizeof(cl_mem), &m_qs); ecode |= clSetKernelArg(k_rhsz, 3, sizeof(cl_mem), &m_rho_i); if (device_type == CL_DEVICE_TYPE_CPU) { ecode |= clSetKernelArg(k_rhsz, 4, sizeof(cl_mem), &m_flux); ecode |= clSetKernelArg(k_rhsz, 5, sizeof(cl_mem), &m_utmp); ecode |= clSetKernelArg(k_rhsz, 6, sizeof(cl_mem), &m_rtmp); ecode |= clSetKernelArg(k_rhsz, 7, sizeof(int), &nx); ecode |= clSetKernelArg(k_rhsz, 8, sizeof(int), &ny); ecode |= clSetKernelArg(k_rhsz, 9, sizeof(int), &nz); } else { ecode |= clSetKernelArg(k_rhsz, 4, sizeof(int), &nx); ecode |= clSetKernelArg(k_rhsz, 5, sizeof(int), &ny); ecode |= clSetKernelArg(k_rhsz, 6, sizeof(int), &nz); } clu_CheckError(ecode, "clSetKernelArg()"); if (RHSZ_DIM == 2) { rhsz_lws[0] = (iend-ist) < work_item_sizes[0] ? (iend-ist) : work_item_sizes[0]; temp = max_work_group_size / rhsz_lws[0]; rhsz_lws[1] = (jend-jst) < temp ? (jend-jst) : temp; rhsz_gws[0] = clu_RoundWorkSize((size_t)(iend-ist), rhsz_lws[0]); rhsz_gws[1] = clu_RoundWorkSize((size_t)(jend-jst), rhsz_lws[1]); } else { //temp = (jend-jst) / max_compute_units; temp = 1; rhsz_lws[0] = temp == 0 ? 1 : temp; rhsz_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), rhsz_lws[0]); } k_ssor2 = clCreateKernel(p_main, "ssor2", &ecode); clu_CheckError(ecode, "clCreateKernel() for ssor2"); ecode = clSetKernelArg(k_ssor2, 0, sizeof(cl_mem), &m_rsd); ecode |= clSetKernelArg(k_ssor2, 2, sizeof(int), &nx); ecode |= clSetKernelArg(k_ssor2, 3, sizeof(int), &ny); ecode |= clSetKernelArg(k_ssor2, 4, sizeof(int), &nz); clu_CheckError(ecode, "clSetKernelArg()"); if (SSOR2_DIM == 3) { ssor2_lws[0] = (iend-ist) < work_item_sizes[0] ? (iend-ist) : work_item_sizes[0]; temp = max_work_group_size / ssor2_lws[0]; ssor2_lws[1] = (jend-jst) < temp ? (jend-jst) : temp; temp = temp / ssor2_lws[1]; ssor2_lws[2] = (nz-2) < temp ? (nz-2) : temp; ssor2_gws[0] = clu_RoundWorkSize((size_t)(iend-ist), ssor2_lws[0]); ssor2_gws[1] = clu_RoundWorkSize((size_t)(jend-jst), ssor2_lws[1]); ssor2_gws[2] = clu_RoundWorkSize((size_t)(nz-2), ssor2_lws[2]); } else if (SSOR2_DIM == 2) { ssor2_lws[0] = (jend-jst) < work_item_sizes[0] ? (jend-jst) : work_item_sizes[0]; temp = max_work_group_size / ssor2_lws[0]; ssor2_lws[1] = (nz-2) < temp ? (nz-2) : temp; ssor2_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), ssor2_lws[0]); ssor2_gws[1] = clu_RoundWorkSize((size_t)(nz-2), ssor2_lws[1]); } else { //temp = (nz-2) / max_compute_units; temp = 1; ssor2_lws[0] = temp == 0 ? 1 : temp; ssor2_gws[0] = clu_RoundWorkSize((size_t)(nz-2), ssor2_lws[0]); } k_ssor3 = clCreateKernel(p_main, "ssor3", &ecode); clu_CheckError(ecode, "clCreateKernel() for ssor3"); ecode = clSetKernelArg(k_ssor3, 0, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_ssor3, 1, sizeof(cl_mem), &m_rsd); ecode |= clSetKernelArg(k_ssor3, 3, sizeof(int), &nx); ecode |= clSetKernelArg(k_ssor3, 4, sizeof(int), &ny); ecode |= clSetKernelArg(k_ssor3, 5, sizeof(int), &nz); clu_CheckError(ecode, "clSetKernelArg()"); if (SSOR3_DIM == 3) { ssor3_lws[0] = (iend-ist) < work_item_sizes[0] ? (iend-ist) : work_item_sizes[0]; temp = max_work_group_size / ssor3_lws[0]; ssor3_lws[1] = (jend-jst) < temp ? (jend-jst) : temp; temp = temp / ssor3_lws[1]; ssor3_lws[2] = (nz-2) < temp ? (nz-2) : temp; ssor3_gws[0] = clu_RoundWorkSize((size_t)(iend-ist), ssor3_lws[0]); ssor3_gws[1] = clu_RoundWorkSize((size_t)(jend-jst), ssor3_lws[1]); ssor3_gws[2] = clu_RoundWorkSize((size_t)(nz-2), ssor3_lws[2]); } else if (SSOR3_DIM == 2) { ssor3_lws[0] = (jend-jst) < work_item_sizes[0] ? (jend-jst) : work_item_sizes[0]; temp = max_work_group_size / ssor3_lws[0]; ssor3_lws[1] = (nz-2) < temp ? (nz-2) : temp; ssor3_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), ssor3_lws[0]); ssor3_gws[1] = clu_RoundWorkSize((size_t)(nz-2), ssor3_lws[1]); } else { //temp = (nz-2) / max_compute_units; temp = 1; ssor3_lws[0] = temp == 0 ? 1 : temp; ssor3_gws[0] = clu_RoundWorkSize((size_t)(nz-2), ssor3_lws[0]); } k_blts = clCreateKernel(p_main, "blts", &ecode); clu_CheckError(ecode, "clCreateKernel() for blts"); ecode = clSetKernelArg(k_blts, 0, sizeof(cl_mem), &m_rsd); ecode |= clSetKernelArg(k_blts, 1, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_blts, 2, sizeof(cl_mem), &m_qs); ecode |= clSetKernelArg(k_blts, 3, sizeof(cl_mem), &m_rho_i); ecode |= clSetKernelArg(k_blts, 4, sizeof(int), &nz); ecode |= clSetKernelArg(k_blts, 5, sizeof(int), &ny); ecode |= clSetKernelArg(k_blts, 6, sizeof(int), &nx); clu_CheckError(ecode, "clSetKernelArg()"); blts_lws[0] = (jend-jst) < work_item_sizes[0] ? (jend-jst) : work_item_sizes[0]; temp = max_work_group_size / blts_lws[0]; blts_lws[1] = (nz-2) < temp ? (nz-2) : temp; blts_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), blts_lws[0]); blts_gws[1] = clu_RoundWorkSize((size_t)(nz-2), blts_lws[1]); k_buts = clCreateKernel(p_main, "buts", &ecode); clu_CheckError(ecode, "clCreateKernel() for buts"); ecode = clSetKernelArg(k_buts, 0, sizeof(cl_mem), &m_rsd); ecode |= clSetKernelArg(k_buts, 1, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_buts, 2, sizeof(cl_mem), &m_qs); ecode |= clSetKernelArg(k_buts, 3, sizeof(cl_mem), &m_rho_i); ecode |= clSetKernelArg(k_buts, 4, sizeof(int), &nz); ecode |= clSetKernelArg(k_buts, 5, sizeof(int), &ny); ecode |= clSetKernelArg(k_buts, 6, sizeof(int), &nx); clu_CheckError(ecode, "clSetKernelArg()"); buts_lws[0] = (jend-jst) < work_item_sizes[0] ? (jend-jst) : work_item_sizes[0]; temp = max_work_group_size / buts_lws[0]; buts_lws[1] = (nz-2) < temp ? (nz-2) : temp; buts_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), buts_lws[0]); buts_gws[1] = clu_RoundWorkSize((size_t)(nz-2), buts_lws[1]); if (timeron) timer_stop(TIMER_OPENCL); }
//--------------------------------------------------------------------- // Set up the OpenCL environment. //--------------------------------------------------------------------- void setup_opencl(int argc, char *argv[]) { cl_int err_code; char *source_dir = "EP"; if (argc > 1) source_dir = argv[1]; #ifdef TIMER_DETAIL if (timers_enabled) { int i; for (i = T_OPENCL_API; i < T_END; i++) timer_clear(i); } #endif DTIMER_START(T_OPENCL_API); // 1. Find the default device type and get a device for the device type device_type = clu_GetDefaultDeviceType(); device = clu_GetAvailableDevice(device_type); device_name = clu_GetDeviceName(device); // 2. Create a context for the specified device context = clCreateContext(NULL, 1, &device, NULL, NULL, &err_code); clu_CheckError(err_code, "clCreateContext()"); // 3. Create a command queue cmd_queue = clCreateCommandQueue(context, device, 0, &err_code); clu_CheckError(err_code, "clCreateCommandQueue()"); DTIMER_STOP(T_OPENCL_API); // 4. Build the program DTIMER_START(T_BUILD); char *source_file; char build_option[30]; sprintf(build_option, "-DM=%d -I.", M); if (device_type == CL_DEVICE_TYPE_CPU) { source_file = "ep_cpu.cl"; GROUP_SIZE = 16; } else { source_file = "ep_gpu.cl"; GROUP_SIZE = 64; } program = clu_MakeProgram(context, device, source_dir, source_file, build_option); DTIMER_STOP(T_BUILD); // 5. Create buffers DTIMER_START(T_BUFFER_CREATE); gq_size = np / GROUP_SIZE * NQ * sizeof(double); gsx_size = np / GROUP_SIZE * sizeof(double); gsy_size = np / GROUP_SIZE * sizeof(double); pgq = clCreateBuffer(context, CL_MEM_READ_WRITE, gq_size, NULL, &err_code); clu_CheckError(err_code, "clCreateBuffer() for pgq"); pgsx = clCreateBuffer(context, CL_MEM_READ_WRITE, gsx_size,NULL, &err_code); clu_CheckError(err_code, "clCreateBuffer() for pgsx"); pgsy = clCreateBuffer(context, CL_MEM_READ_WRITE, gsy_size,NULL, &err_code); clu_CheckError(err_code, "clCreateBuffer() for pgsy"); DTIMER_STOP(T_BUFFER_CREATE); // 6. Create a kernel DTIMER_START(T_OPENCL_API); kernel = clCreateKernel(program, "embar", &err_code); clu_CheckError(err_code, "clCreateKernel()"); DTIMER_STOP(T_OPENCL_API); }