//--------------------------------------------------------------------- // Set up the OpenCL environment. //--------------------------------------------------------------------- static void setup_opencl(int argc, char *argv[]) { size_t temp; cl_int ecode; char *source_dir = "FT"; if (argc > 1) source_dir = argv[1]; #ifdef TIMER_DETAIL if (timers_enabled) { int i; for (i = T_OPENCL_API; i < T_END; i++) timer_clear(i); } #endif DTIMER_START(T_OPENCL_API); // 1. Find the default device type and get a device for the device type device_type = clu_GetDefaultDeviceType(); device = clu_GetAvailableDevice(device_type); device_name = clu_GetDeviceName(device); // Device information ecode = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(work_item_sizes), &work_item_sizes, NULL); clu_CheckError(ecode, "clGetDiviceInfo()"); ecode = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_work_group_size, NULL); clu_CheckError(ecode, "clGetDiviceInfo()"); // FIXME: The below values are experimental. if (max_work_group_size > 64) { max_work_group_size = 64; int i; for (i = 0; i < 3; i++) { if (work_item_sizes[i] > 64) { work_item_sizes[i] = 64; } } } ecode = clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &max_compute_units, NULL); clu_CheckError(ecode, "clGetDiviceInfo()"); // 2. Create a context for the specified device context = clCreateContext(NULL, 1, &device, NULL, NULL, &ecode); clu_CheckError(ecode, "clCreateContext()"); // 3. Create a command queue cmd_queue = clCreateCommandQueue(context, device, 0, &ecode); clu_CheckError(ecode, "clCreateCommandQueue()"); DTIMER_STOP(T_OPENCL_API); // 4. Build the program DTIMER_START(T_BUILD); char *source_file; char build_option[50]; if (device_type == CL_DEVICE_TYPE_CPU) { source_file = "ft_cpu.cl"; sprintf(build_option, "-I. -DCLASS=%d -DUSE_CPU", CLASS); COMPUTE_IMAP_DIM = COMPUTE_IMAP_DIM_CPU; EVOLVE_DIM = EVOLVE_DIM_CPU; CFFTS_DIM = CFFTS_DIM_CPU; } else if (device_type == CL_DEVICE_TYPE_GPU) { char vendor[50]; ecode = clGetDeviceInfo(device, CL_DEVICE_VENDOR, 50, vendor, NULL); clu_CheckError(ecode, "clGetDeviceInfo()"); if (strncmp(vendor, DEV_VENDOR_NVIDIA, strlen(DEV_VENDOR_NVIDIA)) == 0) { source_file = "ft_gpu_nvidia.cl"; CFFTS_LSIZE = 32; } else { source_file = "ft_gpu.cl"; CFFTS_LSIZE = 64; } sprintf(build_option, "-I. -DCLASS=\'%c\' -DLSIZE=%lu", CLASS, CFFTS_LSIZE); COMPUTE_IMAP_DIM = COMPUTE_IMAP_DIM_GPU; EVOLVE_DIM = EVOLVE_DIM_GPU; CFFTS_DIM = CFFTS_DIM_GPU; } else { fprintf(stderr, "Set the environment variable OPENCL_DEVICE_TYPE!\n"); exit(EXIT_FAILURE); } program = clu_MakeProgram(context, device, source_dir, source_file, build_option); DTIMER_STOP(T_BUILD); // 5. Create buffers DTIMER_START(T_BUFFER_CREATE); m_u = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(dcomplex) * NXP, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_u"); m_u0 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(dcomplex) * NTOTALP, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_u0"); m_u1 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(dcomplex) * NTOTALP, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_u1"); m_twiddle = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double) * NTOTALP, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_twiddle"); if (device_type == CL_DEVICE_TYPE_CPU) { size_t ty1_size, ty2_size; if (CFFTS_DIM == 2) { ty1_size = sizeof(dcomplex) * NX * NY * NZ; ty2_size = sizeof(dcomplex) * NX * NY * NZ; } else { fprintf(stderr, "Wrong CFFTS_DIM: %u\n", CFFTS_DIM); exit(EXIT_FAILURE); } m_ty1 = clCreateBuffer(context, CL_MEM_READ_WRITE, ty1_size, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_ty1"); m_ty2 = clCreateBuffer(context, CL_MEM_READ_WRITE, ty2_size, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_ty2"); } if (device_type == CL_DEVICE_TYPE_CPU) { temp = 1024 / max_compute_units; checksum_local_ws = temp == 0 ? 1 : temp; checksum_global_ws = clu_RoundWorkSize((size_t)1024, checksum_local_ws); } else if (device_type == CL_DEVICE_TYPE_GPU) { checksum_local_ws = 32; checksum_global_ws = clu_RoundWorkSize((size_t)1024, checksum_local_ws); } checksum_wg_num = checksum_global_ws / checksum_local_ws; m_chk = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(dcomplex) * checksum_wg_num, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_chk"); g_chk = (dcomplex *)malloc(sizeof(dcomplex) * checksum_wg_num); DTIMER_STOP(T_BUFFER_CREATE); // 6. Create kernels DTIMER_START(T_OPENCL_API); double ap = -4.0 * ALPHA * PI * PI; int d1 = dims[0]; int d2 = dims[1]; int d3 = dims[2]; k_compute_indexmap = clCreateKernel(program, "compute_indexmap", &ecode); clu_CheckError(ecode, "clCreateKernel() for compute_indexmap"); ecode = clSetKernelArg(k_compute_indexmap, 0, sizeof(cl_mem), &m_twiddle); ecode |= clSetKernelArg(k_compute_indexmap, 1, sizeof(int), &d1); ecode |= clSetKernelArg(k_compute_indexmap, 2, sizeof(int), &d2); ecode |= clSetKernelArg(k_compute_indexmap, 3, sizeof(int), &d3); ecode |= clSetKernelArg(k_compute_indexmap, 4, sizeof(double), &ap); clu_CheckError(ecode, "clSetKernelArg() for compute_indexmap"); if (COMPUTE_IMAP_DIM == 3) { cimap_lws[0] = d1 < work_item_sizes[0] ? d1 : work_item_sizes[0]; temp = max_work_group_size / cimap_lws[0]; cimap_lws[1] = d2 < temp ? d2 : temp; temp = temp / cimap_lws[1]; cimap_lws[2] = d3 < temp ? d3 : temp; cimap_gws[0] = clu_RoundWorkSize((size_t)d1, cimap_lws[0]); cimap_gws[1] = clu_RoundWorkSize((size_t)d2, cimap_lws[1]); cimap_gws[2] = clu_RoundWorkSize((size_t)d3, cimap_lws[2]); } else if (COMPUTE_IMAP_DIM == 2) { cimap_lws[0] = d2 < work_item_sizes[0] ? d2 : work_item_sizes[0]; temp = max_work_group_size / cimap_lws[0]; cimap_lws[1] = d3 < temp ? d3 : temp; cimap_gws[0] = clu_RoundWorkSize((size_t)d2, cimap_lws[0]); cimap_gws[1] = clu_RoundWorkSize((size_t)d3, cimap_lws[1]); } else { //temp = d3 / max_compute_units; temp = 1; cimap_lws[0] = temp == 0 ? 1 : temp; cimap_gws[0] = clu_RoundWorkSize((size_t)d3, cimap_lws[0]); } k_compute_ics = clCreateKernel(program, "compute_initial_conditions", &ecode); clu_CheckError(ecode, "clCreateKernel() for compute_initial_conditions"); ecode = clSetKernelArg(k_compute_ics, 2, sizeof(int), &d1); ecode |= clSetKernelArg(k_compute_ics, 3, sizeof(int), &d2); ecode |= clSetKernelArg(k_compute_ics, 4, sizeof(int), &d3); clu_CheckError(ecode, "clSetKernelArg() for compute_initial_conditions"); k_cffts1 = clCreateKernel(program, "cffts1", &ecode); clu_CheckError(ecode, "clCreateKernel() for cffts1"); ecode = clSetKernelArg(k_cffts1, 2, sizeof(cl_mem), &m_u); if (device_type == CL_DEVICE_TYPE_CPU) { ecode |= clSetKernelArg(k_cffts1, 8, sizeof(cl_mem), &m_ty1); ecode |= clSetKernelArg(k_cffts1, 9, sizeof(cl_mem), &m_ty2); } clu_CheckError(ecode, "clSetKernelArg() for k_cffts1"); k_cffts2 = clCreateKernel(program, "cffts2", &ecode); clu_CheckError(ecode, "clCreateKernel() for cffts2"); ecode = clSetKernelArg(k_cffts2, 2, sizeof(cl_mem), &m_u); if (device_type == CL_DEVICE_TYPE_CPU) { ecode |= clSetKernelArg(k_cffts2, 8, sizeof(cl_mem), &m_ty1); ecode |= clSetKernelArg(k_cffts2, 9, sizeof(cl_mem), &m_ty2); } clu_CheckError(ecode, "clSetKernelArg() for k_cffts2"); k_cffts3 = clCreateKernel(program, "cffts3", &ecode); clu_CheckError(ecode, "clCreateKernel() for cffts3"); ecode = clSetKernelArg(k_cffts3, 2, sizeof(cl_mem), &m_u); if (device_type == CL_DEVICE_TYPE_CPU) { ecode |= clSetKernelArg(k_cffts3, 8, sizeof(cl_mem), &m_ty1); ecode |= clSetKernelArg(k_cffts3, 9, sizeof(cl_mem), &m_ty2); } clu_CheckError(ecode, "clSetKernelArg() for k_cffts3"); k_evolve = clCreateKernel(program, "evolve", &ecode); clu_CheckError(ecode, "clCreateKernel() for evolve"); k_checksum = clCreateKernel(program, "checksum", &ecode); clu_CheckError(ecode, "clCreateKernel() for checksum"); ecode = clSetKernelArg(k_checksum, 1, sizeof(cl_mem), &m_chk); ecode |= clSetKernelArg(k_checksum, 2, sizeof(dcomplex)*checksum_local_ws, NULL); ecode |= clSetKernelArg(k_checksum, 3, sizeof(int), &dims[0]); ecode |= clSetKernelArg(k_checksum, 4, sizeof(int), &dims[1]); clu_CheckError(ecode, "clSetKernelArg() for checksum"); DTIMER_STOP(T_OPENCL_API); }
//--------------------------------------------------------------------- // Set up the OpenCL environment. //--------------------------------------------------------------------- static void setup_opencl(int argc, char *argv[]) { cl_int ecode; char *source_dir = "IS"; if (argc > 1) source_dir = argv[1]; #ifdef TIMER_DETAIL if (timer_on) { int i; for (i = T_OPENCL_API; i < T_END; i++) timer_clear(i); } #endif DTIMER_START(T_OPENCL_API); // 1. Find the default device type and get a device for the device type device_type = clu_GetDefaultDeviceType(); device = clu_GetAvailableDevice(device_type); device_name = clu_GetDeviceName(device); // Device information ecode = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(work_item_sizes), &work_item_sizes, NULL); clu_CheckError(ecode, "clGetDiviceInfo()"); ecode = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_work_group_size, NULL); clu_CheckError(ecode, "clGetDiviceInfo()"); // FIXME: The below values are experimental. if (max_work_group_size > 256) { max_work_group_size = 256; int i; for (i = 0; i < 3; i++) { if (work_item_sizes[i] > 256) { work_item_sizes[i] = 256; } } } // 2. Create a context for the specified device context = clCreateContext(NULL, 1, &device, NULL, NULL, &ecode); clu_CheckError(ecode, "clCreateContext()"); // 3. Create a command queue cmd_queue = clCreateCommandQueue(context, device, 0, &ecode); clu_CheckError(ecode, "clCreateCommandQueue()"); DTIMER_STOP(T_OPENCL_API); // 4. Build the program DTIMER_START(T_BUILD); char *source_file; char build_option[30]; if (device_type == CL_DEVICE_TYPE_CPU) { source_file = "is_cpu.cl"; sprintf(build_option, "-DCLASS=%d -I.", CLASS); CREATE_SEQ_GROUP_SIZE = 64; CREATE_SEQ_GLOBAL_SIZE = CREATE_SEQ_GROUP_SIZE * 256; RANK_GROUP_SIZE = 1; RANK_GLOBAL_SIZE = RANK_GROUP_SIZE * 128; RANK1_GROUP_SIZE = 1; RANK1_GLOBAL_SIZE = RANK1_GROUP_SIZE * RANK_GLOBAL_SIZE;; RANK2_GROUP_SIZE = RANK_GROUP_SIZE; RANK2_GLOBAL_SIZE = RANK_GLOBAL_SIZE;; FV2_GROUP_SIZE = 64; FV2_GLOBAL_SIZE = FV2_GROUP_SIZE * 256; } else if (device_type == CL_DEVICE_TYPE_GPU) { source_file = "is_gpu.cl"; sprintf(build_option, "-DCLASS=\'%c\' -I.", CLASS); CREATE_SEQ_GROUP_SIZE = 64; CREATE_SEQ_GLOBAL_SIZE = CREATE_SEQ_GROUP_SIZE * 256; RANK1_GROUP_SIZE = work_item_sizes[0]; RANK1_GLOBAL_SIZE = MAX_KEY; RANK2_GROUP_SIZE = work_item_sizes[0]; RANK2_GLOBAL_SIZE = NUM_KEYS; FV2_GROUP_SIZE = work_item_sizes[0]; FV2_GLOBAL_SIZE = NUM_KEYS; } else { fprintf(stderr, "%s: not supported.", clu_GetDeviceTypeName(device_type)); exit(EXIT_FAILURE); } program = clu_MakeProgram(context, device, source_dir, source_file, build_option); DTIMER_STOP(T_BUILD); // 5. Create buffers DTIMER_START(T_BUFFER_CREATE); m_key_array = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(INT_TYPE) * SIZE_OF_BUFFERS, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_key_array"); m_key_buff1 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(INT_TYPE) * MAX_KEY, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_key_buff1"); m_key_buff2 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(INT_TYPE) * SIZE_OF_BUFFERS, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_key_buff2"); size_t test_array_size = sizeof(INT_TYPE) * TEST_ARRAY_SIZE; m_index_array = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, test_array_size, test_index_array, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_index_array"); m_rank_array = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, test_array_size, test_rank_array, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_rank_array"); m_partial_vals = clCreateBuffer(context, CL_MEM_WRITE_ONLY, test_array_size, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_partial_vals"); m_passed_verification = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_passed_verification"); if (device_type == CL_DEVICE_TYPE_GPU) { m_key_scan = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(INT_TYPE) * MAX_KEY, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_key_buff1_scan"); m_sum = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(INT_TYPE) * work_item_sizes[0], NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_sum"); } else { size_t bs_size = RANK_GLOBAL_SIZE * sizeof(INT_TYPE) * NUM_BUCKETS; m_bucket_size = clCreateBuffer(context, CL_MEM_READ_WRITE, bs_size, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_bucket_size"); m_bucket_ptrs = clCreateBuffer(context, CL_MEM_READ_WRITE, bs_size, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_bucket_ptrs"); } DTIMER_STOP(T_BUFFER_CREATE); // 6. Create kernels DTIMER_START(T_OPENCL_API); k_rank0 = clCreateKernel(program, "rank0", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank0"); ecode = clSetKernelArg(k_rank0, 0, sizeof(cl_mem), (void*)&m_key_array); ecode |= clSetKernelArg(k_rank0, 1, sizeof(cl_mem), (void*)&m_partial_vals); ecode |= clSetKernelArg(k_rank0, 2, sizeof(cl_mem), (void*)&m_index_array); clu_CheckError(ecode, "clSetKernelArg() for rank0"); if (device_type == CL_DEVICE_TYPE_GPU) { k_rank1 = clCreateKernel(program, "rank1", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank1"); ecode = clSetKernelArg(k_rank1, 0, sizeof(cl_mem), (void*)&m_key_buff1); clu_CheckError(ecode, "clSetKernelArg() for rank1"); k_rank2 = clCreateKernel(program, "rank2", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank2"); ecode = clSetKernelArg(k_rank2, 0, sizeof(cl_mem), (void*)&m_key_buff1); ecode |= clSetKernelArg(k_rank2, 1, sizeof(cl_mem), (void*)&m_key_array); clu_CheckError(ecode, "clSetKernelArg() for rank2"); k_rank3_0 = clCreateKernel(program, "rank3_0", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank3_0"); ecode = clSetKernelArg(k_rank3_0, 0, sizeof(cl_mem),(void*)&m_key_buff1); ecode |= clSetKernelArg(k_rank3_0, 1, sizeof(cl_mem),(void*)&m_key_buff1); ecode |= clSetKernelArg(k_rank3_0, 2, sizeof(cl_mem),(void*)&m_sum); ecode |= clSetKernelArg(k_rank3_0, 3, sizeof(INT_TYPE) * work_item_sizes[0] * 2, NULL); clu_CheckError(ecode, "clSetKernelArg() for rank3_0"); k_rank3_1 = clCreateKernel(program, "rank3_1", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank3_1"); ecode = clSetKernelArg(k_rank3_1, 0, sizeof(cl_mem), (void*)&m_sum); ecode = clSetKernelArg(k_rank3_1, 1, sizeof(cl_mem), (void*)&m_sum); ecode |= clSetKernelArg(k_rank3_1, 2, sizeof(INT_TYPE) * work_item_sizes[0] * 2, NULL); clu_CheckError(ecode, "clSetKernelArg() for rank3_1"); k_rank3_2 = clCreateKernel(program, "rank3_2", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank3_2"); ecode = clSetKernelArg(k_rank3_2, 0, sizeof(cl_mem),(void*)&m_key_buff1); ecode = clSetKernelArg(k_rank3_2, 1, sizeof(cl_mem),(void*)&m_key_buff1); ecode |= clSetKernelArg(k_rank3_2, 2, sizeof(cl_mem),(void*)&m_sum); clu_CheckError(ecode, "clSetKernelArg() for rank3_2"); } else { k_rank1 = clCreateKernel(program, "rank1", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank1"); ecode = clSetKernelArg(k_rank1, 0, sizeof(cl_mem),(void*)&m_key_array); ecode |= clSetKernelArg(k_rank1, 1, sizeof(cl_mem),(void*)&m_bucket_size); clu_CheckError(ecode, "clSetKernelArg() for rank1"); k_rank2 = clCreateKernel(program, "rank2", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank2"); ecode = clSetKernelArg(k_rank2, 0, sizeof(cl_mem),(void*)&m_key_array); ecode |= clSetKernelArg(k_rank2, 1, sizeof(cl_mem),(void*)&m_bucket_size); ecode |= clSetKernelArg(k_rank2, 2, sizeof(cl_mem),(void*)&m_bucket_ptrs); ecode |= clSetKernelArg(k_rank2, 3, sizeof(cl_mem),(void*)&m_key_buff2); clu_CheckError(ecode, "clSetKernelArg() for rank2"); k_rank3 = clCreateKernel(program, "rank3", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank3"); ecode = clSetKernelArg(k_rank3, 0, sizeof(cl_mem),(void*)&m_bucket_size); ecode |= clSetKernelArg(k_rank3, 1, sizeof(cl_mem),(void*)&m_bucket_ptrs); ecode |= clSetKernelArg(k_rank3, 2, sizeof(cl_mem),(void*)&m_key_buff1); ecode |= clSetKernelArg(k_rank3, 3, sizeof(cl_mem),(void*)&m_key_buff2); clu_CheckError(ecode, "clSetKernelArg() for rank3"); } k_rank4 = clCreateKernel(program, "rank4", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank4"); ecode = clSetKernelArg(k_rank4, 0, sizeof(cl_mem), (void*)&m_partial_vals); ecode |= clSetKernelArg(k_rank4, 1, sizeof(cl_mem), (void*)&m_key_buff1); ecode |= clSetKernelArg(k_rank4, 2, sizeof(cl_mem), (void*)&m_rank_array); ecode |= clSetKernelArg(k_rank4, 3, sizeof(cl_mem), (void*)&m_passed_verification); clu_CheckError(ecode, "clSetKernelArg() for rank4"); DTIMER_STOP(T_OPENCL_API); }
//--------------------------------------------------------------------- // Set up the OpenCL environment. //--------------------------------------------------------------------- static void setup_opencl(int argc, char *argv[]) { int i; size_t temp, wg_num; cl_int ecode; char *source_dir = "LU"; if (timeron) { timer_clear(TIMER_OPENCL); timer_clear(TIMER_BUILD); timer_clear(TIMER_BUFFER); timer_clear(TIMER_RELEASE); timer_start(TIMER_OPENCL); } if (argc > 1) source_dir = argv[1]; //----------------------------------------------------------------------- // 1. Find the default device type and get a device for the device type //----------------------------------------------------------------------- device_type = clu_GetDefaultDeviceType(); device = clu_GetAvailableDevice(device_type); device_name = clu_GetDeviceName(device); // Device information ecode = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(work_item_sizes), &work_item_sizes, NULL); clu_CheckError(ecode, "clGetDiviceInfo()"); ecode = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_work_group_size, NULL); clu_CheckError(ecode, "clGetDiviceInfo()"); ecode = clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &max_compute_units, NULL); clu_CheckError(ecode, "clGetDiviceInfo()"); //////////////////////////////////////////////////////////////////////// // FIXME: The below values are experimental. size_t default_wg_size = 64; if (device_type == CL_DEVICE_TYPE_CPU) { if (CLASS == 'B') default_wg_size = 128; } else { if (CLASS == 'B') default_wg_size = 32; } if (max_work_group_size > default_wg_size) { max_work_group_size = default_wg_size; int i; for (i = 0; i < 3; i++) { if (work_item_sizes[i] > default_wg_size) { work_item_sizes[i] = default_wg_size; } } } if (device_type == CL_DEVICE_TYPE_CPU) { SETBV1_DIM = SETBV1_DIM_CPU; SETBV2_DIM = SETBV2_DIM_CPU; SETBV3_DIM = SETBV3_DIM_CPU; SETIV_DIM = SETIV_DIM_CPU; ERHS1_DIM = ERHS1_DIM_CPU; ERHS2_DIM = ERHS2_DIM_CPU; ERHS3_DIM = ERHS3_DIM_CPU; ERHS4_DIM = ERHS4_DIM_CPU; PINTGR1_DIM = PINTGR1_DIM_CPU; PINTGR2_DIM = PINTGR2_DIM_CPU; PINTGR3_DIM = PINTGR3_DIM_CPU; RHS_DIM = RHS_DIM_CPU; RHSX_DIM = RHSX_DIM_CPU; RHSY_DIM = RHSY_DIM_CPU; RHSZ_DIM = RHSZ_DIM_CPU; SSOR2_DIM = SSOR2_DIM_CPU; SSOR3_DIM = SSOR3_DIM_CPU; } else { SETBV1_DIM = SETBV1_DIM_GPU; SETBV2_DIM = SETBV2_DIM_GPU; SETBV3_DIM = SETBV3_DIM_GPU; SETIV_DIM = SETIV_DIM_GPU; ERHS1_DIM = ERHS1_DIM_GPU; ERHS2_DIM = ERHS2_DIM_GPU; ERHS3_DIM = ERHS3_DIM_GPU; ERHS4_DIM = ERHS4_DIM_GPU; PINTGR1_DIM = PINTGR1_DIM_GPU; PINTGR2_DIM = PINTGR2_DIM_GPU; PINTGR3_DIM = PINTGR3_DIM_GPU; RHS_DIM = RHS_DIM_GPU; RHSX_DIM = RHSX_DIM_GPU; RHSY_DIM = RHSY_DIM_GPU; RHSZ_DIM = RHSZ_DIM_GPU; SSOR2_DIM = SSOR2_DIM_GPU; SSOR3_DIM = SSOR3_DIM_GPU; } //////////////////////////////////////////////////////////////////////// //----------------------------------------------------------------------- // 2. Create a context for the specified device //----------------------------------------------------------------------- context = clCreateContext(NULL, 1, &device, NULL, NULL, &ecode); clu_CheckError(ecode, "clCreateContext()"); //----------------------------------------------------------------------- // 3. Create command queues //----------------------------------------------------------------------- cmd_queue = clCreateCommandQueue(context, device, 0, &ecode); clu_CheckError(ecode, "clCreateCommandQueue()"); max_pipeline = (jend-jst) < max_compute_units ? (jend-jst) : max_compute_units; pipe_queue = (cl_command_queue *)malloc(sizeof(cl_command_queue) * max_pipeline); for (i = 0; i < max_pipeline; i++) { pipe_queue[i] = clCreateCommandQueue(context, device, 0, &ecode); clu_CheckError(ecode, "clCreateCommandQueue()"); } //----------------------------------------------------------------------- // 4. Build programs //----------------------------------------------------------------------- if (timeron) timer_start(TIMER_BUILD); char build_option[100]; if (device_type == CL_DEVICE_TYPE_CPU) { sprintf(build_option, "-I. -DCLASS=%d -DUSE_CPU", CLASS); } else { sprintf(build_option, "-I. -DCLASS=\'%c\'", CLASS); } p_pre = clu_MakeProgram(context, device, source_dir, "kernel_pre.cl", build_option); p_main = clu_MakeProgram(context, device, source_dir, (device_type == CL_DEVICE_TYPE_CPU ? "kernel_main_cpu.cl" : "kernel_main_gpu.cl"), build_option); p_post = clu_MakeProgram(context, device, source_dir, "kernel_post.cl", build_option); if (timeron) timer_stop(TIMER_BUILD); //----------------------------------------------------------------------- // 5. Create buffers //----------------------------------------------------------------------- if (timeron) timer_start(TIMER_BUFFER); m_ce = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(double)*5*13, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_ce"); m_u = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*(ISIZ3)*(ISIZ2/2*2+1)*(ISIZ1/2*2+1)*5, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_u"); m_rsd = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*(ISIZ3)*(ISIZ2/2*2+1)*(ISIZ1/2*2+1)*5, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_rsd"); m_frct = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*(ISIZ3)*(ISIZ2/2*2+1)*(ISIZ1/2*2+1)*5, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_frct"); m_qs = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*(ISIZ3)*(ISIZ2/2*2+1)*(ISIZ1/2*2+1), NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_qs"); m_rho_i = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*(ISIZ3)*(ISIZ2/2*2+1)*(ISIZ1/2*2+1), NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_rho_i"); // workspace for work-items size_t max_work_items; if (ERHS2_DIM == 1 && ERHS3_DIM == 1 && ERHS4_DIM == 1) { max_work_items = ISIZ3; } else { max_work_items = ISIZ3*ISIZ2; } m_flux = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*ISIZ1*5 * max_work_items, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_flux"); if (RHSZ_DIM == 1) { max_work_items = ISIZ2; } else { max_work_items = ISIZ2*ISIZ1; } if (device_type == CL_DEVICE_TYPE_CPU) { m_utmp = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*ISIZ3*6 * max_work_items, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_utmp"); m_rtmp = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*ISIZ3*5 * max_work_items, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_rtmp"); } temp = (nz0-2) / max_compute_units; l2norm_lws[0] = temp == 0 ? 1 : temp; l2norm_gws[0] = clu_RoundWorkSize((size_t)(nz0-2), l2norm_lws[0]); wg_num = l2norm_gws[0] / l2norm_lws[0]; sum_size = sizeof(double) * 5 * wg_num; m_sum = clCreateBuffer(context, CL_MEM_READ_WRITE, sum_size, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer()"); if (timeron) timer_stop(TIMER_BUFFER); //----------------------------------------------------------------------- // 6. Create kernels //----------------------------------------------------------------------- k_setbv1 = clCreateKernel(p_pre, "setbv1", &ecode); clu_CheckError(ecode, "clCreateKernel() for setbv1"); ecode = clSetKernelArg(k_setbv1, 0, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_setbv1, 1, sizeof(cl_mem), &m_ce); ecode |= clSetKernelArg(k_setbv1, 2, sizeof(int), &nx); ecode |= clSetKernelArg(k_setbv1, 3, sizeof(int), &ny); ecode |= clSetKernelArg(k_setbv1, 4, sizeof(int), &nz); clu_CheckError(ecode, "clSetKernelArg()"); if (SETBV1_DIM == 3) { setbv1_lws[0] = 5; temp = max_work_group_size / setbv1_lws[0]; setbv1_lws[1] = nx < temp ? nx : temp; temp = temp / setbv1_lws[1]; setbv1_lws[2] = ny < temp ? ny : temp; setbv1_gws[0] = clu_RoundWorkSize((size_t)5, setbv1_lws[0]); setbv1_gws[1] = clu_RoundWorkSize((size_t)nx, setbv1_lws[1]); setbv1_gws[2] = clu_RoundWorkSize((size_t)ny, setbv1_lws[2]); } else if (SETBV1_DIM == 2) { setbv1_lws[0] = nx < work_item_sizes[0] ? nx : work_item_sizes[0]; temp = max_work_group_size / setbv1_lws[0]; setbv1_lws[1] = ny < temp ? ny : temp; setbv1_gws[0] = clu_RoundWorkSize((size_t)nx, setbv1_lws[0]); setbv1_gws[1] = clu_RoundWorkSize((size_t)ny, setbv1_lws[1]); } else { temp = ny / max_compute_units; setbv1_lws[0] = temp == 0 ? 1 : temp; setbv1_gws[0] = clu_RoundWorkSize((size_t)ny, setbv1_lws[0]); } k_setbv2 = clCreateKernel(p_pre, "setbv2", &ecode); clu_CheckError(ecode, "clCreateKernel() for setbv2"); ecode = clSetKernelArg(k_setbv2, 0, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_setbv2, 1, sizeof(cl_mem), &m_ce); ecode |= clSetKernelArg(k_setbv2, 2, sizeof(int), &nx); ecode |= clSetKernelArg(k_setbv2, 3, sizeof(int), &ny); ecode |= clSetKernelArg(k_setbv2, 4, sizeof(int), &nz); clu_CheckError(ecode, "clSetKernelArg()"); if (SETBV2_DIM == 3) { setbv2_lws[0] = 5; temp = max_work_group_size / setbv2_lws[0]; setbv2_lws[1] = nx < temp ? nx : temp; temp = temp / setbv2_lws[1]; setbv2_lws[2] = nz < temp ? nz : temp; setbv2_gws[0] = clu_RoundWorkSize((size_t)5, setbv2_lws[0]); setbv2_gws[1] = clu_RoundWorkSize((size_t)nx, setbv2_lws[1]); setbv2_gws[2] = clu_RoundWorkSize((size_t)nz, setbv2_lws[2]); } else if (SETBV2_DIM == 2) { setbv2_lws[0] = nx < work_item_sizes[0] ? nx : work_item_sizes[0]; temp = max_work_group_size / setbv2_lws[0]; setbv2_lws[1] = nz < temp ? nz : temp; setbv2_gws[0] = clu_RoundWorkSize((size_t)nx, setbv2_lws[0]); setbv2_gws[1] = clu_RoundWorkSize((size_t)nz, setbv2_lws[1]); } else { temp = nz / max_compute_units; setbv2_lws[0] = temp == 0 ? 1 : temp; setbv2_gws[0] = clu_RoundWorkSize((size_t)nz, setbv2_lws[0]); } k_setbv3 = clCreateKernel(p_pre, "setbv3", &ecode); clu_CheckError(ecode, "clCreateKernel() for setbv3"); ecode = clSetKernelArg(k_setbv3, 0, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_setbv3, 1, sizeof(cl_mem), &m_ce); ecode |= clSetKernelArg(k_setbv3, 2, sizeof(int), &nx); ecode |= clSetKernelArg(k_setbv3, 3, sizeof(int), &ny); ecode |= clSetKernelArg(k_setbv3, 4, sizeof(int), &nz); clu_CheckError(ecode, "clSetKernelArg()"); if (SETBV3_DIM == 3) { setbv3_lws[0] = 5; temp = max_work_group_size / setbv3_lws[0]; setbv3_lws[1] = ny < temp ? ny : temp; temp = temp / setbv3_lws[1]; setbv3_lws[2] = nz < temp ? nz : temp; setbv3_gws[0] = clu_RoundWorkSize((size_t)5, setbv3_lws[0]); setbv3_gws[1] = clu_RoundWorkSize((size_t)ny, setbv3_lws[1]); setbv3_gws[2] = clu_RoundWorkSize((size_t)nz, setbv3_lws[2]); } else if (SETBV3_DIM == 2) { setbv3_lws[0] = ny < work_item_sizes[0] ? ny : work_item_sizes[0]; temp = max_work_group_size / setbv3_lws[0]; setbv3_lws[1] = nz < temp ? nz : temp; setbv3_gws[0] = clu_RoundWorkSize((size_t)ny, setbv3_lws[0]); setbv3_gws[1] = clu_RoundWorkSize((size_t)nz, setbv3_lws[1]); } else { temp = nz / max_compute_units; setbv3_lws[0] = temp == 0 ? 1 : temp; setbv3_gws[0] = clu_RoundWorkSize((size_t)nz, setbv3_lws[0]); } k_setiv = clCreateKernel(p_pre, "setiv", &ecode); clu_CheckError(ecode, "clCreateKernel() for setiv"); ecode = clSetKernelArg(k_setiv, 0, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_setiv, 1, sizeof(cl_mem), &m_ce); ecode |= clSetKernelArg(k_setiv, 2, sizeof(int), &nx); ecode |= clSetKernelArg(k_setiv, 3, sizeof(int), &ny); ecode |= clSetKernelArg(k_setiv, 4, sizeof(int), &nz); clu_CheckError(ecode, "clSetKernelArg()"); if (SETIV_DIM == 3) { setiv_lws[0] = (nx-2) < work_item_sizes[0] ? (nx-2) : work_item_sizes[0]; temp = max_work_group_size / setiv_lws[0]; setiv_lws[1] = (ny-2) < temp ? (ny-2) : temp; temp = temp / setiv_lws[1]; setiv_lws[2] = (nz-2) < temp ? (nz-2) : temp; setiv_gws[0] = clu_RoundWorkSize((size_t)(nx-2), setiv_lws[0]); setiv_gws[1] = clu_RoundWorkSize((size_t)(ny-2), setiv_lws[1]); setiv_gws[2] = clu_RoundWorkSize((size_t)(nz-2), setiv_lws[2]); } else if (SETIV_DIM == 2) { setiv_lws[0] = (ny-2) < work_item_sizes[0] ? (ny-2) : work_item_sizes[0]; temp = max_work_group_size / setiv_lws[0]; setiv_lws[1] = (nz-2) < temp ? (nz-2) : temp; setiv_gws[0] = clu_RoundWorkSize((size_t)(ny-2), setiv_lws[0]); setiv_gws[1] = clu_RoundWorkSize((size_t)(nz-2), setiv_lws[1]); } else { temp = (nz-2) / max_compute_units; setiv_lws[0] = temp == 0 ? 1 : temp; setiv_gws[0] = clu_RoundWorkSize((size_t)(nz-2), setiv_lws[0]); } k_l2norm = clCreateKernel(p_main, "l2norm", &ecode); clu_CheckError(ecode, "clCreateKernel()"); ecode = clSetKernelArg(k_l2norm, 1, sizeof(cl_mem), &m_sum); ecode |= clSetKernelArg(k_l2norm, 2, sizeof(double)*5*l2norm_lws[0], NULL); clu_CheckError(ecode, "clSetKernelArg()"); k_rhs = clCreateKernel(p_main, "rhs", &ecode); clu_CheckError(ecode, "clCreateKernel() for rhs"); ecode = clSetKernelArg(k_rhs, 0, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_rhs, 1, sizeof(cl_mem), &m_rsd); ecode |= clSetKernelArg(k_rhs, 2, sizeof(cl_mem), &m_frct); ecode |= clSetKernelArg(k_rhs, 3, sizeof(cl_mem), &m_qs); ecode |= clSetKernelArg(k_rhs, 4, sizeof(cl_mem), &m_rho_i); ecode |= clSetKernelArg(k_rhs, 5, sizeof(int), &nx); ecode |= clSetKernelArg(k_rhs, 6, sizeof(int), &ny); ecode |= clSetKernelArg(k_rhs, 7, sizeof(int), &nz); clu_CheckError(ecode, "clSetKernelArg()"); if (RHS_DIM == 3) { rhs_lws[0] = nx < work_item_sizes[0] ? nx : work_item_sizes[0]; temp = max_work_group_size / rhs_lws[0]; rhs_lws[1] = ny < temp ? ny : temp; temp = temp / rhs_lws[1]; rhs_lws[2] = nz < temp ? nz : temp; rhs_gws[0] = clu_RoundWorkSize((size_t)nx, rhs_lws[0]); rhs_gws[1] = clu_RoundWorkSize((size_t)ny, rhs_lws[1]); rhs_gws[2] = clu_RoundWorkSize((size_t)nz, rhs_lws[2]); } else if (RHS_DIM == 2) { rhs_lws[0] = ny < work_item_sizes[0] ? ny : work_item_sizes[0]; temp = max_work_group_size / rhs_lws[0]; rhs_lws[1] = nz < temp ? nz : temp; rhs_gws[0] = clu_RoundWorkSize((size_t)ny, rhs_lws[0]); rhs_gws[1] = clu_RoundWorkSize((size_t)nz, rhs_lws[1]); } else { //temp = nz / max_compute_units; temp = 1; rhs_lws[0] = temp == 0 ? 1 : temp; rhs_gws[0] = clu_RoundWorkSize((size_t)nz, rhs_lws[0]); } k_rhsx = clCreateKernel(p_main, "rhsx", &ecode); clu_CheckError(ecode, "clCreateKernel() for rhsx"); ecode = clSetKernelArg(k_rhsx, 0, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_rhsx, 1, sizeof(cl_mem), &m_rsd); ecode |= clSetKernelArg(k_rhsx, 2, sizeof(cl_mem), &m_qs); ecode |= clSetKernelArg(k_rhsx, 3, sizeof(cl_mem), &m_rho_i); if (device_type == CL_DEVICE_TYPE_CPU) { ecode |= clSetKernelArg(k_rhsx, 4, sizeof(cl_mem), &m_flux); ecode |= clSetKernelArg(k_rhsx, 5, sizeof(int), &nx); ecode |= clSetKernelArg(k_rhsx, 6, sizeof(int), &ny); ecode |= clSetKernelArg(k_rhsx, 7, sizeof(int), &nz); } else { ecode |= clSetKernelArg(k_rhsx, 4, sizeof(int), &nx); ecode |= clSetKernelArg(k_rhsx, 5, sizeof(int), &ny); ecode |= clSetKernelArg(k_rhsx, 6, sizeof(int), &nz); } clu_CheckError(ecode, "clSetKernelArg()"); if (RHSX_DIM == 2) { rhsx_lws[0] = (jend-jst) < work_item_sizes[0] ? (jend-jst) : work_item_sizes[0]; temp = max_work_group_size / rhsx_lws[0]; rhsx_lws[1] = (nz-2) < temp ? (nz-2) : temp; rhsx_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), rhsx_lws[0]); rhsx_gws[1] = clu_RoundWorkSize((size_t)(nz-2), rhsx_lws[1]); } else { //temp = (nz-2) / max_compute_units; temp = 1; rhsx_lws[0] = temp == 0 ? 1 : temp; rhsx_gws[0] = clu_RoundWorkSize((size_t)(nz-2), rhsx_lws[0]); } k_rhsy = clCreateKernel(p_main, "rhsy", &ecode); clu_CheckError(ecode, "clCreateKernel() for rhsy"); ecode = clSetKernelArg(k_rhsy, 0, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_rhsy, 1, sizeof(cl_mem), &m_rsd); ecode |= clSetKernelArg(k_rhsy, 2, sizeof(cl_mem), &m_qs); ecode |= clSetKernelArg(k_rhsy, 3, sizeof(cl_mem), &m_rho_i); if (device_type == CL_DEVICE_TYPE_CPU) { ecode |= clSetKernelArg(k_rhsy, 4, sizeof(cl_mem), &m_flux); ecode |= clSetKernelArg(k_rhsy, 5, sizeof(int), &nx); ecode |= clSetKernelArg(k_rhsy, 6, sizeof(int), &ny); ecode |= clSetKernelArg(k_rhsy, 7, sizeof(int), &nz); } else { ecode |= clSetKernelArg(k_rhsy, 4, sizeof(int), &nx); ecode |= clSetKernelArg(k_rhsy, 5, sizeof(int), &ny); ecode |= clSetKernelArg(k_rhsy, 6, sizeof(int), &nz); } clu_CheckError(ecode, "clSetKernelArg()"); if (RHSY_DIM == 2) { rhsy_lws[0] = (iend-ist) < work_item_sizes[0] ? (iend-ist) : work_item_sizes[0]; temp = max_work_group_size / rhsy_lws[0]; rhsy_lws[1] = (nz-2) < temp ? (nz-2) : temp; rhsy_gws[0] = clu_RoundWorkSize((size_t)(iend-ist), rhsy_lws[0]); rhsy_gws[1] = clu_RoundWorkSize((size_t)(nz-2), rhsy_lws[1]); } else { //temp = (nz-2) / max_compute_units; temp = 1; rhsy_lws[0] = temp == 0 ? 1 : temp; rhsy_gws[0] = clu_RoundWorkSize((size_t)(nz-2), rhsy_lws[0]); } k_rhsz = clCreateKernel(p_main, "rhsz", &ecode); clu_CheckError(ecode, "clCreateKernel() for rhsz"); ecode = clSetKernelArg(k_rhsz, 0, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_rhsz, 1, sizeof(cl_mem), &m_rsd); ecode |= clSetKernelArg(k_rhsz, 2, sizeof(cl_mem), &m_qs); ecode |= clSetKernelArg(k_rhsz, 3, sizeof(cl_mem), &m_rho_i); if (device_type == CL_DEVICE_TYPE_CPU) { ecode |= clSetKernelArg(k_rhsz, 4, sizeof(cl_mem), &m_flux); ecode |= clSetKernelArg(k_rhsz, 5, sizeof(cl_mem), &m_utmp); ecode |= clSetKernelArg(k_rhsz, 6, sizeof(cl_mem), &m_rtmp); ecode |= clSetKernelArg(k_rhsz, 7, sizeof(int), &nx); ecode |= clSetKernelArg(k_rhsz, 8, sizeof(int), &ny); ecode |= clSetKernelArg(k_rhsz, 9, sizeof(int), &nz); } else { ecode |= clSetKernelArg(k_rhsz, 4, sizeof(int), &nx); ecode |= clSetKernelArg(k_rhsz, 5, sizeof(int), &ny); ecode |= clSetKernelArg(k_rhsz, 6, sizeof(int), &nz); } clu_CheckError(ecode, "clSetKernelArg()"); if (RHSZ_DIM == 2) { rhsz_lws[0] = (iend-ist) < work_item_sizes[0] ? (iend-ist) : work_item_sizes[0]; temp = max_work_group_size / rhsz_lws[0]; rhsz_lws[1] = (jend-jst) < temp ? (jend-jst) : temp; rhsz_gws[0] = clu_RoundWorkSize((size_t)(iend-ist), rhsz_lws[0]); rhsz_gws[1] = clu_RoundWorkSize((size_t)(jend-jst), rhsz_lws[1]); } else { //temp = (jend-jst) / max_compute_units; temp = 1; rhsz_lws[0] = temp == 0 ? 1 : temp; rhsz_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), rhsz_lws[0]); } k_ssor2 = clCreateKernel(p_main, "ssor2", &ecode); clu_CheckError(ecode, "clCreateKernel() for ssor2"); ecode = clSetKernelArg(k_ssor2, 0, sizeof(cl_mem), &m_rsd); ecode |= clSetKernelArg(k_ssor2, 2, sizeof(int), &nx); ecode |= clSetKernelArg(k_ssor2, 3, sizeof(int), &ny); ecode |= clSetKernelArg(k_ssor2, 4, sizeof(int), &nz); clu_CheckError(ecode, "clSetKernelArg()"); if (SSOR2_DIM == 3) { ssor2_lws[0] = (iend-ist) < work_item_sizes[0] ? (iend-ist) : work_item_sizes[0]; temp = max_work_group_size / ssor2_lws[0]; ssor2_lws[1] = (jend-jst) < temp ? (jend-jst) : temp; temp = temp / ssor2_lws[1]; ssor2_lws[2] = (nz-2) < temp ? (nz-2) : temp; ssor2_gws[0] = clu_RoundWorkSize((size_t)(iend-ist), ssor2_lws[0]); ssor2_gws[1] = clu_RoundWorkSize((size_t)(jend-jst), ssor2_lws[1]); ssor2_gws[2] = clu_RoundWorkSize((size_t)(nz-2), ssor2_lws[2]); } else if (SSOR2_DIM == 2) { ssor2_lws[0] = (jend-jst) < work_item_sizes[0] ? (jend-jst) : work_item_sizes[0]; temp = max_work_group_size / ssor2_lws[0]; ssor2_lws[1] = (nz-2) < temp ? (nz-2) : temp; ssor2_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), ssor2_lws[0]); ssor2_gws[1] = clu_RoundWorkSize((size_t)(nz-2), ssor2_lws[1]); } else { //temp = (nz-2) / max_compute_units; temp = 1; ssor2_lws[0] = temp == 0 ? 1 : temp; ssor2_gws[0] = clu_RoundWorkSize((size_t)(nz-2), ssor2_lws[0]); } k_ssor3 = clCreateKernel(p_main, "ssor3", &ecode); clu_CheckError(ecode, "clCreateKernel() for ssor3"); ecode = clSetKernelArg(k_ssor3, 0, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_ssor3, 1, sizeof(cl_mem), &m_rsd); ecode |= clSetKernelArg(k_ssor3, 3, sizeof(int), &nx); ecode |= clSetKernelArg(k_ssor3, 4, sizeof(int), &ny); ecode |= clSetKernelArg(k_ssor3, 5, sizeof(int), &nz); clu_CheckError(ecode, "clSetKernelArg()"); if (SSOR3_DIM == 3) { ssor3_lws[0] = (iend-ist) < work_item_sizes[0] ? (iend-ist) : work_item_sizes[0]; temp = max_work_group_size / ssor3_lws[0]; ssor3_lws[1] = (jend-jst) < temp ? (jend-jst) : temp; temp = temp / ssor3_lws[1]; ssor3_lws[2] = (nz-2) < temp ? (nz-2) : temp; ssor3_gws[0] = clu_RoundWorkSize((size_t)(iend-ist), ssor3_lws[0]); ssor3_gws[1] = clu_RoundWorkSize((size_t)(jend-jst), ssor3_lws[1]); ssor3_gws[2] = clu_RoundWorkSize((size_t)(nz-2), ssor3_lws[2]); } else if (SSOR3_DIM == 2) { ssor3_lws[0] = (jend-jst) < work_item_sizes[0] ? (jend-jst) : work_item_sizes[0]; temp = max_work_group_size / ssor3_lws[0]; ssor3_lws[1] = (nz-2) < temp ? (nz-2) : temp; ssor3_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), ssor3_lws[0]); ssor3_gws[1] = clu_RoundWorkSize((size_t)(nz-2), ssor3_lws[1]); } else { //temp = (nz-2) / max_compute_units; temp = 1; ssor3_lws[0] = temp == 0 ? 1 : temp; ssor3_gws[0] = clu_RoundWorkSize((size_t)(nz-2), ssor3_lws[0]); } k_blts = clCreateKernel(p_main, "blts", &ecode); clu_CheckError(ecode, "clCreateKernel() for blts"); ecode = clSetKernelArg(k_blts, 0, sizeof(cl_mem), &m_rsd); ecode |= clSetKernelArg(k_blts, 1, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_blts, 2, sizeof(cl_mem), &m_qs); ecode |= clSetKernelArg(k_blts, 3, sizeof(cl_mem), &m_rho_i); ecode |= clSetKernelArg(k_blts, 4, sizeof(int), &nz); ecode |= clSetKernelArg(k_blts, 5, sizeof(int), &ny); ecode |= clSetKernelArg(k_blts, 6, sizeof(int), &nx); clu_CheckError(ecode, "clSetKernelArg()"); blts_lws[0] = (jend-jst) < work_item_sizes[0] ? (jend-jst) : work_item_sizes[0]; temp = max_work_group_size / blts_lws[0]; blts_lws[1] = (nz-2) < temp ? (nz-2) : temp; blts_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), blts_lws[0]); blts_gws[1] = clu_RoundWorkSize((size_t)(nz-2), blts_lws[1]); k_buts = clCreateKernel(p_main, "buts", &ecode); clu_CheckError(ecode, "clCreateKernel() for buts"); ecode = clSetKernelArg(k_buts, 0, sizeof(cl_mem), &m_rsd); ecode |= clSetKernelArg(k_buts, 1, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_buts, 2, sizeof(cl_mem), &m_qs); ecode |= clSetKernelArg(k_buts, 3, sizeof(cl_mem), &m_rho_i); ecode |= clSetKernelArg(k_buts, 4, sizeof(int), &nz); ecode |= clSetKernelArg(k_buts, 5, sizeof(int), &ny); ecode |= clSetKernelArg(k_buts, 6, sizeof(int), &nx); clu_CheckError(ecode, "clSetKernelArg()"); buts_lws[0] = (jend-jst) < work_item_sizes[0] ? (jend-jst) : work_item_sizes[0]; temp = max_work_group_size / buts_lws[0]; buts_lws[1] = (nz-2) < temp ? (nz-2) : temp; buts_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), buts_lws[0]); buts_gws[1] = clu_RoundWorkSize((size_t)(nz-2), buts_lws[1]); if (timeron) timer_stop(TIMER_OPENCL); }
//--------------------------------------------------------------------- // Set up the OpenCL environment. //--------------------------------------------------------------------- void setup_opencl(int argc, char *argv[]) { cl_int err_code; char *source_dir = "EP"; if (argc > 1) source_dir = argv[1]; #ifdef TIMER_DETAIL if (timers_enabled) { int i; for (i = T_OPENCL_API; i < T_END; i++) timer_clear(i); } #endif DTIMER_START(T_OPENCL_API); // 1. Find the default device type and get a device for the device type device_type = clu_GetDefaultDeviceType(); device = clu_GetAvailableDevice(device_type); device_name = clu_GetDeviceName(device); // 2. Create a context for the specified device context = clCreateContext(NULL, 1, &device, NULL, NULL, &err_code); clu_CheckError(err_code, "clCreateContext()"); // 3. Create a command queue cmd_queue = clCreateCommandQueue(context, device, 0, &err_code); clu_CheckError(err_code, "clCreateCommandQueue()"); DTIMER_STOP(T_OPENCL_API); // 4. Build the program DTIMER_START(T_BUILD); char *source_file; char build_option[30]; sprintf(build_option, "-DM=%d -I.", M); if (device_type == CL_DEVICE_TYPE_CPU) { source_file = "ep_cpu.cl"; GROUP_SIZE = 16; } else { source_file = "ep_gpu.cl"; GROUP_SIZE = 64; } program = clu_MakeProgram(context, device, source_dir, source_file, build_option); DTIMER_STOP(T_BUILD); // 5. Create buffers DTIMER_START(T_BUFFER_CREATE); gq_size = np / GROUP_SIZE * NQ * sizeof(double); gsx_size = np / GROUP_SIZE * sizeof(double); gsy_size = np / GROUP_SIZE * sizeof(double); pgq = clCreateBuffer(context, CL_MEM_READ_WRITE, gq_size, NULL, &err_code); clu_CheckError(err_code, "clCreateBuffer() for pgq"); pgsx = clCreateBuffer(context, CL_MEM_READ_WRITE, gsx_size,NULL, &err_code); clu_CheckError(err_code, "clCreateBuffer() for pgsx"); pgsy = clCreateBuffer(context, CL_MEM_READ_WRITE, gsy_size,NULL, &err_code); clu_CheckError(err_code, "clCreateBuffer() for pgsy"); DTIMER_STOP(T_BUFFER_CREATE); // 6. Create a kernel DTIMER_START(T_OPENCL_API); kernel = clCreateKernel(program, "embar", &err_code); clu_CheckError(err_code, "clCreateKernel()"); DTIMER_STOP(T_OPENCL_API); }