//--------------------------------------------------------------------- // set the boundary values of dependent variables //--------------------------------------------------------------------- void setbv() { DTIMER_START(t_setbv); cl_int ecode; ecode = clEnqueueNDRangeKernel(cmd_queue, k_setbv1, SETBV1_DIM, NULL, setbv1_gws, setbv1_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); ecode = clEnqueueNDRangeKernel(cmd_queue, k_setbv2, SETBV2_DIM, NULL, setbv2_gws, setbv2_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); ecode = clEnqueueNDRangeKernel(cmd_queue, k_setbv3, SETBV3_DIM, NULL, setbv3_gws, setbv3_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); CHECK_FINISH(); DTIMER_STOP(t_setbv); }
void lhsinit() { int i; size_t d0_size, d1_size, d2_size; size_t local_ws[3], global_ws[3], temp; cl_kernel *k_lhsinit; cl_int ecode; k_lhsinit = (cl_kernel *)malloc(sizeof(cl_kernel) * num_devices); for (i = 0; i < num_devices; i++) { d0_size = max_cell_size[i][1]; d1_size = max_cell_size[i][2]; d2_size = ncells; local_ws[0] = d0_size < work_item_sizes[0] ? d0_size : work_item_sizes[0]; temp = max_work_group_size / local_ws[0]; local_ws[1] = d1_size < temp ? d1_size : temp; temp = temp / local_ws[1]; local_ws[2] = d2_size < temp ? d2_size : temp; global_ws[0] = clu_RoundWorkSize(d0_size, local_ws[0]); global_ws[1] = clu_RoundWorkSize(d1_size, local_ws[1]); global_ws[2] = clu_RoundWorkSize(d2_size, local_ws[2]); k_lhsinit[i] = clCreateKernel(p_initialize[i], "lhsinit", &ecode); clu_CheckError(ecode, "clCreateKernel()"); ecode = clSetKernelArg(k_lhsinit[i], 0, sizeof(cl_mem), &m_lhsc[i]); ecode |= clSetKernelArg(k_lhsinit[i], 1, sizeof(cl_mem), &m_start[i]); ecode |= clSetKernelArg(k_lhsinit[i], 2, sizeof(cl_mem), &m_end[i]); ecode |= clSetKernelArg(k_lhsinit[i], 3, sizeof(cl_mem),&m_cell_coord[i]); ecode |= clSetKernelArg(k_lhsinit[i], 4, sizeof(cl_mem), &m_cell_size[i]); ecode |= clSetKernelArg(k_lhsinit[i], 5, sizeof(int), &ncells); clu_CheckError(ecode, "clSetKernelArg()"); ecode = clEnqueueNDRangeKernel(cmd_queue[i], k_lhsinit[i], 3, NULL, global_ws, local_ws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); } for (i = 0; i < num_devices; i++) { ecode = clFinish(cmd_queue[i]); clu_CheckError(ecode, "clFinish()"); } for (i = 0; i < num_devices; i++) { clReleaseKernel(k_lhsinit[i]); } free(k_lhsinit); }
static void cffts3(int is, int d1, int d2, int d3, cl_mem *x, cl_mem *xout) { int logd3 = ilog2(d3); size_t local_ws[2], global_ws[2], temp; cl_int ecode; if (timers_enabled) timer_start(T_fftz); ecode = clSetKernelArg(k_cffts3, 0, sizeof(cl_mem), x); ecode |= clSetKernelArg(k_cffts3, 1, sizeof(cl_mem), xout); ecode |= clSetKernelArg(k_cffts3, 3, sizeof(int), &is); ecode |= clSetKernelArg(k_cffts3, 4, sizeof(int), &d1); ecode |= clSetKernelArg(k_cffts3, 5, sizeof(int), &d2); ecode |= clSetKernelArg(k_cffts3, 6, sizeof(int), &d3); ecode |= clSetKernelArg(k_cffts3, 7, sizeof(int), &logd3); clu_CheckError(ecode, "clSetKernelArg() for k_cffts3"); if (device_type == CL_DEVICE_TYPE_CPU) { local_ws[0] = d1 < work_item_sizes[0] ? d1 : work_item_sizes[0]; temp = max_work_group_size / local_ws[0]; local_ws[1] = d2 < temp ? d2 : temp; global_ws[0] = clu_RoundWorkSize((size_t)d1, local_ws[0]); global_ws[1] = clu_RoundWorkSize((size_t)d2, local_ws[1]); } else if (device_type == CL_DEVICE_TYPE_GPU) { if (CFFTS_DIM == 2) { local_ws[0] = CFFTS_LSIZE; local_ws[1] = 1; global_ws[0] = d1 * local_ws[0]; global_ws[1] = d2; } else { local_ws[0] = CFFTS_LSIZE; global_ws[0] = d2 * local_ws[0]; } } ecode = clEnqueueNDRangeKernel(cmd_queue, k_cffts3, CFFTS_DIM, NULL, global_ws, local_ws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for cffts3"); CHECK_FINISH(); if (timers_enabled) timer_stop(T_fftz); }
//--------------------------------------------------------------------- // evolve u0 -> u1 (t time steps) in fourier space //--------------------------------------------------------------------- static void evolve(cl_mem *u0, cl_mem *u1, cl_mem *twiddle, int d1, int d2, int d3) { cl_int ecode; size_t local_ws[3], global_ws[3]; ecode = clSetKernelArg(k_evolve, 0, sizeof(cl_mem), u0); ecode |= clSetKernelArg(k_evolve, 1, sizeof(cl_mem), u1); ecode |= clSetKernelArg(k_evolve, 2, sizeof(cl_mem), twiddle); ecode |= clSetKernelArg(k_evolve, 3, sizeof(int), &d1); ecode |= clSetKernelArg(k_evolve, 4, sizeof(int), &d2); ecode |= clSetKernelArg(k_evolve, 5, sizeof(int), &d3); clu_CheckError(ecode, "clSetKernelArg() for evolve"); if (EVOLVE_DIM == 3) { local_ws[0] = d1 < work_item_sizes[0] ? d1 : work_item_sizes[0]; int temp = max_work_group_size / local_ws[0]; local_ws[1] = d2 < temp ? d2 : temp; temp = temp / local_ws[1]; local_ws[2] = d3 < temp ? d3 : temp; global_ws[0] = clu_RoundWorkSize((size_t)d1, local_ws[0]); global_ws[1] = clu_RoundWorkSize((size_t)d2, local_ws[1]); global_ws[2] = clu_RoundWorkSize((size_t)d3, local_ws[2]); } else if (EVOLVE_DIM == 2) { local_ws[0] = d2 < work_item_sizes[0] ? d2 : work_item_sizes[0]; int temp = max_work_group_size / local_ws[0]; local_ws[1] = d3 < temp ? d3 : temp; global_ws[0] = clu_RoundWorkSize((size_t)d2, local_ws[0]); global_ws[1] = clu_RoundWorkSize((size_t)d3, local_ws[1]); } else { int temp = d3 / max_compute_units; local_ws[0] = temp == 0 ? 1 : temp; global_ws[0] = clu_RoundWorkSize((size_t)d3, local_ws[0]); } ecode = clEnqueueNDRangeKernel(cmd_queue, k_evolve, EVOLVE_DIM, NULL, global_ws, local_ws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for evolve"); CHECK_FINISH(); }
//--------------------------------------------------------------------- // compute function from local (i,j,k) to ibar^2+jbar^2+kbar^2 // for time evolution exponent. //--------------------------------------------------------------------- static void compute_indexmap(cl_mem *twiddle, int d1, int d2, int d3) { cl_int ecode; ecode = clEnqueueNDRangeKernel(cmd_queue, k_compute_indexmap, COMPUTE_IMAP_DIM, NULL, cimap_gws, cimap_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for compute_indexmap"); CHECK_FINISH(); }
void create_seq( double seed, double a ) { cl_kernel k_cs; cl_int ecode; size_t cs_lws[1], cs_gws[1]; DTIMER_START(T_OPENCL_API); // Create a kernel k_cs = clCreateKernel(program, "create_seq", &ecode); clu_CheckError(ecode, "clCreateKernel() for create_seq"); DTIMER_STOP(T_OPENCL_API); DTIMER_START(T_KERNEL_CREATE_SEQ); // Set kernel arguments ecode = clSetKernelArg(k_cs, 0, sizeof(cl_mem), (void*)&m_key_array); ecode |= clSetKernelArg(k_cs, 1, sizeof(cl_double), (void*)&seed); ecode |= clSetKernelArg(k_cs, 2, sizeof(cl_double), (void*)&a); clu_CheckError(ecode, "clSetKernelArg() for create_seq"); // Launch the kernel cs_lws[0] = CREATE_SEQ_GROUP_SIZE; cs_gws[0] = CREATE_SEQ_GLOBAL_SIZE; ecode = clEnqueueNDRangeKernel(cmd_queue, k_cs, 1, NULL, cs_gws, cs_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for create_seq"); ecode = clFinish(cmd_queue); clu_CheckError(ecode, "clFinish"); DTIMER_STOP(T_KERNEL_CREATE_SEQ); DTIMER_START(T_RELEASE); clReleaseKernel(k_cs); DTIMER_STOP(T_RELEASE); }
//--------------------------------------------------------------------- // touch all the big data //--------------------------------------------------------------------- static void init_ui(cl_mem *u0, cl_mem *u1, cl_mem *twiddle, int d1, int d2, int d3) { cl_kernel k_init_ui; cl_int ecode; DTIMER_START(T_OPENCL_API); // Create a kernel k_init_ui = clCreateKernel(program, "init_ui", &ecode); clu_CheckError(ecode, "clCreateKernel() for init_ui"); DTIMER_STOP(T_OPENCL_API); int n = d3 * d2 * (d1+1); ecode = clSetKernelArg(k_init_ui, 0, sizeof(cl_mem), (void*)u0); ecode |= clSetKernelArg(k_init_ui, 1, sizeof(cl_mem), (void*)u1); ecode |= clSetKernelArg(k_init_ui, 2, sizeof(cl_mem), (void*)twiddle); ecode |= clSetKernelArg(k_init_ui, 3, sizeof(int), (void*)&n); clu_CheckError(ecode, "clSetKernelArg() for init_ui"); size_t local_ws = work_item_sizes[0]; size_t global_ws = clu_RoundWorkSize((size_t)n, local_ws); ecode = clEnqueueNDRangeKernel(cmd_queue, k_init_ui, 1, NULL, &global_ws, &local_ws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for init_ui"); ecode = clFinish(cmd_queue); clu_CheckError(ecode, "clFinish()"); DTIMER_START(T_RELEASE); clReleaseKernel(k_init_ui); DTIMER_STOP(T_RELEASE); }
static void checksum(int i, cl_mem *u1, int d1, int d2, int d3) { dcomplex chk = dcmplx(0.0, 0.0); int k; cl_int ecode; ecode = clSetKernelArg(k_checksum, 0, sizeof(cl_mem), u1); clu_CheckError(ecode, "clSetKernelArg() for checksum"); ecode = clEnqueueNDRangeKernel(cmd_queue, k_checksum, 1, NULL, &checksum_global_ws, &checksum_local_ws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); CHECK_FINISH(); ecode = clEnqueueReadBuffer(cmd_queue, m_chk, CL_TRUE, 0, checksum_wg_num * sizeof(dcomplex), g_chk, 0, NULL, NULL); clu_CheckError(ecode, "clReadBuffer()"); // reduction for (k = 0; k < checksum_wg_num; k++) { chk = dcmplx_add(chk, g_chk[k]); } chk = dcmplx_div2(chk, (double)(NTOTAL)); printf(" T =%5d Checksum =%22.12E%22.12E\n", i, chk.real, chk.imag); sums[i] = chk; }
//--------------------------------------------------------------------- // addition of update to the vector u //--------------------------------------------------------------------- void add() { cl_int ecode; if (timeron) timer_start(t_add); ecode = clEnqueueNDRangeKernel(cmd_queue, k_add, ADD_DIM, NULL, add_gws, add_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); CHECK_FINISH(); if (timeron) timer_stop(t_add); }
//--------------------------------------------------------------------- // block-diagonal matrix-vector multiplication //--------------------------------------------------------------------- void ninvr() { cl_int ecode; if (timeron) timer_start(t_ninvr); ecode = clEnqueueNDRangeKernel(cmd_queue, k_ninvr, NINVR_DIM, NULL, ninvr_gws, ninvr_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); CHECK_FINISH(); if (timeron) timer_stop(t_ninvr); }
//--------------------------------------------------------------------- // this function performs the solution of the approximate factorization // step in the z-direction for all five matrix components // simultaneously. The Thomas algorithm is employed to solve the // systems for the z-lines. Boundary conditions are non-periodic //--------------------------------------------------------------------- void z_solve() { cl_int ecode; if (timeron) timer_start(t_zsolve); ecode = clEnqueueNDRangeKernel(cmd_queue, k_z_solve, Z_SOLVE_DIM, NULL, z_solve_gws, z_solve_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); CHECK_FINISH(); if (timeron) timer_stop(t_zsolve); tzetar(); }
//--------------------------------------------------------------------- // compute the roots-of-unity array that will be used for subsequent FFTs. //--------------------------------------------------------------------- static void fft_init(int n) { int m, nu, ku, i, j, ln; double t, ti; //--------------------------------------------------------------------- // Initialize the U array with sines and cosines in a manner that permits // stride one access at each FFT iteration. //--------------------------------------------------------------------- nu = n; m = ilog2(n); u[0] = dcmplx(m, 0.0); ku = 2; ln = 1; for (j = 1; j <= m; j++) { t = PI / ln; for (i = 0; i <= ln - 1; i++) { ti = i * t; u[i+ku-1] = dcmplx(cos(ti), sin(ti)); } ku = ku + ln; ln = 2 * ln; } int ecode; ecode = clEnqueueWriteBuffer(cmd_queue, m_u, CL_FALSE, 0, sizeof(dcomplex) * NXP, u, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueWriteBuffer() for m_u"); }
//--------------------------------------------------------------------- // Set up the OpenCL environment. //--------------------------------------------------------------------- static void setup_opencl(int argc, char *argv[]) { size_t temp; cl_int ecode; char *source_dir = "FT"; if (argc > 1) source_dir = argv[1]; #ifdef TIMER_DETAIL if (timers_enabled) { int i; for (i = T_OPENCL_API; i < T_END; i++) timer_clear(i); } #endif DTIMER_START(T_OPENCL_API); // 1. Find the default device type and get a device for the device type device_type = clu_GetDefaultDeviceType(); device = clu_GetAvailableDevice(device_type); device_name = clu_GetDeviceName(device); // Device information ecode = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(work_item_sizes), &work_item_sizes, NULL); clu_CheckError(ecode, "clGetDiviceInfo()"); ecode = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_work_group_size, NULL); clu_CheckError(ecode, "clGetDiviceInfo()"); // FIXME: The below values are experimental. if (max_work_group_size > 64) { max_work_group_size = 64; int i; for (i = 0; i < 3; i++) { if (work_item_sizes[i] > 64) { work_item_sizes[i] = 64; } } } ecode = clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &max_compute_units, NULL); clu_CheckError(ecode, "clGetDiviceInfo()"); // 2. Create a context for the specified device context = clCreateContext(NULL, 1, &device, NULL, NULL, &ecode); clu_CheckError(ecode, "clCreateContext()"); // 3. Create a command queue cmd_queue = clCreateCommandQueue(context, device, 0, &ecode); clu_CheckError(ecode, "clCreateCommandQueue()"); DTIMER_STOP(T_OPENCL_API); // 4. Build the program DTIMER_START(T_BUILD); char *source_file; char build_option[50]; if (device_type == CL_DEVICE_TYPE_CPU) { source_file = "ft_cpu.cl"; sprintf(build_option, "-I. -DCLASS=%d -DUSE_CPU", CLASS); COMPUTE_IMAP_DIM = COMPUTE_IMAP_DIM_CPU; EVOLVE_DIM = EVOLVE_DIM_CPU; CFFTS_DIM = CFFTS_DIM_CPU; } else if (device_type == CL_DEVICE_TYPE_GPU) { char vendor[50]; ecode = clGetDeviceInfo(device, CL_DEVICE_VENDOR, 50, vendor, NULL); clu_CheckError(ecode, "clGetDeviceInfo()"); if (strncmp(vendor, DEV_VENDOR_NVIDIA, strlen(DEV_VENDOR_NVIDIA)) == 0) { source_file = "ft_gpu_nvidia.cl"; CFFTS_LSIZE = 32; } else { source_file = "ft_gpu.cl"; CFFTS_LSIZE = 64; } sprintf(build_option, "-I. -DCLASS=\'%c\' -DLSIZE=%lu", CLASS, CFFTS_LSIZE); COMPUTE_IMAP_DIM = COMPUTE_IMAP_DIM_GPU; EVOLVE_DIM = EVOLVE_DIM_GPU; CFFTS_DIM = CFFTS_DIM_GPU; } else { fprintf(stderr, "Set the environment variable OPENCL_DEVICE_TYPE!\n"); exit(EXIT_FAILURE); } program = clu_MakeProgram(context, device, source_dir, source_file, build_option); DTIMER_STOP(T_BUILD); // 5. Create buffers DTIMER_START(T_BUFFER_CREATE); m_u = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(dcomplex) * NXP, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_u"); m_u0 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(dcomplex) * NTOTALP, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_u0"); m_u1 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(dcomplex) * NTOTALP, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_u1"); m_twiddle = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double) * NTOTALP, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_twiddle"); if (device_type == CL_DEVICE_TYPE_CPU) { size_t ty1_size, ty2_size; if (CFFTS_DIM == 2) { ty1_size = sizeof(dcomplex) * NX * NY * NZ; ty2_size = sizeof(dcomplex) * NX * NY * NZ; } else { fprintf(stderr, "Wrong CFFTS_DIM: %u\n", CFFTS_DIM); exit(EXIT_FAILURE); } m_ty1 = clCreateBuffer(context, CL_MEM_READ_WRITE, ty1_size, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_ty1"); m_ty2 = clCreateBuffer(context, CL_MEM_READ_WRITE, ty2_size, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_ty2"); } if (device_type == CL_DEVICE_TYPE_CPU) { temp = 1024 / max_compute_units; checksum_local_ws = temp == 0 ? 1 : temp; checksum_global_ws = clu_RoundWorkSize((size_t)1024, checksum_local_ws); } else if (device_type == CL_DEVICE_TYPE_GPU) { checksum_local_ws = 32; checksum_global_ws = clu_RoundWorkSize((size_t)1024, checksum_local_ws); } checksum_wg_num = checksum_global_ws / checksum_local_ws; m_chk = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(dcomplex) * checksum_wg_num, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_chk"); g_chk = (dcomplex *)malloc(sizeof(dcomplex) * checksum_wg_num); DTIMER_STOP(T_BUFFER_CREATE); // 6. Create kernels DTIMER_START(T_OPENCL_API); double ap = -4.0 * ALPHA * PI * PI; int d1 = dims[0]; int d2 = dims[1]; int d3 = dims[2]; k_compute_indexmap = clCreateKernel(program, "compute_indexmap", &ecode); clu_CheckError(ecode, "clCreateKernel() for compute_indexmap"); ecode = clSetKernelArg(k_compute_indexmap, 0, sizeof(cl_mem), &m_twiddle); ecode |= clSetKernelArg(k_compute_indexmap, 1, sizeof(int), &d1); ecode |= clSetKernelArg(k_compute_indexmap, 2, sizeof(int), &d2); ecode |= clSetKernelArg(k_compute_indexmap, 3, sizeof(int), &d3); ecode |= clSetKernelArg(k_compute_indexmap, 4, sizeof(double), &ap); clu_CheckError(ecode, "clSetKernelArg() for compute_indexmap"); if (COMPUTE_IMAP_DIM == 3) { cimap_lws[0] = d1 < work_item_sizes[0] ? d1 : work_item_sizes[0]; temp = max_work_group_size / cimap_lws[0]; cimap_lws[1] = d2 < temp ? d2 : temp; temp = temp / cimap_lws[1]; cimap_lws[2] = d3 < temp ? d3 : temp; cimap_gws[0] = clu_RoundWorkSize((size_t)d1, cimap_lws[0]); cimap_gws[1] = clu_RoundWorkSize((size_t)d2, cimap_lws[1]); cimap_gws[2] = clu_RoundWorkSize((size_t)d3, cimap_lws[2]); } else if (COMPUTE_IMAP_DIM == 2) { cimap_lws[0] = d2 < work_item_sizes[0] ? d2 : work_item_sizes[0]; temp = max_work_group_size / cimap_lws[0]; cimap_lws[1] = d3 < temp ? d3 : temp; cimap_gws[0] = clu_RoundWorkSize((size_t)d2, cimap_lws[0]); cimap_gws[1] = clu_RoundWorkSize((size_t)d3, cimap_lws[1]); } else { //temp = d3 / max_compute_units; temp = 1; cimap_lws[0] = temp == 0 ? 1 : temp; cimap_gws[0] = clu_RoundWorkSize((size_t)d3, cimap_lws[0]); } k_compute_ics = clCreateKernel(program, "compute_initial_conditions", &ecode); clu_CheckError(ecode, "clCreateKernel() for compute_initial_conditions"); ecode = clSetKernelArg(k_compute_ics, 2, sizeof(int), &d1); ecode |= clSetKernelArg(k_compute_ics, 3, sizeof(int), &d2); ecode |= clSetKernelArg(k_compute_ics, 4, sizeof(int), &d3); clu_CheckError(ecode, "clSetKernelArg() for compute_initial_conditions"); k_cffts1 = clCreateKernel(program, "cffts1", &ecode); clu_CheckError(ecode, "clCreateKernel() for cffts1"); ecode = clSetKernelArg(k_cffts1, 2, sizeof(cl_mem), &m_u); if (device_type == CL_DEVICE_TYPE_CPU) { ecode |= clSetKernelArg(k_cffts1, 8, sizeof(cl_mem), &m_ty1); ecode |= clSetKernelArg(k_cffts1, 9, sizeof(cl_mem), &m_ty2); } clu_CheckError(ecode, "clSetKernelArg() for k_cffts1"); k_cffts2 = clCreateKernel(program, "cffts2", &ecode); clu_CheckError(ecode, "clCreateKernel() for cffts2"); ecode = clSetKernelArg(k_cffts2, 2, sizeof(cl_mem), &m_u); if (device_type == CL_DEVICE_TYPE_CPU) { ecode |= clSetKernelArg(k_cffts2, 8, sizeof(cl_mem), &m_ty1); ecode |= clSetKernelArg(k_cffts2, 9, sizeof(cl_mem), &m_ty2); } clu_CheckError(ecode, "clSetKernelArg() for k_cffts2"); k_cffts3 = clCreateKernel(program, "cffts3", &ecode); clu_CheckError(ecode, "clCreateKernel() for cffts3"); ecode = clSetKernelArg(k_cffts3, 2, sizeof(cl_mem), &m_u); if (device_type == CL_DEVICE_TYPE_CPU) { ecode |= clSetKernelArg(k_cffts3, 8, sizeof(cl_mem), &m_ty1); ecode |= clSetKernelArg(k_cffts3, 9, sizeof(cl_mem), &m_ty2); } clu_CheckError(ecode, "clSetKernelArg() for k_cffts3"); k_evolve = clCreateKernel(program, "evolve", &ecode); clu_CheckError(ecode, "clCreateKernel() for evolve"); k_checksum = clCreateKernel(program, "checksum", &ecode); clu_CheckError(ecode, "clCreateKernel() for checksum"); ecode = clSetKernelArg(k_checksum, 1, sizeof(cl_mem), &m_chk); ecode |= clSetKernelArg(k_checksum, 2, sizeof(dcomplex)*checksum_local_ws, NULL); ecode |= clSetKernelArg(k_checksum, 3, sizeof(int), &dims[0]); ecode |= clSetKernelArg(k_checksum, 4, sizeof(int), &dims[1]); clu_CheckError(ecode, "clSetKernelArg() for checksum"); DTIMER_STOP(T_OPENCL_API); }
//--------------------------------------------------------------------- // this function computes the norm of the difference between the // computed solution and the exact solution //--------------------------------------------------------------------- void error_norm(double rms[5]) { int i, m, d; cl_kernel *k_en; cl_mem *m_rms; double (*g_rms)[5]; cl_int ecode; g_rms = (double (*)[5])malloc(sizeof(double)*5 * num_devices); m_rms = (cl_mem *)malloc(sizeof(cl_mem) * num_devices); k_en = (cl_kernel *)malloc(sizeof(cl_kernel) * num_devices); for (i = 0; i < num_devices; i++) { m_rms[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double) * 5, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer()"); k_en[i] = clCreateKernel(p_error[i], "error_norm", &ecode); clu_CheckError(ecode, "clCreateKernel()"); ecode = clSetKernelArg(k_en[i], 0, sizeof(cl_mem), &m_u[i]); ecode |= clSetKernelArg(k_en[i], 1, sizeof(cl_mem), &m_ce[i]); ecode |= clSetKernelArg(k_en[i], 2, sizeof(cl_mem), &m_rms[i]); ecode |= clSetKernelArg(k_en[i], 3, sizeof(cl_mem), &m_cell_low[i]); ecode |= clSetKernelArg(k_en[i], 4, sizeof(cl_mem), &m_cell_high[i]); ecode |= clSetKernelArg(k_en[i], 5, sizeof(int), &ncells); clu_CheckError(ecode, "clSetKernelArg()"); ecode = clEnqueueTask(cmd_queue[i], k_en[i], 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueTask()"); clFinish(cmd_queue[i]); ecode = clEnqueueReadBuffer(cmd_queue[i], m_rms[i], CL_TRUE, 0, sizeof(double)*5, &g_rms[i], 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueReadBuffer()"); } for (m = 0; m < 5; m++) { rms[m] = 0.0; } for (i = 0; i < num_devices; i++) { ecode = clFinish(cmd_queue[i]); clu_CheckError(ecode, "clFinish()"); } // reduction for (i = 0; i < num_devices; i++) { for (m = 0; m < 5; m++) { rms[m] += g_rms[i][m]; } } for (m = 0; m < 5; m++) { for (d = 0; d < 3; d++) { rms[m] = rms[m] / (double)(grid_points[d]-2); } rms[m] = sqrt(rms[m]); } for (i = 0; i < num_devices; i++) { clReleaseMemObject(m_rms[i]); clReleaseKernel(k_en[i]); } free(g_rms); free(m_rms); free(k_en); }
//--------------------------------------------------------------------- // Set up the OpenCL environment. //--------------------------------------------------------------------- static void setup_opencl(int argc, char **argv) { int i, c; // size_t temp; cl_int ecode = 0; char *source_dir = "."; //FIXME int num_subs = DEFAULT_NUM_SUBS; int num_cus; int sqrt_num_command_queues; if (argc > 1) source_dir = argv[1]; devices = (cl_device_id *)malloc(sizeof(cl_device_id) * num_subs); if (timeron) { timer_clear(TIMER_OPENCL); timer_clear(TIMER_BUILD); timer_clear(TIMER_BUFFER); timer_clear(TIMER_RELEASE); timer_start(TIMER_OPENCL); } // 1. Find the default device type and get a device for the device type // Then, create sub-devices from the parent device. //device_type = CL_DEVICE_TYPE_CPU; device_type = CL_DEVICE_TYPE_ALL; //device_type = CL_DEVICE_TYPE_GPU; if(argc <= 2) { printf("Device type argument missing!\n"); exit(-1); } char *device_type_str = argv[2]; if(strcmp(device_type_str, "CPU") == 0 || strcmp(device_type_str, "cpu") == 0) { device_type = CL_DEVICE_TYPE_CPU; } else if(strcmp(device_type_str, "GPU") == 0 || strcmp(device_type_str, "gpu") == 0) { device_type = CL_DEVICE_TYPE_GPU; } else if(strcmp(device_type_str, "ALL") == 0 || strcmp(device_type_str, "all") == 0) { device_type = CL_DEVICE_TYPE_ALL; } else { printf("Unsupported device type!\n"); exit(-1); } cl_uint num_command_queues = 4; char *num_command_queues_str = getenv("SNU_NPB_COMMAND_QUEUES"); if(num_command_queues_str != NULL) num_command_queues = atoi(num_command_queues_str); cl_platform_id platform; ecode = clGetPlatformIDs(1, &platform, NULL); clu_CheckError(ecode, "clGetPlatformIDs()"); ecode = clGetDeviceIDs(platform, device_type, 0, NULL, &num_devices); clu_CheckError(ecode, "clGetDeviceIDs()"); //num_devices = 2; ecode = clGetDeviceIDs(platform, device_type, num_devices, devices, NULL); clu_CheckError(ecode, "clGetDeviceIDs()"); cl_device_id tmp_dev; work_item_sizes[0] = work_item_sizes[1] = work_item_sizes[2] = 1024; max_work_group_size = 1024; max_compute_units = 22; sqrt_num_command_queues = (int)(sqrt((double)(num_command_queues) + 0.00001)); if (num_command_queues != sqrt_num_command_queues * sqrt_num_command_queues) { fprintf(stderr, "Number of devices is not a square of some integer\n"); exit(EXIT_FAILURE); } ncells = (int)(sqrt((double)(num_command_queues) + 0.00001)); MAX_CELL_DIM = ((PROBLEM_SIZE/ncells)+1); IMAX = MAX_CELL_DIM; JMAX = MAX_CELL_DIM; KMAX = MAX_CELL_DIM; IMAXP = (IMAX/2*2+1); JMAXP = (JMAX/2*2+1); //--------------------------------------------------------------------- // +1 at end to avoid zero length arrays for 1 node //--------------------------------------------------------------------- BUF_SIZE = (MAX_CELL_DIM*MAX_CELL_DIM*(MAXCELLS-1)*60*2+1); // FIXME if (max_work_group_size > 64) { max_work_group_size = 64; int i; for (i = 0; i < 3; i++) { if (work_item_sizes[i] > 64) { work_item_sizes[i] = 64; } } } // 2. Create a context for devices #ifdef MINIMD_SNUCL_OPTIMIZATIONS cl_context_properties props[5] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, CL_CONTEXT_SCHEDULER, CL_CONTEXT_SCHEDULER_CODE_SEGMENTED_PERF_MODEL, //CL_CONTEXT_SCHEDULER_PERF_MODEL, //CL_CONTEXT_SCHEDULER_FIRST_EPOCH_BASED_PERF_MODEL, //CL_CONTEXT_SCHEDULER_ALL_EPOCH_BASED_PERF_MODEL, 0 }; context = clCreateContext(props, #elif defined(SOCL_OPTIMIZATIONS) cl_context_properties props[5] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, CL_CONTEXT_SCHEDULER_SOCL, "dmda", //"random", 0 }; context = clCreateContext(props, #else context = clCreateContext(NULL, #endif num_devices, devices, NULL, NULL, &ecode); clu_CheckError(ecode, "clCreateContext()"); // 3. Create a command queue cmd_queue = (cl_command_queue*)malloc(sizeof(cl_command_queue)*num_command_queues*3); for (i = 0; i < num_command_queues * 2; i++) { //cmd_queue[i] = clCreateCommandQueue(context, devices[(i / 2) % num_devices], #ifdef SOCL_OPTIMIZATIONS cmd_queue[i] = clCreateCommandQueue(context, NULL, #else cmd_queue[i] = clCreateCommandQueue(context, devices[num_devices - 1 - ((i / 2) % num_devices)], #endif // cmd_queue[i] = clCreateCommandQueue(context, devices[0], #ifdef MINIMD_SNUCL_OPTIMIZATIONS 0, // CL_QUEUE_AUTO_DEVICE_SELECTION | // CL_QUEUE_ITERATIVE, //CL_QUEUE_COMPUTE_INTENSIVE, #else 0, #endif &ecode); clu_CheckError(ecode, "clCreateCommandQueue()"); } // 4. Build the program if (timeron) timer_start(TIMER_BUILD); char *source_file = "sp_kernel.cl"; //p_program = clu_MakeProgram(context, devices, source_dir, source_file, build_option); p_program = clu_CreateProgram(context, source_dir, source_file); for(i = 0; i < num_devices; i++) { char build_option[200] = {0}; cl_device_type cur_device_type; cl_int err = clGetDeviceInfo(devices[i], CL_DEVICE_TYPE, sizeof(cl_device_type), &cur_device_type, NULL); clu_CheckError(err, "clGetDeviceInfo()"); if (cur_device_type == CL_DEVICE_TYPE_CPU) { sprintf(build_option, "-I. -DCLASS=%d -DUSE_CPU -DMAX_CELL_DIM=%d -DIMAX=%d -DJMAX=%d -DKMAX=%d -DIMAXP=%d -DJMAXP=%d", CLASS, MAX_CELL_DIM, IMAX, JMAX, KMAX, IMAXP, JMAXP); } else { sprintf(build_option, "-I. -DCLASS=%d -DUSE_GPU -DMAX_CELL_DIM=%d -DIMAX=%d -DJMAX=%d -DKMAX=%d -DIMAXP=%d -DJMAXP=%d", CLASS, MAX_CELL_DIM, IMAX, JMAX, KMAX, IMAXP, JMAXP); } clu_MakeProgram(p_program, 1, &devices[i], source_dir, build_option); //clu_MakeProgram(p_program, num_devices, devices, source_dir, build_option); } num_devices = num_command_queues; program = (cl_program *)malloc(sizeof(cl_program) * num_devices); for (i = 0; i < num_devices; i++) { program[i] = p_program; } if (timeron) timer_stop(TIMER_BUILD); // 5. Create kernels size_t asize = sizeof(cl_kernel) * num_devices; k_initialize1 = (cl_kernel *)malloc(asize); k_initialize2 = (cl_kernel *)malloc(asize); k_initialize3 = (cl_kernel *)malloc(asize); k_initialize4 = (cl_kernel *)malloc(asize); k_initialize5 = (cl_kernel *)malloc(asize); k_initialize6 = (cl_kernel *)malloc(asize); k_initialize7 = (cl_kernel *)malloc(asize); k_initialize8 = (cl_kernel *)malloc(asize); k_lhsinit = (cl_kernel *)malloc(asize); k_exact_rhs1 = (cl_kernel *)malloc(asize); k_exact_rhs2 = (cl_kernel *)malloc(asize); k_exact_rhs3 = (cl_kernel *)malloc(asize); k_exact_rhs4 = (cl_kernel *)malloc(asize); k_exact_rhs5 = (cl_kernel *)malloc(asize); k_copy_faces1 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_copy_faces2 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_copy_faces3 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_copy_faces4 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_copy_faces5 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_copy_faces6 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_compute_rhs1 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_compute_rhs2 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_compute_rhs3 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_compute_rhs4 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_compute_rhs5 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_compute_rhs6 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_txinvr = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_lhsx = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_ninvr = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_x_solve1 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_x_solve2 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_x_solve3 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_x_solve4 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_x_solve5 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_x_solve6 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_lhsy = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_pinvr = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_y_solve1 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_y_solve2 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_y_solve3 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_y_solve4 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_y_solve5 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_y_solve6 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_lhsz = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_tzetar = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_z_solve1 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_z_solve2 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_z_solve3 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_z_solve4 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_z_solve5 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_z_solve6 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_add = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_error_norm = (cl_kernel *)malloc(asize); k_rhs_norm = (cl_kernel *)malloc(asize); for (i = 0; i < num_devices; i++) { k_initialize1[i] = clCreateKernel(program[i], "initialize1", &ecode); clu_CheckError(ecode, "clCreateKernel() for initialize1"); k_initialize2[i] = clCreateKernel(program[i], "initialize2", &ecode); clu_CheckError(ecode, "clCreateKernel() for initialize2"); k_initialize3[i] = clCreateKernel(program[i], "initialize3", &ecode); clu_CheckError(ecode, "clCreateKernel() for initialize3"); k_initialize4[i] = clCreateKernel(program[i], "initialize4", &ecode); clu_CheckError(ecode, "clCreateKernel() for initialize4"); k_initialize5[i] = clCreateKernel(program[i], "initialize5", &ecode); clu_CheckError(ecode, "clCreateKernel() for initialize5"); k_initialize6[i] = clCreateKernel(program[i], "initialize6", &ecode); clu_CheckError(ecode, "clCreateKernel() for initialize6"); k_initialize7[i] = clCreateKernel(program[i], "initialize7", &ecode); clu_CheckError(ecode, "clCreateKernel() for initialize7"); k_initialize8[i] = clCreateKernel(program[i], "initialize8", &ecode); clu_CheckError(ecode, "clCreateKernel() for initialize8"); k_lhsinit[i] = clCreateKernel(program[i], "lhsinit", &ecode); clu_CheckError(ecode, "clCreateKernel() for lhsinit"); k_exact_rhs1[i] = clCreateKernel(program[i], "exact_rhs1", &ecode); clu_CheckError(ecode, "clCreateKernel() for exact_rhs1"); k_exact_rhs2[i] = clCreateKernel(program[i], "exact_rhs2", &ecode); clu_CheckError(ecode, "clCreateKernel() for exact_rhs2"); k_exact_rhs3[i] = clCreateKernel(program[i], "exact_rhs3", &ecode); clu_CheckError(ecode, "clCreateKernel() for exact_rhs3"); k_exact_rhs4[i] = clCreateKernel(program[i], "exact_rhs4", &ecode); clu_CheckError(ecode, "clCreateKernel() for exact_rhs4"); k_exact_rhs5[i] = clCreateKernel(program[i], "exact_rhs5", &ecode); clu_CheckError(ecode, "clCreateKernel() for exact_rhs5"); for (c = 0; c < MAXCELLS; c++) { k_copy_faces1[i][c] = clCreateKernel(program[i], "copy_faces1", &ecode); clu_CheckError(ecode, "clCreateKernel() for copy_faces1"); k_copy_faces2[i][c] = clCreateKernel(program[i], "copy_faces2", &ecode); clu_CheckError(ecode, "clCreateKernel() for copy_faces2"); k_copy_faces3[i][c] = clCreateKernel(program[i], "copy_faces3", &ecode); clu_CheckError(ecode, "clCreateKernel() for copy_faces3"); k_copy_faces4[i][c] = clCreateKernel(program[i], "copy_faces4", &ecode); clu_CheckError(ecode, "clCreateKernel() for copy_faces4"); k_copy_faces5[i][c] = clCreateKernel(program[i], "copy_faces5", &ecode); clu_CheckError(ecode, "clCreateKernel() for copy_faces5"); k_copy_faces6[i][c] = clCreateKernel(program[i], "copy_faces6", &ecode); clu_CheckError(ecode, "clCreateKernel() for copy_faces6"); k_compute_rhs1[i][c] = clCreateKernel(program[i], "compute_rhs1", &ecode); clu_CheckError(ecode, "clCreateKernel() for compute_rhs1"); k_compute_rhs2[i][c] = clCreateKernel(program[i], "compute_rhs2", &ecode); clu_CheckError(ecode, "clCreateKernel() for compute_rhs2"); k_compute_rhs3[i][c] = clCreateKernel(program[i], "compute_rhs3", &ecode); clu_CheckError(ecode, "clCreateKernel() for compute_rhs3"); k_compute_rhs4[i][c] = clCreateKernel(program[i], "compute_rhs4", &ecode); clu_CheckError(ecode, "clCreateKernel() for compute_rhs4"); k_compute_rhs5[i][c] = clCreateKernel(program[i], "compute_rhs5", &ecode); clu_CheckError(ecode, "clCreateKernel() for compute_rhs5"); k_compute_rhs6[i][c] = clCreateKernel(program[i], "compute_rhs6", &ecode); clu_CheckError(ecode, "clCreateKernel() for compute_rhs6"); k_txinvr[i][c] = clCreateKernel(program[i], "txinvr", &ecode); clu_CheckError(ecode, "clCreateKernel() for txinvr"); k_lhsx[i][c] = clCreateKernel(program[i], "lhsx", &ecode); clu_CheckError(ecode, "clCreateKernel() for lhsx"); k_ninvr[i][c] = clCreateKernel(program[i], "ninvr", &ecode); clu_CheckError(ecode, "clCreateKernel() for ninvr"); k_x_solve1[i][c] = clCreateKernel(program[i], "x_solve1", &ecode); clu_CheckError(ecode, "clCreateKernel() for x_solve1"); k_x_solve2[i][c] = clCreateKernel(program[i], "x_solve2", &ecode); clu_CheckError(ecode, "clCreateKernel() for x_solve2"); k_x_solve3[i][c] = clCreateKernel(program[i], "x_solve3", &ecode); clu_CheckError(ecode, "clCreateKernel() for x_solve3"); k_x_solve4[i][c] = clCreateKernel(program[i], "x_solve4", &ecode); clu_CheckError(ecode, "clCreateKernel() for x_solve4"); k_x_solve5[i][c] = clCreateKernel(program[i], "x_solve5", &ecode); clu_CheckError(ecode, "clCreateKernel() for x_solve5"); k_x_solve6[i][c] = clCreateKernel(program[i], "x_solve6", &ecode); clu_CheckError(ecode, "clCreateKernel() for x_solve6"); k_lhsy[i][c] = clCreateKernel(program[i], "lhsy", &ecode); clu_CheckError(ecode, "clCreateKernel() for lhsy"); k_pinvr[i][c] = clCreateKernel(program[i], "pinvr", &ecode); clu_CheckError(ecode, "clCreateKernel() for pinvr"); k_y_solve1[i][c] = clCreateKernel(program[i], "y_solve1", &ecode); clu_CheckError(ecode, "clCreateKernel() for y_solve1"); k_y_solve2[i][c] = clCreateKernel(program[i], "y_solve2", &ecode); clu_CheckError(ecode, "clCreateKernel() for y_solve2"); k_y_solve3[i][c] = clCreateKernel(program[i], "y_solve3", &ecode); clu_CheckError(ecode, "clCreateKernel() for y_solve3"); k_y_solve4[i][c] = clCreateKernel(program[i], "y_solve4", &ecode); clu_CheckError(ecode, "clCreateKernel() for y_solve4"); k_y_solve5[i][c] = clCreateKernel(program[i], "y_solve5", &ecode); clu_CheckError(ecode, "clCreateKernel() for y_solve5"); k_y_solve6[i][c] = clCreateKernel(program[i], "y_solve6", &ecode); clu_CheckError(ecode, "clCreateKernel() for y_solve6"); k_lhsz[i][c] = clCreateKernel(program[i], "lhsz", &ecode); clu_CheckError(ecode, "clCreateKernel() for lhsz"); k_tzetar[i][c] = clCreateKernel(program[i], "tzetar", &ecode); clu_CheckError(ecode, "clCreateKernel() for tzetar"); k_z_solve1[i][c] = clCreateKernel(program[i], "z_solve1", &ecode); clu_CheckError(ecode, "clCreateKernel() for z_solve1"); k_z_solve2[i][c] = clCreateKernel(program[i], "z_solve2", &ecode); clu_CheckError(ecode, "clCreateKernel() for z_solve2"); k_z_solve3[i][c] = clCreateKernel(program[i], "z_solve3", &ecode); clu_CheckError(ecode, "clCreateKernel() for z_solve3"); k_z_solve4[i][c] = clCreateKernel(program[i], "z_solve4", &ecode); clu_CheckError(ecode, "clCreateKernel() for z_solve4"); k_z_solve5[i][c] = clCreateKernel(program[i], "z_solve5", &ecode); clu_CheckError(ecode, "clCreateKernel() for z_solve5"); k_z_solve6[i][c] = clCreateKernel(program[i], "z_solve6", &ecode); clu_CheckError(ecode, "clCreateKernel() for z_solve6"); k_add[i][c] = clCreateKernel(program[i], "add", &ecode); clu_CheckError(ecode, "clCreateKernel() for add"); } k_error_norm[i] = clCreateKernel(program[i], "error_norm", &ecode); clu_CheckError(ecode, "clCreateKernel() for error_norm"); k_rhs_norm[i] = clCreateKernel(program[i], "rhs_norm", &ecode); clu_CheckError(ecode, "clCreateKernel() for rhs_norm"); } // 6. Create buffers if (timeron) timer_start(TIMER_BUFFER); asize = sizeof(cl_mem) * num_devices; m_u = (cl_mem *)malloc(asize); m_us = (cl_mem *)malloc(asize); m_vs = (cl_mem *)malloc(asize); m_ws = (cl_mem *)malloc(asize); m_qs = (cl_mem *)malloc(asize); m_ainv = (cl_mem *)malloc(asize); m_rho_i = (cl_mem *)malloc(asize); m_speed = (cl_mem *)malloc(asize); m_square = (cl_mem *)malloc(asize); m_rhs = (cl_mem *)malloc(asize); m_forcing = (cl_mem *)malloc(asize); m_lhs = (cl_mem *)malloc(asize); m_in_buffer = (cl_mem *)malloc(asize); m_out_buffer = (cl_mem *)malloc(asize); m_ce = (cl_mem *)malloc(asize); for (i = 0; i < num_devices; i++) { m_u[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*MAXCELLS*(KMAX+4)*(JMAXP+4)*(IMAXP+4)*5, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_u"); m_us[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2), NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_us"); m_vs[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2), NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_vs"); m_ws[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2), NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_ws"); m_qs[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2), NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_qs"); m_ainv[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2), NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_ainv"); m_rho_i[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2), NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_rho_i"); m_speed[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2), NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_speed"); m_square[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2), NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_square"); m_rhs[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*MAXCELLS*KMAX*JMAXP*IMAXP*5, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_rhs"); m_forcing[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*MAXCELLS*KMAX*JMAXP*IMAXP*5, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_forcing"); m_lhs[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*MAXCELLS*KMAX*JMAXP*IMAXP*15, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_lhs"); m_in_buffer[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*BUF_SIZE, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_in_buffer"); m_out_buffer[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*BUF_SIZE, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_out_buffer"); m_ce[i] = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(double)*5*13, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_ce"); } if (timeron) timer_stop(TIMER_BUFFER); if (timeron) timer_stop(TIMER_OPENCL); }
//--------------------------------------------------------------------- // This subroutine initializes the field variable u using // tri-linear transfinite interpolation of the boundary values //--------------------------------------------------------------------- void initialize() { int i; size_t d0_size, d1_size, d2_size; size_t local_ws[3], global_ws[3], temp; cl_kernel *k_initialize1; cl_kernel *k_initialize2; cl_kernel *k_initialize3; cl_kernel *k_initialize4; cl_kernel *k_initialize5; cl_kernel *k_initialize6; cl_kernel *k_initialize7; cl_kernel *k_initialize8; cl_int ecode; k_initialize1 = (cl_kernel *)malloc(sizeof(cl_kernel) * num_devices); k_initialize2 = (cl_kernel *)malloc(sizeof(cl_kernel) * num_devices); k_initialize3 = (cl_kernel *)malloc(sizeof(cl_kernel) * num_devices); k_initialize4 = (cl_kernel *)malloc(sizeof(cl_kernel) * num_devices); k_initialize5 = (cl_kernel *)malloc(sizeof(cl_kernel) * num_devices); k_initialize6 = (cl_kernel *)malloc(sizeof(cl_kernel) * num_devices); k_initialize7 = (cl_kernel *)malloc(sizeof(cl_kernel) * num_devices); k_initialize8 = (cl_kernel *)malloc(sizeof(cl_kernel) * num_devices); //----------------------------------------------------------------------- d0_size = JMAX+2; d1_size = KMAX+2; d2_size = ncells; local_ws[0] = d0_size < work_item_sizes[0] ? d0_size : work_item_sizes[0]; temp = max_work_group_size / local_ws[0]; local_ws[1] = d1_size < temp ? d1_size : temp; temp = temp / local_ws[1]; local_ws[2] = d2_size < temp ? d2_size : temp; global_ws[0] = clu_RoundWorkSize(d0_size, local_ws[0]); global_ws[1] = clu_RoundWorkSize(d1_size, local_ws[1]); global_ws[2] = clu_RoundWorkSize(d2_size, local_ws[2]); for (i = 0; i < num_devices; i++) { k_initialize1[i] = clCreateKernel(p_initialize[i], "initialize1", &ecode); clu_CheckError(ecode, "clCreateKernel()"); ecode = clSetKernelArg(k_initialize1[i], 0, sizeof(cl_mem), &m_u[i]); ecode |= clSetKernelArg(k_initialize1[i], 1, sizeof(int), &ncells); clu_CheckError(ecode, "clSetKernelArg()"); ecode = clEnqueueNDRangeKernel(cmd_queue[i], k_initialize1[i], 3, NULL, global_ws, local_ws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); ecode = clFinish(cmd_queue[i]); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); } //----------------------------------------------------------------------- //--------------------------------------------------------------------- // first store the "interpolated" values everywhere on the grid //--------------------------------------------------------------------- for (i = 0; i < num_devices; i++) { d0_size = max_cell_size[i][1]; d1_size = max_cell_size[i][2]; d2_size = ncells; local_ws[0] = d0_size < work_item_sizes[0] ? d0_size : work_item_sizes[0]; temp = max_work_group_size / local_ws[0]; local_ws[1] = d1_size < temp ? d1_size : temp; temp = temp / local_ws[1]; local_ws[2] = d2_size < temp ? d2_size : temp; global_ws[0] = clu_RoundWorkSize(d0_size, local_ws[0]); global_ws[1] = clu_RoundWorkSize(d1_size, local_ws[1]); global_ws[2] = clu_RoundWorkSize(d2_size, local_ws[2]); k_initialize2[i] = clCreateKernel(p_initialize[i], "initialize2", &ecode); clu_CheckError(ecode, "clCreateKernel()"); ecode = clSetKernelArg(k_initialize2[i], 0, sizeof(cl_mem), &m_u[i]); ecode |= clSetKernelArg(k_initialize2[i], 1, sizeof(cl_mem), &m_cell_low[i]); ecode |= clSetKernelArg(k_initialize2[i], 2, sizeof(cl_mem), &m_cell_high[i]); ecode |= clSetKernelArg(k_initialize2[i], 3, sizeof(cl_mem), &m_ce[i]); ecode |= clSetKernelArg(k_initialize2[i], 4, sizeof(int), &ncells); clu_CheckError(ecode, "clSetKernelArg()"); ecode = clEnqueueNDRangeKernel(cmd_queue[i], k_initialize2[i], 3, NULL, global_ws, local_ws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); } //----------------------------------------------------------------------- //--------------------------------------------------------------------- // now store the exact values on the boundaries //--------------------------------------------------------------------- //--------------------------------------------------------------------- // west face //--------------------------------------------------------------------- for (i = 0; i < num_devices; i++) { d0_size = max_cell_size[i][1]; d1_size = max_cell_size[i][2]; local_ws[0] = d0_size < work_item_sizes[0] ? d0_size : work_item_sizes[0]; temp = max_work_group_size / local_ws[0]; local_ws[1] = d1_size < temp ? d1_size : temp; global_ws[0] = clu_RoundWorkSize(d0_size, local_ws[0]); global_ws[1] = clu_RoundWorkSize(d1_size, local_ws[1]); k_initialize3[i] = clCreateKernel(p_initialize[i], "initialize3", &ecode); clu_CheckError(ecode, "clCreateKernel()"); ecode = clSetKernelArg(k_initialize3[i], 0, sizeof(cl_mem), &m_u[i]); ecode |= clSetKernelArg(k_initialize3[i], 1, sizeof(cl_mem), &m_cell_low[i]); ecode |= clSetKernelArg(k_initialize3[i], 2, sizeof(cl_mem), &m_cell_high[i]); ecode |= clSetKernelArg(k_initialize3[i], 3, sizeof(cl_mem), &m_slice[i]); ecode |= clSetKernelArg(k_initialize3[i], 4, sizeof(cl_mem), &m_ce[i]); clu_CheckError(ecode, "clSetKernelArg()"); ecode = clEnqueueNDRangeKernel(cmd_queue[i], k_initialize3[i], 2, NULL, global_ws, local_ws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); } //----------------------------------------------------------------------- //--------------------------------------------------------------------- // east face //--------------------------------------------------------------------- for (i = 0; i < num_devices; i++) { d0_size = max_cell_size[i][1]; d1_size = max_cell_size[i][2]; local_ws[0] = d0_size < work_item_sizes[0] ? d0_size : work_item_sizes[0]; temp = max_work_group_size / local_ws[0]; local_ws[1] = d1_size < temp ? d1_size : temp; global_ws[0] = clu_RoundWorkSize(d0_size, local_ws[0]); global_ws[1] = clu_RoundWorkSize(d1_size, local_ws[1]); k_initialize4[i] = clCreateKernel(p_initialize[i], "initialize4", &ecode); clu_CheckError(ecode, "clCreateKernel()"); ecode = clSetKernelArg(k_initialize4[i], 0, sizeof(cl_mem), &m_u[i]); ecode |= clSetKernelArg(k_initialize4[i], 1, sizeof(cl_mem), &m_cell_low[i]); ecode |= clSetKernelArg(k_initialize4[i], 2, sizeof(cl_mem), &m_cell_high[i]); ecode |= clSetKernelArg(k_initialize4[i], 3, sizeof(cl_mem), &m_cell_size[i]); ecode |= clSetKernelArg(k_initialize4[i], 4, sizeof(cl_mem), &m_slice[i]); ecode = clSetKernelArg(k_initialize4[i], 5, sizeof(cl_mem), &m_ce[i]); ecode |= clSetKernelArg(k_initialize4[i], 6, sizeof(int), &ncells); clu_CheckError(ecode, "clSetKernelArg()"); ecode = clEnqueueNDRangeKernel(cmd_queue[i], k_initialize4[i], 2, NULL, global_ws, local_ws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); } //----------------------------------------------------------------------- //--------------------------------------------------------------------- // south face //--------------------------------------------------------------------- for (i = 0; i < num_devices; i++) { d0_size = max_cell_size[i][0]; d1_size = max_cell_size[i][2]; local_ws[0] = d0_size < work_item_sizes[0] ? d0_size : work_item_sizes[0]; temp = max_work_group_size / local_ws[0]; local_ws[1] = d1_size < temp ? d1_size : temp; global_ws[0] = clu_RoundWorkSize(d0_size, local_ws[0]); global_ws[1] = clu_RoundWorkSize(d1_size, local_ws[1]); k_initialize5[i] = clCreateKernel(p_initialize[i], "initialize5", &ecode); clu_CheckError(ecode, "clCreateKernel()"); ecode = clSetKernelArg(k_initialize5[i], 0, sizeof(cl_mem), &m_u[i]); ecode |= clSetKernelArg(k_initialize5[i], 1, sizeof(cl_mem), &m_cell_low[i]); ecode |= clSetKernelArg(k_initialize5[i], 2, sizeof(cl_mem), &m_cell_high[i]); ecode |= clSetKernelArg(k_initialize5[i], 3, sizeof(cl_mem), &m_slice[i]); ecode |= clSetKernelArg(k_initialize5[i], 4, sizeof(cl_mem), &m_ce[i]); clu_CheckError(ecode, "clSetKernelArg()"); ecode = clEnqueueNDRangeKernel(cmd_queue[i], k_initialize5[i], 2, NULL, global_ws, local_ws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); } //----------------------------------------------------------------------- //--------------------------------------------------------------------- // north face //----------------------------------------------------------------------- for (i = 0; i < num_devices; i++) { d0_size = max_cell_size[i][0]; d1_size = max_cell_size[i][2]; local_ws[0] = d0_size < work_item_sizes[0] ? d0_size : work_item_sizes[0]; temp = max_work_group_size / local_ws[0]; local_ws[1] = d1_size < temp ? d1_size : temp; global_ws[0] = clu_RoundWorkSize(d0_size, local_ws[0]); global_ws[1] = clu_RoundWorkSize(d1_size, local_ws[1]); k_initialize6[i] = clCreateKernel(p_initialize[i], "initialize6", &ecode); clu_CheckError(ecode, "clCreateKernel()"); ecode = clSetKernelArg(k_initialize6[i], 0, sizeof(cl_mem), &m_u[i]); ecode |= clSetKernelArg(k_initialize6[i], 1, sizeof(cl_mem), &m_cell_low[i]); ecode |= clSetKernelArg(k_initialize6[i], 2, sizeof(cl_mem), &m_cell_high[i]); ecode |= clSetKernelArg(k_initialize6[i], 3, sizeof(cl_mem), &m_cell_size[i]); ecode |= clSetKernelArg(k_initialize6[i], 4, sizeof(cl_mem), &m_slice[i]); ecode = clSetKernelArg(k_initialize6[i], 5, sizeof(cl_mem), &m_ce[i]); ecode |= clSetKernelArg(k_initialize6[i], 6, sizeof(int), &ncells); clu_CheckError(ecode, "clSetKernelArg()"); ecode = clEnqueueNDRangeKernel(cmd_queue[i], k_initialize6[i], 2, NULL, global_ws, local_ws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); } //----------------------------------------------------------------------- //--------------------------------------------------------------------- // bottom face //----------------------------------------------------------------------- for (i = 0; i < num_devices; i++) { d0_size = max_cell_size[i][0]; d1_size = max_cell_size[i][1]; local_ws[0] = d0_size < work_item_sizes[0] ? d0_size : work_item_sizes[0]; temp = max_work_group_size / local_ws[0]; local_ws[1] = d1_size < temp ? d1_size : temp; global_ws[0] = clu_RoundWorkSize(d0_size, local_ws[0]); global_ws[1] = clu_RoundWorkSize(d1_size, local_ws[1]); k_initialize7[i] = clCreateKernel(p_initialize[i], "initialize7", &ecode); clu_CheckError(ecode, "clCreateKernel()"); ecode = clSetKernelArg(k_initialize7[i], 0, sizeof(cl_mem), &m_u[i]); ecode |= clSetKernelArg(k_initialize7[i], 1, sizeof(cl_mem), &m_cell_low[i]); ecode |= clSetKernelArg(k_initialize7[i], 2, sizeof(cl_mem), &m_cell_high[i]); ecode |= clSetKernelArg(k_initialize7[i], 3, sizeof(cl_mem), &m_slice[i]); ecode |= clSetKernelArg(k_initialize7[i], 4, sizeof(cl_mem), &m_ce[i]); clu_CheckError(ecode, "clSetKernelArg()"); ecode = clEnqueueNDRangeKernel(cmd_queue[i], k_initialize7[i], 2, NULL, global_ws, local_ws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); } //----------------------------------------------------------------------- //--------------------------------------------------------------------- // top face //----------------------------------------------------------------------- for (i = 0; i < num_devices; i++) { d0_size = max_cell_size[i][0]; d1_size = max_cell_size[i][1]; local_ws[0] = d0_size < work_item_sizes[0] ? d0_size : work_item_sizes[0]; temp = max_work_group_size / local_ws[0]; local_ws[1] = d1_size < temp ? d1_size : temp; global_ws[0] = clu_RoundWorkSize(d0_size, local_ws[0]); global_ws[1] = clu_RoundWorkSize(d1_size, local_ws[1]); k_initialize8[i] = clCreateKernel(p_initialize[i], "initialize8", &ecode); clu_CheckError(ecode, "clCreateKernel()"); ecode = clSetKernelArg(k_initialize8[i], 0, sizeof(cl_mem), &m_u[i]); ecode |= clSetKernelArg(k_initialize8[i], 1, sizeof(cl_mem), &m_cell_low[i]); ecode |= clSetKernelArg(k_initialize8[i], 2, sizeof(cl_mem), &m_cell_high[i]); ecode |= clSetKernelArg(k_initialize8[i], 3, sizeof(cl_mem), &m_cell_size[i]); ecode |= clSetKernelArg(k_initialize8[i], 4, sizeof(cl_mem), &m_slice[i]); ecode = clSetKernelArg(k_initialize8[i], 5, sizeof(cl_mem), &m_ce[i]); ecode |= clSetKernelArg(k_initialize8[i], 6, sizeof(int), &ncells); clu_CheckError(ecode, "clSetKernelArg()"); ecode = clEnqueueNDRangeKernel(cmd_queue[i], k_initialize8[i], 2, NULL, global_ws, local_ws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); } //----------------------------------------------------------------------- for (i = 0; i < num_devices; i++) { ecode = clFinish(cmd_queue[i]); clu_CheckError(ecode, "clFinish()"); } for (i = 0; i < num_devices; i++) { clReleaseKernel(k_initialize1[i]); clReleaseKernel(k_initialize2[i]); clReleaseKernel(k_initialize3[i]); clReleaseKernel(k_initialize4[i]); clReleaseKernel(k_initialize5[i]); clReleaseKernel(k_initialize6[i]); clReleaseKernel(k_initialize7[i]); clReleaseKernel(k_initialize8[i]); } free(k_initialize1); free(k_initialize2); free(k_initialize3); free(k_initialize4); free(k_initialize5); free(k_initialize6); free(k_initialize7); free(k_initialize8); }
//--------------------------------------------------------------------- // Set up the OpenCL environment. //--------------------------------------------------------------------- static void setup_opencl(int argc, char *argv[]) { int i; size_t temp, wg_num; cl_int ecode; char *source_dir = "LU"; if (timeron) { timer_clear(TIMER_OPENCL); timer_clear(TIMER_BUILD); timer_clear(TIMER_BUFFER); timer_clear(TIMER_RELEASE); timer_start(TIMER_OPENCL); } if (argc > 1) source_dir = argv[1]; //----------------------------------------------------------------------- // 1. Find the default device type and get a device for the device type //----------------------------------------------------------------------- device_type = clu_GetDefaultDeviceType(); device = clu_GetAvailableDevice(device_type); device_name = clu_GetDeviceName(device); // Device information ecode = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(work_item_sizes), &work_item_sizes, NULL); clu_CheckError(ecode, "clGetDiviceInfo()"); ecode = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_work_group_size, NULL); clu_CheckError(ecode, "clGetDiviceInfo()"); ecode = clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &max_compute_units, NULL); clu_CheckError(ecode, "clGetDiviceInfo()"); //////////////////////////////////////////////////////////////////////// // FIXME: The below values are experimental. size_t default_wg_size = 64; if (device_type == CL_DEVICE_TYPE_CPU) { if (CLASS == 'B') default_wg_size = 128; } else { if (CLASS == 'B') default_wg_size = 32; } if (max_work_group_size > default_wg_size) { max_work_group_size = default_wg_size; int i; for (i = 0; i < 3; i++) { if (work_item_sizes[i] > default_wg_size) { work_item_sizes[i] = default_wg_size; } } } if (device_type == CL_DEVICE_TYPE_CPU) { SETBV1_DIM = SETBV1_DIM_CPU; SETBV2_DIM = SETBV2_DIM_CPU; SETBV3_DIM = SETBV3_DIM_CPU; SETIV_DIM = SETIV_DIM_CPU; ERHS1_DIM = ERHS1_DIM_CPU; ERHS2_DIM = ERHS2_DIM_CPU; ERHS3_DIM = ERHS3_DIM_CPU; ERHS4_DIM = ERHS4_DIM_CPU; PINTGR1_DIM = PINTGR1_DIM_CPU; PINTGR2_DIM = PINTGR2_DIM_CPU; PINTGR3_DIM = PINTGR3_DIM_CPU; RHS_DIM = RHS_DIM_CPU; RHSX_DIM = RHSX_DIM_CPU; RHSY_DIM = RHSY_DIM_CPU; RHSZ_DIM = RHSZ_DIM_CPU; SSOR2_DIM = SSOR2_DIM_CPU; SSOR3_DIM = SSOR3_DIM_CPU; } else { SETBV1_DIM = SETBV1_DIM_GPU; SETBV2_DIM = SETBV2_DIM_GPU; SETBV3_DIM = SETBV3_DIM_GPU; SETIV_DIM = SETIV_DIM_GPU; ERHS1_DIM = ERHS1_DIM_GPU; ERHS2_DIM = ERHS2_DIM_GPU; ERHS3_DIM = ERHS3_DIM_GPU; ERHS4_DIM = ERHS4_DIM_GPU; PINTGR1_DIM = PINTGR1_DIM_GPU; PINTGR2_DIM = PINTGR2_DIM_GPU; PINTGR3_DIM = PINTGR3_DIM_GPU; RHS_DIM = RHS_DIM_GPU; RHSX_DIM = RHSX_DIM_GPU; RHSY_DIM = RHSY_DIM_GPU; RHSZ_DIM = RHSZ_DIM_GPU; SSOR2_DIM = SSOR2_DIM_GPU; SSOR3_DIM = SSOR3_DIM_GPU; } //////////////////////////////////////////////////////////////////////// //----------------------------------------------------------------------- // 2. Create a context for the specified device //----------------------------------------------------------------------- context = clCreateContext(NULL, 1, &device, NULL, NULL, &ecode); clu_CheckError(ecode, "clCreateContext()"); //----------------------------------------------------------------------- // 3. Create command queues //----------------------------------------------------------------------- cmd_queue = clCreateCommandQueue(context, device, 0, &ecode); clu_CheckError(ecode, "clCreateCommandQueue()"); max_pipeline = (jend-jst) < max_compute_units ? (jend-jst) : max_compute_units; pipe_queue = (cl_command_queue *)malloc(sizeof(cl_command_queue) * max_pipeline); for (i = 0; i < max_pipeline; i++) { pipe_queue[i] = clCreateCommandQueue(context, device, 0, &ecode); clu_CheckError(ecode, "clCreateCommandQueue()"); } //----------------------------------------------------------------------- // 4. Build programs //----------------------------------------------------------------------- if (timeron) timer_start(TIMER_BUILD); char build_option[100]; if (device_type == CL_DEVICE_TYPE_CPU) { sprintf(build_option, "-I. -DCLASS=%d -DUSE_CPU", CLASS); } else { sprintf(build_option, "-I. -DCLASS=\'%c\'", CLASS); } p_pre = clu_MakeProgram(context, device, source_dir, "kernel_pre.cl", build_option); p_main = clu_MakeProgram(context, device, source_dir, (device_type == CL_DEVICE_TYPE_CPU ? "kernel_main_cpu.cl" : "kernel_main_gpu.cl"), build_option); p_post = clu_MakeProgram(context, device, source_dir, "kernel_post.cl", build_option); if (timeron) timer_stop(TIMER_BUILD); //----------------------------------------------------------------------- // 5. Create buffers //----------------------------------------------------------------------- if (timeron) timer_start(TIMER_BUFFER); m_ce = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(double)*5*13, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_ce"); m_u = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*(ISIZ3)*(ISIZ2/2*2+1)*(ISIZ1/2*2+1)*5, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_u"); m_rsd = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*(ISIZ3)*(ISIZ2/2*2+1)*(ISIZ1/2*2+1)*5, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_rsd"); m_frct = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*(ISIZ3)*(ISIZ2/2*2+1)*(ISIZ1/2*2+1)*5, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_frct"); m_qs = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*(ISIZ3)*(ISIZ2/2*2+1)*(ISIZ1/2*2+1), NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_qs"); m_rho_i = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*(ISIZ3)*(ISIZ2/2*2+1)*(ISIZ1/2*2+1), NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_rho_i"); // workspace for work-items size_t max_work_items; if (ERHS2_DIM == 1 && ERHS3_DIM == 1 && ERHS4_DIM == 1) { max_work_items = ISIZ3; } else { max_work_items = ISIZ3*ISIZ2; } m_flux = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*ISIZ1*5 * max_work_items, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_flux"); if (RHSZ_DIM == 1) { max_work_items = ISIZ2; } else { max_work_items = ISIZ2*ISIZ1; } if (device_type == CL_DEVICE_TYPE_CPU) { m_utmp = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*ISIZ3*6 * max_work_items, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_utmp"); m_rtmp = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*ISIZ3*5 * max_work_items, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_rtmp"); } temp = (nz0-2) / max_compute_units; l2norm_lws[0] = temp == 0 ? 1 : temp; l2norm_gws[0] = clu_RoundWorkSize((size_t)(nz0-2), l2norm_lws[0]); wg_num = l2norm_gws[0] / l2norm_lws[0]; sum_size = sizeof(double) * 5 * wg_num; m_sum = clCreateBuffer(context, CL_MEM_READ_WRITE, sum_size, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer()"); if (timeron) timer_stop(TIMER_BUFFER); //----------------------------------------------------------------------- // 6. Create kernels //----------------------------------------------------------------------- k_setbv1 = clCreateKernel(p_pre, "setbv1", &ecode); clu_CheckError(ecode, "clCreateKernel() for setbv1"); ecode = clSetKernelArg(k_setbv1, 0, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_setbv1, 1, sizeof(cl_mem), &m_ce); ecode |= clSetKernelArg(k_setbv1, 2, sizeof(int), &nx); ecode |= clSetKernelArg(k_setbv1, 3, sizeof(int), &ny); ecode |= clSetKernelArg(k_setbv1, 4, sizeof(int), &nz); clu_CheckError(ecode, "clSetKernelArg()"); if (SETBV1_DIM == 3) { setbv1_lws[0] = 5; temp = max_work_group_size / setbv1_lws[0]; setbv1_lws[1] = nx < temp ? nx : temp; temp = temp / setbv1_lws[1]; setbv1_lws[2] = ny < temp ? ny : temp; setbv1_gws[0] = clu_RoundWorkSize((size_t)5, setbv1_lws[0]); setbv1_gws[1] = clu_RoundWorkSize((size_t)nx, setbv1_lws[1]); setbv1_gws[2] = clu_RoundWorkSize((size_t)ny, setbv1_lws[2]); } else if (SETBV1_DIM == 2) { setbv1_lws[0] = nx < work_item_sizes[0] ? nx : work_item_sizes[0]; temp = max_work_group_size / setbv1_lws[0]; setbv1_lws[1] = ny < temp ? ny : temp; setbv1_gws[0] = clu_RoundWorkSize((size_t)nx, setbv1_lws[0]); setbv1_gws[1] = clu_RoundWorkSize((size_t)ny, setbv1_lws[1]); } else { temp = ny / max_compute_units; setbv1_lws[0] = temp == 0 ? 1 : temp; setbv1_gws[0] = clu_RoundWorkSize((size_t)ny, setbv1_lws[0]); } k_setbv2 = clCreateKernel(p_pre, "setbv2", &ecode); clu_CheckError(ecode, "clCreateKernel() for setbv2"); ecode = clSetKernelArg(k_setbv2, 0, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_setbv2, 1, sizeof(cl_mem), &m_ce); ecode |= clSetKernelArg(k_setbv2, 2, sizeof(int), &nx); ecode |= clSetKernelArg(k_setbv2, 3, sizeof(int), &ny); ecode |= clSetKernelArg(k_setbv2, 4, sizeof(int), &nz); clu_CheckError(ecode, "clSetKernelArg()"); if (SETBV2_DIM == 3) { setbv2_lws[0] = 5; temp = max_work_group_size / setbv2_lws[0]; setbv2_lws[1] = nx < temp ? nx : temp; temp = temp / setbv2_lws[1]; setbv2_lws[2] = nz < temp ? nz : temp; setbv2_gws[0] = clu_RoundWorkSize((size_t)5, setbv2_lws[0]); setbv2_gws[1] = clu_RoundWorkSize((size_t)nx, setbv2_lws[1]); setbv2_gws[2] = clu_RoundWorkSize((size_t)nz, setbv2_lws[2]); } else if (SETBV2_DIM == 2) { setbv2_lws[0] = nx < work_item_sizes[0] ? nx : work_item_sizes[0]; temp = max_work_group_size / setbv2_lws[0]; setbv2_lws[1] = nz < temp ? nz : temp; setbv2_gws[0] = clu_RoundWorkSize((size_t)nx, setbv2_lws[0]); setbv2_gws[1] = clu_RoundWorkSize((size_t)nz, setbv2_lws[1]); } else { temp = nz / max_compute_units; setbv2_lws[0] = temp == 0 ? 1 : temp; setbv2_gws[0] = clu_RoundWorkSize((size_t)nz, setbv2_lws[0]); } k_setbv3 = clCreateKernel(p_pre, "setbv3", &ecode); clu_CheckError(ecode, "clCreateKernel() for setbv3"); ecode = clSetKernelArg(k_setbv3, 0, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_setbv3, 1, sizeof(cl_mem), &m_ce); ecode |= clSetKernelArg(k_setbv3, 2, sizeof(int), &nx); ecode |= clSetKernelArg(k_setbv3, 3, sizeof(int), &ny); ecode |= clSetKernelArg(k_setbv3, 4, sizeof(int), &nz); clu_CheckError(ecode, "clSetKernelArg()"); if (SETBV3_DIM == 3) { setbv3_lws[0] = 5; temp = max_work_group_size / setbv3_lws[0]; setbv3_lws[1] = ny < temp ? ny : temp; temp = temp / setbv3_lws[1]; setbv3_lws[2] = nz < temp ? nz : temp; setbv3_gws[0] = clu_RoundWorkSize((size_t)5, setbv3_lws[0]); setbv3_gws[1] = clu_RoundWorkSize((size_t)ny, setbv3_lws[1]); setbv3_gws[2] = clu_RoundWorkSize((size_t)nz, setbv3_lws[2]); } else if (SETBV3_DIM == 2) { setbv3_lws[0] = ny < work_item_sizes[0] ? ny : work_item_sizes[0]; temp = max_work_group_size / setbv3_lws[0]; setbv3_lws[1] = nz < temp ? nz : temp; setbv3_gws[0] = clu_RoundWorkSize((size_t)ny, setbv3_lws[0]); setbv3_gws[1] = clu_RoundWorkSize((size_t)nz, setbv3_lws[1]); } else { temp = nz / max_compute_units; setbv3_lws[0] = temp == 0 ? 1 : temp; setbv3_gws[0] = clu_RoundWorkSize((size_t)nz, setbv3_lws[0]); } k_setiv = clCreateKernel(p_pre, "setiv", &ecode); clu_CheckError(ecode, "clCreateKernel() for setiv"); ecode = clSetKernelArg(k_setiv, 0, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_setiv, 1, sizeof(cl_mem), &m_ce); ecode |= clSetKernelArg(k_setiv, 2, sizeof(int), &nx); ecode |= clSetKernelArg(k_setiv, 3, sizeof(int), &ny); ecode |= clSetKernelArg(k_setiv, 4, sizeof(int), &nz); clu_CheckError(ecode, "clSetKernelArg()"); if (SETIV_DIM == 3) { setiv_lws[0] = (nx-2) < work_item_sizes[0] ? (nx-2) : work_item_sizes[0]; temp = max_work_group_size / setiv_lws[0]; setiv_lws[1] = (ny-2) < temp ? (ny-2) : temp; temp = temp / setiv_lws[1]; setiv_lws[2] = (nz-2) < temp ? (nz-2) : temp; setiv_gws[0] = clu_RoundWorkSize((size_t)(nx-2), setiv_lws[0]); setiv_gws[1] = clu_RoundWorkSize((size_t)(ny-2), setiv_lws[1]); setiv_gws[2] = clu_RoundWorkSize((size_t)(nz-2), setiv_lws[2]); } else if (SETIV_DIM == 2) { setiv_lws[0] = (ny-2) < work_item_sizes[0] ? (ny-2) : work_item_sizes[0]; temp = max_work_group_size / setiv_lws[0]; setiv_lws[1] = (nz-2) < temp ? (nz-2) : temp; setiv_gws[0] = clu_RoundWorkSize((size_t)(ny-2), setiv_lws[0]); setiv_gws[1] = clu_RoundWorkSize((size_t)(nz-2), setiv_lws[1]); } else { temp = (nz-2) / max_compute_units; setiv_lws[0] = temp == 0 ? 1 : temp; setiv_gws[0] = clu_RoundWorkSize((size_t)(nz-2), setiv_lws[0]); } k_l2norm = clCreateKernel(p_main, "l2norm", &ecode); clu_CheckError(ecode, "clCreateKernel()"); ecode = clSetKernelArg(k_l2norm, 1, sizeof(cl_mem), &m_sum); ecode |= clSetKernelArg(k_l2norm, 2, sizeof(double)*5*l2norm_lws[0], NULL); clu_CheckError(ecode, "clSetKernelArg()"); k_rhs = clCreateKernel(p_main, "rhs", &ecode); clu_CheckError(ecode, "clCreateKernel() for rhs"); ecode = clSetKernelArg(k_rhs, 0, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_rhs, 1, sizeof(cl_mem), &m_rsd); ecode |= clSetKernelArg(k_rhs, 2, sizeof(cl_mem), &m_frct); ecode |= clSetKernelArg(k_rhs, 3, sizeof(cl_mem), &m_qs); ecode |= clSetKernelArg(k_rhs, 4, sizeof(cl_mem), &m_rho_i); ecode |= clSetKernelArg(k_rhs, 5, sizeof(int), &nx); ecode |= clSetKernelArg(k_rhs, 6, sizeof(int), &ny); ecode |= clSetKernelArg(k_rhs, 7, sizeof(int), &nz); clu_CheckError(ecode, "clSetKernelArg()"); if (RHS_DIM == 3) { rhs_lws[0] = nx < work_item_sizes[0] ? nx : work_item_sizes[0]; temp = max_work_group_size / rhs_lws[0]; rhs_lws[1] = ny < temp ? ny : temp; temp = temp / rhs_lws[1]; rhs_lws[2] = nz < temp ? nz : temp; rhs_gws[0] = clu_RoundWorkSize((size_t)nx, rhs_lws[0]); rhs_gws[1] = clu_RoundWorkSize((size_t)ny, rhs_lws[1]); rhs_gws[2] = clu_RoundWorkSize((size_t)nz, rhs_lws[2]); } else if (RHS_DIM == 2) { rhs_lws[0] = ny < work_item_sizes[0] ? ny : work_item_sizes[0]; temp = max_work_group_size / rhs_lws[0]; rhs_lws[1] = nz < temp ? nz : temp; rhs_gws[0] = clu_RoundWorkSize((size_t)ny, rhs_lws[0]); rhs_gws[1] = clu_RoundWorkSize((size_t)nz, rhs_lws[1]); } else { //temp = nz / max_compute_units; temp = 1; rhs_lws[0] = temp == 0 ? 1 : temp; rhs_gws[0] = clu_RoundWorkSize((size_t)nz, rhs_lws[0]); } k_rhsx = clCreateKernel(p_main, "rhsx", &ecode); clu_CheckError(ecode, "clCreateKernel() for rhsx"); ecode = clSetKernelArg(k_rhsx, 0, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_rhsx, 1, sizeof(cl_mem), &m_rsd); ecode |= clSetKernelArg(k_rhsx, 2, sizeof(cl_mem), &m_qs); ecode |= clSetKernelArg(k_rhsx, 3, sizeof(cl_mem), &m_rho_i); if (device_type == CL_DEVICE_TYPE_CPU) { ecode |= clSetKernelArg(k_rhsx, 4, sizeof(cl_mem), &m_flux); ecode |= clSetKernelArg(k_rhsx, 5, sizeof(int), &nx); ecode |= clSetKernelArg(k_rhsx, 6, sizeof(int), &ny); ecode |= clSetKernelArg(k_rhsx, 7, sizeof(int), &nz); } else { ecode |= clSetKernelArg(k_rhsx, 4, sizeof(int), &nx); ecode |= clSetKernelArg(k_rhsx, 5, sizeof(int), &ny); ecode |= clSetKernelArg(k_rhsx, 6, sizeof(int), &nz); } clu_CheckError(ecode, "clSetKernelArg()"); if (RHSX_DIM == 2) { rhsx_lws[0] = (jend-jst) < work_item_sizes[0] ? (jend-jst) : work_item_sizes[0]; temp = max_work_group_size / rhsx_lws[0]; rhsx_lws[1] = (nz-2) < temp ? (nz-2) : temp; rhsx_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), rhsx_lws[0]); rhsx_gws[1] = clu_RoundWorkSize((size_t)(nz-2), rhsx_lws[1]); } else { //temp = (nz-2) / max_compute_units; temp = 1; rhsx_lws[0] = temp == 0 ? 1 : temp; rhsx_gws[0] = clu_RoundWorkSize((size_t)(nz-2), rhsx_lws[0]); } k_rhsy = clCreateKernel(p_main, "rhsy", &ecode); clu_CheckError(ecode, "clCreateKernel() for rhsy"); ecode = clSetKernelArg(k_rhsy, 0, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_rhsy, 1, sizeof(cl_mem), &m_rsd); ecode |= clSetKernelArg(k_rhsy, 2, sizeof(cl_mem), &m_qs); ecode |= clSetKernelArg(k_rhsy, 3, sizeof(cl_mem), &m_rho_i); if (device_type == CL_DEVICE_TYPE_CPU) { ecode |= clSetKernelArg(k_rhsy, 4, sizeof(cl_mem), &m_flux); ecode |= clSetKernelArg(k_rhsy, 5, sizeof(int), &nx); ecode |= clSetKernelArg(k_rhsy, 6, sizeof(int), &ny); ecode |= clSetKernelArg(k_rhsy, 7, sizeof(int), &nz); } else { ecode |= clSetKernelArg(k_rhsy, 4, sizeof(int), &nx); ecode |= clSetKernelArg(k_rhsy, 5, sizeof(int), &ny); ecode |= clSetKernelArg(k_rhsy, 6, sizeof(int), &nz); } clu_CheckError(ecode, "clSetKernelArg()"); if (RHSY_DIM == 2) { rhsy_lws[0] = (iend-ist) < work_item_sizes[0] ? (iend-ist) : work_item_sizes[0]; temp = max_work_group_size / rhsy_lws[0]; rhsy_lws[1] = (nz-2) < temp ? (nz-2) : temp; rhsy_gws[0] = clu_RoundWorkSize((size_t)(iend-ist), rhsy_lws[0]); rhsy_gws[1] = clu_RoundWorkSize((size_t)(nz-2), rhsy_lws[1]); } else { //temp = (nz-2) / max_compute_units; temp = 1; rhsy_lws[0] = temp == 0 ? 1 : temp; rhsy_gws[0] = clu_RoundWorkSize((size_t)(nz-2), rhsy_lws[0]); } k_rhsz = clCreateKernel(p_main, "rhsz", &ecode); clu_CheckError(ecode, "clCreateKernel() for rhsz"); ecode = clSetKernelArg(k_rhsz, 0, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_rhsz, 1, sizeof(cl_mem), &m_rsd); ecode |= clSetKernelArg(k_rhsz, 2, sizeof(cl_mem), &m_qs); ecode |= clSetKernelArg(k_rhsz, 3, sizeof(cl_mem), &m_rho_i); if (device_type == CL_DEVICE_TYPE_CPU) { ecode |= clSetKernelArg(k_rhsz, 4, sizeof(cl_mem), &m_flux); ecode |= clSetKernelArg(k_rhsz, 5, sizeof(cl_mem), &m_utmp); ecode |= clSetKernelArg(k_rhsz, 6, sizeof(cl_mem), &m_rtmp); ecode |= clSetKernelArg(k_rhsz, 7, sizeof(int), &nx); ecode |= clSetKernelArg(k_rhsz, 8, sizeof(int), &ny); ecode |= clSetKernelArg(k_rhsz, 9, sizeof(int), &nz); } else { ecode |= clSetKernelArg(k_rhsz, 4, sizeof(int), &nx); ecode |= clSetKernelArg(k_rhsz, 5, sizeof(int), &ny); ecode |= clSetKernelArg(k_rhsz, 6, sizeof(int), &nz); } clu_CheckError(ecode, "clSetKernelArg()"); if (RHSZ_DIM == 2) { rhsz_lws[0] = (iend-ist) < work_item_sizes[0] ? (iend-ist) : work_item_sizes[0]; temp = max_work_group_size / rhsz_lws[0]; rhsz_lws[1] = (jend-jst) < temp ? (jend-jst) : temp; rhsz_gws[0] = clu_RoundWorkSize((size_t)(iend-ist), rhsz_lws[0]); rhsz_gws[1] = clu_RoundWorkSize((size_t)(jend-jst), rhsz_lws[1]); } else { //temp = (jend-jst) / max_compute_units; temp = 1; rhsz_lws[0] = temp == 0 ? 1 : temp; rhsz_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), rhsz_lws[0]); } k_ssor2 = clCreateKernel(p_main, "ssor2", &ecode); clu_CheckError(ecode, "clCreateKernel() for ssor2"); ecode = clSetKernelArg(k_ssor2, 0, sizeof(cl_mem), &m_rsd); ecode |= clSetKernelArg(k_ssor2, 2, sizeof(int), &nx); ecode |= clSetKernelArg(k_ssor2, 3, sizeof(int), &ny); ecode |= clSetKernelArg(k_ssor2, 4, sizeof(int), &nz); clu_CheckError(ecode, "clSetKernelArg()"); if (SSOR2_DIM == 3) { ssor2_lws[0] = (iend-ist) < work_item_sizes[0] ? (iend-ist) : work_item_sizes[0]; temp = max_work_group_size / ssor2_lws[0]; ssor2_lws[1] = (jend-jst) < temp ? (jend-jst) : temp; temp = temp / ssor2_lws[1]; ssor2_lws[2] = (nz-2) < temp ? (nz-2) : temp; ssor2_gws[0] = clu_RoundWorkSize((size_t)(iend-ist), ssor2_lws[0]); ssor2_gws[1] = clu_RoundWorkSize((size_t)(jend-jst), ssor2_lws[1]); ssor2_gws[2] = clu_RoundWorkSize((size_t)(nz-2), ssor2_lws[2]); } else if (SSOR2_DIM == 2) { ssor2_lws[0] = (jend-jst) < work_item_sizes[0] ? (jend-jst) : work_item_sizes[0]; temp = max_work_group_size / ssor2_lws[0]; ssor2_lws[1] = (nz-2) < temp ? (nz-2) : temp; ssor2_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), ssor2_lws[0]); ssor2_gws[1] = clu_RoundWorkSize((size_t)(nz-2), ssor2_lws[1]); } else { //temp = (nz-2) / max_compute_units; temp = 1; ssor2_lws[0] = temp == 0 ? 1 : temp; ssor2_gws[0] = clu_RoundWorkSize((size_t)(nz-2), ssor2_lws[0]); } k_ssor3 = clCreateKernel(p_main, "ssor3", &ecode); clu_CheckError(ecode, "clCreateKernel() for ssor3"); ecode = clSetKernelArg(k_ssor3, 0, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_ssor3, 1, sizeof(cl_mem), &m_rsd); ecode |= clSetKernelArg(k_ssor3, 3, sizeof(int), &nx); ecode |= clSetKernelArg(k_ssor3, 4, sizeof(int), &ny); ecode |= clSetKernelArg(k_ssor3, 5, sizeof(int), &nz); clu_CheckError(ecode, "clSetKernelArg()"); if (SSOR3_DIM == 3) { ssor3_lws[0] = (iend-ist) < work_item_sizes[0] ? (iend-ist) : work_item_sizes[0]; temp = max_work_group_size / ssor3_lws[0]; ssor3_lws[1] = (jend-jst) < temp ? (jend-jst) : temp; temp = temp / ssor3_lws[1]; ssor3_lws[2] = (nz-2) < temp ? (nz-2) : temp; ssor3_gws[0] = clu_RoundWorkSize((size_t)(iend-ist), ssor3_lws[0]); ssor3_gws[1] = clu_RoundWorkSize((size_t)(jend-jst), ssor3_lws[1]); ssor3_gws[2] = clu_RoundWorkSize((size_t)(nz-2), ssor3_lws[2]); } else if (SSOR3_DIM == 2) { ssor3_lws[0] = (jend-jst) < work_item_sizes[0] ? (jend-jst) : work_item_sizes[0]; temp = max_work_group_size / ssor3_lws[0]; ssor3_lws[1] = (nz-2) < temp ? (nz-2) : temp; ssor3_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), ssor3_lws[0]); ssor3_gws[1] = clu_RoundWorkSize((size_t)(nz-2), ssor3_lws[1]); } else { //temp = (nz-2) / max_compute_units; temp = 1; ssor3_lws[0] = temp == 0 ? 1 : temp; ssor3_gws[0] = clu_RoundWorkSize((size_t)(nz-2), ssor3_lws[0]); } k_blts = clCreateKernel(p_main, "blts", &ecode); clu_CheckError(ecode, "clCreateKernel() for blts"); ecode = clSetKernelArg(k_blts, 0, sizeof(cl_mem), &m_rsd); ecode |= clSetKernelArg(k_blts, 1, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_blts, 2, sizeof(cl_mem), &m_qs); ecode |= clSetKernelArg(k_blts, 3, sizeof(cl_mem), &m_rho_i); ecode |= clSetKernelArg(k_blts, 4, sizeof(int), &nz); ecode |= clSetKernelArg(k_blts, 5, sizeof(int), &ny); ecode |= clSetKernelArg(k_blts, 6, sizeof(int), &nx); clu_CheckError(ecode, "clSetKernelArg()"); blts_lws[0] = (jend-jst) < work_item_sizes[0] ? (jend-jst) : work_item_sizes[0]; temp = max_work_group_size / blts_lws[0]; blts_lws[1] = (nz-2) < temp ? (nz-2) : temp; blts_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), blts_lws[0]); blts_gws[1] = clu_RoundWorkSize((size_t)(nz-2), blts_lws[1]); k_buts = clCreateKernel(p_main, "buts", &ecode); clu_CheckError(ecode, "clCreateKernel() for buts"); ecode = clSetKernelArg(k_buts, 0, sizeof(cl_mem), &m_rsd); ecode |= clSetKernelArg(k_buts, 1, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_buts, 2, sizeof(cl_mem), &m_qs); ecode |= clSetKernelArg(k_buts, 3, sizeof(cl_mem), &m_rho_i); ecode |= clSetKernelArg(k_buts, 4, sizeof(int), &nz); ecode |= clSetKernelArg(k_buts, 5, sizeof(int), &ny); ecode |= clSetKernelArg(k_buts, 6, sizeof(int), &nx); clu_CheckError(ecode, "clSetKernelArg()"); buts_lws[0] = (jend-jst) < work_item_sizes[0] ? (jend-jst) : work_item_sizes[0]; temp = max_work_group_size / buts_lws[0]; buts_lws[1] = (nz-2) < temp ? (nz-2) : temp; buts_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), buts_lws[0]); buts_gws[1] = clu_RoundWorkSize((size_t)(nz-2), buts_lws[1]); if (timeron) timer_stop(TIMER_OPENCL); }
//--------------------------------------------------------------------- // Fill in array u0 with initial conditions from // random number generator //--------------------------------------------------------------------- static void compute_initial_conditions(cl_mem *u0, int d1, int d2, int d3) { int k; double start, an, dummy, starts[NZ]; size_t local_ws, global_ws, temp; cl_mem m_starts; cl_int ecode; start = SEED; //--------------------------------------------------------------------- // Jump to the starting element for our first plane. //--------------------------------------------------------------------- an = ipow46(A, 0); dummy = randlc(&start, an); an = ipow46(A, 2*NX*NY); starts[0] = start; for (k = 1; k < dims[2]; k++) { dummy = randlc(&start, an); starts[k] = start; } if (device_type == CL_DEVICE_TYPE_CPU) { m_starts = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(double) * NZ, starts, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_starts"); local_ws = 1; global_ws = clu_RoundWorkSize((size_t)d2, local_ws); } else { //GPU m_starts = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(double) * NZ, starts, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_starts"); temp = d2 / max_compute_units; local_ws = temp == 0 ? 1 : ((temp > work_item_sizes[0]) ? work_item_sizes[0] : temp); global_ws = clu_RoundWorkSize((size_t)d2, local_ws); } ecode = clSetKernelArg(k_compute_ics, 0, sizeof(cl_mem), u0); ecode |= clSetKernelArg(k_compute_ics, 1, sizeof(cl_mem), &m_starts); clu_CheckError(ecode, "clSetKernelArg() for compute_initial_conditions"); ecode = clEnqueueNDRangeKernel(cmd_queue, k_compute_ics, 1, NULL, &global_ws, &local_ws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); ecode = clFinish(cmd_queue); clu_CheckError(ecode, "clFinish()"); DTIMER_START(T_RELEASE); clReleaseMemObject(m_starts); DTIMER_STOP(T_RELEASE); }
//--------------------------------------------------------------------- // compute the right hand side based on exact solution //--------------------------------------------------------------------- void exact_rhs() { int c, i; int range_1, range_0; size_t d[3], local_ws[3], global_ws[3]; cl_int ecode = 0; for (c = 0; c < ncells; c++) { for (i = 0; i < num_devices; i++) { ecode = clSetKernelArg(k_exact_rhs1[i], 0, sizeof(cl_mem), &m_forcing[i]); ecode |= clSetKernelArg(k_exact_rhs1[i], 1, sizeof(int), &c); ecode |= clSetKernelArg(k_exact_rhs1[i], 2, sizeof(int), &cell_size[i][c][2]); ecode |= clSetKernelArg(k_exact_rhs1[i], 3, sizeof(int), &cell_size[i][c][1]); ecode |= clSetKernelArg(k_exact_rhs1[i], 4, sizeof(int), &cell_size[i][c][0]); clu_CheckError(ecode, "clSetKernelArg() for exact_rhs1"); d[0] = cell_size[i][c][1]; d[1] = cell_size[i][c][2]; compute_ws_dim_2(d, local_ws, global_ws); ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_exact_rhs1[i], 2, NULL, global_ws, local_ws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRange() for exact_rhs1"); } for (i = 0; i < num_devices; i++) { CHECK_FINISH(i * 2); } } for (i = 0; i < num_devices; i++) { ecode = clSetKernelArg(k_exact_rhs2[i], 0, sizeof(cl_mem), &m_forcing[i]); ecode |= clSetKernelArg(k_exact_rhs2[i], 1, sizeof(cl_mem), &m_ce[i]); clu_CheckError(ecode, "clSetKernelArg() for exact_rhs2"); } for (c = 0; c < ncells; c++) { for (i = 0; i < num_devices; i++) { range_1 = cell_size[i][c][2] - cell_end[i][c][2]; range_0 = cell_size[i][c][1] - cell_end[i][c][1]; ecode = clSetKernelArg(k_exact_rhs2[i], 2, sizeof(int), &c); ecode |= clSetKernelArg(k_exact_rhs2[i], 3, sizeof(int), &cell_start[i][c][2]); ecode |= clSetKernelArg(k_exact_rhs2[i], 4, sizeof(int), &range_1); ecode |= clSetKernelArg(k_exact_rhs2[i], 5, sizeof(int), &cell_start[i][c][1]); ecode |= clSetKernelArg(k_exact_rhs2[i], 6, sizeof(int), &range_0); ecode |= clSetKernelArg(k_exact_rhs2[i], 7, sizeof(int), &cell_size[i][c][0]); ecode |= clSetKernelArg(k_exact_rhs2[i], 8, sizeof(int), &cell_start[i][c][0]); ecode |= clSetKernelArg(k_exact_rhs2[i], 9, sizeof(int), &cell_end[i][c][0]); ecode |= clSetKernelArg(k_exact_rhs2[i], 10, sizeof(int), &cell_low[i][c][2]); ecode |= clSetKernelArg(k_exact_rhs2[i], 11, sizeof(int), &cell_low[i][c][1]); ecode |= clSetKernelArg(k_exact_rhs2[i], 12, sizeof(int), &cell_low[i][c][0]); clu_CheckError(ecode, "clSetKernelArg() for exact_rhs2"); d[0] = cell_size[i][c][1] - cell_start[i][c][1] - cell_end[i][c][1]; d[1] = cell_size[i][c][2] - cell_start[i][c][2] - cell_end[i][c][2]; compute_ws_dim_2(d, local_ws, global_ws); if (c == 0) CHECK_FINISH(i * 2); ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_exact_rhs2[i], 2, NULL, global_ws, local_ws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRange() for exact_rhs2"); } for (i = 0; i < num_devices; i++) { CHECK_FINISH(i * 2); } } for (i = 0; i < num_devices; i++) { ecode = clSetKernelArg(k_exact_rhs3[i], 0, sizeof(cl_mem), &m_forcing[i]); ecode |= clSetKernelArg(k_exact_rhs3[i], 1, sizeof(cl_mem), &m_ce[i]); clu_CheckError(ecode, "clSetKernelArg() for exact_rhs3"); } for (c = 0; c < ncells; c++) { for (i = 0; i < num_devices; i++) { range_1 = cell_size[i][c][2] - cell_end[i][c][2]; range_0 = cell_size[i][c][0] - cell_end[i][c][0]; ecode = clSetKernelArg(k_exact_rhs3[i], 2, sizeof(int), &c); ecode |= clSetKernelArg(k_exact_rhs3[i], 3, sizeof(int), &cell_start[i][c][2]); ecode |= clSetKernelArg(k_exact_rhs3[i], 4, sizeof(int), &range_1); ecode |= clSetKernelArg(k_exact_rhs3[i], 5, sizeof(int), &cell_start[i][c][0]); ecode |= clSetKernelArg(k_exact_rhs3[i], 6, sizeof(int), &range_0); ecode |= clSetKernelArg(k_exact_rhs3[i], 7, sizeof(int), &cell_size[i][c][1]); ecode |= clSetKernelArg(k_exact_rhs3[i], 8, sizeof(int), &cell_start[i][c][1]); ecode |= clSetKernelArg(k_exact_rhs3[i], 9, sizeof(int), &cell_end[i][c][1]); ecode |= clSetKernelArg(k_exact_rhs3[i], 10, sizeof(int), &cell_low[i][c][2]); ecode |= clSetKernelArg(k_exact_rhs3[i], 11, sizeof(int), &cell_low[i][c][1]); ecode |= clSetKernelArg(k_exact_rhs3[i], 12, sizeof(int), &cell_low[i][c][0]); clu_CheckError(ecode, "clSetKernelArg() for exact_rhs3"); d[0] = cell_size[i][c][0] - cell_start[i][c][0] - cell_end[i][c][0]; d[1] = cell_size[i][c][2] - cell_start[i][c][2] - cell_end[i][c][2]; compute_ws_dim_2(d, local_ws, global_ws); if (c == 0) CHECK_FINISH(i * 2); ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_exact_rhs3[i], 2, NULL, global_ws, local_ws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRange() for exact_rhs3"); } for (i = 0; i < num_devices; i++) { CHECK_FINISH(i * 2); } } for (i = 0; i < num_devices; i++) { ecode = clSetKernelArg(k_exact_rhs4[i], 0, sizeof(cl_mem), &m_forcing[i]); ecode |= clSetKernelArg(k_exact_rhs4[i], 1, sizeof(cl_mem), &m_ce[i]); clu_CheckError(ecode, "clSetKernelArg() for exact_rhs4"); } for (c = 0; c < ncells; c++) { for (i = 0; i < num_devices; i++) { range_1 = cell_size[i][c][1] - cell_end[i][c][1]; range_0 = cell_size[i][c][0] - cell_end[i][c][0]; ecode = clSetKernelArg(k_exact_rhs4[i], 2, sizeof(int), &c); ecode |= clSetKernelArg(k_exact_rhs4[i], 3, sizeof(int), &cell_start[i][c][1]); ecode |= clSetKernelArg(k_exact_rhs4[i], 4, sizeof(int), &range_1); ecode |= clSetKernelArg(k_exact_rhs4[i], 5, sizeof(int), &cell_start[i][c][0]); ecode |= clSetKernelArg(k_exact_rhs4[i], 6, sizeof(int), &range_0); ecode |= clSetKernelArg(k_exact_rhs4[i], 7, sizeof(int), &cell_size[i][c][2]); ecode |= clSetKernelArg(k_exact_rhs4[i], 8, sizeof(int), &cell_start[i][c][2]); ecode |= clSetKernelArg(k_exact_rhs4[i], 9, sizeof(int), &cell_end[i][c][2]); ecode |= clSetKernelArg(k_exact_rhs4[i], 10, sizeof(int), &cell_low[i][c][2]); ecode |= clSetKernelArg(k_exact_rhs4[i], 11, sizeof(int), &cell_low[i][c][1]); ecode |= clSetKernelArg(k_exact_rhs4[i], 12, sizeof(int), &cell_low[i][c][0]); clu_CheckError(ecode, "clSetKernelArg() for exact_rhs4"); d[0] = cell_size[i][c][0] - cell_start[i][c][0] - cell_end[i][c][0]; d[1] = cell_size[i][c][1] - cell_start[i][c][1] - cell_end[i][c][1]; compute_ws_dim_2(d, local_ws, global_ws); if (c == 0) CHECK_FINISH(i * 2); ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_exact_rhs4[i], 2, NULL, global_ws, local_ws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRange() for exact_rhs4"); } for (i = 0; i < num_devices; i++) { CHECK_FINISH(i * 2); } } for (c = 0; c < ncells; c++) { for (i = 0; i < num_devices; i++) { range_1 = cell_size[i][c][2] - cell_end[i][c][2]; range_0 = cell_size[i][c][1] - cell_end[i][c][1]; ecode = clSetKernelArg(k_exact_rhs5[i], 0, sizeof(cl_mem), &m_forcing[i]); ecode |= clSetKernelArg(k_exact_rhs5[i], 1, sizeof(int), &c); ecode |= clSetKernelArg(k_exact_rhs5[i], 2, sizeof(int), &cell_start[i][c][2]); ecode |= clSetKernelArg(k_exact_rhs5[i], 3, sizeof(int), &range_1); ecode |= clSetKernelArg(k_exact_rhs5[i], 4, sizeof(int), &cell_start[i][c][1]); ecode |= clSetKernelArg(k_exact_rhs5[i], 5, sizeof(int), &range_0); ecode |= clSetKernelArg(k_exact_rhs5[i], 6, sizeof(int), &cell_size[i][c][0]); ecode |= clSetKernelArg(k_exact_rhs5[i], 7, sizeof(int), &cell_start[i][c][0]); ecode |= clSetKernelArg(k_exact_rhs5[i], 8, sizeof(int), &cell_end[i][c][0]); clu_CheckError(ecode, "clSetKernelArg() for exact_rhs5"); d[0] = cell_size[i][c][1] - cell_start[i][c][1] - cell_end[i][c][1]; d[1] = cell_size[i][c][2] - cell_start[i][c][2] - cell_end[i][c][2]; compute_ws_dim_2(d, local_ws, global_ws); if (c == 0) CHECK_FINISH(i * 2); ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_exact_rhs5[i], 2, NULL, global_ws, local_ws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRange() for exact_rhs5"); } for (i = 0; i < num_devices; i++) { CHECK_FINISH(i * 2); } } for (i = 0; i < num_devices; i++) CHECK_FINISH(i * 2); }
void compute_rhs() { int i; size_t d0_size, d1_size, d2_size; cl_int ecode; if (timeron) timer_start(t_rhs); //------------------------------------------------------------------------- // compute the reciprocal of density, and the kinetic energy, // and the speed of sound. //------------------------------------------------------------------------- for (i = 0; i < num_devices; i++) { size_t compute_rhs1_lws[3], compute_rhs1_gws[3], temp; if (COMPUTE_RHS1_DIM == 3) { d0_size = max_cell_size[i][1] + 2; d1_size = max_cell_size[i][2] + 2; d2_size = ncells; compute_rhs1_lws[0] = d0_size < work_item_sizes[0] ? d0_size : work_item_sizes[0]; temp = max_work_group_size / compute_rhs1_lws[0]; compute_rhs1_lws[1] = d1_size < temp ? d1_size : temp; temp = temp / compute_rhs1_lws[1]; compute_rhs1_lws[2] = d2_size < temp ? d2_size : temp; compute_rhs1_gws[0] = clu_RoundWorkSize(d0_size, compute_rhs1_lws[0]); compute_rhs1_gws[1] = clu_RoundWorkSize(d1_size, compute_rhs1_lws[1]); compute_rhs1_gws[2] = clu_RoundWorkSize(d2_size, compute_rhs1_lws[2]); } else { d0_size = max_cell_size[i][2] + 2; d1_size = ncells; compute_rhs1_lws[0] = d0_size < work_item_sizes[0] ? d0_size : work_item_sizes[0]; temp = max_work_group_size / compute_rhs1_lws[0]; compute_rhs1_lws[1] = d1_size < temp ? d1_size : temp; compute_rhs1_gws[0] = clu_RoundWorkSize(d0_size, compute_rhs1_lws[0]); compute_rhs1_gws[1] = clu_RoundWorkSize(d1_size, compute_rhs1_lws[1]); } ecode = clSetKernelArg(k_compute_rhs1[i], 0, sizeof(cl_mem), &m_u[i]); ecode |= clSetKernelArg(k_compute_rhs1[i], 1, sizeof(cl_mem), &m_us[i]); ecode |= clSetKernelArg(k_compute_rhs1[i], 2, sizeof(cl_mem), &m_vs[i]); ecode |= clSetKernelArg(k_compute_rhs1[i], 3, sizeof(cl_mem), &m_ws[i]); ecode |= clSetKernelArg(k_compute_rhs1[i], 4, sizeof(cl_mem), &m_qs[i]); ecode |= clSetKernelArg(k_compute_rhs1[i], 5, sizeof(cl_mem), &m_rho_i[i]); ecode |= clSetKernelArg(k_compute_rhs1[i], 6, sizeof(cl_mem), &m_square[i]); ecode |= clSetKernelArg(k_compute_rhs1[i], 7, sizeof(cl_mem), &m_cell_size[i]); ecode |= clSetKernelArg(k_compute_rhs1[i], 8, sizeof(int), &ncells); clu_CheckError(ecode, "clSetKernelArg()"); ecode = clEnqueueNDRangeKernel(cmd_queue[i], k_compute_rhs1[i], COMPUTE_RHS1_DIM, NULL, compute_rhs1_gws, compute_rhs1_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); } //------------------------------------------------------------------------- //------------------------------------------------------------------------- // copy the exact forcing term to the right hand side; because // this forcing term is known, we can store it on the whole of every // cell, including the boundary //------------------------------------------------------------------------- for (i = 0; i < num_devices; i++) { size_t compute_rhs2_lws[3], compute_rhs2_gws[3], temp; if (COMPUTE_RHS2_DIM == 3) { d0_size = max_cell_size[i][1]; d1_size = max_cell_size[i][2]; d2_size = ncells; compute_rhs2_lws[0] = d0_size < work_item_sizes[0] ? d0_size : work_item_sizes[0]; temp = max_work_group_size / compute_rhs2_lws[0]; compute_rhs2_lws[1] = d1_size < temp ? d1_size : temp; temp = temp / compute_rhs2_lws[1]; compute_rhs2_lws[2] = d2_size < temp ? d2_size : temp; compute_rhs2_gws[0] = clu_RoundWorkSize(d0_size, compute_rhs2_lws[0]); compute_rhs2_gws[1] = clu_RoundWorkSize(d1_size, compute_rhs2_lws[1]); compute_rhs2_gws[2] = clu_RoundWorkSize(d2_size, compute_rhs2_lws[2]); } else { d0_size = max_cell_size[i][2]; d1_size = ncells; compute_rhs2_lws[0] = d0_size < work_item_sizes[0] ? d0_size : work_item_sizes[0]; temp = max_work_group_size / compute_rhs2_lws[0]; compute_rhs2_lws[1] = d1_size < temp ? d1_size : temp; compute_rhs2_gws[0] = clu_RoundWorkSize(d0_size, compute_rhs2_lws[0]); compute_rhs2_gws[1] = clu_RoundWorkSize(d1_size, compute_rhs2_lws[1]); } ecode = clSetKernelArg(k_compute_rhs2[i], 0, sizeof(cl_mem), &m_forcing[i]); ecode |= clSetKernelArg(k_compute_rhs2[i], 1, sizeof(cl_mem), &m_rhs[i]); ecode |= clSetKernelArg(k_compute_rhs2[i], 2, sizeof(cl_mem), &m_cell_size[i]); ecode |= clSetKernelArg(k_compute_rhs2[i], 3, sizeof(int), &ncells); clu_CheckError(ecode, "clSetKernelArg()"); ecode = clEnqueueNDRangeKernel(cmd_queue[i], k_compute_rhs2[i], COMPUTE_RHS2_DIM, NULL, compute_rhs2_gws, compute_rhs2_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); } //------------------------------------------------------------------------- //------------------------------------------------------------------------- // compute xi-direction fluxes //------------------------------------------------------------------------- for (i = 0; i < num_devices; i++) { size_t compute_rhs3_lws[3], compute_rhs3_gws[3], temp; d0_size = max_cell_size[i][1]; d1_size = max_cell_size[i][2]; d2_size = ncells; compute_rhs3_lws[0] = d0_size < work_item_sizes[0] ? d0_size : work_item_sizes[0]; temp = max_work_group_size / compute_rhs3_lws[0]; compute_rhs3_lws[1] = d1_size < temp ? d1_size : temp; temp = temp / compute_rhs3_lws[1]; compute_rhs3_lws[2] = d2_size < temp ? d2_size : temp; compute_rhs3_gws[0] = clu_RoundWorkSize(d0_size, compute_rhs3_lws[0]); compute_rhs3_gws[1] = clu_RoundWorkSize(d1_size, compute_rhs3_lws[1]); compute_rhs3_gws[2] = clu_RoundWorkSize(d2_size, compute_rhs3_lws[2]); ecode = clSetKernelArg(k_compute_rhs3[i], 0, sizeof(cl_mem), &m_u[i]); ecode |= clSetKernelArg(k_compute_rhs3[i], 1, sizeof(cl_mem), &m_us[i]); ecode |= clSetKernelArg(k_compute_rhs3[i], 2, sizeof(cl_mem), &m_vs[i]); ecode |= clSetKernelArg(k_compute_rhs3[i], 3, sizeof(cl_mem), &m_ws[i]); ecode |= clSetKernelArg(k_compute_rhs3[i], 4, sizeof(cl_mem), &m_qs[i]); ecode |= clSetKernelArg(k_compute_rhs3[i], 5, sizeof(cl_mem), &m_rho_i[i]); ecode |= clSetKernelArg(k_compute_rhs3[i], 6, sizeof(cl_mem), &m_square[i]); ecode |= clSetKernelArg(k_compute_rhs3[i], 7, sizeof(cl_mem), &m_rhs[i]); ecode |= clSetKernelArg(k_compute_rhs3[i], 8, sizeof(cl_mem), &m_cell_size[i]); ecode |= clSetKernelArg(k_compute_rhs3[i], 9, sizeof(cl_mem),&m_start[i]); ecode |= clSetKernelArg(k_compute_rhs3[i], 10, sizeof(cl_mem), &m_end[i]); ecode |= clSetKernelArg(k_compute_rhs3[i], 11, sizeof(int), &ncells); clu_CheckError(ecode, "clSetKernelArg()"); ecode = clEnqueueNDRangeKernel(cmd_queue[i], k_compute_rhs3[i], 3, NULL, compute_rhs3_gws, compute_rhs3_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); } //------------------------------------------------------------------------- //------------------------------------------------------------------------- // compute eta-direction fluxes //------------------------------------------------------------------------- for (i = 0; i < num_devices; i++) { size_t compute_rhs4_lws[3], compute_rhs4_gws[3], temp; d0_size = max_cell_size[i][0]; d1_size = max_cell_size[i][2]; d2_size = ncells; compute_rhs4_lws[0] = d0_size < work_item_sizes[0] ? d0_size : work_item_sizes[0]; temp = max_work_group_size / compute_rhs4_lws[0]; compute_rhs4_lws[1] = d1_size < temp ? d1_size : temp; temp = temp / compute_rhs4_lws[1]; compute_rhs4_lws[2] = d2_size < temp ? d2_size : temp; compute_rhs4_gws[0] = clu_RoundWorkSize(d0_size, compute_rhs4_lws[0]); compute_rhs4_gws[1] = clu_RoundWorkSize(d1_size, compute_rhs4_lws[1]); compute_rhs4_gws[2] = clu_RoundWorkSize(d2_size, compute_rhs4_lws[2]); ecode = clSetKernelArg(k_compute_rhs4[i], 0, sizeof(cl_mem), &m_u[i]); ecode |= clSetKernelArg(k_compute_rhs4[i], 1, sizeof(cl_mem), &m_us[i]); ecode |= clSetKernelArg(k_compute_rhs4[i], 2, sizeof(cl_mem), &m_vs[i]); ecode |= clSetKernelArg(k_compute_rhs4[i], 3, sizeof(cl_mem), &m_ws[i]); ecode |= clSetKernelArg(k_compute_rhs4[i], 4, sizeof(cl_mem), &m_qs[i]); ecode |= clSetKernelArg(k_compute_rhs4[i], 5, sizeof(cl_mem), &m_rho_i[i]); ecode |= clSetKernelArg(k_compute_rhs4[i], 6, sizeof(cl_mem), &m_square[i]); ecode |= clSetKernelArg(k_compute_rhs4[i], 7, sizeof(cl_mem), &m_rhs[i]); ecode |= clSetKernelArg(k_compute_rhs4[i], 8, sizeof(cl_mem), &m_cell_size[i]); ecode |= clSetKernelArg(k_compute_rhs4[i], 9, sizeof(cl_mem),&m_start[i]); ecode |= clSetKernelArg(k_compute_rhs4[i], 10, sizeof(cl_mem), &m_end[i]); ecode |= clSetKernelArg(k_compute_rhs4[i], 11, sizeof(int), &ncells); clu_CheckError(ecode, "clSetKernelArg()"); ecode = clEnqueueNDRangeKernel(cmd_queue[i], k_compute_rhs4[i], 3, NULL, compute_rhs4_gws, compute_rhs4_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); } //------------------------------------------------------------------------- //------------------------------------------------------------------------- // compute zeta-direction fluxes //------------------------------------------------------------------------- for (i = 0; i < num_devices; i++) { size_t compute_rhs5_lws[3], compute_rhs5_gws[3], temp; d0_size = max_cell_size[i][0]; d1_size = max_cell_size[i][1]; d2_size = ncells; compute_rhs5_lws[0] = d0_size < work_item_sizes[0] ? d0_size : work_item_sizes[0]; temp = max_work_group_size / compute_rhs5_lws[0]; compute_rhs5_lws[1] = d1_size < temp ? d1_size : temp; temp = temp / compute_rhs5_lws[1]; compute_rhs5_lws[2] = d2_size < temp ? d2_size : temp; compute_rhs5_gws[0] = clu_RoundWorkSize(d0_size, compute_rhs5_lws[0]); compute_rhs5_gws[1] = clu_RoundWorkSize(d1_size, compute_rhs5_lws[1]); compute_rhs5_gws[2] = clu_RoundWorkSize(d2_size, compute_rhs5_lws[2]); ecode = clSetKernelArg(k_compute_rhs5[i], 0, sizeof(cl_mem), &m_u[i]); ecode |= clSetKernelArg(k_compute_rhs5[i], 1, sizeof(cl_mem), &m_us[i]); ecode |= clSetKernelArg(k_compute_rhs5[i], 2, sizeof(cl_mem), &m_vs[i]); ecode |= clSetKernelArg(k_compute_rhs5[i], 3, sizeof(cl_mem), &m_ws[i]); ecode |= clSetKernelArg(k_compute_rhs5[i], 4, sizeof(cl_mem), &m_qs[i]); ecode |= clSetKernelArg(k_compute_rhs5[i], 5, sizeof(cl_mem), &m_rho_i[i]); ecode |= clSetKernelArg(k_compute_rhs5[i], 6, sizeof(cl_mem), &m_square[i]); ecode |= clSetKernelArg(k_compute_rhs5[i], 7, sizeof(cl_mem), &m_rhs[i]); ecode |= clSetKernelArg(k_compute_rhs5[i], 8, sizeof(cl_mem), &m_cell_size[i]); ecode |= clSetKernelArg(k_compute_rhs5[i], 9, sizeof(cl_mem),&m_start[i]); ecode |= clSetKernelArg(k_compute_rhs5[i], 10, sizeof(cl_mem), &m_end[i]); ecode |= clSetKernelArg(k_compute_rhs5[i], 11, sizeof(int), &ncells); clu_CheckError(ecode, "clSetKernelArg()"); ecode = clEnqueueNDRangeKernel(cmd_queue[i], k_compute_rhs5[i], 3, NULL, compute_rhs5_gws, compute_rhs5_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); } //------------------------------------------------------------------------- //------------------------------------------------------------------------- for (i = 0; i < num_devices; i++) { size_t compute_rhs6_lws[3], compute_rhs6_gws[3], temp; if (COMPUTE_RHS6_DIM == 3) { d0_size = max_cell_size[i][1]; d1_size = max_cell_size[i][2]; d2_size = ncells; compute_rhs6_lws[0] = d0_size < work_item_sizes[0] ? d0_size : work_item_sizes[0]; temp = max_work_group_size / compute_rhs6_lws[0]; compute_rhs6_lws[1] = d1_size < temp ? d1_size : temp; temp = temp / compute_rhs6_lws[1]; compute_rhs6_lws[2] = d2_size < temp ? d2_size : temp; compute_rhs6_gws[0] = clu_RoundWorkSize(d0_size, compute_rhs6_lws[0]); compute_rhs6_gws[1] = clu_RoundWorkSize(d1_size, compute_rhs6_lws[1]); compute_rhs6_gws[2] = clu_RoundWorkSize(d2_size, compute_rhs6_lws[2]); } else { d0_size = max_cell_size[i][2]; d1_size = ncells; compute_rhs6_lws[0] = d0_size < work_item_sizes[0] ? d0_size : work_item_sizes[0]; temp = max_work_group_size / compute_rhs6_lws[0]; compute_rhs6_lws[1] = d1_size < temp ? d1_size : temp; compute_rhs6_gws[0] = clu_RoundWorkSize(d0_size, compute_rhs6_lws[0]); compute_rhs6_gws[1] = clu_RoundWorkSize(d1_size, compute_rhs6_lws[1]); } ecode = clSetKernelArg(k_compute_rhs6[i], 0, sizeof(cl_mem), &m_rhs[i]); ecode |= clSetKernelArg(k_compute_rhs6[i], 1, sizeof(cl_mem), &m_cell_size[i]); ecode |= clSetKernelArg(k_compute_rhs6[i], 2, sizeof(cl_mem),&m_start[i]); ecode |= clSetKernelArg(k_compute_rhs6[i], 3, sizeof(cl_mem), &m_end[i]); ecode |= clSetKernelArg(k_compute_rhs6[i], 4, sizeof(int), &ncells); clu_CheckError(ecode, "clSetKernelArg()"); ecode = clEnqueueNDRangeKernel(cmd_queue[i], k_compute_rhs6[i], COMPUTE_RHS6_DIM, NULL, compute_rhs6_gws, compute_rhs6_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); } //------------------------------------------------------------------------- CHECK_FINISH(); if (timeron) timer_stop(t_rhs); }
//--------------------------------------------------------------------- // this function copies the face values of a variable defined on a set // of cells to the overlap locations of the adjacent sets of cells. // Because a set of cells interfaces in each direction with exactly one // other set, we only need to fill six different buffers. We could try to // overlap communication with computation, by computing // some internal values while communicating boundary values, but this // adds so much overhead that it's not clearly useful. //--------------------------------------------------------------------- void copy_faces() { int c, i; cl_int ecode = 0; //--------------------------------------------------------------------- // exit immediately if there are no faces to be copied //--------------------------------------------------------------------- if (num_devices == 1) { compute_rhs(); return; } //--------------------------------------------------------------------- // because the difference stencil for the diagonalized scheme is // orthogonal, we do not have to perform the staged copying of faces, // but can send all face information simultaneously to the neighboring // cells in all directions //--------------------------------------------------------------------- if (timeron) timer_start(t_bpack); for (c = 0; c < ncells; c++) { for (i = 0; i < num_devices; i++) { ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_copy_faces1[i][c], COPY_FACES1_DIM, NULL, copy_faces1_gw[i][c], copy_faces1_lw[i][c], 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRange() for copy_faces1"); } for (i = 0; i < num_devices; i++) { ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_copy_faces2[i][c], COPY_FACES2_DIM, NULL, copy_faces2_gw[i][c], copy_faces2_lw[i][c], 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRange() for copy_faces2"); ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_copy_faces3[i][c], COPY_FACES3_DIM, NULL, copy_faces3_gw[i][c], copy_faces3_lw[i][c], 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRange() for copy_faces3"); } for (i = 0; i < num_devices; i++) { CHECK_FINISH(i * 2); } } if (timeron) timer_stop(t_bpack); if (timeron) timer_start(t_exch); for (i = 0; i < num_devices; i++) { CHECK_FINISH(i * 2); ecode = clEnqueueCopyBuffer(cmd_queue[successor[i][0] * 2 + 1], m_out_buffer[i], m_in_buffer[successor[i][0]], start_send_east[i]*sizeof(double), start_recv_west[successor[i][0]]*sizeof(double), east_size[i]*sizeof(double), 0, NULL, NULL); CHECK_FINISH(successor[i][0] * 2 + 1); } for (i = 0; i < num_devices; i++) { ecode = clEnqueueCopyBuffer(cmd_queue[predecessor[i][0] * 2 + 1], m_out_buffer[i], m_in_buffer[predecessor[i][0]], start_send_west[i]*sizeof(double), start_recv_east[predecessor[i][0]]*sizeof(double), west_size[i]*sizeof(double), 0, NULL, NULL); CHECK_FINISH(predecessor[i][0] * 2 + 1); ecode = clEnqueueCopyBuffer(cmd_queue[successor[i][1] * 2 + 1], m_out_buffer[i], m_in_buffer[successor[i][1]], start_send_north[i]*sizeof(double), start_recv_south[successor[i][1]]*sizeof(double), north_size[i]*sizeof(double), 0, NULL, NULL); CHECK_FINISH(successor[i][1] * 2 + 1); ecode = clEnqueueCopyBuffer(cmd_queue[predecessor[i][1] * 2 + 1], m_out_buffer[i], m_in_buffer[predecessor[i][1]], start_send_south[i]*sizeof(double), start_recv_north[predecessor[i][1]]*sizeof(double), south_size[i]*sizeof(double), 0, NULL, NULL); CHECK_FINISH(predecessor[i][1] * 2 + 1); ecode = clEnqueueCopyBuffer(cmd_queue[successor[i][2] * 2 + 1], m_out_buffer[i], m_in_buffer[successor[i][2]], start_send_top[i]*sizeof(double), start_recv_bottom[successor[i][2]]*sizeof(double), top_size[i]*sizeof(double), 0, NULL, NULL); CHECK_FINISH(successor[i][2] * 2 + 1); ecode = clEnqueueCopyBuffer(cmd_queue[predecessor[i][2] * 2 + 1], m_out_buffer[i], m_in_buffer[predecessor[i][2]], start_send_bottom[i]*sizeof(double), start_recv_top[predecessor[i][2]]*sizeof(double), bottom_size[i]*sizeof(double), 0, NULL, NULL); CHECK_FINISH(predecessor[i][2] * 2 + 1); } if (timeron) timer_stop(t_exch); //--------------------------------------------------------------------- // unpack the data that has just been received; //--------------------------------------------------------------------- if (timeron) timer_start(t_bpack); for (c = 0; c < ncells; c++) { for (i = 0; i < num_devices; i++) { if (c == 0) CHECK_FINISH(i * 2 + 1); ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_copy_faces4[i][c], COPY_FACES4_DIM, NULL, copy_faces4_gw[i][c], copy_faces4_lw[i][c], 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRange() for copy_faces4"); } for (i = 0; i < num_devices; i++) { ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_copy_faces5[i][c], COPY_FACES5_DIM, NULL, copy_faces5_gw[i][c], copy_faces5_lw[i][c], 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRange() for copy_faces5"); ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_copy_faces6[i][c], COPY_FACES6_DIM, NULL, copy_faces6_gw[i][c], copy_faces6_lw[i][c], 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRange() for copy_faces6"); } for (i = 0; i < num_devices; i++) { CHECK_FINISH(i * 2); } } if (timeron) timer_stop(t_bpack); for (i = 0; i < num_devices; i++) CHECK_FINISH(i * 2); //--------------------------------------------------------------------- // now that we have all the data, compute the rhs //--------------------------------------------------------------------- compute_rhs(); }
void opencl_info() { cl_int err_code; cl_platform_id *platforms; cl_device_type device_type; cl_uint num_devices; cl_device_id *devices; // Get OpenCL platforms // - Get the number of available platforms cl_uint num_platforms; err_code = clGetPlatformIDs(0, NULL, &num_platforms); clu_CheckError(err_code, "clGetPlatformIDs() for num_platforms"); if (num_platforms == 0) { fprintf(stderr, "No OpenCL platform!\n"); exit(EXIT_FAILURE); } // - Get platform IDs platforms = (cl_platform_id *)malloc(num_platforms*sizeof(cl_platform_id)); err_code = clGetPlatformIDs(num_platforms, platforms, NULL); clu_CheckError(err_code, "clGetPlatformIDs()"); // Get platform informations printf("\nNumber of platforms: %u\n\n", num_platforms); char tmp_buf[1024]; for (cl_uint i = 0; i < num_platforms; i++) { printf("platform: %u\n", i); err_code = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 1024, &tmp_buf, NULL); clu_CheckError(err_code, "clGetPlatformInfo() for CL_PLATFORM_NAME"); printf("- CL_PLATFORM_NAME : %s\n", tmp_buf); err_code = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 1024, &tmp_buf, NULL); clu_CheckError(err_code, "clGetPlatformInfo() for CL_PLATFORM_VENDOR"); printf("- CL_PLATFORM_VENDOR : %s\n", tmp_buf); err_code = clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 1024, &tmp_buf, NULL); clu_CheckError(err_code, "clGetPlatformInfo() for CL_PLATFORM_PROFILE"); printf("- CL_PLATFORM_PROFILE : %s\n", tmp_buf); err_code = clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 1024, &tmp_buf, NULL); clu_CheckError(err_code, "clGetPlatformInfo() for CL_PLATFORM_VERSION"); printf("- CL_PLATFORM_VERSION : %s\n", tmp_buf); err_code = clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 1024, &tmp_buf, NULL); clu_CheckError(err_code,"clGetPlatformInfo() for CL_PLATFORM_EXTENSIONS"); printf("- CL_PLATFORM_EXTENSIONS: %s\n", tmp_buf); printf("\n"); // Get the number of devices err_code = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices); clu_CheckError(err_code, "clGetDeviceIDs for num_devices"); if (num_devices == 0) { fprintf(stderr, "No OpenCL device in this platform!\n"); exit(EXIT_FAILURE); } printf("Number of devices: %u\n", num_devices); // Get the default device cl_device_id default_device; cl_uint num_defaults; err_code = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_DEFAULT, 1, &default_device, &num_defaults); clu_CheckError(err_code, "clGetDeviceIDs() for CL_DEVICE_TYPE_DEFAULT"); if (num_defaults != 1) { printf("- # of default devices: %u\n", num_defaults); } // Get device IDs devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); err_code = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, num_devices, devices, NULL); clu_CheckError(err_code, "clGetDeviceIDs()"); for (cl_uint k = 0; k < num_devices; k++) { printf("device: %u (", k); err_code = clGetDeviceInfo(devices[k], CL_DEVICE_TYPE, sizeof(cl_device_type), &device_type, NULL); if (device_type & CL_DEVICE_TYPE_CPU) printf("CL_DEVICE_TYPE_CPU"); if (device_type & CL_DEVICE_TYPE_GPU) printf("CL_DEVICE_TYPE_GPU"); if (device_type & CL_DEVICE_TYPE_ACCELERATOR) printf("CL_DEVICE_TYPE_ACCELERATOR"); if (device_type & CL_DEVICE_TYPE_DEFAULT) printf("CL_DEVICE_TYPE_DEFAULT"); printf(")"); if (default_device == devices[k]) printf(" default"); printf("\n"); err_code = clGetDeviceInfo(devices[k], CL_DEVICE_NAME, 1024, tmp_buf, NULL); printf(" - CL_DEVICE_NAME : %s\n", tmp_buf); err_code = clGetDeviceInfo(devices[k], CL_DEVICE_VENDOR, 1024, tmp_buf, NULL); printf(" - CL_DEVICE_VENDOR : %s\n", tmp_buf); err_code = clGetDeviceInfo(devices[k], CL_DRIVER_VERSION, 1024, tmp_buf, NULL); printf(" - CL_DRIVER_VERSION : %s\n", tmp_buf); err_code = clGetDeviceInfo(devices[k], CL_DEVICE_PROFILE, 1024, tmp_buf, NULL); printf(" - CL_DEVICE_PROFILE : %s\n", tmp_buf); err_code = clGetDeviceInfo(devices[k], CL_DEVICE_VERSION, 1024, tmp_buf, NULL); printf(" - CL_DEVICE_VERSION : %s\n", tmp_buf); err_code = clGetDeviceInfo(devices[k], CL_DEVICE_EXTENSIONS, 1024, tmp_buf, NULL); //CL_DEVICE_MAX_COMPUTE_UNITS //CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS //CL_DEVICE_MAX_WORK_GROUP_SIZE //CL_DEVICE_MAX_WORK_ITEM_SIZES // cl_uint usize; err_code = clGetDeviceInfo(devices[k], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(usize), &usize, NULL); printf(" - CL_DEVICE_MAX_COMPUTE_UNITS : %d\n", usize); err_code = clGetDeviceInfo(devices[k], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(usize), &usize, NULL); printf(" - CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS : %d\n", usize); size_t size; err_code = clGetDeviceInfo(devices[k], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size), &size, NULL); printf(" - CL_DEVICE_MAX_WORK_GROUP_SIZE : %d\n",size); err_code = clGetDeviceInfo(devices[k], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size), &size, NULL); printf(" - CL_DEVICE_MAX_WORK_ITEM_SIZES : %d\n", size); printf("\n"); } free(devices); printf("\n"); } free(platforms); }
int main( int argc, char **argv ) { int i, iteration; double timecounter; FILE *fp; cl_int ecode; if (argc == 1) { fprintf(stderr, "Usage: %s <kernel directory>\n", argv[0]); exit(-1); } /* Initialize timers */ timer_on = 0; if ((fp = fopen("timer.flag", "r")) != NULL) { fclose(fp); timer_on = 1; } timer_clear( 0 ); if (timer_on) { timer_clear( 1 ); timer_clear( 2 ); timer_clear( 3 ); } if (timer_on) timer_start( 3 ); /* Initialize the verification arrays if a valid class */ for( i=0; i<TEST_ARRAY_SIZE; i++ ) switch( CLASS ) { case 'S': test_index_array[i] = S_test_index_array[i]; test_rank_array[i] = S_test_rank_array[i]; break; case 'A': test_index_array[i] = A_test_index_array[i]; test_rank_array[i] = A_test_rank_array[i]; break; case 'W': test_index_array[i] = W_test_index_array[i]; test_rank_array[i] = W_test_rank_array[i]; break; case 'B': test_index_array[i] = B_test_index_array[i]; test_rank_array[i] = B_test_rank_array[i]; break; case 'C': test_index_array[i] = C_test_index_array[i]; test_rank_array[i] = C_test_rank_array[i]; break; case 'D': test_index_array[i] = D_test_index_array[i]; test_rank_array[i] = D_test_rank_array[i]; break; }; /* set up the OpenCL environment. */ setup_opencl(argc, argv); /* Printout initial NPB info */ printf( "\n\n NAS Parallel Benchmarks (NPB3.3-OCL) - IS Benchmark\n\n" ); printf( " Size: %ld (class %c)\n", (long)TOTAL_KEYS, CLASS ); printf( " Iterations: %d\n", MAX_ITERATIONS ); if (timer_on) timer_start( 1 ); /* Generate random number sequence and subsequent keys on all procs */ create_seq( 314159265.00, /* Random number gen seed */ 1220703125.00 ); /* Random number gen mult */ if (timer_on) timer_stop( 1 ); /* Do one interation for free (i.e., untimed) to guarantee initialization of all data and code pages and respective tables */ rank( 1 ); /* Start verification counter */ passed_verification = 0; DTIMER_START(T_BUFFER_WRITE); ecode = clEnqueueWriteBuffer(cmd_queue, m_passed_verification, CL_TRUE, 0, sizeof(cl_int), &passed_verification, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueWriteBuffer() for m_passed_verification"); DTIMER_STOP(T_BUFFER_WRITE); if( CLASS != 'S' ) printf( "\n iteration\n" ); /* Start timer */ timer_start( 0 ); /* This is the main iteration */ for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ ) { if( CLASS != 'S' ) printf( " %d\n", iteration ); rank( iteration ); } DTIMER_START(T_BUFFER_READ); ecode = clEnqueueReadBuffer(cmd_queue, m_passed_verification, CL_TRUE, 0, sizeof(cl_int), &passed_verification, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueReadBuffer() for m_passed_verification"); DTIMER_STOP(T_BUFFER_READ); /* End of timing, obtain maximum time of all processors */ timer_stop( 0 ); timecounter = timer_read( 0 ); /* This tests that keys are in sequence: sorting of last ranked key seq occurs here, but is an untimed operation */ if (timer_on) timer_start( 2 ); full_verify(); if (timer_on) timer_stop( 2 ); if (timer_on) timer_stop( 3 ); /* The final printout */ if( passed_verification != 5*MAX_ITERATIONS + 1 ) passed_verification = 0; c_print_results( "IS", CLASS, (int)(TOTAL_KEYS/64), 64, 0, MAX_ITERATIONS, timecounter, ((double) (MAX_ITERATIONS*TOTAL_KEYS)) /timecounter/1000000., "keys ranked", passed_verification, NPBVERSION, COMPILETIME, CC, CLINK, C_LIB, C_INC, CFLAGS, CLINKFLAGS, "", clu_GetDeviceTypeName(device_type), device_name); /* Print additional timers */ if (timer_on) { double t_total, t_percent; t_total = timer_read( 3 ); printf("\nAdditional timers -\n"); printf(" Total execution: %8.3f\n", t_total); if (t_total == 0.0) t_total = 1.0; timecounter = timer_read(1); t_percent = timecounter/t_total * 100.; printf(" Initialization : %8.3f (%5.2f%%)\n", timecounter, t_percent); timecounter = timer_read(0); t_percent = timecounter/t_total * 100.; printf(" Benchmarking : %8.3f (%5.2f%%)\n", timecounter, t_percent); timecounter = timer_read(2); t_percent = timecounter/t_total * 100.; printf(" Sorting : %8.3f (%5.2f%%)\n", timecounter, t_percent); } release_opencl(); fflush(stdout); return 0; /**************************/ } /* E N D P R O G R A M */
void full_verify( void ) { cl_kernel k_fv1, k_fv2; cl_mem m_j; INT_TYPE *g_j; INT_TYPE j = 0, i; size_t j_size; size_t fv1_lws[1], fv1_gws[1]; size_t fv2_lws[1], fv2_gws[1]; cl_int ecode; DTIMER_START(T_BUFFER_CREATE); // Create buffers j_size = sizeof(INT_TYPE) * (FV2_GLOBAL_SIZE / FV2_GROUP_SIZE); m_j = clCreateBuffer(context, CL_MEM_READ_WRITE, j_size, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer for m_j"); DTIMER_STOP(T_BUFFER_CREATE); DTIMER_START(T_OPENCL_API); // Create kernels k_fv1 = clCreateKernel(program, "full_verify1", &ecode); clu_CheckError(ecode, "clCreateKernel() for full_verify1"); k_fv2 = clCreateKernel(program, "full_verify2", &ecode); clu_CheckError(ecode, "clCreateKernel() for full_verify2"); DTIMER_STOP(T_OPENCL_API); if (device_type == CL_DEVICE_TYPE_GPU) { cl_kernel k_fv0; size_t fv0_lws[1], fv0_gws[1]; DTIMER_START(T_OPENCL_API); // Create kernels k_fv0 = clCreateKernel(program, "full_verify0", &ecode); clu_CheckError(ecode, "clCreateKernel() for full_verify0"); DTIMER_STOP(T_OPENCL_API); // Kernel execution DTIMER_START(T_KERNEL_FV0); ecode = clSetKernelArg(k_fv0, 0, sizeof(cl_mem), (void*)&m_key_array); ecode |= clSetKernelArg(k_fv0, 1, sizeof(cl_mem), (void*)&m_key_buff2); clu_CheckError(ecode, "clSetKernelArg() for full_verify0"); fv0_lws[0] = work_item_sizes[0]; fv0_gws[0] = NUM_KEYS; ecode = clEnqueueNDRangeKernel(cmd_queue, k_fv0, 1, NULL, fv0_gws, fv0_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for full_verify0"); CHECK_FINISH(); DTIMER_STOP(T_KERNEL_FV0); DTIMER_START(T_KERNEL_FV1); ecode = clSetKernelArg(k_fv1, 0, sizeof(cl_mem), (void*)&m_key_buff2); ecode |= clSetKernelArg(k_fv1, 1, sizeof(cl_mem), (void*)&m_key_buff1); ecode |= clSetKernelArg(k_fv1, 2, sizeof(cl_mem), (void*)&m_key_array); clu_CheckError(ecode, "clSetKernelArg() for full_verify1"); fv1_lws[0] = work_item_sizes[0]; fv1_gws[0] = NUM_KEYS; ecode = clEnqueueNDRangeKernel(cmd_queue, k_fv1, 1, NULL, fv1_gws, fv1_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for full_verify1"); CHECK_FINISH(); DTIMER_STOP(T_KERNEL_FV1); DTIMER_START(T_KERNEL_FV2); ecode = clSetKernelArg(k_fv2, 0, sizeof(cl_mem), (void*)&m_key_array); ecode |= clSetKernelArg(k_fv2, 1, sizeof(cl_mem), (void*)&m_j); ecode |= clSetKernelArg(k_fv2, 2, sizeof(INT_TYPE)*FV2_GROUP_SIZE, NULL); clu_CheckError(ecode, "clSetKernelArg() for full_verify2"); fv2_lws[0] = FV2_GROUP_SIZE; fv2_gws[0] = FV2_GLOBAL_SIZE; ecode = clEnqueueNDRangeKernel(cmd_queue, k_fv2, 1, NULL, fv2_gws, fv2_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for full_verify2"); CHECK_FINISH(); DTIMER_STOP(T_KERNEL_FV2); g_j = (INT_TYPE *)malloc(j_size); DTIMER_START(T_BUFFER_READ); ecode = clEnqueueReadBuffer(cmd_queue, m_j, CL_TRUE, 0, j_size, g_j, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueReadBuffer() for m_j"); DTIMER_STOP(T_BUFFER_READ); // reduction for (i = 0; i < j_size/sizeof(INT_TYPE); i++) { j += g_j[i]; } DTIMER_START(T_RELEASE); clReleaseKernel(k_fv0); DTIMER_STOP(T_RELEASE); } else { // Kernel execution DTIMER_START(T_KERNEL_FV1); ecode = clSetKernelArg(k_fv1, 0, sizeof(cl_mem), (void*)&m_bucket_ptrs); ecode |= clSetKernelArg(k_fv1, 1, sizeof(cl_mem), (void*)&m_key_buff2); ecode |= clSetKernelArg(k_fv1, 2, sizeof(cl_mem), (void*)&m_key_buff1); ecode |= clSetKernelArg(k_fv1, 3, sizeof(cl_mem), (void*)&m_key_array); clu_CheckError(ecode, "clSetKernelArg() for full_verify1"); fv1_lws[0] = RANK_GROUP_SIZE; fv1_gws[0] = RANK_GLOBAL_SIZE; ecode = clEnqueueNDRangeKernel(cmd_queue, k_fv1, 1, NULL, fv1_gws, fv1_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for full_verify1"); CHECK_FINISH(); DTIMER_STOP(T_KERNEL_FV1); DTIMER_START(T_KERNEL_FV2); ecode = clSetKernelArg(k_fv2, 0, sizeof(cl_mem), (void*)&m_key_array); ecode |= clSetKernelArg(k_fv2, 1, sizeof(cl_mem), (void*)&m_j); ecode |= clSetKernelArg(k_fv2, 2, sizeof(INT_TYPE)*FV2_GROUP_SIZE, NULL); clu_CheckError(ecode, "clSetKernelArg() for full_verify2"); fv2_lws[0] = FV2_GROUP_SIZE; fv2_gws[0] = FV2_GLOBAL_SIZE; ecode = clEnqueueNDRangeKernel(cmd_queue, k_fv2, 1, NULL, fv2_gws, fv2_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for full_verify2"); CHECK_FINISH(); DTIMER_STOP(T_KERNEL_FV2); g_j = (INT_TYPE *)malloc(j_size); DTIMER_START(T_BUFFER_READ); ecode = clEnqueueReadBuffer(cmd_queue, m_j, CL_TRUE, 0, j_size, g_j, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueReadBuffer() for m_j"); DTIMER_STOP(T_BUFFER_READ); // reduction for (i = 0; i < j_size/sizeof(INT_TYPE); i++) { j += g_j[i]; } } DTIMER_START(T_RELEASE); free(g_j); clReleaseMemObject(m_j); clReleaseKernel(k_fv1); clReleaseKernel(k_fv2); DTIMER_STOP(T_RELEASE); if (j != 0) printf( "Full_verify: number of keys out of sort: %ld\n", (long)j ); else passed_verification++; }
void rank( int iteration ) { size_t r1_lws[1], r1_gws[1]; size_t r2_lws[1], r2_gws[1]; size_t r3_lws[1], r3_gws[1]; cl_int ecode; DTIMER_START(T_KERNEL_RANK0); // rank0 ecode = clSetKernelArg(k_rank0, 3, sizeof(cl_int), (void*)&iteration); clu_CheckError(ecode, "clSetKernelArg() for rank0: iteration"); ecode = clEnqueueTask(cmd_queue, k_rank0, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueTask() for rank0"); CHECK_FINISH(); DTIMER_STOP(T_KERNEL_RANK0); DTIMER_START(T_KERNEL_RANK1); // rank1 r1_lws[0] = RANK1_GROUP_SIZE; r1_gws[0] = RANK1_GLOBAL_SIZE; ecode = clEnqueueNDRangeKernel(cmd_queue, k_rank1, 1, NULL, r1_gws, r1_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for rank1"); CHECK_FINISH(); DTIMER_STOP(T_KERNEL_RANK1); DTIMER_START(T_KERNEL_RANK2); // rank2 r2_lws[0] = RANK2_GROUP_SIZE; r2_gws[0] = RANK2_GLOBAL_SIZE; ecode = clEnqueueNDRangeKernel(cmd_queue, k_rank2, 1, NULL, r2_gws, r2_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for rank2"); CHECK_FINISH(); DTIMER_STOP(T_KERNEL_RANK2); DTIMER_START(T_KERNEL_RANK3); // rank3 if (device_type == CL_DEVICE_TYPE_GPU) { r3_lws[0] = work_item_sizes[0]; r3_gws[0] = work_item_sizes[0] * work_item_sizes[0]; if (r3_gws[0] > MAX_KEY) r3_gws[0] = MAX_KEY; ecode = clEnqueueNDRangeKernel(cmd_queue, k_rank3_0, 1, NULL, r3_gws, r3_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for rank3_0"); r3_lws[0] = work_item_sizes[0]; r3_gws[0] = work_item_sizes[0]; ecode = clEnqueueNDRangeKernel(cmd_queue, k_rank3_1, 1, NULL, r3_gws, r3_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for rank3_1"); r3_lws[0] = work_item_sizes[0]; r3_gws[0] = work_item_sizes[0] * work_item_sizes[0]; if (r3_gws[0] > MAX_KEY) r3_gws[0] = MAX_KEY; ecode = clEnqueueNDRangeKernel(cmd_queue, k_rank3_2, 1, NULL, r3_gws, r3_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for rank3_2"); } else { r3_lws[0] = RANK_GROUP_SIZE; r3_gws[0] = RANK_GLOBAL_SIZE; ecode = clEnqueueNDRangeKernel(cmd_queue, k_rank3, 1, NULL, r3_gws, r3_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for rank3"); } CHECK_FINISH(); DTIMER_STOP(T_KERNEL_RANK3); // rank4 - partial verification DTIMER_START(T_KERNEL_RANK4); ecode = clSetKernelArg(k_rank4, 4, sizeof(cl_int), (void*)&iteration); clu_CheckError(ecode, "clSetKernelArg() for rank4"); ecode = clEnqueueTask(cmd_queue, k_rank4, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueTask() for rank4"); ecode = clFinish(cmd_queue); clu_CheckError(ecode, "clFinish"); DTIMER_STOP(T_KERNEL_RANK4); }
//--------------------------------------------------------------------- // this function computes the norm of the difference between the // computed solution and the exact solution //--------------------------------------------------------------------- void error_norm(double rms[5]) { int c, i, m, d; int k, j, kk, jj; double rms_work[5]; size_t one = 1; cl_mem m_rms[MAX_DEVICE_NUM]; cl_int ecode = 0; for (m = 0; m < 5; m++) { rms[m] = 0.0; rms_work[m] = 0.0; } for (i = 0; i < num_devices; i++) { m_rms[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*5, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_rms"); ecode = clEnqueueWriteBuffer(cmd_queue[i * 2], m_rms[i], CL_TRUE, 0, sizeof(double)*5, rms_work, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueWriteBuffer() for m_rms"); } for (c = 0; c < ncells; c++) { for (i = 0; i < num_devices; i++) { kk = 2; for (k = cell_low[i][c][2]; k <= cell_high[i][c][2]; k++) { jj = 2; for (j = cell_low[i][c][1]; j <= cell_high[i][c][1]; j++) { ecode = clSetKernelArg(k_error_norm[i], 0, sizeof(cl_mem), &m_u[i]); ecode |= clSetKernelArg(k_error_norm[i], 1, sizeof(cl_mem), &m_rms[i]); ecode |= clSetKernelArg(k_error_norm[i], 2, sizeof(cl_mem), &m_ce[i]); ecode |= clSetKernelArg(k_error_norm[i], 3, sizeof(int), &c); ecode |= clSetKernelArg(k_error_norm[i], 4, sizeof(int), &k); ecode |= clSetKernelArg(k_error_norm[i], 5, sizeof(int), &kk); ecode |= clSetKernelArg(k_error_norm[i], 6, sizeof(int), &j); ecode |= clSetKernelArg(k_error_norm[i], 7, sizeof(int), &jj); ecode |= clSetKernelArg(k_error_norm[i], 8, sizeof(int), &cell_low[i][c][0]); ecode |= clSetKernelArg(k_error_norm[i], 9, sizeof(int), &cell_high[i][c][0]); clu_CheckError(ecode, "clSetKernelArg() for error_norm"); ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_error_norm[i], 1, NULL, &one, &one, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for error_norm"); jj = jj + 1; CHECK_FINISH(i * 2); } kk = kk + 1; } } } for (i = 0; i < num_devices; i++) { ecode = clEnqueueReadBuffer(cmd_queue[i * 2], m_rms[i], CL_TRUE, 0, sizeof(double)*5, rms_work, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueReadBuffer() for m_rms"); for (m = 0; m < 5; m++) rms[m] += rms_work[m]; clReleaseMemObject(m_rms[i]); } for (m = 0; m < 5; m++) { for (d = 0; d < 3; d++) { rms[m] = rms[m] / (double)(grid_points[d]-2); } rms[m] = sqrt(rms[m]); } }
//--------------------------------------------------------------------- // Set up the OpenCL environment. //--------------------------------------------------------------------- static void setup_opencl(int argc, char *argv[]) { cl_int ecode; char *source_dir = "IS"; if (argc > 1) source_dir = argv[1]; #ifdef TIMER_DETAIL if (timer_on) { int i; for (i = T_OPENCL_API; i < T_END; i++) timer_clear(i); } #endif DTIMER_START(T_OPENCL_API); // 1. Find the default device type and get a device for the device type device_type = clu_GetDefaultDeviceType(); device = clu_GetAvailableDevice(device_type); device_name = clu_GetDeviceName(device); // Device information ecode = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(work_item_sizes), &work_item_sizes, NULL); clu_CheckError(ecode, "clGetDiviceInfo()"); ecode = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_work_group_size, NULL); clu_CheckError(ecode, "clGetDiviceInfo()"); // FIXME: The below values are experimental. if (max_work_group_size > 256) { max_work_group_size = 256; int i; for (i = 0; i < 3; i++) { if (work_item_sizes[i] > 256) { work_item_sizes[i] = 256; } } } // 2. Create a context for the specified device context = clCreateContext(NULL, 1, &device, NULL, NULL, &ecode); clu_CheckError(ecode, "clCreateContext()"); // 3. Create a command queue cmd_queue = clCreateCommandQueue(context, device, 0, &ecode); clu_CheckError(ecode, "clCreateCommandQueue()"); DTIMER_STOP(T_OPENCL_API); // 4. Build the program DTIMER_START(T_BUILD); char *source_file; char build_option[30]; if (device_type == CL_DEVICE_TYPE_CPU) { source_file = "is_cpu.cl"; sprintf(build_option, "-DCLASS=%d -I.", CLASS); CREATE_SEQ_GROUP_SIZE = 64; CREATE_SEQ_GLOBAL_SIZE = CREATE_SEQ_GROUP_SIZE * 256; RANK_GROUP_SIZE = 1; RANK_GLOBAL_SIZE = RANK_GROUP_SIZE * 128; RANK1_GROUP_SIZE = 1; RANK1_GLOBAL_SIZE = RANK1_GROUP_SIZE * RANK_GLOBAL_SIZE;; RANK2_GROUP_SIZE = RANK_GROUP_SIZE; RANK2_GLOBAL_SIZE = RANK_GLOBAL_SIZE;; FV2_GROUP_SIZE = 64; FV2_GLOBAL_SIZE = FV2_GROUP_SIZE * 256; } else if (device_type == CL_DEVICE_TYPE_GPU) { source_file = "is_gpu.cl"; sprintf(build_option, "-DCLASS=\'%c\' -I.", CLASS); CREATE_SEQ_GROUP_SIZE = 64; CREATE_SEQ_GLOBAL_SIZE = CREATE_SEQ_GROUP_SIZE * 256; RANK1_GROUP_SIZE = work_item_sizes[0]; RANK1_GLOBAL_SIZE = MAX_KEY; RANK2_GROUP_SIZE = work_item_sizes[0]; RANK2_GLOBAL_SIZE = NUM_KEYS; FV2_GROUP_SIZE = work_item_sizes[0]; FV2_GLOBAL_SIZE = NUM_KEYS; } else { fprintf(stderr, "%s: not supported.", clu_GetDeviceTypeName(device_type)); exit(EXIT_FAILURE); } program = clu_MakeProgram(context, device, source_dir, source_file, build_option); DTIMER_STOP(T_BUILD); // 5. Create buffers DTIMER_START(T_BUFFER_CREATE); m_key_array = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(INT_TYPE) * SIZE_OF_BUFFERS, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_key_array"); m_key_buff1 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(INT_TYPE) * MAX_KEY, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_key_buff1"); m_key_buff2 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(INT_TYPE) * SIZE_OF_BUFFERS, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_key_buff2"); size_t test_array_size = sizeof(INT_TYPE) * TEST_ARRAY_SIZE; m_index_array = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, test_array_size, test_index_array, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_index_array"); m_rank_array = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, test_array_size, test_rank_array, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_rank_array"); m_partial_vals = clCreateBuffer(context, CL_MEM_WRITE_ONLY, test_array_size, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_partial_vals"); m_passed_verification = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_passed_verification"); if (device_type == CL_DEVICE_TYPE_GPU) { m_key_scan = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(INT_TYPE) * MAX_KEY, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_key_buff1_scan"); m_sum = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(INT_TYPE) * work_item_sizes[0], NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_sum"); } else { size_t bs_size = RANK_GLOBAL_SIZE * sizeof(INT_TYPE) * NUM_BUCKETS; m_bucket_size = clCreateBuffer(context, CL_MEM_READ_WRITE, bs_size, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_bucket_size"); m_bucket_ptrs = clCreateBuffer(context, CL_MEM_READ_WRITE, bs_size, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_bucket_ptrs"); } DTIMER_STOP(T_BUFFER_CREATE); // 6. Create kernels DTIMER_START(T_OPENCL_API); k_rank0 = clCreateKernel(program, "rank0", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank0"); ecode = clSetKernelArg(k_rank0, 0, sizeof(cl_mem), (void*)&m_key_array); ecode |= clSetKernelArg(k_rank0, 1, sizeof(cl_mem), (void*)&m_partial_vals); ecode |= clSetKernelArg(k_rank0, 2, sizeof(cl_mem), (void*)&m_index_array); clu_CheckError(ecode, "clSetKernelArg() for rank0"); if (device_type == CL_DEVICE_TYPE_GPU) { k_rank1 = clCreateKernel(program, "rank1", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank1"); ecode = clSetKernelArg(k_rank1, 0, sizeof(cl_mem), (void*)&m_key_buff1); clu_CheckError(ecode, "clSetKernelArg() for rank1"); k_rank2 = clCreateKernel(program, "rank2", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank2"); ecode = clSetKernelArg(k_rank2, 0, sizeof(cl_mem), (void*)&m_key_buff1); ecode |= clSetKernelArg(k_rank2, 1, sizeof(cl_mem), (void*)&m_key_array); clu_CheckError(ecode, "clSetKernelArg() for rank2"); k_rank3_0 = clCreateKernel(program, "rank3_0", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank3_0"); ecode = clSetKernelArg(k_rank3_0, 0, sizeof(cl_mem),(void*)&m_key_buff1); ecode |= clSetKernelArg(k_rank3_0, 1, sizeof(cl_mem),(void*)&m_key_buff1); ecode |= clSetKernelArg(k_rank3_0, 2, sizeof(cl_mem),(void*)&m_sum); ecode |= clSetKernelArg(k_rank3_0, 3, sizeof(INT_TYPE) * work_item_sizes[0] * 2, NULL); clu_CheckError(ecode, "clSetKernelArg() for rank3_0"); k_rank3_1 = clCreateKernel(program, "rank3_1", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank3_1"); ecode = clSetKernelArg(k_rank3_1, 0, sizeof(cl_mem), (void*)&m_sum); ecode = clSetKernelArg(k_rank3_1, 1, sizeof(cl_mem), (void*)&m_sum); ecode |= clSetKernelArg(k_rank3_1, 2, sizeof(INT_TYPE) * work_item_sizes[0] * 2, NULL); clu_CheckError(ecode, "clSetKernelArg() for rank3_1"); k_rank3_2 = clCreateKernel(program, "rank3_2", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank3_2"); ecode = clSetKernelArg(k_rank3_2, 0, sizeof(cl_mem),(void*)&m_key_buff1); ecode = clSetKernelArg(k_rank3_2, 1, sizeof(cl_mem),(void*)&m_key_buff1); ecode |= clSetKernelArg(k_rank3_2, 2, sizeof(cl_mem),(void*)&m_sum); clu_CheckError(ecode, "clSetKernelArg() for rank3_2"); } else { k_rank1 = clCreateKernel(program, "rank1", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank1"); ecode = clSetKernelArg(k_rank1, 0, sizeof(cl_mem),(void*)&m_key_array); ecode |= clSetKernelArg(k_rank1, 1, sizeof(cl_mem),(void*)&m_bucket_size); clu_CheckError(ecode, "clSetKernelArg() for rank1"); k_rank2 = clCreateKernel(program, "rank2", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank2"); ecode = clSetKernelArg(k_rank2, 0, sizeof(cl_mem),(void*)&m_key_array); ecode |= clSetKernelArg(k_rank2, 1, sizeof(cl_mem),(void*)&m_bucket_size); ecode |= clSetKernelArg(k_rank2, 2, sizeof(cl_mem),(void*)&m_bucket_ptrs); ecode |= clSetKernelArg(k_rank2, 3, sizeof(cl_mem),(void*)&m_key_buff2); clu_CheckError(ecode, "clSetKernelArg() for rank2"); k_rank3 = clCreateKernel(program, "rank3", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank3"); ecode = clSetKernelArg(k_rank3, 0, sizeof(cl_mem),(void*)&m_bucket_size); ecode |= clSetKernelArg(k_rank3, 1, sizeof(cl_mem),(void*)&m_bucket_ptrs); ecode |= clSetKernelArg(k_rank3, 2, sizeof(cl_mem),(void*)&m_key_buff1); ecode |= clSetKernelArg(k_rank3, 3, sizeof(cl_mem),(void*)&m_key_buff2); clu_CheckError(ecode, "clSetKernelArg() for rank3"); } k_rank4 = clCreateKernel(program, "rank4", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank4"); ecode = clSetKernelArg(k_rank4, 0, sizeof(cl_mem), (void*)&m_partial_vals); ecode |= clSetKernelArg(k_rank4, 1, sizeof(cl_mem), (void*)&m_key_buff1); ecode |= clSetKernelArg(k_rank4, 2, sizeof(cl_mem), (void*)&m_rank_array); ecode |= clSetKernelArg(k_rank4, 3, sizeof(cl_mem), (void*)&m_passed_verification); clu_CheckError(ecode, "clSetKernelArg() for rank4"); DTIMER_STOP(T_OPENCL_API); }
void set_constants() { ce[0][0] = 2.0; ce[0][1] = 0.0; ce[0][2] = 0.0; ce[0][3] = 4.0; ce[0][4] = 5.0; ce[0][5] = 3.0; ce[0][6] = 0.5; ce[0][7] = 0.02; ce[0][8] = 0.01; ce[0][9] = 0.03; ce[0][10] = 0.5; ce[0][11] = 0.4; ce[0][12] = 0.3; ce[1][0] = 1.0; ce[1][1] = 0.0; ce[1][2] = 0.0; ce[1][3] = 0.0; ce[1][4] = 1.0; ce[1][5] = 2.0; ce[1][6] = 3.0; ce[1][7] = 0.01; ce[1][8] = 0.03; ce[1][9] = 0.02; ce[1][10] = 0.4; ce[1][11] = 0.3; ce[1][12] = 0.5; ce[2][0] = 2.0; ce[2][1] = 2.0; ce[2][2] = 0.0; ce[2][3] = 0.0; ce[2][4] = 0.0; ce[2][5] = 2.0; ce[2][6] = 3.0; ce[2][7] = 0.04; ce[2][8] = 0.03; ce[2][9] = 0.05; ce[2][10] = 0.3; ce[2][11] = 0.5; ce[2][12] = 0.4; ce[3][0] = 2.0; ce[3][1] = 2.0; ce[3][2] = 0.0; ce[3][3] = 0.0; ce[3][4] = 0.0; ce[3][5] = 2.0; ce[3][6] = 3.0; ce[3][7] = 0.03; ce[3][8] = 0.05; ce[3][9] = 0.04; ce[3][10] = 0.2; ce[3][11] = 0.1; ce[3][12] = 0.3; ce[4][0] = 5.0; ce[4][1] = 4.0; ce[4][2] = 3.0; ce[4][3] = 2.0; ce[4][4] = 0.1; ce[4][5] = 0.4; ce[4][6] = 0.3; ce[4][7] = 0.05; ce[4][8] = 0.04; ce[4][9] = 0.03; ce[4][10] = 0.1; ce[4][11] = 0.3; ce[4][12] = 0.2; c1 = 1.4; c2 = 0.4; c3 = 0.1; c4 = 1.0; c5 = 1.4; dnxm1 = 1.0 / (double)(grid_points[0]-1); dnym1 = 1.0 / (double)(grid_points[1]-1); dnzm1 = 1.0 / (double)(grid_points[2]-1); c1c2 = c1 * c2; c1c5 = c1 * c5; c3c4 = c3 * c4; c1345 = c1c5 * c3c4; conz1 = (1.0-c1c5); tx1 = 1.0 / (dnxm1 * dnxm1); tx2 = 1.0 / (2.0 * dnxm1); tx3 = 1.0 / dnxm1; ty1 = 1.0 / (dnym1 * dnym1); ty2 = 1.0 / (2.0 * dnym1); ty3 = 1.0 / dnym1; tz1 = 1.0 / (dnzm1 * dnzm1); tz2 = 1.0 / (2.0 * dnzm1); tz3 = 1.0 / dnzm1; dx1 = 0.75; dx2 = 0.75; dx3 = 0.75; dx4 = 0.75; dx5 = 0.75; dy1 = 0.75; dy2 = 0.75; dy3 = 0.75; dy4 = 0.75; dy5 = 0.75; dz1 = 1.0; dz2 = 1.0; dz3 = 1.0; dz4 = 1.0; dz5 = 1.0; dxmax = max(dx3, dx4); dymax = max(dy2, dy4); dzmax = max(dz2, dz3); dssp = 0.25 * max(dx1, max(dy1, dz1) ); c4dssp = 4.0 * dssp; c5dssp = 5.0 * dssp; dttx1 = dt*tx1; dttx2 = dt*tx2; dtty1 = dt*ty1; dtty2 = dt*ty2; dttz1 = dt*tz1; dttz2 = dt*tz2; c2dttx1 = 2.0*dttx1; c2dtty1 = 2.0*dtty1; c2dttz1 = 2.0*dttz1; dtdssp = dt*dssp; comz1 = dtdssp; comz4 = 4.0*dtdssp; comz5 = 5.0*dtdssp; comz6 = 6.0*dtdssp; c3c4tx3 = c3c4*tx3; c3c4ty3 = c3c4*ty3; c3c4tz3 = c3c4*tz3; dx1tx1 = dx1*tx1; dx2tx1 = dx2*tx1; dx3tx1 = dx3*tx1; dx4tx1 = dx4*tx1; dx5tx1 = dx5*tx1; dy1ty1 = dy1*ty1; dy2ty1 = dy2*ty1; dy3ty1 = dy3*ty1; dy4ty1 = dy4*ty1; dy5ty1 = dy5*ty1; dz1tz1 = dz1*tz1; dz2tz1 = dz2*tz1; dz3tz1 = dz3*tz1; dz4tz1 = dz4*tz1; dz5tz1 = dz5*tz1; c2iv = 2.5; con43 = 4.0/3.0; con16 = 1.0/6.0; xxcon1 = c3c4tx3*con43*tx3; xxcon2 = c3c4tx3*tx3; xxcon3 = c3c4tx3*conz1*tx3; xxcon4 = c3c4tx3*con16*tx3; xxcon5 = c3c4tx3*c1c5*tx3; yycon1 = c3c4ty3*con43*ty3; yycon2 = c3c4ty3*ty3; yycon3 = c3c4ty3*conz1*ty3; yycon4 = c3c4ty3*con16*ty3; yycon5 = c3c4ty3*c1c5*ty3; zzcon1 = c3c4tz3*con43*tz3; zzcon2 = c3c4tz3*tz3; zzcon3 = c3c4tz3*conz1*tz3; zzcon4 = c3c4tz3*con16*tz3; zzcon5 = c3c4tz3*c1c5*tz3; //------------------------------------------------------------------------ cl_int ecode; int i; for (i = 0; i < num_devices; i++) { ecode = clEnqueueWriteBuffer(cmd_queue[i], m_ce[i], CL_TRUE, 0, sizeof(double)*5*13, ce, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueWriteBuffer() for m_ce"); } //------------------------------------------------------------------------ }
void compute_rhs() { int c, i; cl_int ecode = 0; if (timeron) timer_start(t_rhs); //--------------------------------------------------------------------- // loop over all cells owned by this node //--------------------------------------------------------------------- for (c = 0; c < ncells; c++) { for (i = 0; i < num_devices; i++) { ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_compute_rhs1[i][c], COMPUTE_RHS1_DIM, NULL, compute_rhs1_gw[i][c], compute_rhs1_lw[i][c], 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRange() for compute_rhs1"); } for (i = 0; i < num_devices; i++) { ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_compute_rhs2[i][c], COMPUTE_RHS2_DIM, NULL, compute_rhs2_gw[i][c], compute_rhs2_lw[i][c], 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRange() for compute_rhs2"); ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_compute_rhs3[i][c], 2, NULL, compute_rhs3_gw[i][c], compute_rhs3_lw[i][c], 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRange() for compute_rhs3"); ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_compute_rhs4[i][c], 2, NULL, compute_rhs4_gw[i][c], compute_rhs4_lw[i][c], 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRange() for compute_rhs4"); ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_compute_rhs5[i][c], 2, NULL, compute_rhs5_gw[i][c], compute_rhs5_lw[i][c], 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRange() for compute_rhs5"); ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_compute_rhs6[i][c], COMPUTE_RHS6_DIM, NULL, compute_rhs6_gw[i][c], compute_rhs6_lw[i][c], 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRange() for compute_rhs6"); } for (i = 0; i < num_devices; i++) { CHECK_FINISH(i * 2); } } for (i = 0; i < num_devices; i++) CHECK_FINISH(i * 2); if (timeron) timer_stop(t_rhs); }
//--------------------------------------------------------------------- // This subroutine initializes the field variable u using // tri-linear transfinite interpolation of the boundary values //--------------------------------------------------------------------- void initialize() { cl_kernel k_initialize1; cl_kernel k_initialize2; cl_kernel k_initialize3; cl_kernel k_initialize4; cl_kernel k_initialize5; size_t local_ws[3], global_ws[3], temp; cl_int ecode; int d0 = grid_points[0]; int d1 = grid_points[1]; int d2 = grid_points[2]; //----------------------------------------------------------------------- k_initialize1 = clCreateKernel(p_initialize, "initialize1", &ecode); clu_CheckError(ecode, "clCreateKernel()"); ecode = clSetKernelArg(k_initialize1, 0, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_initialize1, 1, sizeof(int), &d0); ecode |= clSetKernelArg(k_initialize1, 2, sizeof(int), &d1); ecode |= clSetKernelArg(k_initialize1, 3, sizeof(int), &d2); clu_CheckError(ecode, "clSetKernelArg()"); local_ws[0] = d1 < work_item_sizes[0] ? d1 : work_item_sizes[0]; temp = max_work_group_size / local_ws[0]; local_ws[1] = d2 < temp ? d2 : temp; global_ws[0] = clu_RoundWorkSize((size_t)d1, local_ws[0]); global_ws[1] = clu_RoundWorkSize((size_t)d2, local_ws[1]); ecode = clEnqueueNDRangeKernel(cmd_queue, k_initialize1, 2, NULL, global_ws, local_ws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); //----------------------------------------------------------------------- //--------------------------------------------------------------------- // first store the "interpolated" values everywhere on the grid //--------------------------------------------------------------------- k_initialize2 = clCreateKernel(p_initialize, "initialize2", &ecode); clu_CheckError(ecode, "clCreateKernel()"); ecode = clSetKernelArg(k_initialize2, 0, sizeof(cl_mem), &m_u); ecode = clSetKernelArg(k_initialize2, 1, sizeof(cl_mem), &m_ce); ecode |= clSetKernelArg(k_initialize2, 2, sizeof(int), &d0); ecode |= clSetKernelArg(k_initialize2, 3, sizeof(int), &d1); ecode |= clSetKernelArg(k_initialize2, 4, sizeof(int), &d2); clu_CheckError(ecode, "clSetKernelArg()"); if (INITIALIZE2_DIM == 3) { local_ws[0] = d0 < work_item_sizes[0] ? d0 : work_item_sizes[0]; temp = max_work_group_size / local_ws[0]; local_ws[1] = d1 < temp ? d1 : temp; temp = temp / local_ws[1]; local_ws[2] = d2 < temp ? d2 : temp; global_ws[0] = clu_RoundWorkSize((size_t)d0, local_ws[0]); global_ws[1] = clu_RoundWorkSize((size_t)d1, local_ws[1]); global_ws[2] = clu_RoundWorkSize((size_t)d2, local_ws[2]); } else if (INITIALIZE2_DIM == 2) { local_ws[0] = d1 < work_item_sizes[0] ? d1 : work_item_sizes[0]; temp = max_work_group_size / local_ws[0]; local_ws[1] = d2 < temp ? d2 : temp; global_ws[0] = clu_RoundWorkSize((size_t)d1, local_ws[0]); global_ws[1] = clu_RoundWorkSize((size_t)d2, local_ws[1]); } CHECK_FINISH(); ecode = clEnqueueNDRangeKernel(cmd_queue, k_initialize2, INITIALIZE2_DIM, NULL, global_ws, local_ws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); //----------------------------------------------------------------------- //--------------------------------------------------------------------- // now store the exact values on the boundaries //--------------------------------------------------------------------- k_initialize3 = clCreateKernel(p_initialize, "initialize3", &ecode); clu_CheckError(ecode, "clCreateKernel()"); ecode = clSetKernelArg(k_initialize3, 0, sizeof(cl_mem), &m_u); ecode = clSetKernelArg(k_initialize3, 1, sizeof(cl_mem), &m_ce); ecode |= clSetKernelArg(k_initialize3, 2, sizeof(int), &d0); ecode |= clSetKernelArg(k_initialize3, 3, sizeof(int), &d1); ecode |= clSetKernelArg(k_initialize3, 4, sizeof(int), &d2); clu_CheckError(ecode, "clSetKernelArg()"); local_ws[0] = d1 < work_item_sizes[0] ? d1 : work_item_sizes[0]; temp = max_work_group_size / local_ws[0]; local_ws[1] = d2 < temp ? d2 : temp; global_ws[0] = clu_RoundWorkSize((size_t)d1, local_ws[0]); global_ws[1] = clu_RoundWorkSize((size_t)d2, local_ws[1]); CHECK_FINISH(); ecode = clEnqueueNDRangeKernel(cmd_queue, k_initialize3, 2, NULL, global_ws, local_ws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); //----------------------------------------------------------------------- k_initialize4 = clCreateKernel(p_initialize, "initialize4", &ecode); clu_CheckError(ecode, "clCreateKernel()"); ecode = clSetKernelArg(k_initialize4, 0, sizeof(cl_mem), &m_u); ecode = clSetKernelArg(k_initialize4, 1, sizeof(cl_mem), &m_ce); ecode |= clSetKernelArg(k_initialize4, 2, sizeof(int), &d0); ecode |= clSetKernelArg(k_initialize4, 3, sizeof(int), &d1); ecode |= clSetKernelArg(k_initialize4, 4, sizeof(int), &d2); clu_CheckError(ecode, "clSetKernelArg()"); local_ws[0] = d0 < work_item_sizes[0] ? d0 : work_item_sizes[0]; temp = max_work_group_size / local_ws[0]; local_ws[1] = d2 < temp ? d2 : temp; global_ws[0] = clu_RoundWorkSize((size_t)d0, local_ws[0]); global_ws[1] = clu_RoundWorkSize((size_t)d2, local_ws[1]); CHECK_FINISH(); ecode = clEnqueueNDRangeKernel(cmd_queue, k_initialize4, 2, NULL, global_ws, local_ws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); //----------------------------------------------------------------------- k_initialize5 = clCreateKernel(p_initialize, "initialize5", &ecode); clu_CheckError(ecode, "clCreateKernel()"); ecode = clSetKernelArg(k_initialize5, 0, sizeof(cl_mem), &m_u); ecode = clSetKernelArg(k_initialize5, 1, sizeof(cl_mem), &m_ce); ecode |= clSetKernelArg(k_initialize5, 2, sizeof(int), &d0); ecode |= clSetKernelArg(k_initialize5, 3, sizeof(int), &d1); ecode |= clSetKernelArg(k_initialize5, 4, sizeof(int), &d2); clu_CheckError(ecode, "clSetKernelArg()"); local_ws[0] = d0 < work_item_sizes[0] ? d0 : work_item_sizes[0]; temp = max_work_group_size / local_ws[0]; local_ws[1] = d1 < temp ? d1 : temp; global_ws[0] = clu_RoundWorkSize((size_t)d0, local_ws[0]); global_ws[1] = clu_RoundWorkSize((size_t)d1, local_ws[1]); CHECK_FINISH(); ecode = clEnqueueNDRangeKernel(cmd_queue, k_initialize5, 2, NULL, global_ws, local_ws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); //----------------------------------------------------------------------- clReleaseKernel(k_initialize1); clReleaseKernel(k_initialize2); clReleaseKernel(k_initialize3); clReleaseKernel(k_initialize4); CHECK_FINISH(); clReleaseKernel(k_initialize5); }