//--------------------------------------------------------------------- // set the boundary values of dependent variables //--------------------------------------------------------------------- void setbv() { DTIMER_START(t_setbv); cl_int ecode; ecode = clEnqueueNDRangeKernel(cmd_queue, k_setbv1, SETBV1_DIM, NULL, setbv1_gws, setbv1_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); ecode = clEnqueueNDRangeKernel(cmd_queue, k_setbv2, SETBV2_DIM, NULL, setbv2_gws, setbv2_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); ecode = clEnqueueNDRangeKernel(cmd_queue, k_setbv3, SETBV3_DIM, NULL, setbv3_gws, setbv3_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); CHECK_FINISH(); DTIMER_STOP(t_setbv); }
//--------------------------------------------------------------------- // compute function from local (i,j,k) to ibar^2+jbar^2+kbar^2 // for time evolution exponent. //--------------------------------------------------------------------- static void compute_indexmap(cl_mem *twiddle, int d1, int d2, int d3) { cl_int ecode; ecode = clEnqueueNDRangeKernel(cmd_queue, k_compute_indexmap, COMPUTE_IMAP_DIM, NULL, cimap_gws, cimap_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for compute_indexmap"); CHECK_FINISH(); }
static void cffts3(int is, int d1, int d2, int d3, cl_mem *x, cl_mem *xout) { int logd3 = ilog2(d3); size_t local_ws[2], global_ws[2], temp; cl_int ecode; if (timers_enabled) timer_start(T_fftz); ecode = clSetKernelArg(k_cffts3, 0, sizeof(cl_mem), x); ecode |= clSetKernelArg(k_cffts3, 1, sizeof(cl_mem), xout); ecode |= clSetKernelArg(k_cffts3, 3, sizeof(int), &is); ecode |= clSetKernelArg(k_cffts3, 4, sizeof(int), &d1); ecode |= clSetKernelArg(k_cffts3, 5, sizeof(int), &d2); ecode |= clSetKernelArg(k_cffts3, 6, sizeof(int), &d3); ecode |= clSetKernelArg(k_cffts3, 7, sizeof(int), &logd3); clu_CheckError(ecode, "clSetKernelArg() for k_cffts3"); if (device_type == CL_DEVICE_TYPE_CPU) { local_ws[0] = d1 < work_item_sizes[0] ? d1 : work_item_sizes[0]; temp = max_work_group_size / local_ws[0]; local_ws[1] = d2 < temp ? d2 : temp; global_ws[0] = clu_RoundWorkSize((size_t)d1, local_ws[0]); global_ws[1] = clu_RoundWorkSize((size_t)d2, local_ws[1]); } else if (device_type == CL_DEVICE_TYPE_GPU) { if (CFFTS_DIM == 2) { local_ws[0] = CFFTS_LSIZE; local_ws[1] = 1; global_ws[0] = d1 * local_ws[0]; global_ws[1] = d2; } else { local_ws[0] = CFFTS_LSIZE; global_ws[0] = d2 * local_ws[0]; } } ecode = clEnqueueNDRangeKernel(cmd_queue, k_cffts3, CFFTS_DIM, NULL, global_ws, local_ws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for cffts3"); CHECK_FINISH(); if (timers_enabled) timer_stop(T_fftz); }
//--------------------------------------------------------------------- // evolve u0 -> u1 (t time steps) in fourier space //--------------------------------------------------------------------- static void evolve(cl_mem *u0, cl_mem *u1, cl_mem *twiddle, int d1, int d2, int d3) { cl_int ecode; size_t local_ws[3], global_ws[3]; ecode = clSetKernelArg(k_evolve, 0, sizeof(cl_mem), u0); ecode |= clSetKernelArg(k_evolve, 1, sizeof(cl_mem), u1); ecode |= clSetKernelArg(k_evolve, 2, sizeof(cl_mem), twiddle); ecode |= clSetKernelArg(k_evolve, 3, sizeof(int), &d1); ecode |= clSetKernelArg(k_evolve, 4, sizeof(int), &d2); ecode |= clSetKernelArg(k_evolve, 5, sizeof(int), &d3); clu_CheckError(ecode, "clSetKernelArg() for evolve"); if (EVOLVE_DIM == 3) { local_ws[0] = d1 < work_item_sizes[0] ? d1 : work_item_sizes[0]; int temp = max_work_group_size / local_ws[0]; local_ws[1] = d2 < temp ? d2 : temp; temp = temp / local_ws[1]; local_ws[2] = d3 < temp ? d3 : temp; global_ws[0] = clu_RoundWorkSize((size_t)d1, local_ws[0]); global_ws[1] = clu_RoundWorkSize((size_t)d2, local_ws[1]); global_ws[2] = clu_RoundWorkSize((size_t)d3, local_ws[2]); } else if (EVOLVE_DIM == 2) { local_ws[0] = d2 < work_item_sizes[0] ? d2 : work_item_sizes[0]; int temp = max_work_group_size / local_ws[0]; local_ws[1] = d3 < temp ? d3 : temp; global_ws[0] = clu_RoundWorkSize((size_t)d2, local_ws[0]); global_ws[1] = clu_RoundWorkSize((size_t)d3, local_ws[1]); } else { int temp = d3 / max_compute_units; local_ws[0] = temp == 0 ? 1 : temp; global_ws[0] = clu_RoundWorkSize((size_t)d3, local_ws[0]); } ecode = clEnqueueNDRangeKernel(cmd_queue, k_evolve, EVOLVE_DIM, NULL, global_ws, local_ws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for evolve"); CHECK_FINISH(); }
//--------------------------------------------------------------------- // block-diagonal matrix-vector multiplication //--------------------------------------------------------------------- void ninvr() { cl_int ecode; if (timeron) timer_start(t_ninvr); ecode = clEnqueueNDRangeKernel(cmd_queue, k_ninvr, NINVR_DIM, NULL, ninvr_gws, ninvr_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); CHECK_FINISH(); if (timeron) timer_stop(t_ninvr); }
//--------------------------------------------------------------------- // addition of update to the vector u //--------------------------------------------------------------------- void add() { cl_int ecode; if (timeron) timer_start(t_add); ecode = clEnqueueNDRangeKernel(cmd_queue, k_add, ADD_DIM, NULL, add_gws, add_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); CHECK_FINISH(); if (timeron) timer_stop(t_add); }
//--------------------------------------------------------------------- // this function performs the solution of the approximate factorization // step in the z-direction for all five matrix components // simultaneously. The Thomas algorithm is employed to solve the // systems for the z-lines. Boundary conditions are non-periodic //--------------------------------------------------------------------- void z_solve() { cl_int ecode; if (timeron) timer_start(t_zsolve); ecode = clEnqueueNDRangeKernel(cmd_queue, k_z_solve, Z_SOLVE_DIM, NULL, z_solve_gws, z_solve_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); CHECK_FINISH(); if (timeron) timer_stop(t_zsolve); tzetar(); }
static void checksum(int i, cl_mem *u1, int d1, int d2, int d3) { dcomplex chk = dcmplx(0.0, 0.0); int k; cl_int ecode; ecode = clSetKernelArg(k_checksum, 0, sizeof(cl_mem), u1); clu_CheckError(ecode, "clSetKernelArg() for checksum"); ecode = clEnqueueNDRangeKernel(cmd_queue, k_checksum, 1, NULL, &checksum_global_ws, &checksum_local_ws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); CHECK_FINISH(); ecode = clEnqueueReadBuffer(cmd_queue, m_chk, CL_TRUE, 0, checksum_wg_num * sizeof(dcomplex), g_chk, 0, NULL, NULL); clu_CheckError(ecode, "clReadBuffer()"); // reduction for (k = 0; k < checksum_wg_num; k++) { chk = dcmplx_add(chk, g_chk[k]); } chk = dcmplx_div2(chk, (double)(NTOTAL)); printf(" T =%5d Checksum =%22.12E%22.12E\n", i, chk.real, chk.imag); sums[i] = chk; }
//--------------------------------------------------------------------- // this function computes the norm of the difference between the // computed solution and the exact solution //--------------------------------------------------------------------- void error_norm(double rms[5]) { int c, i, m, d; int k, j, kk, jj; double rms_work[5]; size_t one = 1; cl_mem m_rms[MAX_DEVICE_NUM]; cl_int ecode = 0; for (m = 0; m < 5; m++) { rms[m] = 0.0; rms_work[m] = 0.0; } for (i = 0; i < num_devices; i++) { m_rms[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*5, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_rms"); ecode = clEnqueueWriteBuffer(cmd_queue[i * 2], m_rms[i], CL_TRUE, 0, sizeof(double)*5, rms_work, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueWriteBuffer() for m_rms"); } for (c = 0; c < ncells; c++) { for (i = 0; i < num_devices; i++) { kk = 2; for (k = cell_low[i][c][2]; k <= cell_high[i][c][2]; k++) { jj = 2; for (j = cell_low[i][c][1]; j <= cell_high[i][c][1]; j++) { ecode = clSetKernelArg(k_error_norm[i], 0, sizeof(cl_mem), &m_u[i]); ecode |= clSetKernelArg(k_error_norm[i], 1, sizeof(cl_mem), &m_rms[i]); ecode |= clSetKernelArg(k_error_norm[i], 2, sizeof(cl_mem), &m_ce[i]); ecode |= clSetKernelArg(k_error_norm[i], 3, sizeof(int), &c); ecode |= clSetKernelArg(k_error_norm[i], 4, sizeof(int), &k); ecode |= clSetKernelArg(k_error_norm[i], 5, sizeof(int), &kk); ecode |= clSetKernelArg(k_error_norm[i], 6, sizeof(int), &j); ecode |= clSetKernelArg(k_error_norm[i], 7, sizeof(int), &jj); ecode |= clSetKernelArg(k_error_norm[i], 8, sizeof(int), &cell_low[i][c][0]); ecode |= clSetKernelArg(k_error_norm[i], 9, sizeof(int), &cell_high[i][c][0]); clu_CheckError(ecode, "clSetKernelArg() for error_norm"); ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_error_norm[i], 1, NULL, &one, &one, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for error_norm"); jj = jj + 1; CHECK_FINISH(i * 2); } kk = kk + 1; } } } for (i = 0; i < num_devices; i++) { ecode = clEnqueueReadBuffer(cmd_queue[i * 2], m_rms[i], CL_TRUE, 0, sizeof(double)*5, rms_work, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueReadBuffer() for m_rms"); for (m = 0; m < 5; m++) rms[m] += rms_work[m]; clReleaseMemObject(m_rms[i]); } for (m = 0; m < 5; m++) { for (d = 0; d < 3; d++) { rms[m] = rms[m] / (double)(grid_points[d]-2); } rms[m] = sqrt(rms[m]); } }
//--------------------------------------------------------------------- // This subroutine initializes the field variable u using // tri-linear transfinite interpolation of the boundary values //--------------------------------------------------------------------- void initialize() { cl_kernel k_initialize1; cl_kernel k_initialize2; cl_kernel k_initialize3; cl_kernel k_initialize4; cl_kernel k_initialize5; size_t local_ws[3], global_ws[3], temp; cl_int ecode; int d0 = grid_points[0]; int d1 = grid_points[1]; int d2 = grid_points[2]; //----------------------------------------------------------------------- k_initialize1 = clCreateKernel(p_initialize, "initialize1", &ecode); clu_CheckError(ecode, "clCreateKernel()"); ecode = clSetKernelArg(k_initialize1, 0, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_initialize1, 1, sizeof(int), &d0); ecode |= clSetKernelArg(k_initialize1, 2, sizeof(int), &d1); ecode |= clSetKernelArg(k_initialize1, 3, sizeof(int), &d2); clu_CheckError(ecode, "clSetKernelArg()"); local_ws[0] = d1 < work_item_sizes[0] ? d1 : work_item_sizes[0]; temp = max_work_group_size / local_ws[0]; local_ws[1] = d2 < temp ? d2 : temp; global_ws[0] = clu_RoundWorkSize((size_t)d1, local_ws[0]); global_ws[1] = clu_RoundWorkSize((size_t)d2, local_ws[1]); ecode = clEnqueueNDRangeKernel(cmd_queue, k_initialize1, 2, NULL, global_ws, local_ws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); //----------------------------------------------------------------------- //--------------------------------------------------------------------- // first store the "interpolated" values everywhere on the grid //--------------------------------------------------------------------- k_initialize2 = clCreateKernel(p_initialize, "initialize2", &ecode); clu_CheckError(ecode, "clCreateKernel()"); ecode = clSetKernelArg(k_initialize2, 0, sizeof(cl_mem), &m_u); ecode = clSetKernelArg(k_initialize2, 1, sizeof(cl_mem), &m_ce); ecode |= clSetKernelArg(k_initialize2, 2, sizeof(int), &d0); ecode |= clSetKernelArg(k_initialize2, 3, sizeof(int), &d1); ecode |= clSetKernelArg(k_initialize2, 4, sizeof(int), &d2); clu_CheckError(ecode, "clSetKernelArg()"); if (INITIALIZE2_DIM == 3) { local_ws[0] = d0 < work_item_sizes[0] ? d0 : work_item_sizes[0]; temp = max_work_group_size / local_ws[0]; local_ws[1] = d1 < temp ? d1 : temp; temp = temp / local_ws[1]; local_ws[2] = d2 < temp ? d2 : temp; global_ws[0] = clu_RoundWorkSize((size_t)d0, local_ws[0]); global_ws[1] = clu_RoundWorkSize((size_t)d1, local_ws[1]); global_ws[2] = clu_RoundWorkSize((size_t)d2, local_ws[2]); } else if (INITIALIZE2_DIM == 2) { local_ws[0] = d1 < work_item_sizes[0] ? d1 : work_item_sizes[0]; temp = max_work_group_size / local_ws[0]; local_ws[1] = d2 < temp ? d2 : temp; global_ws[0] = clu_RoundWorkSize((size_t)d1, local_ws[0]); global_ws[1] = clu_RoundWorkSize((size_t)d2, local_ws[1]); } CHECK_FINISH(); ecode = clEnqueueNDRangeKernel(cmd_queue, k_initialize2, INITIALIZE2_DIM, NULL, global_ws, local_ws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); //----------------------------------------------------------------------- //--------------------------------------------------------------------- // now store the exact values on the boundaries //--------------------------------------------------------------------- k_initialize3 = clCreateKernel(p_initialize, "initialize3", &ecode); clu_CheckError(ecode, "clCreateKernel()"); ecode = clSetKernelArg(k_initialize3, 0, sizeof(cl_mem), &m_u); ecode = clSetKernelArg(k_initialize3, 1, sizeof(cl_mem), &m_ce); ecode |= clSetKernelArg(k_initialize3, 2, sizeof(int), &d0); ecode |= clSetKernelArg(k_initialize3, 3, sizeof(int), &d1); ecode |= clSetKernelArg(k_initialize3, 4, sizeof(int), &d2); clu_CheckError(ecode, "clSetKernelArg()"); local_ws[0] = d1 < work_item_sizes[0] ? d1 : work_item_sizes[0]; temp = max_work_group_size / local_ws[0]; local_ws[1] = d2 < temp ? d2 : temp; global_ws[0] = clu_RoundWorkSize((size_t)d1, local_ws[0]); global_ws[1] = clu_RoundWorkSize((size_t)d2, local_ws[1]); CHECK_FINISH(); ecode = clEnqueueNDRangeKernel(cmd_queue, k_initialize3, 2, NULL, global_ws, local_ws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); //----------------------------------------------------------------------- k_initialize4 = clCreateKernel(p_initialize, "initialize4", &ecode); clu_CheckError(ecode, "clCreateKernel()"); ecode = clSetKernelArg(k_initialize4, 0, sizeof(cl_mem), &m_u); ecode = clSetKernelArg(k_initialize4, 1, sizeof(cl_mem), &m_ce); ecode |= clSetKernelArg(k_initialize4, 2, sizeof(int), &d0); ecode |= clSetKernelArg(k_initialize4, 3, sizeof(int), &d1); ecode |= clSetKernelArg(k_initialize4, 4, sizeof(int), &d2); clu_CheckError(ecode, "clSetKernelArg()"); local_ws[0] = d0 < work_item_sizes[0] ? d0 : work_item_sizes[0]; temp = max_work_group_size / local_ws[0]; local_ws[1] = d2 < temp ? d2 : temp; global_ws[0] = clu_RoundWorkSize((size_t)d0, local_ws[0]); global_ws[1] = clu_RoundWorkSize((size_t)d2, local_ws[1]); CHECK_FINISH(); ecode = clEnqueueNDRangeKernel(cmd_queue, k_initialize4, 2, NULL, global_ws, local_ws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); //----------------------------------------------------------------------- k_initialize5 = clCreateKernel(p_initialize, "initialize5", &ecode); clu_CheckError(ecode, "clCreateKernel()"); ecode = clSetKernelArg(k_initialize5, 0, sizeof(cl_mem), &m_u); ecode = clSetKernelArg(k_initialize5, 1, sizeof(cl_mem), &m_ce); ecode |= clSetKernelArg(k_initialize5, 2, sizeof(int), &d0); ecode |= clSetKernelArg(k_initialize5, 3, sizeof(int), &d1); ecode |= clSetKernelArg(k_initialize5, 4, sizeof(int), &d2); clu_CheckError(ecode, "clSetKernelArg()"); local_ws[0] = d0 < work_item_sizes[0] ? d0 : work_item_sizes[0]; temp = max_work_group_size / local_ws[0]; local_ws[1] = d1 < temp ? d1 : temp; global_ws[0] = clu_RoundWorkSize((size_t)d0, local_ws[0]); global_ws[1] = clu_RoundWorkSize((size_t)d1, local_ws[1]); CHECK_FINISH(); ecode = clEnqueueNDRangeKernel(cmd_queue, k_initialize5, 2, NULL, global_ws, local_ws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); //----------------------------------------------------------------------- clReleaseKernel(k_initialize1); clReleaseKernel(k_initialize2); clReleaseKernel(k_initialize3); clReleaseKernel(k_initialize4); CHECK_FINISH(); clReleaseKernel(k_initialize5); }
int main(int argc, char *argv[]) { double Mops, t1, t2; double tsx, tsy, tm, an, tt, gc; double sx_verify_value, sy_verify_value, sx_err, sy_err; int i, nit; int k_offset, j; logical verified; char size[16]; FILE *fp; if (argc == 1) { fprintf(stderr, "Usage: %s <kernel directory>\n", argv[0]); exit(-1); } if ((fp = fopen("timer.flag", "r")) == NULL) { timers_enabled = false; } else { timers_enabled = true; fclose(fp); } //-------------------------------------------------------------------- // Because the size of the problem is too large to store in a 32-bit // integer for some classes, we put it into a string (for printing). // Have to strip off the decimal point put in there by the floating // point print statement (internal file) //-------------------------------------------------------------------- sprintf(size, "%15.0lf", pow(2.0, M+1)); j = 14; if (size[j] == '.') j--; size[j+1] = '\0'; printf("\n\n NAS Parallel Benchmarks (NPB3.3-OCL) - EP Benchmark\n"); printf("\n Number of random numbers generated: %15s\n", size); verified = false; //-------------------------------------------------------------------- // Compute the number of "batches" of random number pairs generated // per processor. Adjust if the number of processors does not evenly // divide the total number //-------------------------------------------------------------------- np = NN; setup_opencl(argc, argv); timer_clear(0); timer_start(0); //-------------------------------------------------------------------- // Compute AN = A ^ (2 * NK) (mod 2^46). //-------------------------------------------------------------------- t1 = A; for (i = 0; i < MK + 1; i++) { t2 = randlc(&t1, t1); } an = t1; tt = S; //-------------------------------------------------------------------- // Each instance of this loop may be performed independently. We compute // the k offsets separately to take into account the fact that some nodes // have more numbers to generate than others //-------------------------------------------------------------------- k_offset = -1; DTIMER_START(T_KERNEL_EMBAR); // Launch the kernel int q_size = GROUP_SIZE * NQ * sizeof(cl_double); int sx_size = GROUP_SIZE * sizeof(cl_double); int sy_size = GROUP_SIZE * sizeof(cl_double); err_code = clSetKernelArg(kernel, 0, q_size, NULL); err_code |= clSetKernelArg(kernel, 1, sx_size, NULL); err_code |= clSetKernelArg(kernel, 2, sy_size, NULL); err_code |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&pgq); err_code |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&pgsx); err_code |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&pgsy); err_code |= clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&k_offset); err_code |= clSetKernelArg(kernel, 7, sizeof(cl_double), (void*)&an); clu_CheckError(err_code, "clSetKernelArg()"); size_t localWorkSize[] = { GROUP_SIZE }; size_t globalWorkSize[] = { np }; err_code = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); clu_CheckError(err_code, "clEnqueueNDRangeKernel()"); CHECK_FINISH(); DTIMER_STOP(T_KERNEL_EMBAR); double (*gq)[NQ] = (double (*)[NQ])malloc(gq_size); double *gsx = (double*)malloc(gsx_size); double *gsy = (double*)malloc(gsy_size); gc = 0.0; tsx = 0.0; tsy = 0.0; for (i = 0; i < NQ; i++) { q[i] = 0.0; } // 9. Get the result DTIMER_START(T_BUFFER_READ); err_code = clEnqueueReadBuffer(cmd_queue, pgq, CL_FALSE, 0, gq_size, gq, 0, NULL, NULL); clu_CheckError(err_code, "clEnqueueReadbuffer()"); err_code = clEnqueueReadBuffer(cmd_queue, pgsx, CL_FALSE, 0, gsx_size, gsx, 0, NULL, NULL); clu_CheckError(err_code, "clEnqueueReadbuffer()"); err_code = clEnqueueReadBuffer(cmd_queue, pgsy, CL_TRUE, 0, gsy_size, gsy, 0, NULL, NULL); clu_CheckError(err_code, "clEnqueueReadbuffer()"); DTIMER_STOP(T_BUFFER_READ); for (i = 0; i < np/localWorkSize[0]; i++) { for (j = 0; j < NQ; j++ ){ q[j] = q[j] + gq[i][j]; } tsx = tsx + gsx[i]; tsy = tsy + gsy[i]; } for (i = 0; i < NQ; i++) { gc = gc + q[i]; } timer_stop(0); tm = timer_read(0); nit = 0; verified = true; if (M == 24) { sx_verify_value = -3.247834652034740e+3; sy_verify_value = -6.958407078382297e+3; } else if (M == 25) { sx_verify_value = -2.863319731645753e+3; sy_verify_value = -6.320053679109499e+3; } else if (M == 28) { sx_verify_value = -4.295875165629892e+3; sy_verify_value = -1.580732573678431e+4; } else if (M == 30) { sx_verify_value = 4.033815542441498e+4; sy_verify_value = -2.660669192809235e+4; } else if (M == 32) { sx_verify_value = 4.764367927995374e+4; sy_verify_value = -8.084072988043731e+4; } else if (M == 36) { sx_verify_value = 1.982481200946593e+5; sy_verify_value = -1.020596636361769e+5; } else if (M == 40) { sx_verify_value = -5.319717441530e+05; sy_verify_value = -3.688834557731e+05; } else { verified = false; } if (verified) { sx_err = fabs((tsx - sx_verify_value) / sx_verify_value); sy_err = fabs((tsy - sy_verify_value) / sy_verify_value); verified = ((sx_err <= EPSILON) && (sy_err <= EPSILON)); } Mops = pow(2.0, M+1) / tm / 1000000.0; printf("\nEP Benchmark Results:\n\n"); printf("CPU Time =%10.4lf\n", tm); printf("N = 2^%5d\n", M); printf("No. Gaussian Pairs = %15.0lf\n", gc); printf("Sums = %25.15lE %25.15lE\n", tsx, tsy); printf("Counts: \n"); for (i = 0; i < NQ; i++) { printf("%3d%15.0lf\n", i, q[i]); } c_print_results("EP", CLASS, M+1, 0, 0, nit, tm, Mops, "Random numbers generated", verified, NPBVERSION, COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, CS7, clu_GetDeviceTypeName(device_type), device_name); if (timers_enabled) { if (tm <= 0.0) tm = 1.0; tt = timer_read(0); printf("\nTotal time: %9.3lf (%6.2lf)\n", tt, tt*100.0/tm); } free(gq); free(gsx); free(gsy); release_opencl(); fflush(stdout); return 0; }
//--------------------------------------------------------------------- // compute the right hand side based on exact solution //--------------------------------------------------------------------- void exact_rhs() { int c, i; int range_1, range_0; size_t d[3], local_ws[3], global_ws[3]; cl_int ecode = 0; for (c = 0; c < ncells; c++) { for (i = 0; i < num_devices; i++) { ecode = clSetKernelArg(k_exact_rhs1[i], 0, sizeof(cl_mem), &m_forcing[i]); ecode |= clSetKernelArg(k_exact_rhs1[i], 1, sizeof(int), &c); ecode |= clSetKernelArg(k_exact_rhs1[i], 2, sizeof(int), &cell_size[i][c][2]); ecode |= clSetKernelArg(k_exact_rhs1[i], 3, sizeof(int), &cell_size[i][c][1]); ecode |= clSetKernelArg(k_exact_rhs1[i], 4, sizeof(int), &cell_size[i][c][0]); clu_CheckError(ecode, "clSetKernelArg() for exact_rhs1"); d[0] = cell_size[i][c][1]; d[1] = cell_size[i][c][2]; compute_ws_dim_2(d, local_ws, global_ws); ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_exact_rhs1[i], 2, NULL, global_ws, local_ws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRange() for exact_rhs1"); } for (i = 0; i < num_devices; i++) { CHECK_FINISH(i * 2); } } for (i = 0; i < num_devices; i++) { ecode = clSetKernelArg(k_exact_rhs2[i], 0, sizeof(cl_mem), &m_forcing[i]); ecode |= clSetKernelArg(k_exact_rhs2[i], 1, sizeof(cl_mem), &m_ce[i]); clu_CheckError(ecode, "clSetKernelArg() for exact_rhs2"); } for (c = 0; c < ncells; c++) { for (i = 0; i < num_devices; i++) { range_1 = cell_size[i][c][2] - cell_end[i][c][2]; range_0 = cell_size[i][c][1] - cell_end[i][c][1]; ecode = clSetKernelArg(k_exact_rhs2[i], 2, sizeof(int), &c); ecode |= clSetKernelArg(k_exact_rhs2[i], 3, sizeof(int), &cell_start[i][c][2]); ecode |= clSetKernelArg(k_exact_rhs2[i], 4, sizeof(int), &range_1); ecode |= clSetKernelArg(k_exact_rhs2[i], 5, sizeof(int), &cell_start[i][c][1]); ecode |= clSetKernelArg(k_exact_rhs2[i], 6, sizeof(int), &range_0); ecode |= clSetKernelArg(k_exact_rhs2[i], 7, sizeof(int), &cell_size[i][c][0]); ecode |= clSetKernelArg(k_exact_rhs2[i], 8, sizeof(int), &cell_start[i][c][0]); ecode |= clSetKernelArg(k_exact_rhs2[i], 9, sizeof(int), &cell_end[i][c][0]); ecode |= clSetKernelArg(k_exact_rhs2[i], 10, sizeof(int), &cell_low[i][c][2]); ecode |= clSetKernelArg(k_exact_rhs2[i], 11, sizeof(int), &cell_low[i][c][1]); ecode |= clSetKernelArg(k_exact_rhs2[i], 12, sizeof(int), &cell_low[i][c][0]); clu_CheckError(ecode, "clSetKernelArg() for exact_rhs2"); d[0] = cell_size[i][c][1] - cell_start[i][c][1] - cell_end[i][c][1]; d[1] = cell_size[i][c][2] - cell_start[i][c][2] - cell_end[i][c][2]; compute_ws_dim_2(d, local_ws, global_ws); if (c == 0) CHECK_FINISH(i * 2); ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_exact_rhs2[i], 2, NULL, global_ws, local_ws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRange() for exact_rhs2"); } for (i = 0; i < num_devices; i++) { CHECK_FINISH(i * 2); } } for (i = 0; i < num_devices; i++) { ecode = clSetKernelArg(k_exact_rhs3[i], 0, sizeof(cl_mem), &m_forcing[i]); ecode |= clSetKernelArg(k_exact_rhs3[i], 1, sizeof(cl_mem), &m_ce[i]); clu_CheckError(ecode, "clSetKernelArg() for exact_rhs3"); } for (c = 0; c < ncells; c++) { for (i = 0; i < num_devices; i++) { range_1 = cell_size[i][c][2] - cell_end[i][c][2]; range_0 = cell_size[i][c][0] - cell_end[i][c][0]; ecode = clSetKernelArg(k_exact_rhs3[i], 2, sizeof(int), &c); ecode |= clSetKernelArg(k_exact_rhs3[i], 3, sizeof(int), &cell_start[i][c][2]); ecode |= clSetKernelArg(k_exact_rhs3[i], 4, sizeof(int), &range_1); ecode |= clSetKernelArg(k_exact_rhs3[i], 5, sizeof(int), &cell_start[i][c][0]); ecode |= clSetKernelArg(k_exact_rhs3[i], 6, sizeof(int), &range_0); ecode |= clSetKernelArg(k_exact_rhs3[i], 7, sizeof(int), &cell_size[i][c][1]); ecode |= clSetKernelArg(k_exact_rhs3[i], 8, sizeof(int), &cell_start[i][c][1]); ecode |= clSetKernelArg(k_exact_rhs3[i], 9, sizeof(int), &cell_end[i][c][1]); ecode |= clSetKernelArg(k_exact_rhs3[i], 10, sizeof(int), &cell_low[i][c][2]); ecode |= clSetKernelArg(k_exact_rhs3[i], 11, sizeof(int), &cell_low[i][c][1]); ecode |= clSetKernelArg(k_exact_rhs3[i], 12, sizeof(int), &cell_low[i][c][0]); clu_CheckError(ecode, "clSetKernelArg() for exact_rhs3"); d[0] = cell_size[i][c][0] - cell_start[i][c][0] - cell_end[i][c][0]; d[1] = cell_size[i][c][2] - cell_start[i][c][2] - cell_end[i][c][2]; compute_ws_dim_2(d, local_ws, global_ws); if (c == 0) CHECK_FINISH(i * 2); ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_exact_rhs3[i], 2, NULL, global_ws, local_ws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRange() for exact_rhs3"); } for (i = 0; i < num_devices; i++) { CHECK_FINISH(i * 2); } } for (i = 0; i < num_devices; i++) { ecode = clSetKernelArg(k_exact_rhs4[i], 0, sizeof(cl_mem), &m_forcing[i]); ecode |= clSetKernelArg(k_exact_rhs4[i], 1, sizeof(cl_mem), &m_ce[i]); clu_CheckError(ecode, "clSetKernelArg() for exact_rhs4"); } for (c = 0; c < ncells; c++) { for (i = 0; i < num_devices; i++) { range_1 = cell_size[i][c][1] - cell_end[i][c][1]; range_0 = cell_size[i][c][0] - cell_end[i][c][0]; ecode = clSetKernelArg(k_exact_rhs4[i], 2, sizeof(int), &c); ecode |= clSetKernelArg(k_exact_rhs4[i], 3, sizeof(int), &cell_start[i][c][1]); ecode |= clSetKernelArg(k_exact_rhs4[i], 4, sizeof(int), &range_1); ecode |= clSetKernelArg(k_exact_rhs4[i], 5, sizeof(int), &cell_start[i][c][0]); ecode |= clSetKernelArg(k_exact_rhs4[i], 6, sizeof(int), &range_0); ecode |= clSetKernelArg(k_exact_rhs4[i], 7, sizeof(int), &cell_size[i][c][2]); ecode |= clSetKernelArg(k_exact_rhs4[i], 8, sizeof(int), &cell_start[i][c][2]); ecode |= clSetKernelArg(k_exact_rhs4[i], 9, sizeof(int), &cell_end[i][c][2]); ecode |= clSetKernelArg(k_exact_rhs4[i], 10, sizeof(int), &cell_low[i][c][2]); ecode |= clSetKernelArg(k_exact_rhs4[i], 11, sizeof(int), &cell_low[i][c][1]); ecode |= clSetKernelArg(k_exact_rhs4[i], 12, sizeof(int), &cell_low[i][c][0]); clu_CheckError(ecode, "clSetKernelArg() for exact_rhs4"); d[0] = cell_size[i][c][0] - cell_start[i][c][0] - cell_end[i][c][0]; d[1] = cell_size[i][c][1] - cell_start[i][c][1] - cell_end[i][c][1]; compute_ws_dim_2(d, local_ws, global_ws); if (c == 0) CHECK_FINISH(i * 2); ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_exact_rhs4[i], 2, NULL, global_ws, local_ws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRange() for exact_rhs4"); } for (i = 0; i < num_devices; i++) { CHECK_FINISH(i * 2); } } for (c = 0; c < ncells; c++) { for (i = 0; i < num_devices; i++) { range_1 = cell_size[i][c][2] - cell_end[i][c][2]; range_0 = cell_size[i][c][1] - cell_end[i][c][1]; ecode = clSetKernelArg(k_exact_rhs5[i], 0, sizeof(cl_mem), &m_forcing[i]); ecode |= clSetKernelArg(k_exact_rhs5[i], 1, sizeof(int), &c); ecode |= clSetKernelArg(k_exact_rhs5[i], 2, sizeof(int), &cell_start[i][c][2]); ecode |= clSetKernelArg(k_exact_rhs5[i], 3, sizeof(int), &range_1); ecode |= clSetKernelArg(k_exact_rhs5[i], 4, sizeof(int), &cell_start[i][c][1]); ecode |= clSetKernelArg(k_exact_rhs5[i], 5, sizeof(int), &range_0); ecode |= clSetKernelArg(k_exact_rhs5[i], 6, sizeof(int), &cell_size[i][c][0]); ecode |= clSetKernelArg(k_exact_rhs5[i], 7, sizeof(int), &cell_start[i][c][0]); ecode |= clSetKernelArg(k_exact_rhs5[i], 8, sizeof(int), &cell_end[i][c][0]); clu_CheckError(ecode, "clSetKernelArg() for exact_rhs5"); d[0] = cell_size[i][c][1] - cell_start[i][c][1] - cell_end[i][c][1]; d[1] = cell_size[i][c][2] - cell_start[i][c][2] - cell_end[i][c][2]; compute_ws_dim_2(d, local_ws, global_ws); if (c == 0) CHECK_FINISH(i * 2); ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_exact_rhs5[i], 2, NULL, global_ws, local_ws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRange() for exact_rhs5"); } for (i = 0; i < num_devices; i++) { CHECK_FINISH(i * 2); } } for (i = 0; i < num_devices; i++) CHECK_FINISH(i * 2); }
void compute_rhs() { int i; size_t d0_size, d1_size, d2_size; cl_int ecode; if (timeron) timer_start(t_rhs); //------------------------------------------------------------------------- // compute the reciprocal of density, and the kinetic energy, // and the speed of sound. //------------------------------------------------------------------------- for (i = 0; i < num_devices; i++) { size_t compute_rhs1_lws[3], compute_rhs1_gws[3], temp; if (COMPUTE_RHS1_DIM == 3) { d0_size = max_cell_size[i][1] + 2; d1_size = max_cell_size[i][2] + 2; d2_size = ncells; compute_rhs1_lws[0] = d0_size < work_item_sizes[0] ? d0_size : work_item_sizes[0]; temp = max_work_group_size / compute_rhs1_lws[0]; compute_rhs1_lws[1] = d1_size < temp ? d1_size : temp; temp = temp / compute_rhs1_lws[1]; compute_rhs1_lws[2] = d2_size < temp ? d2_size : temp; compute_rhs1_gws[0] = clu_RoundWorkSize(d0_size, compute_rhs1_lws[0]); compute_rhs1_gws[1] = clu_RoundWorkSize(d1_size, compute_rhs1_lws[1]); compute_rhs1_gws[2] = clu_RoundWorkSize(d2_size, compute_rhs1_lws[2]); } else { d0_size = max_cell_size[i][2] + 2; d1_size = ncells; compute_rhs1_lws[0] = d0_size < work_item_sizes[0] ? d0_size : work_item_sizes[0]; temp = max_work_group_size / compute_rhs1_lws[0]; compute_rhs1_lws[1] = d1_size < temp ? d1_size : temp; compute_rhs1_gws[0] = clu_RoundWorkSize(d0_size, compute_rhs1_lws[0]); compute_rhs1_gws[1] = clu_RoundWorkSize(d1_size, compute_rhs1_lws[1]); } ecode = clSetKernelArg(k_compute_rhs1[i], 0, sizeof(cl_mem), &m_u[i]); ecode |= clSetKernelArg(k_compute_rhs1[i], 1, sizeof(cl_mem), &m_us[i]); ecode |= clSetKernelArg(k_compute_rhs1[i], 2, sizeof(cl_mem), &m_vs[i]); ecode |= clSetKernelArg(k_compute_rhs1[i], 3, sizeof(cl_mem), &m_ws[i]); ecode |= clSetKernelArg(k_compute_rhs1[i], 4, sizeof(cl_mem), &m_qs[i]); ecode |= clSetKernelArg(k_compute_rhs1[i], 5, sizeof(cl_mem), &m_rho_i[i]); ecode |= clSetKernelArg(k_compute_rhs1[i], 6, sizeof(cl_mem), &m_square[i]); ecode |= clSetKernelArg(k_compute_rhs1[i], 7, sizeof(cl_mem), &m_cell_size[i]); ecode |= clSetKernelArg(k_compute_rhs1[i], 8, sizeof(int), &ncells); clu_CheckError(ecode, "clSetKernelArg()"); ecode = clEnqueueNDRangeKernel(cmd_queue[i], k_compute_rhs1[i], COMPUTE_RHS1_DIM, NULL, compute_rhs1_gws, compute_rhs1_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); } //------------------------------------------------------------------------- //------------------------------------------------------------------------- // copy the exact forcing term to the right hand side; because // this forcing term is known, we can store it on the whole of every // cell, including the boundary //------------------------------------------------------------------------- for (i = 0; i < num_devices; i++) { size_t compute_rhs2_lws[3], compute_rhs2_gws[3], temp; if (COMPUTE_RHS2_DIM == 3) { d0_size = max_cell_size[i][1]; d1_size = max_cell_size[i][2]; d2_size = ncells; compute_rhs2_lws[0] = d0_size < work_item_sizes[0] ? d0_size : work_item_sizes[0]; temp = max_work_group_size / compute_rhs2_lws[0]; compute_rhs2_lws[1] = d1_size < temp ? d1_size : temp; temp = temp / compute_rhs2_lws[1]; compute_rhs2_lws[2] = d2_size < temp ? d2_size : temp; compute_rhs2_gws[0] = clu_RoundWorkSize(d0_size, compute_rhs2_lws[0]); compute_rhs2_gws[1] = clu_RoundWorkSize(d1_size, compute_rhs2_lws[1]); compute_rhs2_gws[2] = clu_RoundWorkSize(d2_size, compute_rhs2_lws[2]); } else { d0_size = max_cell_size[i][2]; d1_size = ncells; compute_rhs2_lws[0] = d0_size < work_item_sizes[0] ? d0_size : work_item_sizes[0]; temp = max_work_group_size / compute_rhs2_lws[0]; compute_rhs2_lws[1] = d1_size < temp ? d1_size : temp; compute_rhs2_gws[0] = clu_RoundWorkSize(d0_size, compute_rhs2_lws[0]); compute_rhs2_gws[1] = clu_RoundWorkSize(d1_size, compute_rhs2_lws[1]); } ecode = clSetKernelArg(k_compute_rhs2[i], 0, sizeof(cl_mem), &m_forcing[i]); ecode |= clSetKernelArg(k_compute_rhs2[i], 1, sizeof(cl_mem), &m_rhs[i]); ecode |= clSetKernelArg(k_compute_rhs2[i], 2, sizeof(cl_mem), &m_cell_size[i]); ecode |= clSetKernelArg(k_compute_rhs2[i], 3, sizeof(int), &ncells); clu_CheckError(ecode, "clSetKernelArg()"); ecode = clEnqueueNDRangeKernel(cmd_queue[i], k_compute_rhs2[i], COMPUTE_RHS2_DIM, NULL, compute_rhs2_gws, compute_rhs2_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); } //------------------------------------------------------------------------- //------------------------------------------------------------------------- // compute xi-direction fluxes //------------------------------------------------------------------------- for (i = 0; i < num_devices; i++) { size_t compute_rhs3_lws[3], compute_rhs3_gws[3], temp; d0_size = max_cell_size[i][1]; d1_size = max_cell_size[i][2]; d2_size = ncells; compute_rhs3_lws[0] = d0_size < work_item_sizes[0] ? d0_size : work_item_sizes[0]; temp = max_work_group_size / compute_rhs3_lws[0]; compute_rhs3_lws[1] = d1_size < temp ? d1_size : temp; temp = temp / compute_rhs3_lws[1]; compute_rhs3_lws[2] = d2_size < temp ? d2_size : temp; compute_rhs3_gws[0] = clu_RoundWorkSize(d0_size, compute_rhs3_lws[0]); compute_rhs3_gws[1] = clu_RoundWorkSize(d1_size, compute_rhs3_lws[1]); compute_rhs3_gws[2] = clu_RoundWorkSize(d2_size, compute_rhs3_lws[2]); ecode = clSetKernelArg(k_compute_rhs3[i], 0, sizeof(cl_mem), &m_u[i]); ecode |= clSetKernelArg(k_compute_rhs3[i], 1, sizeof(cl_mem), &m_us[i]); ecode |= clSetKernelArg(k_compute_rhs3[i], 2, sizeof(cl_mem), &m_vs[i]); ecode |= clSetKernelArg(k_compute_rhs3[i], 3, sizeof(cl_mem), &m_ws[i]); ecode |= clSetKernelArg(k_compute_rhs3[i], 4, sizeof(cl_mem), &m_qs[i]); ecode |= clSetKernelArg(k_compute_rhs3[i], 5, sizeof(cl_mem), &m_rho_i[i]); ecode |= clSetKernelArg(k_compute_rhs3[i], 6, sizeof(cl_mem), &m_square[i]); ecode |= clSetKernelArg(k_compute_rhs3[i], 7, sizeof(cl_mem), &m_rhs[i]); ecode |= clSetKernelArg(k_compute_rhs3[i], 8, sizeof(cl_mem), &m_cell_size[i]); ecode |= clSetKernelArg(k_compute_rhs3[i], 9, sizeof(cl_mem),&m_start[i]); ecode |= clSetKernelArg(k_compute_rhs3[i], 10, sizeof(cl_mem), &m_end[i]); ecode |= clSetKernelArg(k_compute_rhs3[i], 11, sizeof(int), &ncells); clu_CheckError(ecode, "clSetKernelArg()"); ecode = clEnqueueNDRangeKernel(cmd_queue[i], k_compute_rhs3[i], 3, NULL, compute_rhs3_gws, compute_rhs3_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); } //------------------------------------------------------------------------- //------------------------------------------------------------------------- // compute eta-direction fluxes //------------------------------------------------------------------------- for (i = 0; i < num_devices; i++) { size_t compute_rhs4_lws[3], compute_rhs4_gws[3], temp; d0_size = max_cell_size[i][0]; d1_size = max_cell_size[i][2]; d2_size = ncells; compute_rhs4_lws[0] = d0_size < work_item_sizes[0] ? d0_size : work_item_sizes[0]; temp = max_work_group_size / compute_rhs4_lws[0]; compute_rhs4_lws[1] = d1_size < temp ? d1_size : temp; temp = temp / compute_rhs4_lws[1]; compute_rhs4_lws[2] = d2_size < temp ? d2_size : temp; compute_rhs4_gws[0] = clu_RoundWorkSize(d0_size, compute_rhs4_lws[0]); compute_rhs4_gws[1] = clu_RoundWorkSize(d1_size, compute_rhs4_lws[1]); compute_rhs4_gws[2] = clu_RoundWorkSize(d2_size, compute_rhs4_lws[2]); ecode = clSetKernelArg(k_compute_rhs4[i], 0, sizeof(cl_mem), &m_u[i]); ecode |= clSetKernelArg(k_compute_rhs4[i], 1, sizeof(cl_mem), &m_us[i]); ecode |= clSetKernelArg(k_compute_rhs4[i], 2, sizeof(cl_mem), &m_vs[i]); ecode |= clSetKernelArg(k_compute_rhs4[i], 3, sizeof(cl_mem), &m_ws[i]); ecode |= clSetKernelArg(k_compute_rhs4[i], 4, sizeof(cl_mem), &m_qs[i]); ecode |= clSetKernelArg(k_compute_rhs4[i], 5, sizeof(cl_mem), &m_rho_i[i]); ecode |= clSetKernelArg(k_compute_rhs4[i], 6, sizeof(cl_mem), &m_square[i]); ecode |= clSetKernelArg(k_compute_rhs4[i], 7, sizeof(cl_mem), &m_rhs[i]); ecode |= clSetKernelArg(k_compute_rhs4[i], 8, sizeof(cl_mem), &m_cell_size[i]); ecode |= clSetKernelArg(k_compute_rhs4[i], 9, sizeof(cl_mem),&m_start[i]); ecode |= clSetKernelArg(k_compute_rhs4[i], 10, sizeof(cl_mem), &m_end[i]); ecode |= clSetKernelArg(k_compute_rhs4[i], 11, sizeof(int), &ncells); clu_CheckError(ecode, "clSetKernelArg()"); ecode = clEnqueueNDRangeKernel(cmd_queue[i], k_compute_rhs4[i], 3, NULL, compute_rhs4_gws, compute_rhs4_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); } //------------------------------------------------------------------------- //------------------------------------------------------------------------- // compute zeta-direction fluxes //------------------------------------------------------------------------- for (i = 0; i < num_devices; i++) { size_t compute_rhs5_lws[3], compute_rhs5_gws[3], temp; d0_size = max_cell_size[i][0]; d1_size = max_cell_size[i][1]; d2_size = ncells; compute_rhs5_lws[0] = d0_size < work_item_sizes[0] ? d0_size : work_item_sizes[0]; temp = max_work_group_size / compute_rhs5_lws[0]; compute_rhs5_lws[1] = d1_size < temp ? d1_size : temp; temp = temp / compute_rhs5_lws[1]; compute_rhs5_lws[2] = d2_size < temp ? d2_size : temp; compute_rhs5_gws[0] = clu_RoundWorkSize(d0_size, compute_rhs5_lws[0]); compute_rhs5_gws[1] = clu_RoundWorkSize(d1_size, compute_rhs5_lws[1]); compute_rhs5_gws[2] = clu_RoundWorkSize(d2_size, compute_rhs5_lws[2]); ecode = clSetKernelArg(k_compute_rhs5[i], 0, sizeof(cl_mem), &m_u[i]); ecode |= clSetKernelArg(k_compute_rhs5[i], 1, sizeof(cl_mem), &m_us[i]); ecode |= clSetKernelArg(k_compute_rhs5[i], 2, sizeof(cl_mem), &m_vs[i]); ecode |= clSetKernelArg(k_compute_rhs5[i], 3, sizeof(cl_mem), &m_ws[i]); ecode |= clSetKernelArg(k_compute_rhs5[i], 4, sizeof(cl_mem), &m_qs[i]); ecode |= clSetKernelArg(k_compute_rhs5[i], 5, sizeof(cl_mem), &m_rho_i[i]); ecode |= clSetKernelArg(k_compute_rhs5[i], 6, sizeof(cl_mem), &m_square[i]); ecode |= clSetKernelArg(k_compute_rhs5[i], 7, sizeof(cl_mem), &m_rhs[i]); ecode |= clSetKernelArg(k_compute_rhs5[i], 8, sizeof(cl_mem), &m_cell_size[i]); ecode |= clSetKernelArg(k_compute_rhs5[i], 9, sizeof(cl_mem),&m_start[i]); ecode |= clSetKernelArg(k_compute_rhs5[i], 10, sizeof(cl_mem), &m_end[i]); ecode |= clSetKernelArg(k_compute_rhs5[i], 11, sizeof(int), &ncells); clu_CheckError(ecode, "clSetKernelArg()"); ecode = clEnqueueNDRangeKernel(cmd_queue[i], k_compute_rhs5[i], 3, NULL, compute_rhs5_gws, compute_rhs5_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); } //------------------------------------------------------------------------- //------------------------------------------------------------------------- for (i = 0; i < num_devices; i++) { size_t compute_rhs6_lws[3], compute_rhs6_gws[3], temp; if (COMPUTE_RHS6_DIM == 3) { d0_size = max_cell_size[i][1]; d1_size = max_cell_size[i][2]; d2_size = ncells; compute_rhs6_lws[0] = d0_size < work_item_sizes[0] ? d0_size : work_item_sizes[0]; temp = max_work_group_size / compute_rhs6_lws[0]; compute_rhs6_lws[1] = d1_size < temp ? d1_size : temp; temp = temp / compute_rhs6_lws[1]; compute_rhs6_lws[2] = d2_size < temp ? d2_size : temp; compute_rhs6_gws[0] = clu_RoundWorkSize(d0_size, compute_rhs6_lws[0]); compute_rhs6_gws[1] = clu_RoundWorkSize(d1_size, compute_rhs6_lws[1]); compute_rhs6_gws[2] = clu_RoundWorkSize(d2_size, compute_rhs6_lws[2]); } else { d0_size = max_cell_size[i][2]; d1_size = ncells; compute_rhs6_lws[0] = d0_size < work_item_sizes[0] ? d0_size : work_item_sizes[0]; temp = max_work_group_size / compute_rhs6_lws[0]; compute_rhs6_lws[1] = d1_size < temp ? d1_size : temp; compute_rhs6_gws[0] = clu_RoundWorkSize(d0_size, compute_rhs6_lws[0]); compute_rhs6_gws[1] = clu_RoundWorkSize(d1_size, compute_rhs6_lws[1]); } ecode = clSetKernelArg(k_compute_rhs6[i], 0, sizeof(cl_mem), &m_rhs[i]); ecode |= clSetKernelArg(k_compute_rhs6[i], 1, sizeof(cl_mem), &m_cell_size[i]); ecode |= clSetKernelArg(k_compute_rhs6[i], 2, sizeof(cl_mem),&m_start[i]); ecode |= clSetKernelArg(k_compute_rhs6[i], 3, sizeof(cl_mem), &m_end[i]); ecode |= clSetKernelArg(k_compute_rhs6[i], 4, sizeof(int), &ncells); clu_CheckError(ecode, "clSetKernelArg()"); ecode = clEnqueueNDRangeKernel(cmd_queue[i], k_compute_rhs6[i], COMPUTE_RHS6_DIM, NULL, compute_rhs6_gws, compute_rhs6_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); } //------------------------------------------------------------------------- CHECK_FINISH(); if (timeron) timer_stop(t_rhs); }
//--------------------------------------------------------------------- // this function copies the face values of a variable defined on a set // of cells to the overlap locations of the adjacent sets of cells. // Because a set of cells interfaces in each direction with exactly one // other set, we only need to fill six different buffers. We could try to // overlap communication with computation, by computing // some internal values while communicating boundary values, but this // adds so much overhead that it's not clearly useful. //--------------------------------------------------------------------- void copy_faces() { int c, i; cl_int ecode = 0; //--------------------------------------------------------------------- // exit immediately if there are no faces to be copied //--------------------------------------------------------------------- if (num_devices == 1) { compute_rhs(); return; } //--------------------------------------------------------------------- // because the difference stencil for the diagonalized scheme is // orthogonal, we do not have to perform the staged copying of faces, // but can send all face information simultaneously to the neighboring // cells in all directions //--------------------------------------------------------------------- if (timeron) timer_start(t_bpack); for (c = 0; c < ncells; c++) { for (i = 0; i < num_devices; i++) { ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_copy_faces1[i][c], COPY_FACES1_DIM, NULL, copy_faces1_gw[i][c], copy_faces1_lw[i][c], 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRange() for copy_faces1"); } for (i = 0; i < num_devices; i++) { ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_copy_faces2[i][c], COPY_FACES2_DIM, NULL, copy_faces2_gw[i][c], copy_faces2_lw[i][c], 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRange() for copy_faces2"); ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_copy_faces3[i][c], COPY_FACES3_DIM, NULL, copy_faces3_gw[i][c], copy_faces3_lw[i][c], 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRange() for copy_faces3"); } for (i = 0; i < num_devices; i++) { CHECK_FINISH(i * 2); } } if (timeron) timer_stop(t_bpack); if (timeron) timer_start(t_exch); for (i = 0; i < num_devices; i++) { CHECK_FINISH(i * 2); ecode = clEnqueueCopyBuffer(cmd_queue[successor[i][0] * 2 + 1], m_out_buffer[i], m_in_buffer[successor[i][0]], start_send_east[i]*sizeof(double), start_recv_west[successor[i][0]]*sizeof(double), east_size[i]*sizeof(double), 0, NULL, NULL); CHECK_FINISH(successor[i][0] * 2 + 1); } for (i = 0; i < num_devices; i++) { ecode = clEnqueueCopyBuffer(cmd_queue[predecessor[i][0] * 2 + 1], m_out_buffer[i], m_in_buffer[predecessor[i][0]], start_send_west[i]*sizeof(double), start_recv_east[predecessor[i][0]]*sizeof(double), west_size[i]*sizeof(double), 0, NULL, NULL); CHECK_FINISH(predecessor[i][0] * 2 + 1); ecode = clEnqueueCopyBuffer(cmd_queue[successor[i][1] * 2 + 1], m_out_buffer[i], m_in_buffer[successor[i][1]], start_send_north[i]*sizeof(double), start_recv_south[successor[i][1]]*sizeof(double), north_size[i]*sizeof(double), 0, NULL, NULL); CHECK_FINISH(successor[i][1] * 2 + 1); ecode = clEnqueueCopyBuffer(cmd_queue[predecessor[i][1] * 2 + 1], m_out_buffer[i], m_in_buffer[predecessor[i][1]], start_send_south[i]*sizeof(double), start_recv_north[predecessor[i][1]]*sizeof(double), south_size[i]*sizeof(double), 0, NULL, NULL); CHECK_FINISH(predecessor[i][1] * 2 + 1); ecode = clEnqueueCopyBuffer(cmd_queue[successor[i][2] * 2 + 1], m_out_buffer[i], m_in_buffer[successor[i][2]], start_send_top[i]*sizeof(double), start_recv_bottom[successor[i][2]]*sizeof(double), top_size[i]*sizeof(double), 0, NULL, NULL); CHECK_FINISH(successor[i][2] * 2 + 1); ecode = clEnqueueCopyBuffer(cmd_queue[predecessor[i][2] * 2 + 1], m_out_buffer[i], m_in_buffer[predecessor[i][2]], start_send_bottom[i]*sizeof(double), start_recv_top[predecessor[i][2]]*sizeof(double), bottom_size[i]*sizeof(double), 0, NULL, NULL); CHECK_FINISH(predecessor[i][2] * 2 + 1); } if (timeron) timer_stop(t_exch); //--------------------------------------------------------------------- // unpack the data that has just been received; //--------------------------------------------------------------------- if (timeron) timer_start(t_bpack); for (c = 0; c < ncells; c++) { for (i = 0; i < num_devices; i++) { if (c == 0) CHECK_FINISH(i * 2 + 1); ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_copy_faces4[i][c], COPY_FACES4_DIM, NULL, copy_faces4_gw[i][c], copy_faces4_lw[i][c], 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRange() for copy_faces4"); } for (i = 0; i < num_devices; i++) { ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_copy_faces5[i][c], COPY_FACES5_DIM, NULL, copy_faces5_gw[i][c], copy_faces5_lw[i][c], 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRange() for copy_faces5"); ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_copy_faces6[i][c], COPY_FACES6_DIM, NULL, copy_faces6_gw[i][c], copy_faces6_lw[i][c], 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRange() for copy_faces6"); } for (i = 0; i < num_devices; i++) { CHECK_FINISH(i * 2); } } if (timeron) timer_stop(t_bpack); for (i = 0; i < num_devices; i++) CHECK_FINISH(i * 2); //--------------------------------------------------------------------- // now that we have all the data, compute the rhs //--------------------------------------------------------------------- compute_rhs(); }
void rank( int iteration ) { size_t r1_lws[1], r1_gws[1]; size_t r2_lws[1], r2_gws[1]; size_t r3_lws[1], r3_gws[1]; cl_int ecode; DTIMER_START(T_KERNEL_RANK0); // rank0 ecode = clSetKernelArg(k_rank0, 3, sizeof(cl_int), (void*)&iteration); clu_CheckError(ecode, "clSetKernelArg() for rank0: iteration"); ecode = clEnqueueTask(cmd_queue, k_rank0, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueTask() for rank0"); CHECK_FINISH(); DTIMER_STOP(T_KERNEL_RANK0); DTIMER_START(T_KERNEL_RANK1); // rank1 r1_lws[0] = RANK1_GROUP_SIZE; r1_gws[0] = RANK1_GLOBAL_SIZE; ecode = clEnqueueNDRangeKernel(cmd_queue, k_rank1, 1, NULL, r1_gws, r1_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for rank1"); CHECK_FINISH(); DTIMER_STOP(T_KERNEL_RANK1); DTIMER_START(T_KERNEL_RANK2); // rank2 r2_lws[0] = RANK2_GROUP_SIZE; r2_gws[0] = RANK2_GLOBAL_SIZE; ecode = clEnqueueNDRangeKernel(cmd_queue, k_rank2, 1, NULL, r2_gws, r2_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for rank2"); CHECK_FINISH(); DTIMER_STOP(T_KERNEL_RANK2); DTIMER_START(T_KERNEL_RANK3); // rank3 if (device_type == CL_DEVICE_TYPE_GPU) { r3_lws[0] = work_item_sizes[0]; r3_gws[0] = work_item_sizes[0] * work_item_sizes[0]; if (r3_gws[0] > MAX_KEY) r3_gws[0] = MAX_KEY; ecode = clEnqueueNDRangeKernel(cmd_queue, k_rank3_0, 1, NULL, r3_gws, r3_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for rank3_0"); r3_lws[0] = work_item_sizes[0]; r3_gws[0] = work_item_sizes[0]; ecode = clEnqueueNDRangeKernel(cmd_queue, k_rank3_1, 1, NULL, r3_gws, r3_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for rank3_1"); r3_lws[0] = work_item_sizes[0]; r3_gws[0] = work_item_sizes[0] * work_item_sizes[0]; if (r3_gws[0] > MAX_KEY) r3_gws[0] = MAX_KEY; ecode = clEnqueueNDRangeKernel(cmd_queue, k_rank3_2, 1, NULL, r3_gws, r3_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for rank3_2"); } else { r3_lws[0] = RANK_GROUP_SIZE; r3_gws[0] = RANK_GLOBAL_SIZE; ecode = clEnqueueNDRangeKernel(cmd_queue, k_rank3, 1, NULL, r3_gws, r3_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for rank3"); } CHECK_FINISH(); DTIMER_STOP(T_KERNEL_RANK3); // rank4 - partial verification DTIMER_START(T_KERNEL_RANK4); ecode = clSetKernelArg(k_rank4, 4, sizeof(cl_int), (void*)&iteration); clu_CheckError(ecode, "clSetKernelArg() for rank4"); ecode = clEnqueueTask(cmd_queue, k_rank4, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueTask() for rank4"); ecode = clFinish(cmd_queue); clu_CheckError(ecode, "clFinish"); DTIMER_STOP(T_KERNEL_RANK4); }
void full_verify( void ) { cl_kernel k_fv1, k_fv2; cl_mem m_j; INT_TYPE *g_j; INT_TYPE j = 0, i; size_t j_size; size_t fv1_lws[1], fv1_gws[1]; size_t fv2_lws[1], fv2_gws[1]; cl_int ecode; DTIMER_START(T_BUFFER_CREATE); // Create buffers j_size = sizeof(INT_TYPE) * (FV2_GLOBAL_SIZE / FV2_GROUP_SIZE); m_j = clCreateBuffer(context, CL_MEM_READ_WRITE, j_size, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer for m_j"); DTIMER_STOP(T_BUFFER_CREATE); DTIMER_START(T_OPENCL_API); // Create kernels k_fv1 = clCreateKernel(program, "full_verify1", &ecode); clu_CheckError(ecode, "clCreateKernel() for full_verify1"); k_fv2 = clCreateKernel(program, "full_verify2", &ecode); clu_CheckError(ecode, "clCreateKernel() for full_verify2"); DTIMER_STOP(T_OPENCL_API); if (device_type == CL_DEVICE_TYPE_GPU) { cl_kernel k_fv0; size_t fv0_lws[1], fv0_gws[1]; DTIMER_START(T_OPENCL_API); // Create kernels k_fv0 = clCreateKernel(program, "full_verify0", &ecode); clu_CheckError(ecode, "clCreateKernel() for full_verify0"); DTIMER_STOP(T_OPENCL_API); // Kernel execution DTIMER_START(T_KERNEL_FV0); ecode = clSetKernelArg(k_fv0, 0, sizeof(cl_mem), (void*)&m_key_array); ecode |= clSetKernelArg(k_fv0, 1, sizeof(cl_mem), (void*)&m_key_buff2); clu_CheckError(ecode, "clSetKernelArg() for full_verify0"); fv0_lws[0] = work_item_sizes[0]; fv0_gws[0] = NUM_KEYS; ecode = clEnqueueNDRangeKernel(cmd_queue, k_fv0, 1, NULL, fv0_gws, fv0_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for full_verify0"); CHECK_FINISH(); DTIMER_STOP(T_KERNEL_FV0); DTIMER_START(T_KERNEL_FV1); ecode = clSetKernelArg(k_fv1, 0, sizeof(cl_mem), (void*)&m_key_buff2); ecode |= clSetKernelArg(k_fv1, 1, sizeof(cl_mem), (void*)&m_key_buff1); ecode |= clSetKernelArg(k_fv1, 2, sizeof(cl_mem), (void*)&m_key_array); clu_CheckError(ecode, "clSetKernelArg() for full_verify1"); fv1_lws[0] = work_item_sizes[0]; fv1_gws[0] = NUM_KEYS; ecode = clEnqueueNDRangeKernel(cmd_queue, k_fv1, 1, NULL, fv1_gws, fv1_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for full_verify1"); CHECK_FINISH(); DTIMER_STOP(T_KERNEL_FV1); DTIMER_START(T_KERNEL_FV2); ecode = clSetKernelArg(k_fv2, 0, sizeof(cl_mem), (void*)&m_key_array); ecode |= clSetKernelArg(k_fv2, 1, sizeof(cl_mem), (void*)&m_j); ecode |= clSetKernelArg(k_fv2, 2, sizeof(INT_TYPE)*FV2_GROUP_SIZE, NULL); clu_CheckError(ecode, "clSetKernelArg() for full_verify2"); fv2_lws[0] = FV2_GROUP_SIZE; fv2_gws[0] = FV2_GLOBAL_SIZE; ecode = clEnqueueNDRangeKernel(cmd_queue, k_fv2, 1, NULL, fv2_gws, fv2_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for full_verify2"); CHECK_FINISH(); DTIMER_STOP(T_KERNEL_FV2); g_j = (INT_TYPE *)malloc(j_size); DTIMER_START(T_BUFFER_READ); ecode = clEnqueueReadBuffer(cmd_queue, m_j, CL_TRUE, 0, j_size, g_j, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueReadBuffer() for m_j"); DTIMER_STOP(T_BUFFER_READ); // reduction for (i = 0; i < j_size/sizeof(INT_TYPE); i++) { j += g_j[i]; } DTIMER_START(T_RELEASE); clReleaseKernel(k_fv0); DTIMER_STOP(T_RELEASE); } else { // Kernel execution DTIMER_START(T_KERNEL_FV1); ecode = clSetKernelArg(k_fv1, 0, sizeof(cl_mem), (void*)&m_bucket_ptrs); ecode |= clSetKernelArg(k_fv1, 1, sizeof(cl_mem), (void*)&m_key_buff2); ecode |= clSetKernelArg(k_fv1, 2, sizeof(cl_mem), (void*)&m_key_buff1); ecode |= clSetKernelArg(k_fv1, 3, sizeof(cl_mem), (void*)&m_key_array); clu_CheckError(ecode, "clSetKernelArg() for full_verify1"); fv1_lws[0] = RANK_GROUP_SIZE; fv1_gws[0] = RANK_GLOBAL_SIZE; ecode = clEnqueueNDRangeKernel(cmd_queue, k_fv1, 1, NULL, fv1_gws, fv1_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for full_verify1"); CHECK_FINISH(); DTIMER_STOP(T_KERNEL_FV1); DTIMER_START(T_KERNEL_FV2); ecode = clSetKernelArg(k_fv2, 0, sizeof(cl_mem), (void*)&m_key_array); ecode |= clSetKernelArg(k_fv2, 1, sizeof(cl_mem), (void*)&m_j); ecode |= clSetKernelArg(k_fv2, 2, sizeof(INT_TYPE)*FV2_GROUP_SIZE, NULL); clu_CheckError(ecode, "clSetKernelArg() for full_verify2"); fv2_lws[0] = FV2_GROUP_SIZE; fv2_gws[0] = FV2_GLOBAL_SIZE; ecode = clEnqueueNDRangeKernel(cmd_queue, k_fv2, 1, NULL, fv2_gws, fv2_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for full_verify2"); CHECK_FINISH(); DTIMER_STOP(T_KERNEL_FV2); g_j = (INT_TYPE *)malloc(j_size); DTIMER_START(T_BUFFER_READ); ecode = clEnqueueReadBuffer(cmd_queue, m_j, CL_TRUE, 0, j_size, g_j, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueReadBuffer() for m_j"); DTIMER_STOP(T_BUFFER_READ); // reduction for (i = 0; i < j_size/sizeof(INT_TYPE); i++) { j += g_j[i]; } } DTIMER_START(T_RELEASE); free(g_j); clReleaseMemObject(m_j); clReleaseKernel(k_fv1); clReleaseKernel(k_fv2); DTIMER_STOP(T_RELEASE); if (j != 0) printf( "Full_verify: number of keys out of sort: %ld\n", (long)j ); else passed_verification++; }
void compute_rhs() { int c, i; cl_int ecode = 0; if (timeron) timer_start(t_rhs); //--------------------------------------------------------------------- // loop over all cells owned by this node //--------------------------------------------------------------------- for (c = 0; c < ncells; c++) { for (i = 0; i < num_devices; i++) { ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_compute_rhs1[i][c], COMPUTE_RHS1_DIM, NULL, compute_rhs1_gw[i][c], compute_rhs1_lw[i][c], 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRange() for compute_rhs1"); } for (i = 0; i < num_devices; i++) { ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_compute_rhs2[i][c], COMPUTE_RHS2_DIM, NULL, compute_rhs2_gw[i][c], compute_rhs2_lw[i][c], 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRange() for compute_rhs2"); ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_compute_rhs3[i][c], 2, NULL, compute_rhs3_gw[i][c], compute_rhs3_lw[i][c], 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRange() for compute_rhs3"); ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_compute_rhs4[i][c], 2, NULL, compute_rhs4_gw[i][c], compute_rhs4_lw[i][c], 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRange() for compute_rhs4"); ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_compute_rhs5[i][c], 2, NULL, compute_rhs5_gw[i][c], compute_rhs5_lw[i][c], 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRange() for compute_rhs5"); ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_compute_rhs6[i][c], COMPUTE_RHS6_DIM, NULL, compute_rhs6_gw[i][c], compute_rhs6_lw[i][c], 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRange() for compute_rhs6"); } for (i = 0; i < num_devices; i++) { CHECK_FINISH(i * 2); } } for (i = 0; i < num_devices; i++) CHECK_FINISH(i * 2); if (timeron) timer_stop(t_rhs); }
//--------------------------------------------------------------------- // this function performs the solution of the approximate factorization // step in the x-direction for all five matrix components // simultaneously. The Thomas algorithm is employed to solve the // systems for the x-lines. Boundary conditions are non-periodic //--------------------------------------------------------------------- void x_solve() { int stage, i, c, buffer_size; cl_int ecode = 0; //--------------------------------------------------------------------- // OK, now we know that there are multiple processors //--------------------------------------------------------------------- //--------------------------------------------------------------------- // now do a sweep on a layer-by-layer basis, i.e. sweeping through cells // on this node in the direction of increasing i for the forward sweep, // and after that reversing the direction for the backsubstitution. //--------------------------------------------------------------------- if (timeron) timer_start(t_xsolve); //--------------------------------------------------------------------- // FORWARD ELIMINATION //--------------------------------------------------------------------- for (stage = 1; stage <= ncells; stage++) { if (stage != 1) { //--------------------------------------------------------------------- // communication has already been started. // compute the left hand side while waiting for the msg //--------------------------------------------------------------------- for (i = 0; i < num_devices; i++) { ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_lhsx[i][slice[i][stage-1][0]], 2, NULL, lhsx_gw[i][slice[i][stage-1][0]], lhsx_lw[i][slice[i][stage-1][0]], 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRange() for lhsx"); } for (i = 0; i < num_devices; i++) { CHECK_FINISH(i * 2); } //--------------------------------------------------------------------- // wait for pending communication to complete // This waits on the current receive and on the send // from the previous stage. They always come in pairs. //--------------------------------------------------------------------- for (i = 0; i < num_devices; i++) { CHECK_FINISH(i * 2 + 1); ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_x_solve1[i][slice[i][stage-1][0]], 2, NULL, x_solve1_gw[i][slice[i][stage-1][0]], x_solve1_lw[i][slice[i][stage-1][0]], 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRange() for x_solve1"); } for (i = 0; i < num_devices; i++) { CHECK_FINISH(i * 2); } } else { //--------------------------------------------------------------------- // if this IS the first cell, we still compute the lhs //--------------------------------------------------------------------- for (i = 0; i < num_devices; i++) { ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_lhsx[i][slice[i][stage-1][0]], 2, NULL, lhsx_gw[i][slice[i][stage-1][0]], lhsx_lw[i][slice[i][stage-1][0]], 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRange() for lhsx"); } for (i = 0; i < num_devices; i++) { CHECK_FINISH(i * 2); } } for (i = 0; i < num_devices; i++) { ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_x_solve2[i][slice[i][stage-1][0]], 2, NULL, x_solve2_gw[i][slice[i][stage-1][0]], x_solve2_lw[i][slice[i][stage-1][0]], 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRange() for x_solve2"); } for (i = 0; i < num_devices; i++) { CHECK_FINISH(i * 2); } //--------------------------------------------------------------------- // send information to the next processor, except when this // is the last grid block //--------------------------------------------------------------------- if (stage != ncells) { for (i = 0; i < num_devices; i++) { ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_x_solve3[i][slice[i][stage-1][0]], 2, NULL, x_solve3_gw[i][slice[i][stage-1][0]], x_solve3_lw[i][slice[i][stage-1][0]], 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRange() for x_solve3"); } for (i = 0; i < num_devices; i++) { CHECK_FINISH(i * 2); } //--------------------------------------------------------------------- // send data to next phase // can't receive data yet because buffer size will be wrong //--------------------------------------------------------------------- for (i = 0; i < num_devices; i++) { CHECK_FINISH(i * 2); c = slice[i][stage-1][0]; buffer_size = (cell_size[i][c][1]-cell_start[i][c][1]-cell_end[i][c][1]) * (cell_size[i][c][2]-cell_start[i][c][2]-cell_end[i][c][2]); ecode = clEnqueueCopyBuffer(cmd_queue[successor[i][0] * 2 + 1], m_out_buffer[i], m_in_buffer[successor[i][0]], 0, 0, 22*buffer_size*sizeof(double), 0, NULL, NULL); } } } //--------------------------------------------------------------------- // now go in the reverse direction //--------------------------------------------------------------------- //--------------------------------------------------------------------- // BACKSUBSTITUTION //--------------------------------------------------------------------- for (stage = ncells; stage >= 1; stage--) { if (stage != ncells) { //--------------------------------------------------------------------- // communication has already been started // while waiting, do the block-diagonal inversion for the // cell that was just finished //--------------------------------------------------------------------- for (i = 0; i < num_devices; i++) { ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_ninvr[i][slice[i][stage][0]], NINVR_DIM, NULL, ninvr_gw[i][slice[i][stage][0]], ninvr_lw[i][slice[i][stage][0]], 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRange() for ninvr"); } for (i = 0; i < num_devices; i++) { CHECK_FINISH(i * 2); } //--------------------------------------------------------------------- // wait for pending communication to complete //--------------------------------------------------------------------- for (i = 0; i < num_devices; i++) { CHECK_FINISH(i * 2 + 1); ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_x_solve4[i][slice[i][stage-1][0]], 2, NULL, x_solve4_gw[i][slice[i][stage-1][0]], x_solve4_lw[i][slice[i][stage-1][0]], 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRange() for x_solve4"); } for (i = 0; i < num_devices; i++) { CHECK_FINISH(i * 2); } } else { for (i = 0; i < num_devices; i++) { ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_x_solve5[i][slice[i][stage-1][0]], 2, NULL, x_solve5_gw[i][slice[i][stage-1][0]], x_solve5_lw[i][slice[i][stage-1][0]], 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRange() for x_solve5"); } for (i = 0; i < num_devices; i++) { CHECK_FINISH(i * 2); } } //--------------------------------------------------------------------- // send on information to the previous processor, if needed //--------------------------------------------------------------------- if (stage != 1) { for (i = 0; i < num_devices; i++) { ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_x_solve6[i][slice[i][stage-1][0]], X_SOLVE6_DIM, NULL, x_solve6_gw[i][slice[i][stage-1][0]], x_solve6_lw[i][slice[i][stage-1][0]], 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRange() for x_solve6"); } for (i = 0; i < num_devices; i++) { CHECK_FINISH(i * 2); } //--------------------------------------------------------------------- // pack and send the buffer //--------------------------------------------------------------------- for (i = 0; i < num_devices; i++) { CHECK_FINISH(i * 2); c = slice[i][stage-1][0]; buffer_size = (cell_size[i][c][1]-cell_start[i][c][1]-cell_end[i][c][1]) * (cell_size[i][c][2]-cell_start[i][c][2]-cell_end[i][c][2]); ecode = clEnqueueCopyBuffer(cmd_queue[predecessor[i][0] * 2 + 1], m_out_buffer[i], m_in_buffer[predecessor[i][0]], 0, 0, 10*buffer_size*sizeof(double), 0, NULL, NULL); } } //--------------------------------------------------------------------- // If this was the last stage, do the block-diagonal inversion //--------------------------------------------------------------------- if (stage == 1) { for (i = 0; i < num_devices; i++) { ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2], k_ninvr[i][slice[i][stage-1][0]], NINVR_DIM, NULL, ninvr_gw[i][slice[i][stage-1][0]], ninvr_lw[i][slice[i][stage-1][0]], 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRange() for ninvr"); } for (i = 0; i < num_devices; i++) { CHECK_FINISH(i * 2); } } } for (i = 0; i < num_devices; i++) CHECK_FINISH(i * 2); if (timeron) timer_stop(t_xsolve); }