/************************************************************************** Function: reference_jacobi This routine contains the main iteration loop for the Jacobi iteration reference implementation (no OpenCL). params: a two arrays to compute solution into max_iter maximum number of iterations size size of array for this MPI rank tolerance all differences should be les than this tolerance value mpi_ranks number of MPI ranks in each dimension rank_pos cartesian position of this rank origin origin for this rank d discretion size mpi_comm MPI communications structure **************************************************************************/ static void reference_jacobi(value_type *a[2], unsigned int max_iter, size_t size[DIMENSIONS], value_type tolerance, value_type d[DIMENSIONS]) { unsigned int rc, iter = 0; value_type max_diff, timer; /* init arrays by setting the initial value and the boundary conditions */ set_initial_solution(a[OLD], size, INITIAL_GUESS); set_initial_solution(a[NEW], size, INITIAL_GUESS); set_boundary_conditions(a[OLD], size, d); set_boundary_conditions(a[NEW], size, d); /* print the initial solution guess */ print_array("Init ", a[NEW], size, d); /* iterate until maximum difference is less than the given tolerance or number of iterations is too high */ do { /* swap array pointers for next iteration */ SWAP_PTR(a[OLD], a[NEW]); /* iterate using a[OLD] as the input and a[NEW] as the output */ max_diff = reference_jacobi_kernel(a[OLD], a[NEW], size); /* output status for user, overwrite the same line */ if (0 == iter % 100) { printf("Iteration=%5d, max difference=%0.7f, target=%0.7f\r", iter, max_diff, tolerance); fflush(stdout); } /* increment counter */ iter++; } while (max_diff > tolerance && max_iter > iter); /* do loop */ /* output final iteration count and maximum difference value */ printf("Iteration=%5d, max difference=%0.7f, execution time=%.3f seconds\n", iter, max_diff, timer); }
int invert_doublet_eo_quda(spinor * const Even_new_s, spinor * const Odd_new_s, spinor * const Even_new_c, spinor * const Odd_new_c, spinor * const Even_s, spinor * const Odd_s, spinor * const Even_c, spinor * const Odd_c, const double precision, const int max_iter, const int solver_flag, const int rel_prec, const int even_odd_flag, const SloppyPrecision sloppy_precision, CompressionType compression) { spinor ** solver_field = NULL; const int nr_sf = 4; init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf); convert_eo_to_lexic(solver_field[0], Even_s, Odd_s); convert_eo_to_lexic(solver_field[1], Even_c, Odd_c); // convert_eo_to_lexic(g_spinor_field[DUM_DERI+1], Even_new, Odd_new); void *spinorIn = (void*)solver_field[0]; // source void *spinorIn_c = (void*)solver_field[1]; // charme source void *spinorOut = (void*)solver_field[2]; // solution void *spinorOut_c = (void*)solver_field[3]; // charme solution if ( rel_prec ) inv_param.residual_type = QUDA_L2_RELATIVE_RESIDUAL; else inv_param.residual_type = QUDA_L2_ABSOLUTE_RESIDUAL; inv_param.kappa = g_kappa; // IMPORTANT: use opposite TM mu-flavor since gamma5 -> -gamma5 inv_param.mu = -g_mubar /2./g_kappa; inv_param.epsilon = g_epsbar/2./g_kappa; // figure out which BC to use (theta, trivial...) set_boundary_conditions(&compression); // set the sloppy precision of the mixed prec solver set_sloppy_prec(sloppy_precision); // load gauge after setting precision _loadGaugeQuda(compression); // choose dslash type if( g_c_sw > 0.0 ) { inv_param.dslash_type = QUDA_TWISTED_CLOVER_DSLASH; inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; inv_param.solution_type = QUDA_MAT_SOLUTION; inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER; inv_param.clover_coeff = g_c_sw*g_kappa; } else { inv_param.dslash_type = QUDA_TWISTED_MASS_DSLASH; inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN_ASYMMETRIC; inv_param.solution_type = QUDA_MAT_SOLUTION; } // choose solver if(solver_flag == BICGSTAB) { if(g_proc_id == 0) {printf("# QUDA: Using BiCGstab!\n"); fflush(stdout);} inv_param.inv_type = QUDA_BICGSTAB_INVERTER; } else { /* Here we invert the hermitean operator squared */ inv_param.inv_type = QUDA_CG_INVERTER; if(g_proc_id == 0) { printf("# QUDA: Using mixed precision CG!\n"); printf("# QUDA: mu = %f, kappa = %f\n", g_mu/2./g_kappa, g_kappa); fflush(stdout); } } if( even_odd_flag ) { inv_param.solve_type = QUDA_NORMOP_PC_SOLVE; if(g_proc_id == 0) printf("# QUDA: Using preconditioning!\n"); } else { inv_param.solve_type = QUDA_NORMOP_SOLVE; if(g_proc_id == 0) printf("# QUDA: Not using preconditioning!\n"); } inv_param.tol = sqrt(precision)*0.25; inv_param.maxiter = max_iter; inv_param.twist_flavor = QUDA_TWIST_NONDEG_DOUBLET; inv_param.Ls = 2; // NULL pointers to host fields to force // construction instead of download of the clover field: if( g_c_sw > 0.0 ) loadCloverQuda(NULL, NULL, &inv_param); // reorder spinor reorder_spinor_toQuda( (double*)spinorIn, inv_param.cpu_prec, 1, (double*)spinorIn_c ); // perform the inversion invertQuda(spinorOut, spinorIn, &inv_param); if( inv_param.verbosity == QUDA_VERBOSE ) if(g_proc_id == 0) printf("# QUDA: Device memory used: Spinor: %f GiB, Gauge: %f GiB, Clover: %f GiB\n", inv_param.spinorGiB, gauge_param.gaugeGiB, inv_param.cloverGiB); if( inv_param.verbosity > QUDA_SILENT ) if(g_proc_id == 0) printf("# QUDA: Done: %i iter / %g secs = %g Gflops\n", inv_param.iter, inv_param.secs, inv_param.gflops/inv_param.secs); // number of CG iterations int iteration = inv_param.iter; // reorder spinor reorder_spinor_fromQuda( (double*)spinorIn, inv_param.cpu_prec, 1, (double*)spinorIn_c ); reorder_spinor_fromQuda( (double*)spinorOut, inv_param.cpu_prec, 1, (double*)spinorOut_c ); convert_lexic_to_eo(Even_s, Odd_s, solver_field[0]); convert_lexic_to_eo(Even_c, Odd_c, solver_field[1]); convert_lexic_to_eo(Even_new_s, Odd_new_s, solver_field[2]); convert_lexic_to_eo(Even_new_c, Odd_new_c, solver_field[3]); finalize_solver(solver_field, nr_sf); freeGaugeQuda(); if(iteration >= max_iter) return(-1); return(iteration); }
int main() { Array3<double> level_set; // level-set field Array3<double> pressure; // pressure field Array3<double> u_1_velocity_old; // velocity field at old time level x1 direction Array3<double> u_2_velocity_old; // velocity field at old time level x2 direction Array3<double> u_3_velocity_old; // velocity field at old time level x3 direction Array3<double> u_1_velocity_new; // velocity field at new time level x1 direction Array3<double> u_2_velocity_new; // velocity field at new time level x2 direction Array3<double> u_3_velocity_new; // velocity field at new time level x3 direction Array3<double> momentum_source_term_u_1; // source term of the momentum equation in x1 direction // defined on all u1 points (including boundaries) Array3<double> momentum_source_term_u_2; // source term of the momentum equation in x2 direction // defined on all u1 points (including boundaries) Array3<double> momentum_source_term_u_3; // source term of the momentum equation in x3 direction // defined on all u1 points (including boundaries) Array3<double> surface_tension_body_force_x1; // source term of the momentum equation in x1 direction // defined on all u1 points (including boundaries) Array3<double> surface_tension_body_force_x2; // source term of the momentum equation in x2 direction // defined on all u1 points (including boundaries) Array3<double> surface_tension_body_force_x3; // source term of the momentum equation in x3 direction // defined on all u1 points (including boundaries) Array3<double> scaled_density_u1; // scaled density for the controlvolumes // of the momentum equation in x1 direction Array3<double> scaled_density_u2; // scaled density for the controlvolumes // of the momentum equation in x2 direction Array3<double> scaled_density_u3; // scaled density for the controlvolumes // of the momentum equation in x3 direction boundary_face boundary_faces[6]; // array with all the information // for the boundary conditions double mesh_width_x1; // grid spacing in x1 direction (uniform) double mesh_width_x2; // grid spacing in x2 direction (uniform) double mesh_width_x3; // grid spacing in x3 direction (uniform) int number_primary_cells_i; // number of primary (pressure) cells in x1 direction int number_primary_cells_j; // number of primary (pressure) cells in x2 direction int number_primary_cells_k; // number of primary (pressure) cells in x3 direction int number_matrix_connections = 7; // number of connections in momentum matrix vector gravity; // gravitational acceleration vector double actual_time_step_navier_stokes; // time step used for level-set advection // computed from all stability restrictions and // possibly subscycling double rho_plus_over_rho_minus; // ratio of the densities of the two phases double smoothing_distance_factor; // the smoothing distance is smoothing_distance_factor // times the smallest mesh width double rho_minus_over_mu_minus; // this was the 'Reynolds' number // in the original implementation of Sander double mu_plus_over_mu_minus; // ratio of the viscosities of both phases double tolerance_pressure = 1e-8; // the tolerance with which the system for the pressure is solved int maximum_iterations_allowed_pressure=1e3; // maximum number of iterations allowed for the // conjugate gradient method double tolerance_velocity=1e-7; // the tolerance with which the system for the momentum predictor is solved int maximum_iterations_allowed_velocity=1e2; // maximum number of iterations allowed for the // conjugate gradient method int continuous_surface_force_model = 1; // =1, the continuous surface force model is applied // =0, the exact interface boundary conditions are applied int source_terms_in_momentum_predictor =1; // =1, the source terms are applied in the momentum predictor // equation // =0, the source terms are applied in the momentum corrector // equation vector initial_velocity; // some initial velocity to fill the arrays int i, j, k; // local variables for loop indexing double x,y,z; // coordinates double x_length = 1; double y_length = 2; double z_length = 3; double a,b,c,d,e,f,g,h,l,m,n,o,p,q,r,s,t,u; // input coefficients double u_1, u_2, u_3; double viscosity_over_density; double output_polynomial; double scaled_density_constant = 1; initial_velocity.u1 = 0.0; initial_velocity.u2 = 0.0; initial_velocity.u3 = 0.0; smoothing_distance_factor = 2; rho_plus_over_rho_minus = 10.0; rho_minus_over_mu_minus = 1e20; mu_plus_over_mu_minus = 1; gravity.u1 = 00.0; gravity.u2 = 00.0; gravity.u3 = 00.0; actual_time_step_navier_stokes = 0.1; set_boundary_conditions(boundary_faces, initial_velocity); // set_boundary_conditions(boundary_face[6], initial_velocity); number_primary_cells_i = pow(2,1); number_primary_cells_j = pow(2,2); number_primary_cells_k = pow(2,2); mesh_width_x1 = x_length/number_primary_cells_i; mesh_width_x2 = y_length/number_primary_cells_j; mesh_width_x3 = z_length/number_primary_cells_k; viscosity_over_density = (1/scaled_density_constant)*compute_scaled_viscosity(100,mesh_width_x1,mesh_width_x2,mesh_width_x3, smoothing_distance_factor, rho_minus_over_mu_minus,mu_plus_over_mu_minus); level_set.create(number_primary_cells_i+2,number_primary_cells_j+2,number_primary_cells_k+2); set_constant_matrix2(number_primary_cells_i+2, number_primary_cells_j+2, number_primary_cells_k+2, level_set, 0.1); pressure.create(number_primary_cells_i+2, number_primary_cells_j+2, number_primary_cells_k+2); set_constant_matrix2(number_primary_cells_i+2, number_primary_cells_j+2, number_primary_cells_k+2, pressure, 10000.0); // velocity old u_1_velocity_old.create(number_primary_cells_i+1, number_primary_cells_j+2, number_primary_cells_k+2); u_2_velocity_old.create(number_primary_cells_i+2, number_primary_cells_j+1, number_primary_cells_k+2); u_3_velocity_old.create(number_primary_cells_i+2, number_primary_cells_j+2, number_primary_cells_k+1); set_constant_matrix2(number_primary_cells_i+1, number_primary_cells_j+2, number_primary_cells_k+2, u_1_velocity_old, initial_velocity.u1); set_constant_matrix2(number_primary_cells_i+2, number_primary_cells_j+1, number_primary_cells_k+2, u_2_velocity_old, initial_velocity.u2); set_constant_matrix2(number_primary_cells_i+2, number_primary_cells_j+2, number_primary_cells_k+1, u_3_velocity_old, initial_velocity.u3); // multiple input sets of parameters // a=1; b=2; c=3; d=4; e=5; f=6; g=7; h=8; l=9; m=1; n=2; o=3; p=4; q=5; r=6; s=7; t=8; u=9; // full test a=0; b=0; c=3; d=4; e=5; f=6; g=7; h=8; l=0; m=0; n=-3; o=3; p=4; q=5; r=6; s=7; t=0; u=0; // divergence free velocity input b+m+n = 0 // a=0; b=2; c=0; d=4; e=0; f=6; g=0; h=8; l=0; m=1; n=0; o=3; p=0; q=5; r=0; s=7; t=0; u=9; // linear test // a=0; b=0; c=0; d=0; e=0; f=0; g=0; h=1; l=0; m=0; n=0; o=0; p=0; q=0; r=0; s=0; t=0; u=0; // random test // a=0; b=1; c=0; d=0; e=0; f=0; g=0; h=0; l=0; m=0; n=0; o=0; p=0; q=0; r=0; s=0; t=0; u=0; // random test // a=1; b=2; c=3; d=4; e=5; f=6; g=7; h=8; l=9; m=1; n=2; o=3; p=4; q=5; r=6; s=7; t=8; u=9; // test for diffusion, only quadratic terms for(i=0;i<number_primary_cells_i+2;i++) { for(j=0;j<number_primary_cells_j+2;j++) { for(k=0;k<number_primary_cells_k+2;k++) { x = x_length * i/number_primary_cells_i -0.5*mesh_width_x1; // printf("x = %f \n",x); y = y_length * j/number_primary_cells_j -0.5*mesh_width_x2; // printf("y = %f \n",y); z = z_length * k/number_primary_cells_k -0.5*mesh_width_x3; if(i!=number_primary_cells_i+1) {x += 0.5*mesh_width_x1; // printf("x = %f \t",x); u_1_velocity_old[i][j][k]= a*x*x+b*x +c*y*y+d*y +e*z*z+f*z; // 1st test // u_1_velocity_old[i][j][k]= a*x*x+b*y*y+c*z*z +d*x*y+e*x*z +f*y*z; // 2nd test, only quadratic terms // printf("u_1 = %f \n", u_1_velocity_old[i][j][k]); x -= 0.5*mesh_width_x1;} if(j!=number_primary_cells_j+1) {y += 0.5*mesh_width_x2; u_2_velocity_old[i][j][k]= g*x*x+h*x +l*y*y+m*y +n*z*z+o*z; // 1st test // u_2_velocity_old[i][j][k]= g*x*x+h*y*y+l*z*z +m*x*y+n*x*z +o*y*z; // 2nd test, only quadratic terms y -= 0.5*mesh_width_x2;} if(k!=number_primary_cells_k+1) {z += 0.5*mesh_width_x3; u_3_velocity_old[i][j][k]= p*x*x+q*x +r*y*y+s*y +t*z*z+u*z; // 1st test // u_3_velocity_old[i][j][k]= p*x*x+q*y*y+r*z*z +s*x*y+t*x*z +u*y*z; // 2nd test, only quadratic terms z -= 0.5*mesh_width_x3;} } } } // velocity new u_1_velocity_new.create(number_primary_cells_i+1, number_primary_cells_j+2, number_primary_cells_k+2); u_2_velocity_new.create(number_primary_cells_i+2, number_primary_cells_j+1, number_primary_cells_k+2); u_3_velocity_new.create(number_primary_cells_i+2, number_primary_cells_j+2, number_primary_cells_k+1); set_constant_matrix2(number_primary_cells_i+1, number_primary_cells_j+2, number_primary_cells_k+2, u_1_velocity_new, initial_velocity.u1); set_constant_matrix2(number_primary_cells_i+2, number_primary_cells_j+1, number_primary_cells_k+2, u_2_velocity_new, initial_velocity.u2); set_constant_matrix2(number_primary_cells_i+2, number_primary_cells_j+2, number_primary_cells_k+1, u_3_velocity_new, initial_velocity.u3); // momentum source terms momentum_source_term_u_1.create(number_primary_cells_i+1, number_primary_cells_j+2, number_primary_cells_k+2); momentum_source_term_u_2.create(number_primary_cells_i+2, number_primary_cells_j+1, number_primary_cells_k+2); momentum_source_term_u_3.create(number_primary_cells_i+2, number_primary_cells_j+2, number_primary_cells_k+1); set_constant_matrix2(number_primary_cells_i+1, number_primary_cells_j+2, number_primary_cells_k+2, momentum_source_term_u_1, 0.0); set_constant_matrix2(number_primary_cells_i+2, number_primary_cells_j+1, number_primary_cells_k+2, momentum_source_term_u_2, 0.0); set_constant_matrix2(number_primary_cells_i+2, number_primary_cells_j+2, number_primary_cells_k+1, momentum_source_term_u_3, 0.0); // surface_tension_body_force surface_tension_body_force_x1.create(number_primary_cells_i+1, number_primary_cells_j+2, number_primary_cells_k+2); surface_tension_body_force_x2.create(number_primary_cells_i+2, number_primary_cells_j+1, number_primary_cells_k+2); surface_tension_body_force_x3.create(number_primary_cells_i+2, number_primary_cells_j+2, number_primary_cells_k+1); set_constant_matrix2(number_primary_cells_i+1, number_primary_cells_j+2, number_primary_cells_k+2, surface_tension_body_force_x1, 0); set_constant_matrix2(number_primary_cells_i+2, number_primary_cells_j+1, number_primary_cells_k+2, surface_tension_body_force_x2, 0); set_constant_matrix2(number_primary_cells_i+2, number_primary_cells_j+2, number_primary_cells_k+1, surface_tension_body_force_x3, 0); // scaled density scaled_density_u1.create(number_primary_cells_i+1, number_primary_cells_j+2, number_primary_cells_k+2); scaled_density_u2.create(number_primary_cells_i+2, number_primary_cells_j+1, number_primary_cells_k+2); scaled_density_u3.create(number_primary_cells_i+2, number_primary_cells_j+2, number_primary_cells_k+1); set_constant_matrix2(number_primary_cells_i+1, number_primary_cells_j+2, number_primary_cells_k+2, scaled_density_u1, 1); set_constant_matrix2(number_primary_cells_i+2, number_primary_cells_j+1, number_primary_cells_k+2, scaled_density_u2, 1); set_constant_matrix2(number_primary_cells_i+2, number_primary_cells_j+2, number_primary_cells_k+1, scaled_density_u3, 1); advance_flow_field(level_set, pressure, u_1_velocity_old, u_2_velocity_old, u_3_velocity_old, u_1_velocity_new, u_2_velocity_new, u_3_velocity_new, momentum_source_term_u_1, momentum_source_term_u_2, momentum_source_term_u_3, surface_tension_body_force_x1, surface_tension_body_force_x2, surface_tension_body_force_x3, scaled_density_u1, scaled_density_u2, scaled_density_u3, boundary_faces, mesh_width_x1, mesh_width_x2, mesh_width_x3, number_primary_cells_i, number_primary_cells_j, number_primary_cells_k, number_matrix_connections, gravity, actual_time_step_navier_stokes, rho_plus_over_rho_minus, smoothing_distance_factor, rho_minus_over_mu_minus, mu_plus_over_mu_minus, tolerance_pressure, maximum_iterations_allowed_pressure, tolerance_velocity, maximum_iterations_allowed_velocity, continuous_surface_force_model, source_terms_in_momentum_predictor ); // check for u,v,w momentum for(i=1;i<number_primary_cells_i+1;i++) { for(j=1;j<number_primary_cells_j+1;j++) { for(k=1;k<number_primary_cells_k+1;k++) { x = x_length * i/number_primary_cells_i -0.5*mesh_width_x1; y = y_length * j/number_primary_cells_j -0.5*mesh_width_x2; z = z_length * k/number_primary_cells_k -0.5*mesh_width_x3; printf("\n i=%i j=%i k=%i \n", i,j,k); if(i!=number_primary_cells_i){ x += 0.5*mesh_width_x1; u_1 = a*x*x+b*x +c*y*y+d*y +e*z*z+f*z; u_2 = g*x*x+h*x +l*y*y+m*y +n*z*z+o*z; u_3 = p*x*x+q*x +r*y*y+s*y +t*z*z+u*z; output_polynomial = 2*u_1*(2*a*x+b) + u_1*(2*y*l+m) + u_2*(2*y*c+d) + u_1*(2*z*t+u) + u_3*(2*z*e+f) +viscosity_over_density*(2*2*a+2*c+2*e); // 1st test // output_polynomial = viscosity_over_density*(4*a+2*b+m+2*c+t); // 2nd test, only quadratic terms x -= 0.5*mesh_width_x1; printf("\n u_1_velocity_old = %f \n", u_1); printf("u_1_velocity_new = %f \n", u_1_velocity_new[i][j][k]); printf("new - old = %f \n", u_1_velocity_new[i][j][k]-u_1); printf("polynomial = %f \n", output_polynomial); } if(j!=number_primary_cells_j){ y += 0.5*mesh_width_x2; u_1 = a*x*x+b*x +c*y*y+d*y +e*z*z+f*z; u_2 = g*x*x+h*x +l*y*y+m*y +n*z*z+o*z; u_3 = p*x*x+q*x +r*y*y+s*y +t*z*z+u*z; output_polynomial = u_1*(2*x*g+h)+u_2*(2*x*a+b) + 2*u_2*(2*y*l+m) + u_3*(2*z*n+o)+u_2*(2*z*t+u) +viscosity_over_density*(2*2*l+2*g+2*n); // 1st test // output_polynomial = viscosity_over_density*(4*h+2*g+d+2*l+u); // 2nd test, only quadratic terms y -= 0.5*mesh_width_x2; printf(" \n u_2_velocity_old = %f \n", u_2); printf("u_2_velocity_new = %f \n", u_2_velocity_new[i][j][k]); printf("new - old = %f \n", u_2_velocity_new[i][j][k]-u_2); printf("polynomial = %f \n", output_polynomial); } if(k!=number_primary_cells_k){ z += 0.5*mesh_width_x3; u_1 = a*x*x+b*x +c*y*y+d*y +e*z*z+f*z; u_2 = g*x*x+h*x +l*y*y+m*y +n*z*z+o*z; u_3 = p*x*x+q*x +r*y*y+s*y +t*z*z+u*z; output_polynomial = u_1*(2*x*p+q)+u_3*(2*x*a+b) + u_2*(2*y*r+s)+u_3*(2*y*l+m) + 2*u_3*(2*z*t+u) +viscosity_over_density*(2*2*t+2*p+2*r); // 1st test // output_polynomial = viscosity_over_density*(4*r+2*p+e+2*q+o); // 2nd test, only quadratic terms z -= 0.5*mesh_width_x3; printf("\n u_3_velocity_old = %f \n", u_3); printf("u_3_velocity_new = %f \n", u_3_velocity_new[i][j][k]); printf("new - old = %f \n", u_3_velocity_new[i][j][k]-u_3); printf("polynomial = %f \n", output_polynomial); } }}} // printf("u_1_velocity_new = %f \n", u_1_velocity_new[i][j][k]); // printf("u_2_velocity_old = %f \n", u_2_velocity_old[i][j][k]); // printf("u_2_velocity_new = %f \n", u_2_velocity_new[i][j][k]); // printf("u_3_velocity_old = %f \n", u_3_velocity_old[i][j][k]); // printf("u_3_velocity_new = %f \n", u_3_velocity_new[i][j][k]); // printf("pressure = %f \n", pressure[i][j][k]); printf("%s \n", "advance_flow_field_unit_test finish"); return 0; }
/************************************************************************** Function: ocl_jacobi This routine contains the main iteration loop for the Jacobi iteration using OpenCL kernel. params: a two arrays to compute solution into max_iter maximum number of iterations size size of array for this MPI rank tolerance all differences should be les than this tolerance value mpi_ranks number of MPI ranks in each dimension rank_pos cartesian position of this rank origin origin for this rank d discretion size mpi_comm MPI communications structure local_workblock_size size of local workblock for OpenCL kernel device_type OpenCL device type full_copy boolean if full buffer copy is to be done **************************************************************************/ static void ocl_jacobi(value_type *a[2], unsigned int max_iter, size_t size[DIMENSIONS], value_type tolerance, value_type d[DIMENSIONS], size_t local_workblock_size[DIMENSIONS], cl_device_type device_type, unsigned int full_copy) { size_t array_size; unsigned int i, j, rc, iter = 0; size_t delta_buffer_size, delta_size[DIMENSIONS]; size_t tile_delta_size, tile_cache_size; value_type max_diff, timer; icl_device* device_id; icl_kernel* kernel; cl_int err; icl_buffer *a_buf[2], *delta_buf; value_type *delta; /* convenience for y stride in array */ cl_uint ystride = size[Y]+2*GHOST_CELL_WIDTH; /* init devices */ icl_init_devices(device_type); /* find OpenCL device */ device_id = icl_get_device(0); /* build the kernel and verify the kernel */ kernel = icl_create_kernel(device_id, "jacsolver_kernel.cl", "ocl_jacobi_local_copy", "", ICL_SOURCE); /* calculate size of kernel local memory - also used later for kernel params */ tile_delta_size = local_workblock_size[X] * local_workblock_size[Y]; tile_cache_size = (local_workblock_size[X]+2*GHOST_CELL_WIDTH) * (local_workblock_size[Y]+2*GHOST_CELL_WIDTH); /* verify the device has enough resources for this device */ /* I'm an optimist, we just hope for the best if ((cluGetAvailableLocalMem(device_id, kernel) < tile_delta_size + tile_cache_size) || (! cluCheckLocalWorkgroupSize(device_id, kernel, DIMENSIONS, local_workblock_size))) { local_workblock_size[X] = 1; local_workblock_size[Y] = 1; } */ printf("Estimating solution using OpenCL Jacobi iteration with %d x %d workblock.\n", (int)local_workblock_size[X], (int)local_workblock_size[Y]); fflush(stdout); /* init arrays by setting the initial value and the boundary conditions */ set_initial_solution(a[OLD], size, INITIAL_GUESS); set_initial_solution(a[NEW], size, INITIAL_GUESS); set_boundary_conditions(a[OLD], size, d); set_boundary_conditions(a[NEW], size, d); /* print the initial solution guess */ print_array("Init ", a[NEW], size, d); /* allocate memory for differences */ delta_size[X] = size[X] / local_workblock_size[X]; delta_size[Y] = size[Y] / local_workblock_size[Y]; delta_buffer_size = delta_size[X] * delta_size[Y]; delta = (value_type *)malloc(sizeof(value_type) * delta_buffer_size); /* initialize deltas so that first execution of kernel with overlapping * reduction on the host will work correctly and not prematurely exit */ for (i=0; i<delta_size[X]; ++i) { for (j=0; j<delta_size[Y]; ++j) { delta[i * delta_size[Y] + j] = 1.0; } } /* create buffers for OpenCL device using host memory */ array_size = (size[X]+2*GHOST_CELL_WIDTH) * ystride; a_buf[OLD] = icl_create_buffer(device_id, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(value_type) * array_size); a_buf[NEW] = icl_create_buffer(device_id, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(value_type) * array_size); delta_buf = icl_create_buffer(device_id, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(value_type) * delta_buffer_size); /* copy over buffers to device */ icl_write_buffer(a_buf[OLD], CL_TRUE, sizeof(value_type) * array_size, a[OLD], NULL, NULL); icl_write_buffer(a_buf[NEW], CL_TRUE, sizeof(value_type) * array_size, a[NEW], NULL, NULL); /* set the kernel execution type - data parallel */ // cluSetKernelNDRange(clu, kernel, DIMENSIONS, NULL, size, local_workblock_size); /* iterate until maximum difference is less than the given tolerance or number of iterations is too high */ do { /* swap array pointers for next iteration */ SWAP_PTR(a[OLD], a[NEW]); SWAP_BUF(a_buf[OLD], a_buf[NEW]); icl_run_kernel(kernel, DIMENSIONS, size, local_workblock_size, NULL, NULL, 6, (size_t)0,(void *) a_buf[OLD], (size_t)0, (void *) a_buf[NEW], sizeof(value_type) * tile_delta_size, NULL, sizeof(value_type) * tile_cache_size, NULL, (size_t)0, (void *) delta_buf, sizeof(cl_uint), (void *) &ystride); /* while the kernel is running, calculate the reduction for the previous iteration */ max_diff = ocl_jacobi_reduce(delta, delta_size); /* enqueue a synchronous copy of the delta. This will not occur until the kernel * has finished. The deltas for each workgroup is a much smaller array to process */ icl_read_buffer(delta_buf, CL_TRUE, sizeof(value_type) * delta_buffer_size, delta, NULL, NULL); // clEnqueueReadBuffer(queue, a_buf[NEW], CL_TRUE, 0, sizeof(value_type) * array_size, a[NEW], 0, NULL, NULL)); /* output status for user, overwrite the same line */ if ((0 == iter % 100)) { printf("Iteration=%5d, max difference=%0.7f, target=%0.7f\r", iter, max_diff, tolerance); fflush(stdout); } /* increment the iteration counter */ iter++; } while (max_diff > tolerance && max_iter >= iter); /* do loop */ /* read back the final result */ icl_read_buffer(a_buf[NEW], CL_TRUE, sizeof(value_type) * array_size, a[NEW], NULL, NULL); /* output final iteration count and maximum difference value */ printf("Iteration=%5d, max difference=%0.7f, execution time=%.3f seconds\n", iter-1, max_diff, timer); fflush(stdout); /* finish usage of OpenCL device */ icl_release_buffers(3, a_buf[OLD], a_buf[NEW], delta_buf); icl_release_kernel(kernel); free(delta); }