// host stub function void ops_par_loop_advec_mom_kernel_post_pre_advec_z_execute( ops_kernel_descriptor *desc) { ops_block block = desc->block; int dim = desc->dim; int *range = desc->range; ops_arg arg0 = desc->args[0]; ops_arg arg1 = desc->args[1]; ops_arg arg2 = desc->args[2]; ops_arg arg3 = desc->args[3]; ops_arg arg4 = desc->args[4]; // Timing double t1, t2, c1, c2; ops_arg args[5] = {arg0, arg1, arg2, arg3, arg4}; #ifdef CHECKPOINTING if (!ops_checkpointing_before(args, 5, range, 136)) return; #endif if (OPS_diags > 1) { OPS_kernels[136].count++; ops_timers_core(&c2, &t2); } // compute locally allocated range for the sub-block int start[3]; int end[3]; for (int n = 0; n < 3; n++) { start[n] = range[2 * n]; end[n] = range[2 * n + 1]; } #ifdef OPS_DEBUG ops_register_args(args, "advec_mom_kernel_post_pre_advec_z"); #endif // set up initial pointers and exchange halos if necessary int base0 = args[0].dat->base_offset; double *__restrict__ node_mass_post = (double *)(args[0].data + base0); int base1 = args[1].dat->base_offset; const double *__restrict__ post_vol = (double *)(args[1].data + base1); int base2 = args[2].dat->base_offset; const double *__restrict__ density1 = (double *)(args[2].data + base2); int base3 = args[3].dat->base_offset; double *__restrict__ node_mass_pre = (double *)(args[3].data + base3); int base4 = args[4].dat->base_offset; const double *__restrict__ node_flux = (double *)(args[4].data + base4); // initialize global variable with the dimension of dats int xdim0_advec_mom_kernel_post_pre_advec_z = args[0].dat->size[0]; int ydim0_advec_mom_kernel_post_pre_advec_z = args[0].dat->size[1]; int xdim1_advec_mom_kernel_post_pre_advec_z = args[1].dat->size[0]; int ydim1_advec_mom_kernel_post_pre_advec_z = args[1].dat->size[1]; int xdim2_advec_mom_kernel_post_pre_advec_z = args[2].dat->size[0]; int ydim2_advec_mom_kernel_post_pre_advec_z = args[2].dat->size[1]; int xdim3_advec_mom_kernel_post_pre_advec_z = args[3].dat->size[0]; int ydim3_advec_mom_kernel_post_pre_advec_z = args[3].dat->size[1]; int xdim4_advec_mom_kernel_post_pre_advec_z = args[4].dat->size[0]; int ydim4_advec_mom_kernel_post_pre_advec_z = args[4].dat->size[1]; if (OPS_diags > 1) { ops_timers_core(&c1, &t1); OPS_kernels[136].mpi_time += t1 - t2; } #pragma omp parallel for collapse(2) for (int n_z = start[2]; n_z < end[2]; n_z++) { for (int n_y = start[1]; n_y < end[1]; n_y++) { #ifdef intel #pragma loop_count(10000) #pragma omp simd aligned(node_mass_post, post_vol, density1, node_mass_pre, \ node_flux) #else #pragma simd #endif for (int n_x = start[0]; n_x < end[0]; n_x++) { node_mass_post[OPS_ACC0(0, 0, 0)] = 0.125 * (density1[OPS_ACC2(0, -1, 0)] * post_vol[OPS_ACC1(0, -1, 0)] + density1[OPS_ACC2(0, 0, 0)] * post_vol[OPS_ACC1(0, 0, 0)] + density1[OPS_ACC2(-1, -1, 0)] * post_vol[OPS_ACC1(-1, -1, 0)] + density1[OPS_ACC2(-1, 0, 0)] * post_vol[OPS_ACC1(-1, 0, 0)] + density1[OPS_ACC2(0, -1, -1)] * post_vol[OPS_ACC1(0, -1, -1)] + density1[OPS_ACC2(0, 0, -1)] * post_vol[OPS_ACC1(0, 0, -1)] + density1[OPS_ACC2(-1, -1, -1)] * post_vol[OPS_ACC1(-1, -1, -1)] + density1[OPS_ACC2(-1, 0, -1)] * post_vol[OPS_ACC1(-1, 0, -1)]); node_mass_pre[OPS_ACC3(0, 0, 0)] = node_mass_post[OPS_ACC0(0, 0, 0)] - node_flux[OPS_ACC4(0, 0, -1)] + node_flux[OPS_ACC4(0, 0, 0)]; } } } if (OPS_diags > 1) { ops_timers_core(&c2, &t2); OPS_kernels[136].time += t2 - t1; } if (OPS_diags > 1) { // Update kernel record ops_timers_core(&c1, &t1); OPS_kernels[136].mpi_time += t1 - t2; OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg0); OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg1); OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg2); OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg3); OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg4); } }
inline void update_halo_kernel2_xvel_minus_4_right(double *xvel0, double *xvel1, const int* fields) { if(fields[FIELD_XVEL0] == 1) xvel0[OPS_ACC0(0,0,0)] = -xvel0[OPS_ACC0(-4,0,0)]; if(fields[FIELD_XVEL1] == 1) xvel1[OPS_ACC1(0,0,0)] = -xvel1[OPS_ACC1(-4,0,0)]; }
inline void update_halo_kernel3_minus_2_b(double *vol_flux_x, double *mass_flux_x, const int* fields) { if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x[OPS_ACC0(0,0)] = -(vol_flux_x[OPS_ACC0(-2,0)]); if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x[OPS_ACC1(0,0)] = -(mass_flux_x[OPS_ACC1(-2,0)]); }
// user function inline void reset_field_kernel2(double *xvel0, const double *xvel1, double *yvel0, const double *yvel1) { xvel0[OPS_ACC0(0, 0)] = xvel1[OPS_ACC1(0, 0)]; yvel0[OPS_ACC2(0, 0)] = yvel1[OPS_ACC3(0, 0)]; }
//user function inline void generate_chunk_kernel( const double *vertexx, const double *vertexy, const double *vertexz, double *energy0, double *density0, double *xvel0, double *yvel0, double *zvel0, const double *cellx, const double *celly, const double *cellz) { double radius, x_cent, y_cent, z_cent; energy0[OPS_ACC3(0,0,0)]= states[0].energy; density0[OPS_ACC4(0,0,0)]= states[0].density; xvel0[OPS_ACC5(0,0,0)]=states[0].xvel; yvel0[OPS_ACC6(0,0,0)]=states[0].yvel; zvel0[OPS_ACC7(0,0,0)]=states[0].zvel; for(int i = 1; i<number_of_states; i++) { x_cent=states[i].xmin; y_cent=states[i].ymin; z_cent=states[i].zmin; if (states[i].geometry == g_cube) { if(vertexx[OPS_ACC0(1,0,0)] >= states[i].xmin && vertexx[OPS_ACC0(0,0,0)] < states[i].xmax) { if(vertexy[OPS_ACC1(0,1,0)] >= states[i].ymin && vertexy[OPS_ACC1(0,0,0)] < states[i].ymax) { if(vertexz[OPS_ACC2(0,0,1)] >= states[i].zmin && vertexz[OPS_ACC2(0,0,0)] < states[i].zmax) { energy0[OPS_ACC3(0,0,0)] = states[i].energy; density0[OPS_ACC4(0,0,0)] = states[i].density; for (int ix=0;ix<2;ix++){ for (int iy=0;iy<2;iy++){ for (int iz=0;iz<2;iz++){ xvel0[OPS_ACC5(ix,iy,iz)] = states[i].xvel; yvel0[OPS_ACC6(ix,iy,iz)] = states[i].yvel; zvel0[OPS_ACC7(ix,iy,iz)] = states[i].zvel; } } } } } } } else if(states[i].geometry == g_sphe) { radius = sqrt ((cellx[OPS_ACC8(0,0,0)] - x_cent) * (cellx[OPS_ACC8(0,0,0)] - x_cent) + (celly[OPS_ACC9(0,0,0)] - y_cent) * (celly[OPS_ACC9(0,0,0)] - y_cent) + (cellz[OPS_ACC10(0,0,0)] - z_cent) * (cellz[OPS_ACC10(0,0,0)] - z_cent)); if(radius <= states[i].radius) { energy0[OPS_ACC3(0,0,0)] = states[i].energy; density0[OPS_ACC4(0,0,0)] = states[i].density; for (int ix=0;ix<2;ix++){ for (int iy=0;iy<2;iy++){ for (int iz=0;iz<2;iz++){ xvel0[OPS_ACC5(ix,iy,iz)] = states[i].xvel; yvel0[OPS_ACC6(ix,iy,iz)] = states[i].yvel; zvel0[OPS_ACC7(ix,iy,iz)] = states[i].zvel; } } } } } else if(states[i].geometry == g_point) { if(vertexx[OPS_ACC0(0,0,0)] == x_cent && vertexy[OPS_ACC1(0,0,0)] == y_cent && vertexz[OPS_ACC2(0,0,0)] == z_cent) { energy0[OPS_ACC3(0,0,0)] = states[i].energy; density0[OPS_ACC4(0,0,0)] = states[i].density; for (int ix=0;ix<2;ix++){ for (int iy=0;iy<2;iy++){ for (int iz=0;iz<2;iz++){ xvel0[OPS_ACC5(ix,iy,iz)] = states[i].xvel; yvel0[OPS_ACC6(ix,iy,iz)] = states[i].yvel; zvel0[OPS_ACC7(ix,iy,iz)] = states[i].zvel; } } } } } } }
inline void update_halo_kernel5_minus_4_front(double *vol_flux_z, double *mass_flux_z, const int* fields) { if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z[OPS_ACC0(0,0,0)] = -vol_flux_z[OPS_ACC0(0,0,-4)]; if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z[OPS_ACC1(0,0,0)] = -mass_flux_z[OPS_ACC1(0,0,-4)]; }
const double *restrict pressure, const double *restrict density0, double *restrict density1, const double *restrict viscosity, const double *restrict energy0, double *restrict energy1, const double *restrict zarea, const double *restrict zvel0, int x_size, int y_size, int z_size) { #pragma omp parallel for for (int n_z = 0; n_z < z_size; n_z++) { for (int n_y = 0; n_y < y_size; n_y++) { for (int n_x = 0; n_x < x_size; n_x++) { double recip_volume, energy_change; double right_flux, left_flux, top_flux, bottom_flux, back_flux, front_flux, total_flux; left_flux = (xarea[OPS_ACC0(0, 0, 0)] * (xvel0[OPS_ACC1(0, 0, 0)] + xvel0[OPS_ACC1(0, 1, 0)] + xvel0[OPS_ACC1(0, 0, 1)] + xvel0[OPS_ACC1(0, 1, 1)] + xvel0[OPS_ACC1(0, 0, 0)] + xvel0[OPS_ACC1(0, 1, 0)] + xvel0[OPS_ACC1(0, 0, 1)] + xvel0[OPS_ACC1(0, 1, 1)])) * 0.125 * dt * 0.5; right_flux = (xarea[OPS_ACC0(1, 0, 0)] * (xvel0[OPS_ACC1(1, 0, 0)] + xvel0[OPS_ACC1(1, 1, 0)] + xvel0[OPS_ACC1(1, 0, 1)] + xvel0[OPS_ACC1(1, 1, 1)] + xvel0[OPS_ACC1(1, 0, 0)] + xvel0[OPS_ACC1(1, 1, 0)] + xvel0[OPS_ACC1(1, 0, 1)] + xvel0[OPS_ACC1(1, 1, 1)])) * 0.125 * dt * 0.5; bottom_flux = (yarea[OPS_ACC2(0, 0, 0)] * (yvel0[OPS_ACC3(0, 0, 0)] + yvel0[OPS_ACC3(1, 0, 0)] + yvel0[OPS_ACC3(0, 0, 1)] + yvel0[OPS_ACC3(1, 0, 1)] + yvel0[OPS_ACC3(0, 0, 0)] + yvel0[OPS_ACC3(1, 0, 0)] +
xdim5_reset_field_kernel2 * (y) + \ xdim5_reset_field_kernel2 * ydim5_reset_field_kernel2 * (z)) // user function void reset_field_kernel2_c_wrapper(double *restrict xvel0, const double *restrict xvel1, double *restrict yvel0, const double *restrict yvel1, double *restrict zvel0, const double *restrict zvel1, int x_size, int y_size, int z_size) { #pragma omp parallel for for (int n_z = 0; n_z < z_size; n_z++) { for (int n_y = 0; n_y < y_size; n_y++) { for (int n_x = 0; n_x < x_size; n_x++) { xvel0[OPS_ACC0(0, 0, 0)] = xvel1[OPS_ACC1(0, 0, 0)]; yvel0[OPS_ACC2(0, 0, 0)] = yvel1[OPS_ACC3(0, 0, 0)]; zvel0[OPS_ACC4(0, 0, 0)] = zvel1[OPS_ACC5(0, 0, 0)]; } } } } #undef OPS_ACC0 #undef OPS_ACC1 #undef OPS_ACC2 #undef OPS_ACC3 #undef OPS_ACC4 #undef OPS_ACC5
ydim4_advec_mom_kernel_post_pre_advec_z * (z)) // user function void advec_mom_kernel_post_pre_advec_z_c_wrapper( double *restrict node_mass_post, const double *restrict post_vol, const double *restrict density1, double *restrict node_mass_pre, const double *restrict node_flux, int x_size, int y_size, int z_size) { #pragma omp parallel for for (int n_z = 0; n_z < z_size; n_z++) { for (int n_y = 0; n_y < y_size; n_y++) { for (int n_x = 0; n_x < x_size; n_x++) { node_mass_post[OPS_ACC0(0, 0, 0)] = 0.125 * (density1[OPS_ACC2(0, -1, 0)] * post_vol[OPS_ACC1(0, -1, 0)] + density1[OPS_ACC2(0, 0, 0)] * post_vol[OPS_ACC1(0, 0, 0)] + density1[OPS_ACC2(-1, -1, 0)] * post_vol[OPS_ACC1(-1, -1, 0)] + density1[OPS_ACC2(-1, 0, 0)] * post_vol[OPS_ACC1(-1, 0, 0)] + density1[OPS_ACC2(0, -1, -1)] * post_vol[OPS_ACC1(0, -1, -1)] + density1[OPS_ACC2(0, 0, -1)] * post_vol[OPS_ACC1(0, 0, -1)] + density1[OPS_ACC2(-1, -1, -1)] * post_vol[OPS_ACC1(-1, -1, -1)] + density1[OPS_ACC2(-1, 0, -1)] * post_vol[OPS_ACC1(-1, 0, -1)]); node_mass_pre[OPS_ACC3(0, 0, 0)] = node_mass_post[OPS_ACC0(0, 0, 0)] - node_flux[OPS_ACC4(0, 0, -1)] + node_flux[OPS_ACC4(0, 0, 0)]; } } } }
inline void update_halo_kernel4_minus_4_a(double *vol_flux_y, double *mass_flux_y, const int* fields) { if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y[OPS_ACC0(0,0,0)] = -(vol_flux_y[OPS_ACC0(0,4,0)]); if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y[OPS_ACC1(0,0,0)] = -(mass_flux_y[OPS_ACC1(0,4,0)]); }
int xdim3_revert_kernel; #define OPS_ACC0(x, y) \ (n_x * 1 + n_y * xdim0_revert_kernel * 1 + x + xdim0_revert_kernel * (y)) #define OPS_ACC1(x, y) \ (n_x * 1 + n_y * xdim1_revert_kernel * 1 + x + xdim1_revert_kernel * (y)) #define OPS_ACC2(x, y) \ (n_x * 1 + n_y * xdim2_revert_kernel * 1 + x + xdim2_revert_kernel * (y)) #define OPS_ACC3(x, y) \ (n_x * 1 + n_y * xdim3_revert_kernel * 1 + x + xdim3_revert_kernel * (y)) // user function void revert_kernel_c_wrapper(const double *restrict density0, double *restrict density1, const double *restrict energy0, double *restrict energy1, int x_size, int y_size) { #pragma omp parallel for for (int n_y = 0; n_y < y_size; n_y++) { for (int n_x = 0; n_x < x_size; n_x++) { density1[OPS_ACC1(0, 0)] = density0[OPS_ACC0(0, 0)]; energy1[OPS_ACC3(0, 0)] = energy0[OPS_ACC2(0, 0)]; } } } #undef OPS_ACC0 #undef OPS_ACC1 #undef OPS_ACC2 #undef OPS_ACC3
// host stub function void ops_par_loop_advec_mom_kernel1_z_nonvector_execute( ops_kernel_descriptor *desc) { ops_block block = desc->block; int dim = desc->dim; int *range = desc->range; ops_arg arg0 = desc->args[0]; ops_arg arg1 = desc->args[1]; ops_arg arg2 = desc->args[2]; ops_arg arg3 = desc->args[3]; ops_arg arg4 = desc->args[4]; // Timing double t1, t2, c1, c2; ops_arg args[5] = {arg0, arg1, arg2, arg3, arg4}; #ifdef CHECKPOINTING if (!ops_checkpointing_before(args, 5, range, 137)) return; #endif if (OPS_diags > 1) { OPS_kernels[137].count++; ops_timers_core(&c2, &t2); } // compute locally allocated range for the sub-block int start[3]; int end[3]; for (int n = 0; n < 3; n++) { start[n] = range[2 * n]; end[n] = range[2 * n + 1]; } #ifdef OPS_DEBUG ops_register_args(args, "advec_mom_kernel1_z_nonvector"); #endif // set up initial pointers and exchange halos if necessary int base0 = args[0].dat->base_offset; const double *__restrict__ node_flux = (double *)(args[0].data + base0); int base1 = args[1].dat->base_offset; const double *__restrict__ node_mass_pre = (double *)(args[1].data + base1); int base2 = args[2].dat->base_offset; double *__restrict__ mom_flux = (double *)(args[2].data + base2); int base3 = args[3].dat->base_offset; const double *__restrict__ celldz = (double *)(args[3].data + base3); int base4 = args[4].dat->base_offset; const double *__restrict__ vel1 = (double *)(args[4].data + base4); // initialize global variable with the dimension of dats int xdim0_advec_mom_kernel1_z_nonvector = args[0].dat->size[0]; int ydim0_advec_mom_kernel1_z_nonvector = args[0].dat->size[1]; int xdim1_advec_mom_kernel1_z_nonvector = args[1].dat->size[0]; int ydim1_advec_mom_kernel1_z_nonvector = args[1].dat->size[1]; int xdim2_advec_mom_kernel1_z_nonvector = args[2].dat->size[0]; int ydim2_advec_mom_kernel1_z_nonvector = args[2].dat->size[1]; int xdim3_advec_mom_kernel1_z_nonvector = args[3].dat->size[0]; int ydim3_advec_mom_kernel1_z_nonvector = args[3].dat->size[1]; int xdim4_advec_mom_kernel1_z_nonvector = args[4].dat->size[0]; int ydim4_advec_mom_kernel1_z_nonvector = args[4].dat->size[1]; if (OPS_diags > 1) { ops_timers_core(&c1, &t1); OPS_kernels[137].mpi_time += t1 - t2; } #pragma omp parallel for collapse(2) for (int n_z = start[2]; n_z < end[2]; n_z++) { for (int n_y = start[1]; n_y < end[1]; n_y++) { #ifdef intel #pragma loop_count(10000) #pragma omp simd aligned(node_flux, node_mass_pre, mom_flux, celldz, vel1) #else #pragma simd #endif for (int n_x = start[0]; n_x < end[0]; n_x++) { double sigma, wind, width; double vdiffuw, vdiffdw, auw, adw, limiter; int upwind, donor, downwind, dif; double advec_vel_temp; if ((node_flux[OPS_ACC0(0, 0, 0)]) < 0.0) { upwind = 2; donor = 1; downwind = 0; dif = donor; } else { upwind = -1; donor = 0; downwind = 1; dif = upwind; } sigma = fabs(node_flux[OPS_ACC0(0, 0, 0)]) / node_mass_pre[OPS_ACC1(0, 0, donor)]; width = celldz[OPS_ACC3(0, 0, 0)]; vdiffuw = vel1[OPS_ACC4(0, 0, donor)] - vel1[OPS_ACC4(0, 0, upwind)]; vdiffdw = vel1[OPS_ACC4(0, 0, downwind)] - vel1[OPS_ACC4(0, 0, donor)]; limiter = 0.0; if (vdiffuw * vdiffdw > 0.0) { auw = fabs(vdiffuw); adw = fabs(vdiffdw); wind = 1.0; if (vdiffdw <= 0.0) wind = -1.0; limiter = wind * MIN(width * ((2.0 - sigma) * adw / width + (1.0 + sigma) * auw / celldz[OPS_ACC3(0, 0, dif)]) / 6.0, MIN(auw, adw)); } advec_vel_temp = vel1[OPS_ACC4(0, 0, donor)] + (1.0 - sigma) * limiter; mom_flux[OPS_ACC2(0, 0, 0)] = advec_vel_temp * node_flux[OPS_ACC0(0, 0, 0)]; } } } if (OPS_diags > 1) { ops_timers_core(&c2, &t2); OPS_kernels[137].time += t2 - t1; } if (OPS_diags > 1) { // Update kernel record ops_timers_core(&c1, &t1); OPS_kernels[137].mpi_time += t1 - t2; OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg0); OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg1); OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg2); OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg3); OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg4); } }
inline void update_halo_kernel2_yvel_minus_4_bot(double *yvel0, double *yvel1, const int* fields) { if(fields[FIELD_YVEL0] == 1) yvel0[OPS_ACC0(0,0,0)] = -yvel0[OPS_ACC0(0,4,0)]; if(fields[FIELD_YVEL1] == 1) yvel1[OPS_ACC1(0,0,0)] = -yvel1[OPS_ACC1(0,4,0)]; }
inline void update_halo_kernel2_yvel_plus_2_front(double *yvel0, double *yvel1, const int* fields) { if(fields[FIELD_YVEL0] == 1) yvel0[OPS_ACC0(0,0,0)] = yvel0[OPS_ACC0(0,0,-2)]; if(fields[FIELD_YVEL1] == 1) yvel1[OPS_ACC1(0,0,0)] = yvel1[OPS_ACC1(0,0,-2)]; }
// user function inline void tea_leaf_axpy_kernel(double *u, const double *p, const double *alpha) { u[OPS_ACC0(0, 0)] = u[OPS_ACC0(0, 0)] + (*alpha) * p[OPS_ACC1(0, 0)]; }
inline void update_halo_kernel2_xvel_plus_2_top(double *xvel0, double *xvel1, const int* fields) { if(fields[FIELD_XVEL0] == 1) xvel0[OPS_ACC0(0,0,0)] = xvel0[OPS_ACC0(0,-2,0)]; if(fields[FIELD_XVEL1] == 1) xvel1[OPS_ACC1(0,0,0)] = xvel1[OPS_ACC1(0,-2,0)]; }
inline void update_halo_kernel4_plus_2_back(double *vol_flux_y, double *mass_flux_y, const int* fields) { if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y[OPS_ACC0(0,0,0)] = vol_flux_y[OPS_ACC0(0,0,2)]; if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y[OPS_ACC1(0,0,0)] = mass_flux_y[OPS_ACC1(0,0,2)]; }
// user function inline void generate_chunk_kernel(const double *vertexx, const double *vertexy, double *energy0, double *density0, double *xvel0, double *yvel0, const double *cellx, const double *celly) { double radius, x_cent, y_cent; int is_in = 0; int is_in2 = 0; energy0[OPS_ACC2(0, 0)] = states[0].energy; density0[OPS_ACC3(0, 0)] = states[0].density; xvel0[OPS_ACC4(0, 0)] = states[0].xvel; yvel0[OPS_ACC5(0, 0)] = states[0].yvel; for (int i = 1; i < number_of_states; i++) { x_cent = states[i].xmin; y_cent = states[i].ymin; is_in = 0; is_in2 = 0; if (states[i].geometry == g_rect) { for (int i1 = -1; i1 <= 0; i1++) { for (int j1 = -1; j1 <= 0; j1++) { if (vertexx[OPS_ACC0(1 + i1, 0)] >= states[i].xmin && vertexx[OPS_ACC0(0 + i1, 0)] < states[i].xmax) { if (vertexy[OPS_ACC1(0, 1 + j1)] >= states[i].ymin && vertexy[OPS_ACC1(0, 0 + j1)] < states[i].ymax) { is_in = 1; } } } } if (vertexx[OPS_ACC0(1, 0)] >= states[i].xmin && vertexx[OPS_ACC0(0, 0)] < states[i].xmax) { if (vertexy[OPS_ACC1(0, 1)] >= states[i].ymin && vertexy[OPS_ACC1(0, 0)] < states[i].ymax) { is_in2 = 1; } } if (is_in2) { energy0[OPS_ACC2(0, 0)] = states[i].energy; density0[OPS_ACC3(0, 0)] = states[i].density; } if (is_in) { xvel0[OPS_ACC4(0, 0)] = states[i].xvel; yvel0[OPS_ACC5(0, 0)] = states[i].yvel; } } else if (states[i].geometry == g_circ) { for (int i1 = -1; i1 <= 0; i1++) { for (int j1 = -1; j1 <= 0; j1++) { radius = sqrt((cellx[OPS_ACC6(i1, 0)] - x_cent) * (cellx[OPS_ACC6(i1, 0)] - x_cent) + (celly[OPS_ACC7(0, j1)] - y_cent) * (celly[OPS_ACC7(0, j1)] - y_cent)); if (radius <= states[i].radius) { is_in = 1; } } } if (radius <= states[i].radius) is_in2 = 1; if (is_in2) { energy0[OPS_ACC2(0, 0)] = states[i].energy; density0[OPS_ACC3(0, 0)] = states[i].density; } if (is_in) { xvel0[OPS_ACC4(0, 0)] = states[i].xvel; yvel0[OPS_ACC5(0, 0)] = states[i].yvel; } } else if (states[i].geometry == g_point) { for (int i1 = -1; i1 <= 0; i1++) { for (int j1 = -1; j1 <= 0; j1++) { if (vertexx[OPS_ACC0(i1, 0)] == x_cent && vertexy[OPS_ACC1(0, j1)] == y_cent) { is_in = 1; } } } if (vertexx[OPS_ACC0(0, 0)] == x_cent && vertexy[OPS_ACC1(0, 0)] == y_cent) is_in2 = 1; if (is_in2) { energy0[OPS_ACC2(0, 0)] = states[i].energy; density0[OPS_ACC3(0, 0)] = states[i].density; } if (is_in) { xvel0[OPS_ACC4(0, 0)] = states[i].xvel; yvel0[OPS_ACC5(0, 0)] = states[i].yvel; } } } }
inline void update_halo_kernel3_plus_4_front(double *vol_flux_x, double *mass_flux_x, const int* fields) { if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x[OPS_ACC0(0,0,0)] = vol_flux_x[OPS_ACC0(0,0,-4)]; if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x[OPS_ACC1(0,0,0)] = mass_flux_x[OPS_ACC1(0,0,-4)]; }
inline void advec_cell_kernel3_ydir(const double *vol_flux_y, const double *pre_vol, const int *yy, const double *vertexdy, const double *density1, const double *energy1, double *mass_flux_y, double *ener_flux) { double sigmat, sigmav, sigmam, sigma3, sigma4; double diffuw, diffdw, limiter; double one_by_six = 1.0 / 6.0; int y_max = field.y_max; int upwind, donor, downwind, dif; if (vol_flux_y[OPS_ACC0(0, 0)] > 0.0) { upwind = -2; donor = -1; downwind = 0; dif = donor; } else if (yy[OPS_ACC2(0, 1)] < y_max + 2 - 2) { upwind = 1; donor = 0; downwind = -1; dif = upwind; } else { upwind = 0; donor = 0; downwind = -1; dif = upwind; } sigmat = fabs(vol_flux_y[OPS_ACC0(0, 0)]) / pre_vol[OPS_ACC1(0, donor)]; sigma3 = (1.0 + sigmat) * (vertexdy[OPS_ACC3(0, 0)] / vertexdy[OPS_ACC3(0, dif)]); sigma4 = 2.0 - sigmat; sigmav = sigmat; diffuw = density1[OPS_ACC4(0, donor)] - density1[OPS_ACC4(0, upwind)]; diffdw = density1[OPS_ACC4(0, downwind)] - density1[OPS_ACC4(0, donor)]; if ((diffuw * diffdw) > 0.0) limiter = (1.0 - sigmav) * SIGN(1.0, diffdw) * MIN(MIN(fabs(diffuw), fabs(diffdw)), one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); else limiter = 0.0; mass_flux_y[OPS_ACC6(0, 0)] = (vol_flux_y[OPS_ACC0(0, 0)]) * (density1[OPS_ACC4(0, donor)] + limiter); sigmam = fabs(mass_flux_y[OPS_ACC6(0, 0)]) / (density1[OPS_ACC4(0, donor)] * pre_vol[OPS_ACC1(0, donor)]); diffuw = energy1[OPS_ACC5(0, donor)] - energy1[OPS_ACC5(0, upwind)]; diffdw = energy1[OPS_ACC5(0, downwind)] - energy1[OPS_ACC5(0, donor)]; if ((diffuw * diffdw) > 0.0) limiter = (1.0 - sigmam) * SIGN(1.0, diffdw) * MIN(MIN(fabs(diffuw), fabs(diffdw)), one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); else limiter = 0.0; ener_flux[OPS_ACC7(0, 0)] = mass_flux_y[OPS_ACC6(0, 0)] * (energy1[OPS_ACC5(0, donor)] + limiter); }
// host stub function void ops_par_loop_ideal_gas_kernel_execute(ops_kernel_descriptor *desc) { ops_block block = desc->block; int dim = desc->dim; int *range = desc->range; ops_arg arg0 = desc->args[0]; ops_arg arg1 = desc->args[1]; ops_arg arg2 = desc->args[2]; ops_arg arg3 = desc->args[3]; // Timing double t1, t2, c1, c2; ops_arg args[4] = {arg0, arg1, arg2, arg3}; #ifdef CHECKPOINTING if (!ops_checkpointing_before(args, 4, range, 8)) return; #endif if (OPS_diags > 1) { OPS_kernels[8].count++; ops_timers_core(&c2, &t2); } // compute locally allocated range for the sub-block int start[2]; int end[2]; for (int n = 0; n < 2; n++) { start[n] = range[2 * n]; end[n] = range[2 * n + 1]; } #ifdef OPS_DEBUG ops_register_args(args, "ideal_gas_kernel"); #endif // set up initial pointers and exchange halos if necessary int base0 = args[0].dat->base_offset; const double *__restrict__ density = (double *)(args[0].data + base0); int base1 = args[1].dat->base_offset; const double *__restrict__ energy = (double *)(args[1].data + base1); int base2 = args[2].dat->base_offset; double *__restrict__ pressure = (double *)(args[2].data + base2); int base3 = args[3].dat->base_offset; double *__restrict__ soundspeed = (double *)(args[3].data + base3); // initialize global variable with the dimension of dats int xdim0_ideal_gas_kernel = args[0].dat->size[0]; int xdim1_ideal_gas_kernel = args[1].dat->size[0]; int xdim2_ideal_gas_kernel = args[2].dat->size[0]; int xdim3_ideal_gas_kernel = args[3].dat->size[0]; if (OPS_diags > 1) { ops_timers_core(&c1, &t1); OPS_kernels[8].mpi_time += t1 - t2; } #pragma omp parallel for for (int n_y = start[1]; n_y < end[1]; n_y++) { #ifdef intel #pragma loop_count(10000) #pragma omp simd aligned(density, energy, pressure, soundspeed) #else #pragma simd #endif for (int n_x = start[0]; n_x < end[0]; n_x++) { double sound_speed_squared, v, pressurebyenergy, pressurebyvolume; v = 1.0 / density[OPS_ACC0(0, 0)]; pressure[OPS_ACC2(0, 0)] = (1.4 - 1.0) * density[OPS_ACC0(0, 0)] * energy[OPS_ACC1(0, 0)]; pressurebyenergy = (1.4 - 1.0) * density[OPS_ACC0(0, 0)]; pressurebyvolume = -1 * density[OPS_ACC0(0, 0)] * pressure[OPS_ACC2(0, 0)]; sound_speed_squared = v * v * (pressure[OPS_ACC2(0, 0)] * pressurebyenergy - pressurebyvolume); soundspeed[OPS_ACC3(0, 0)] = sqrt(sound_speed_squared); } } if (OPS_diags > 1) { ops_timers_core(&c2, &t2); OPS_kernels[8].time += t2 - t1; } if (OPS_diags > 1) { // Update kernel record ops_timers_core(&c1, &t1); OPS_kernels[8].mpi_time += t1 - t2; OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg0); OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg1); OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg2); OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg3); } }
const double *restrict mass_flux_y, const double *restrict vol_flux_y, const double *restrict pre_vol, const double *restrict post_vol, double *restrict pre_mass, double *restrict post_mass, double *restrict advec_vol, double *restrict post_ener, const double *restrict ener_flux, int x_size, int y_size) { #pragma omp parallel for for (int n_y = 0; n_y < y_size; n_y++) { for (int n_x = 0; n_x < x_size; n_x++) { pre_mass[OPS_ACC6(0, 0)] = density1[OPS_ACC0(0, 0)] * pre_vol[OPS_ACC4(0, 0)]; post_mass[OPS_ACC7(0, 0)] = pre_mass[OPS_ACC6(0, 0)] + mass_flux_y[OPS_ACC2(0, 0)] - mass_flux_y[OPS_ACC2(0, 1)]; post_ener[OPS_ACC9(0, 0)] = (energy1[OPS_ACC1(0, 0)] * pre_mass[OPS_ACC6(0, 0)] + ener_flux[OPS_ACC10(0, 0)] - ener_flux[OPS_ACC10(0, 1)]) / post_mass[OPS_ACC7(0, 0)]; advec_vol[OPS_ACC8(0, 0)] = pre_vol[OPS_ACC4(0, 0)] + vol_flux_y[OPS_ACC3(0, 0)] - vol_flux_y[OPS_ACC3(0, 1)]; density1[OPS_ACC0(0, 0)] = post_mass[OPS_ACC7(0, 0)] / advec_vol[OPS_ACC8(0, 0)]; energy1[OPS_ACC1(0, 0)] = post_ener[OPS_ACC9(0, 0)]; } } } #undef OPS_ACC0 #undef OPS_ACC1 #undef OPS_ACC2 #undef OPS_ACC3
// user function inline void poisson_kernel_error(const double *u, const double *ref, double *err) { *err = *err + (u[OPS_ACC0(0, 0)] - ref[OPS_ACC1(0, 0)]) * (u[OPS_ACC0(0, 0)] - ref[OPS_ACC1(0, 0)]); }
// user function inline void tea_leaf_cg_calc_ur_r_reduce_kernel(double *r, const double *w, const double *alpha, double *rnn) { r[OPS_ACC0(0, 0)] = r[OPS_ACC0(0, 0)] - (*alpha) * w[OPS_ACC1(0, 0)]; *rnn = *rnn + r[OPS_ACC0(0, 0)] * r[OPS_ACC0(0, 0)]; }
// user function inline void poisson_kernel_update(const double *u2, double *u) { u[OPS_ACC1(0, 0)] = u2[OPS_ACC0(0, 0)]; }
inline void update_halo_kernel2_zvel_minus_2_back(double *zvel0, double *zvel1, const int* fields) { if(fields[FIELD_ZVEL0] == 1) zvel0[OPS_ACC0(0,0,0)] = -zvel0[OPS_ACC0(0,0,2)]; if(fields[FIELD_ZVEL1] == 1) zvel1[OPS_ACC1(0,0,0)] = -zvel1[OPS_ACC1(0,0,2)]; }
n_z * xdim3_flux_calc_kernelx * ydim3_flux_calc_kernelx * 1 + x + \ xdim3_flux_calc_kernelx * (y) + \ xdim3_flux_calc_kernelx * ydim3_flux_calc_kernelx * (z)) // user function void flux_calc_kernelx_c_wrapper(double *restrict vol_flux_x, const double *restrict xarea, const double *restrict xvel0, const double *restrict xvel1, int x_size, int y_size, int z_size) { #pragma omp parallel for for (int n_z = 0; n_z < z_size; n_z++) { for (int n_y = 0; n_y < y_size; n_y++) { for (int n_x = 0; n_x < x_size; n_x++) { vol_flux_x[OPS_ACC0(0, 0, 0)] = 0.125 * dt * (xarea[OPS_ACC1(0, 0, 0)]) * (xvel0[OPS_ACC2(0, 0, 0)] + xvel0[OPS_ACC2(0, 1, 0)] + xvel0[OPS_ACC2(0, 0, 1)] + xvel0[OPS_ACC2(0, 1, 1)] + xvel1[OPS_ACC3(0, 0, 0)] + xvel1[OPS_ACC3(0, 1, 0)] + xvel1[OPS_ACC3(0, 0, 1)] + xvel1[OPS_ACC3(0, 1, 1)]); } } } } #undef OPS_ACC0 #undef OPS_ACC1 #undef OPS_ACC2 #undef OPS_ACC3
inline void advec_mom_kernel2_y( double *vel1, const double *node_mass_post, const double *node_mass_pre, const double *mom_flux) { vel1[OPS_ACC0(0,0,0)] = ( vel1[OPS_ACC0(0,0,0)] * node_mass_pre[OPS_ACC2(0,0,0)] + mom_flux[OPS_ACC3(0,-1,0)] - mom_flux[OPS_ACC3(0,0,0)] ) / node_mass_post[OPS_ACC1(0,0,0)]; }
// host stub function void ops_par_loop_update_halo_kernel5_plus_4_a_execute( ops_kernel_descriptor *desc) { ops_block block = desc->block; int dim = desc->dim; int *range = desc->range; ops_arg arg0 = desc->args[0]; ops_arg arg1 = desc->args[1]; ops_arg arg2 = desc->args[2]; // Timing double t1, t2, c1, c2; ops_arg args[3] = {arg0, arg1, arg2}; #ifdef CHECKPOINTING if (!ops_checkpointing_before(args, 3, range, 84)) return; #endif if (OPS_diags > 1) { OPS_kernels[84].count++; ops_timers_core(&c2, &t2); } // compute locally allocated range for the sub-block int start[3]; int end[3]; for (int n = 0; n < 3; n++) { start[n] = range[2 * n]; end[n] = range[2 * n + 1]; } #ifdef OPS_DEBUG ops_register_args(args, "update_halo_kernel5_plus_4_a"); #endif // set up initial pointers and exchange halos if necessary int base0 = args[0].dat->base_offset; double *__restrict__ vol_flux_z = (double *)(args[0].data + base0); int base1 = args[1].dat->base_offset; double *__restrict__ mass_flux_z = (double *)(args[1].data + base1); const int *__restrict__ fields = (int *)args[2].data; // initialize global variable with the dimension of dats int xdim0_update_halo_kernel5_plus_4_a = args[0].dat->size[0]; int ydim0_update_halo_kernel5_plus_4_a = args[0].dat->size[1]; int xdim1_update_halo_kernel5_plus_4_a = args[1].dat->size[0]; int ydim1_update_halo_kernel5_plus_4_a = args[1].dat->size[1]; if (OPS_diags > 1) { ops_timers_core(&c1, &t1); OPS_kernels[84].mpi_time += t1 - t2; } #pragma omp parallel for collapse(2) for (int n_z = start[2]; n_z < end[2]; n_z++) { for (int n_y = start[1]; n_y < end[1]; n_y++) { #ifdef intel #pragma loop_count(10000) #pragma omp simd aligned(vol_flux_z, mass_flux_z) #else #pragma simd #endif for (int n_x = start[0]; n_x < end[0]; n_x++) { if (fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z[OPS_ACC0(0, 0, 0)] = vol_flux_z[OPS_ACC0(0, 4, 0)]; if (fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z[OPS_ACC1(0, 0, 0)] = mass_flux_z[OPS_ACC1(0, 4, 0)]; } } } if (OPS_diags > 1) { ops_timers_core(&c2, &t2); OPS_kernels[84].time += t2 - t1; } if (OPS_diags > 1) { // Update kernel record ops_timers_core(&c1, &t1); OPS_kernels[84].mpi_time += t1 - t2; OPS_kernels[84].transfer += ops_compute_transfer(dim, start, end, &arg0); OPS_kernels[84].transfer += ops_compute_transfer(dim, start, end, &arg1); } }
// host stub function void ops_par_loop_initialise_chunk_kernel_cellx_execute( ops_kernel_descriptor *desc) { ops_block block = desc->block; int dim = desc->dim; int *range = desc->range; ops_arg arg0 = desc->args[0]; ops_arg arg1 = desc->args[1]; ops_arg arg2 = desc->args[2]; // Timing double t1, t2, c1, c2; ops_arg args[3] = {arg0, arg1, arg2}; #ifdef CHECKPOINTING if (!ops_checkpointing_before(args, 3, range, 12)) return; #endif if (OPS_diags > 1) { OPS_kernels[12].count++; ops_timers_core(&c2, &t2); } // compute locally allocated range for the sub-block int start[2]; int end[2]; for (int n = 0; n < 2; n++) { start[n] = range[2 * n]; end[n] = range[2 * n + 1]; } #ifdef OPS_DEBUG ops_register_args(args, "initialise_chunk_kernel_cellx"); #endif // set up initial pointers and exchange halos if necessary int base0 = args[0].dat->base_offset; const double *__restrict__ vertexx = (double *)(args[0].data + base0); int base1 = args[1].dat->base_offset; double *__restrict__ cellx = (double *)(args[1].data + base1); int base2 = args[2].dat->base_offset; double *__restrict__ celldx = (double *)(args[2].data + base2); // initialize global variable with the dimension of dats int xdim0_initialise_chunk_kernel_cellx = args[0].dat->size[0]; int xdim1_initialise_chunk_kernel_cellx = args[1].dat->size[0]; int xdim2_initialise_chunk_kernel_cellx = args[2].dat->size[0]; if (OPS_diags > 1) { ops_timers_core(&c1, &t1); OPS_kernels[12].mpi_time += t1 - t2; } #pragma omp parallel for for (int n_y = start[1]; n_y < end[1]; n_y++) { #ifdef intel #pragma loop_count(10000) #pragma omp simd aligned(vertexx, cellx, celldx) #else #pragma simd #endif for (int n_x = start[0]; n_x < end[0]; n_x++) { double d_x; d_x = (grid.xmax - grid.xmin) / (double)grid.x_cells; cellx[OPS_ACC1(0, 0)] = 0.5 * (vertexx[OPS_ACC0(0, 0)] + vertexx[OPS_ACC0(1, 0)]); celldx[OPS_ACC2(0, 0)] = d_x; } } if (OPS_diags > 1) { ops_timers_core(&c2, &t2); OPS_kernels[12].time += t2 - t1; } if (OPS_diags > 1) { // Update kernel record ops_timers_core(&c1, &t1); OPS_kernels[12].mpi_time += t1 - t2; OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg0); OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg1); OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg2); } }