// Calculates w void cg_calc_w( const int x, const int y, const int halo_depth, double* pw, double* p, double* w, double* kx, double* ky) { double pw_temp = 0.0; #pragma omp target #pragma omp parallel for reduction(+:pw_temp) for(int jj = halo_depth; jj < y-halo_depth; ++jj) { for(int kk = halo_depth; kk < x-halo_depth; ++kk) { const int index = kk + jj*x; const double smvp = SMVP(p); w[index] = smvp; pw_temp += w[index]*p[index]; } } *pw += pw_temp; }
// Entry point for calculating w void ext_cg_calc_w_( const int* chunk, double* p, double* w, double* kx, double* ky, double* kz, double* pw) { START_PROFILING; double pwTemp = 0.0; #pragma omp target if(_chunk.is_offload) device(_chunk.device_id) #pragma omp parallel for reduction(+:pwTemp) for(int ii = HALO_PAD; ii < _chunk.z-HALO_PAD; ++ii) { for(int jj = HALO_PAD; jj < _chunk.y-HALO_PAD; ++jj) { for(int kk = HALO_PAD; kk < _chunk.x-HALO_PAD; ++kk) { const int index = ii*_chunk.x*_chunk.y+jj*_chunk.x+kk; const double smvp = SMVP(p); w[index] = smvp; pwTemp += w[index]*p[index]; } } } *pw = pwTemp; STOP_PROFILING(__func__); }
// Initialises the Chebyshev solver. void TeaLeafChunk::ChebyInit( double* alphas, double* betas, const double theta, const bool preconditionerOn) { preconditioner = preconditionerOn; this->alphas = alphas; this->betas = betas; START_PROFILING; #pragma omp parallel for for(int jj = HALO_PAD; jj < yCells-HALO_PAD; ++jj) { for(int kk = HALO_PAD; kk < xCells-HALO_PAD; ++kk) { const int index = jj*xCells+kk; const double smvp = SMVP(u); w[index] = smvp; r[index] = u0[index]-w[index]; p[index] = (preconditioner ? mi[index]*r[index] : r[index])/theta; } } STOP_PROFILING("Cheby Init"); ChebyCalcU(); }
// The main chebyshev iteration void cheby_iterate( const int x, const int y, const int halo_depth, double alpha, double beta, double* u, double* u0, double* p, double* r, double* w, double* kx, double* ky) { #pragma omp parallel for for(int jj = halo_depth; jj < y-halo_depth; ++jj) { for(int kk = halo_depth; kk < x-halo_depth; ++kk) { const int index = kk + jj*x; const double smvp = SMVP(u); w[index] = smvp; r[index] = u0[index]-w[index]; p[index] = alpha*p[index] + beta*r[index]; } } cheby_calc_u(x, y, halo_depth, u, p); }
KOKKOS_INLINE_FUNCTION void operator()(const int index) const { KOKKOS_INDICES; if(INDEX_IN_INNER_DOMAIN) { const double smvp = SMVP(sd); r[index] -= smvp; u[index] += sd[index]; } }
KOKKOS_INLINE_FUNCTION void operator()(const int index, value_type& rro) const { const int kk = index % x; const int jj = index / x; if(kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) { const double smvp = SMVP(u); w(index) = smvp; r(index) = u(index)-w(index); p(index) = r(index); rro += r(index)*p(index); } }
// Entry point for the main PPCG step. void ext_ppcg_inner_( const int* chunk, double* u, double* r, double* kx, double* ky, double* sd, double* mi, double* alphas, double* betas, int* step, int* maxSteps) { START_PROFILING; double alpha = alphas[*step]; double beta = betas[*step]; #pragma omp target if(_chunk.is_offload) device(_chunk.device_id) #pragma omp parallel for for(int jj = HALO_PAD; jj < _chunk.y-HALO_PAD; ++jj) { for(int kk = HALO_PAD; kk < _chunk.x-HALO_PAD; ++kk) { const int index = jj*_chunk.x+kk; const double smvp = SMVP(sd); r[index] -= smvp; u[index] += sd[index]; } } #pragma omp target if(_chunk.is_offload) device(_chunk.device_id) #pragma omp parallel for for(int jj = HALO_PAD; jj < _chunk.y-HALO_PAD; ++jj) { for(int kk = HALO_PAD; kk < _chunk.x-HALO_PAD; ++kk) { const int index = jj*_chunk.x+kk; sd[index] = alpha*sd[index]+beta* r[index]; } } STOP_PROFILING(__func__); }
// The PPCG inner iteration void ppcg_inner_iteration( const int x, const int y, const int z, const int halo_depth, double alpha, double beta, double* vec_u, double* vec_r, double* vec_kx, double* vec_ky, double* vec_kz, double* vec_sd) { #pragma omp target teams distribute //#pragma omp parallel for for(int ii = halo_depth; ii < z-halo_depth; ++ii) { for(int jj = halo_depth; jj < y-halo_depth; ++jj) { for(int kk = halo_depth; kk < x-halo_depth; ++kk) { const int index = ii*x*y+jj*x+kk; const double smvp = SMVP(vec_sd); vec_r[index] -= smvp; vec_u[index] += vec_sd[index]; } } } #pragma omp target teams distribute //#pragma omp parallel for for(int ii = halo_depth; ii < z-halo_depth; ++ii) { for(int jj = halo_depth; jj < y-halo_depth; ++jj) { for(int kk = halo_depth; kk < x-halo_depth; ++kk) { const int index = ii*x*y+jj*x+kk; vec_sd[index] = alpha*vec_sd[index] + beta*vec_r[index]; } } } }
KOKKOS_INLINE_FUNCTION void operator()(const team_member& team, value_type& pw) const { double pw_team = 0.0; const int team_offset = (team.league_rank() + halo_depth)*y; Kokkos::parallel_reduce( Kokkos::TeamThreadRange(team, halo_depth, y-halo_depth), [&] (const int &j, double& pw_thread) { const int index = team_offset + j; const double smvp = SMVP(p); w(index) = smvp; pw_thread += smvp*p(index); }, pw_team); Kokkos::single(Kokkos::PerTeam(team), [&] () { pw += pw_team; }); }
// The main Chebyshev solver iteration. void TeaLeafChunk::ChebyIterate( const int step) { START_PROFILING; #pragma omp parallel for for(int jj = HALO_PAD; jj < yCells-HALO_PAD; ++jj) { for(int kk = HALO_PAD; kk < xCells-HALO_PAD; ++kk) { const int index = jj*xCells+kk; const double smvp = SMVP(u); w[index] = smvp; r[index] = u0[index]-w[index]; p[index] = alphas[step]*p[index] + betas[step] * (preconditioner ? mi[index]*r[index] : r[index]); } } STOP_PROFILING("Cheby Iterate"); ChebyCalcU(); }
// Entry point for calculating residual. void ext_calculate_residual_( const int* chunk, double* u, double* u0, double* r, double* kx, double* ky) { START_PROFILING; #pragma omp target device(_chunk.device_id) #pragma omp parallel for for(int jj = HALO_PAD; jj < _chunk.y-HALO_PAD; ++jj) { for(int kk = HALO_PAD; kk < _chunk.x-HALO_PAD; ++kk) { int index = jj*_chunk.x+kk; const double smvp = SMVP(u); r[index] = u0[index] - smvp; } } STOP_PROFILING(__func__); }
// Initialises the CG solver void cg_init( const int x, const int y, const int halo_depth, const int coefficient, double rx, double ry, double* rro, double* density, double* energy, double* u, double* p, double* r, double* w, double* kx, double* ky) { if(coefficient != CONDUCTIVITY && coefficient != RECIP_CONDUCTIVITY) { die(__LINE__, __FILE__, "Coefficient %d is not valid.\n", coefficient); } #pragma omp target #pragma omp parallel for for(int jj = 0; jj < y; ++jj) { for(int kk = 0; kk < x; ++kk) { const int index = kk + jj*x; p[index] = 0.0; r[index] = 0.0; u[index] = energy[index]*density[index]; } } #pragma omp target #pragma omp parallel for for(int jj = 1; jj < y-1; ++jj) { for(int kk = 1; kk < x-1; ++kk) { const int index = kk + jj*x; w[index] = (coefficient == CONDUCTIVITY) ? density[index] : 1.0/density[index]; } } #pragma omp target #pragma omp parallel for for(int jj = halo_depth; jj < y-1; ++jj) { for(int kk = halo_depth; kk < x-1; ++kk) { const int index = kk + jj*x; kx[index] = rx*(w[index-1]+w[index]) / (2.0*w[index-1]*w[index]); ky[index] = ry*(w[index-x]+w[index]) / (2.0*w[index-x]*w[index]); } } double rro_temp = 0.0; #pragma omp target #pragma omp parallel for reduction(+:rro_temp) for(int jj = halo_depth; jj < y-halo_depth; ++jj) { for(int kk = halo_depth; kk < x-halo_depth; ++kk) { const int index = kk + jj*x; const double smvp = SMVP(u); w[index] = smvp; r[index] = u[index]-w[index]; p[index] = r[index]; rro_temp += r[index]*p[index]; } } // Sum locally *rro += rro_temp; }
// Entry point for CG initialisation. void ext_cg_solver_init_( const int* chunk, double* density, double* energy, double* u, double* p, double* r, double* mi, double* w, double* z, double* kx, double* ky, double* kz, const int* coefficient, const int* preconditioner, double* dt, double* rx, double* ry, double* rz, double* rro) { START_PROFILING; if(*coefficient < CONDUCTIVITY && *coefficient < RECIP_CONDUCTIVITY) { panic(__LINE__, __FILE__, "Coefficient %d is not valid.\n", *coefficient); } #pragma omp target if(_chunk.is_offload) device(_chunk.device_id) #pragma omp parallel for for(int ii = 0; ii < _chunk.z; ++ii) { for(int jj = 0; jj < _chunk.y; ++jj) { for(int kk = 0; kk < _chunk.x; ++kk) { const int index = ii*_chunk.y*_chunk.x+jj*_chunk.x+kk; p[index] = 0.0; r[index] = 0.0; u[index] = energy[index]*density[index]; } } } #pragma omp target if(_chunk.is_offload) device(_chunk.device_id) #pragma omp parallel for for(int ii = 1; ii < _chunk.z-1; ++ii) { for(int jj = 1; jj < _chunk.y-1; ++jj) { for(int kk = 1; kk < _chunk.x-1; ++kk) { const int index = ii*_chunk.y*_chunk.x+jj*_chunk.x+kk; w[index] = (*coefficient == CONDUCTIVITY) ? density[index] : 1.0/density[index]; } } } #pragma omp target if(_chunk.is_offload) device(_chunk.device_id) #pragma omp parallel for for(int ii = HALO_PAD; ii < _chunk.z-1; ++ii) { for(int jj = HALO_PAD; jj < _chunk.y-1; ++jj) { for(int kk = HALO_PAD; kk < _chunk.x-1; ++kk) { const int index = ii*_chunk.y*_chunk.x+jj*_chunk.x+kk; kx[index] = *rx*(w[index-1]+w[index])/(2.0*w[index-1]*w[index]); ky[index] = *ry*(w[index-_chunk.x]+w[index])/(2.0*w[index-_chunk.x]*w[index]); kz[index] = *rz*(w[index-_chunk.page]+w[index])/(2.0*w[index-_chunk.page]*w[index]); } } } double rroTemp = 0.0; #pragma omp target if(_chunk.is_offload) device(_chunk.device_id) #pragma omp parallel for reduction(+:rroTemp) for(int ii = HALO_PAD; ii < _chunk.z-HALO_PAD; ++ii) { for(int jj = HALO_PAD; jj < _chunk.y-HALO_PAD; ++jj) { for(int kk = HALO_PAD; kk < _chunk.x-HALO_PAD; ++kk) { const int index = ii*_chunk.y*_chunk.x+jj*_chunk.x+kk; const double smvp = SMVP(u); w[index] = smvp; r[index] = u[index]-w[index]; p[index] = r[index]; rroTemp += r[index]*p[index]; } } } *rro = rroTemp; STOP_PROFILING(__func__); }