/** Check some common input. */ int Ewald::CheckInput(Box const& boxIn, int debugIn, double cutoffIn, double dsumTolIn, double ew_coeffIn, double lw_coeffIn, double switch_widthIn, double erfcTableDxIn, double skinnbIn) { debug_ = debugIn; cutoff_ = cutoffIn; dsumTol_ = dsumTolIn; ew_coeff_ = ew_coeffIn; lw_coeff_ = lw_coeffIn; switch_width_ = switch_widthIn; erfcTableDx_ = erfcTableDxIn; // Check input if (cutoff_ < Constants::SMALL) { mprinterr("Error: Direct space cutoff (%g) is too small.\n", cutoff_); return 1; } char dir[3] = {'X', 'Y', 'Z'}; for (int i = 0; i < 3; i++) { if (cutoff_ > boxIn[i]/2.0) { mprinterr("Error: Cutoff must be less than half the box length (%g > %g, %c)\n", cutoff_, boxIn[i]/2.0, dir[i]); return 1; } } if (skinnbIn < 0.0) { mprinterr("Error: skinnb is less than 0.0\n"); return 1; } if (switch_width_ < 0.0) switch_width_ = 0.0; if (switch_width_ > cutoff_) { mprinterr("Error: Switch width must be less than the cutoff.\n"); return 1; } // Set defaults if necessary if (dsumTol_ < Constants::SMALL) dsumTol_ = 1E-5; if (DABS(ew_coeff_) < Constants::SMALL) ew_coeff_ = FindEwaldCoefficient( cutoff_, dsumTol_ ); if (erfcTableDx_ <= 0.0) erfcTableDx_ = 1.0 / 5000; // TODO make this optional FillErfcTable( cutoff_, ew_coeff_ ); // TODO do for C6 as well // TODO for C6 correction term if (lw_coeff_ < 0.0) lw_coeff_ = 0.0; else if (DABS(lw_coeff_) < Constants::SMALL) lw_coeff_ = ew_coeff_; // Calculate some common factors. cut2_ = cutoff_ * cutoff_; double cut0 = cutoff_ - switch_width_; cut2_0_ = cut0 * cut0; return 0; }
inline void courantOnXY(real_t *cournox, real_t *cournoy, const int Hnx, const int Hnxyt, const int Hnvar, const int slices, const int Hstep, real_t c[Hstep][Hnxyt], real_t q[Hnvar][Hstep][Hnxyt], real_t *tmpm1, real_t *tmpm2 ) { #ifdef WOMP int s, i; // real_t maxValC = zero; real_t tmp1 = *cournox, tmp2 = *cournoy; #pragma omp parallel for shared(tmpm1, tmpm2) private(s,i) reduction(max:tmp1) reduction(max:tmp2) for (s = 0; s < slices; s++) { for (i = 0; i < Hnx; i++) { tmp1 = MAX(tmp1, c[s][i] + DABS(q[IU][s][i])); tmp2 = MAX(tmp2, c[s][i] + DABS(q[IV][s][i])); } } *cournox = tmp1; *cournoy = tmp2; { int nops = (slices) * Hnx; FLOPS(2 * nops, 0 * nops, 2 * nops, 0 * nops); } #else int i, s; real_t tmp1, tmp2; for (s = 0; s < slices; s++) { for (i = 0; i < Hnx; i++) { tmp1 = c[s][i] + DABS(q[IU][s][i]); tmp2 = c[s][i] + DABS(q[IV][s][i]); *cournox = MAX(*cournox, tmp1); *cournoy = MAX(*cournoy, tmp2); } } { int nops = (slices) * Hnx; FLOPS(2 * nops, 0 * nops, 5 * nops, 0 * nops); } #endif #undef IHVW }
static void courantOnXY (hydro_real_t *cournox, hydro_real_t *cournoy, const int Hnx, const int Hnxyt, const int Hnvar, const int slices, const int Hstep, hydro_real_t *c, hydro_real_t *q) { int i, s; // double maxValC = zero; hydro_real_t tmp1, tmp2; // #define IHVW(i,v) ((i) + (v) * nxyt) // maxValC = c[0]; // for (i = 0; i < Hnx; i++) { // maxValC = MAX(maxValC, c[i]); // } // for (i = 0; i < Hnx; i++) { // *cournox = MAX(*cournox, maxValC + DABS(q[IU][i])); // *cournoy = MAX(*cournoy, maxValC + DABS(q[IV][i])); // } hydro_real_t _cournox = *cournox; hydro_real_t _cournoy = *cournoy; #pragma acc kernels present(q[0:Hnvar*Hstep*Hnxyt],c[0:Hstep*Hnxyt]) { #pragma acc loop independent reduction(max:_cournox) reduction(max:_cournoy) gang(128) for (s = 0; s < slices; s++) { #pragma acc loop independent reduction(max:_cournox) reduction(max:_cournoy) worker(64) for (i = 0; i < Hnx; i++) { tmp1 = c[IDXE(s,i)] + DABS (q[IDX(IU,s,i)]); tmp2 = c[IDXE(s,i)] + DABS (q[IDX(IV,s,i)]); _cournox = MAX (_cournox, tmp1); _cournoy = MAX (_cournoy, tmp2); } } } *cournox = _cournox; *cournoy = _cournoy; #undef IHVW }
/** Complimentary error function: 2/sqrt(PI) * SUM[exp(-t^2)*dt] * Original code: SANDER: erfcfun.F90 */ double Ewald::erfc_func(double xIn) { double erfc; double absx = DABS( xIn ); if (xIn > 26.0) erfc = 0.0; else if (xIn < -5.5) erfc = 2.0; else if (absx <= 0.5) { double cval = xIn * xIn; double pval = ((-0.356098437018154E-1*cval+0.699638348861914E1)*cval + 0.219792616182942E2) * cval + 0.242667955230532E3; double qval = ((cval+0.150827976304078E2)*cval+0.911649054045149E2)*cval + 0.215058875869861E3; double erf = xIn * pval/qval; erfc = 1.0 - erf; } else if (absx < 4.0) { double cval = absx; double pval=((((((-0.136864857382717E-6*cval+0.564195517478974)*cval+ 0.721175825088309E1)*cval+0.431622272220567E2)*cval+ 0.152989285046940E3)*cval+0.339320816734344E3)*cval+ 0.451918953711873E3)*cval+0.300459261020162E3; double qval=((((((cval+0.127827273196294E2)*cval+0.770001529352295E2)*cval+ 0.277585444743988E3)*cval+0.638980264465631E3)*cval+ 0.931354094850610E3)*cval+0.790950925327898E3)*cval+ 0.300459260956983E3; double nonexperfc; if ( xIn > 0.0 ) nonexperfc = pval/qval; else nonexperfc = 2.0*exp(xIn*xIn) - pval/qval; erfc = exp(-absx*absx)*nonexperfc; } else { double cval = 1.0/(xIn*xIn); double pval = (((0.223192459734185E-1*cval+0.278661308609648)*cval+ 0.226956593539687)*cval+0.494730910623251E-1)*cval+ 0.299610707703542E-2; double qval = (((cval+0.198733201817135E1)*cval+0.105167510706793E1)*cval+ 0.191308926107830)*cval+0.106209230528468E-1; cval = (-cval*pval/qval + 0.564189583547756)/absx; double nonexperfc; if ( xIn > 0.0 ) nonexperfc = cval; else nonexperfc = 2.0*exp(xIn*xIn) - cval; erfc = exp(-absx*absx)*nonexperfc; } return erfc; }
void riemann(int narray, const double Hsmallr, const double Hsmallc, const double Hgamma, const int Hniter_riemann, const int Hnvar, const int Hnxyt, const int slices, const int Hstep, double *qleft, double *qright, double *qgdnv, int *sgnm) { //double qleft[Hnvar][Hstep][Hnxyt], //double qright[Hnvar][Hstep][Hnxyt], // //double qgdnv[Hnvar][Hstep][Hnxyt], // //int sgnm[Hstep][Hnxyt]) { // #define IHVW(i, v) ((i) + (v) * Hnxyt) int i, s; double smallp_ = Square(Hsmallc) / Hgamma; double gamma6_ = (Hgamma + one) / (two * Hgamma); double smallpp_ = Hsmallr * smallp_; // Pressure, density and velocity #pragma acc parallel pcopy(qleft[0:Hnvar*Hstep*Hnxyt], qright[0:Hnvar*Hstep*Hnxyt]) pcopyout(qgdnv[0:Hnvar*Hstep*Hnxyt], sgnm[0:Hstep*Hnxyt]) #pragma acc loop gang for (s = 0; s < slices; s++) { #pragma acc loop vector for (i = 0; i < narray; i++) { double smallp = smallp_; double gamma6 = gamma6_; double smallpp = smallpp_; double rl_i = MAX(qleft[IDX(ID,s,i)], Hsmallr); double ul_i = qleft[IDX(IU,s,i)]; double pl_i = MAX(qleft[IDX(IP,s,i)], (double) (rl_i * smallp)); double rr_i = MAX(qright[IDX(ID,s,i)], Hsmallr); double ur_i = qright[IDX(IU,s,i)]; double pr_i = MAX(qright[IDX(IP,s,i)], (double) (rr_i * smallp)); CFLOPS(2); // Lagrangian sound speed double cl_i = Hgamma * pl_i * rl_i; double cr_i = Hgamma * pr_i * rr_i; CFLOPS(4); // First guess double wl_i = sqrt(cl_i); double wr_i = sqrt(cr_i); double pstar_i = MAX(((wr_i * pl_i + wl_i * pr_i) + wl_i * wr_i * (ul_i - ur_i)) / (wl_i + wr_i), 0.0); CFLOPS(9); // Newton-Raphson iterations to find pstar at the required accuracy { int iter; int goon = 1; for (iter = 0; iter < Hniter_riemann; iter++) { if (goon) { double wwl, wwr; wwl = sqrt(cl_i * (one + gamma6 * (pstar_i - pl_i) / pl_i)); wwr = sqrt(cr_i * (one + gamma6 * (pstar_i - pr_i) / pr_i)); double ql = two * wwl * Square(wwl) / (Square(wwl) + cl_i); double qr = two * wwr * Square(wwr) / (Square(wwr) + cr_i); double usl = ul_i - (pstar_i - pl_i) / wwl; double usr = ur_i + (pstar_i - pr_i) / wwr; double delp_i = MAX((qr * ql / (qr + ql) * (usl - usr)), (-pstar_i)); CFLOPS(38); // PRINTARRAY(delp, narray, "delp", H); pstar_i = pstar_i + delp_i; CFLOPS(1); // Convergence indicator double uo_i = DABS(delp_i / (pstar_i + smallpp)); CFLOPS(2); goon = uo_i > PRECISION; } } // iter_riemann } if (wr_i) { // Bug CUDA !! wr_i = sqrt(cr_i * (one + gamma6 * (pstar_i - pr_i) / pr_i)); wl_i = sqrt(cl_i * (one + gamma6 * (pstar_i - pl_i) / pl_i)); CFLOPS(10); } double ustar_i = half * (ul_i + (pl_i - pstar_i) / wl_i + ur_i - (pr_i - pstar_i) / wr_i); CFLOPS(8); int left = ustar_i > 0; double ro_i, uo_i, po_i, wo_i; if (left) { sgnm[IDXE(s,i)] = 1; ro_i = rl_i; uo_i = ul_i; po_i = pl_i; wo_i = wl_i; } else { sgnm[IDXE(s,i)] = -1; ro_i = rr_i; uo_i = ur_i; po_i = pr_i; wo_i = wr_i; } double co_i = sqrt(DABS(Hgamma * po_i / ro_i)); co_i = MAX(Hsmallc, co_i); CFLOPS(2); double rstar_i = ro_i / (one + ro_i * (po_i - pstar_i) / Square(wo_i)); rstar_i = MAX(rstar_i, Hsmallr); CFLOPS(6); double cstar_i = sqrt(DABS(Hgamma * pstar_i / rstar_i)); cstar_i = MAX(Hsmallc, cstar_i); CFLOPS(2); double spout_i = co_i - sgnm[IDXE(s,i)] * uo_i; double spin_i = cstar_i - sgnm[IDXE(s,i)] * ustar_i; double ushock_i = wo_i / ro_i - sgnm[IDXE(s,i)] * uo_i; CFLOPS(7); if (pstar_i >= po_i) { spin_i = ushock_i; spout_i = ushock_i; } double scr_i = MAX((double) (spout_i - spin_i), (double) (Hsmallc + DABS(spout_i + spin_i))); CFLOPS(3); double frac_i = (one + (spout_i + spin_i) / scr_i) * half; frac_i = MAX(zero, (double) (MIN(one, frac_i))); CFLOPS(4); int addSpout = spout_i < zero; int addSpin = spin_i > zero; // double originalQgdnv = !addSpout & !addSpin; double qgdnv_ID, qgdnv_IU, qgdnv_IP; if (addSpout) { qgdnv_ID = ro_i; qgdnv_IU = uo_i; qgdnv_IP = po_i; } else if (addSpin) { qgdnv_ID = rstar_i; qgdnv_IU = ustar_i; qgdnv_IP = pstar_i; } else { qgdnv_ID = (frac_i * rstar_i + (one - frac_i) * ro_i); qgdnv_IU = (frac_i * ustar_i + (one - frac_i) * uo_i); qgdnv_IP = (frac_i * pstar_i + (one - frac_i) * po_i); } qgdnv[IDX(ID,s,i)] = qgdnv_ID; qgdnv[IDX(IU,s,i)] = qgdnv_IU; qgdnv[IDX(IP,s,i)] = qgdnv_IP; // transverse velocity if (left) { qgdnv[IDX(IV,s,i)] = qleft[IDX(IV,s,i)]; } else { qgdnv[IDX(IV,s,i)] = qright[IDX(IV,s,i)]; } } } // other passive variables if (Hnvar > IP) { int invar; #pragma acc parallel pcopy(qleft[0:Hnvar*Hstep*Hnxyt], qright[0:Hnvar*Hstep*Hnxyt], sgnm[0:Hstep*Hnxyt]) pcopyout(qgdnv[0:Hnvar*Hstep*Hnxyt]) #pragma acc loop gang collapse(2) for (invar = IP + 1; invar < Hnvar; invar++) { for (s = 0; s < slices; s++) { #pragma acc loop vector for (i = 0; i < narray; i++) { int left = (sgnm[IDXE(s,i)] == 1); qgdnv[IDX(invar,s,i)] = qleft[IDX(invar,s,i)] * left + qright[IDX(invar,s,i)] * !left; } } } } } // riemann
// // compute gamma's and diGamma's including optional error checking // void computeGammas(struct stepStruct *step, double *pi, double **A, double **B, int N, int T) { int i, j, t; double denom; #ifdef CHECK_GAMMAS double ftemp, ftemp2; #endif // CHECK_GAMMAS // compute gamma's and diGamma's for(t = 0; t < T - 1; ++t) { denom = 0.0; for(i = 0; i < N; ++i) { for(j = 0; j < N; ++j) { denom += step[t].alpha[i] * A[i][j] * B[j][step[t + 1].obs] * step[t + 1].beta[j]; } } #ifdef CHECK_GAMMAS ftemp2 = 0.0; #endif // CHECK_GAMMAS for(i = 0; i < N; ++i) { step[t].gamma[i] = 0.0; for(j = 0; j < N; ++j) { step[t].diGamma[i][j] = (step[t].alpha[i] * A[i][j] * B[j][step[t + 1].obs] * step[t + 1].beta[j]) / denom; step[t].gamma[i] += step[t].diGamma[i][j]; } #ifdef CHECK_GAMMAS // verify that gamma[i] == alpha[i]*beta[i] / sum(alpha[j]*beta[j]) ftemp2 += step[t].gamma[i]; ftemp = 0.0; for(j = 0; j < N; ++j) { ftemp += step[t].alpha[j] * step[t].beta[j]; } ftemp = (step[t].alpha[i] * step[t].beta[i]) / ftemp; if(DABS(ftemp - step[t].gamma[i]) > EPSILON) { printf("gamma[%d] = %f (%f) ", i, step[t].gamma[i], ftemp); printf("********** Error !!!\n"); } #endif // CHECK_GAMMAS }// next i #ifdef CHECK_GAMMAS if(DABS(1.0 - ftemp2) > EPSILON) { printf("sum of gamma's = %f (should sum to 1.0)\n", ftemp2); } #endif // CHECK_GAMMAS }// next t }// end computeGammas
void slope (const int n, const int Hnvar, const int Hnxyt, const hydro_real_t Hslope_type, const int slices, const int Hstep, hydro_real_t *q, hydro_real_t *dq){ //const int slices, const int Hstep, double* q[Hnvar][Hstep][Hnxyt], double* dq) { //int nbv, i, ijmin, ijmax, s; //double dlft, drgt, dcen, dsgn, slop, dlim; // long ihvwin, ihvwimn, ihvwipn; // #define IHVW(i, v) ((i) + (v) * Hnxyt) WHERE ("slope"); //ijmin = 0; //ijmax = n; #pragma acc kernels present(q[0: Hnvar * Hstep * Hnxyt], dq[0:Hnvar * Hstep * Hnxyt]) { /// double dlft, drgt, dcen, dsgn, slop, dlim; int ijmin, ijmax; ijmin = 0; ijmax = n; //#pragma hmppcg unroll i:4 #ifdef GRIDIFY #ifndef GRIDIFY_TUNE_PHI #pragma hmppcg gridify(nbv*s,i) #else #pragma hmppcg gridify(nbv*s,i), blocksize 512x1 #endif #endif /* GRIDIFY */ #ifndef GRIDIFY #pragma acc loop independent #endif /* !GRIDIFY */ for (int nbv = 0; nbv < Hnvar; nbv++) { #ifndef GRIDIFY #pragma acc loop independent #endif /* !GRIDIFY */ for (int s = 0; s < slices; s++) { #ifndef GRIDIFY #pragma acc loop independent #endif /* !GRIDIFY */ for (int i = ijmin + 1; i < ijmax - 1; i++) { hydro_real_t dlft, drgt, dcen, dsgn, slop, dlim; dlft = Hslope_type * (q[IDX (nbv, s, i)] - q[IDX (nbv, s, i - 1)]); drgt = Hslope_type * (q[IDX (nbv, s, i + 1)] - q[IDX (nbv, s, i)]); dcen = half * (dlft + drgt) / Hslope_type; dsgn = (dcen > zero) ? one:-one; // sign(one, dcen); slop = MIN(DABS(dlft), DABS(drgt)); dlim = slop; if ((dlft * drgt) <= zero){ dlim = zero; } dq[IDX(nbv, s, i)] = dsgn * MIN(dlim, DABS(dcen)); #ifdef FLOPS flops += 8; #endif } } } }//kernels region } // slope