gmx_bool pme_load_balance(pme_load_balancing_t pme_lb, t_commrec *cr, FILE *fp_err, FILE *fp_log, t_inputrec *ir, t_state *state, double cycles, interaction_const_t *ic, struct nonbonded_verlet_t *nbv, struct gmx_pme_t ** pmedata, gmx_int64_t step) { gmx_bool OK; pme_setup_t *set; double cycles_fast; char buf[STRLEN], sbuf[22]; real rtab; gmx_bool bUsesSimpleTables = TRUE; if (pme_lb->stage == pme_lb->nstage) { return FALSE; } if (PAR(cr)) { gmx_sumd(1, &cycles, cr); cycles /= cr->nnodes; } set = &pme_lb->setup[pme_lb->cur]; set->count++; rtab = ir->rlistlong + ir->tabext; if (set->count % 2 == 1) { /* Skip the first cycle, because the first step after a switch * is much slower due to allocation and/or caching effects. */ return TRUE; } sprintf(buf, "step %4s: ", gmx_step_str(step, sbuf)); print_grid(fp_err, fp_log, buf, "timed with", set, cycles); if (set->count <= 2) { set->cycles = cycles; } else { if (cycles*PME_LB_ACCEL_TOL < set->cycles && pme_lb->stage == pme_lb->nstage - 1) { /* The performance went up a lot (due to e.g. DD load balancing). * Add a stage, keep the minima, but rescan all setups. */ pme_lb->nstage++; if (debug) { fprintf(debug, "The performance for grid %d %d %d went from %.3f to %.1f M-cycles, this is more than %f\n" "Increased the number stages to %d" " and ignoring the previous performance\n", set->grid[XX], set->grid[YY], set->grid[ZZ], cycles*1e-6, set->cycles*1e-6, PME_LB_ACCEL_TOL, pme_lb->nstage); } } set->cycles = min(set->cycles, cycles); } if (set->cycles < pme_lb->setup[pme_lb->fastest].cycles) { pme_lb->fastest = pme_lb->cur; if (DOMAINDECOMP(cr)) { /* We found a new fastest setting, ensure that with subsequent * shorter cut-off's the dynamic load balancing does not make * the use of the current cut-off impossible. This solution is * a trade-off, as the PME load balancing and DD domain size * load balancing can interact in complex ways. * With the Verlet kernels, DD load imbalance will usually be * mainly due to bonded interaction imbalance, which will often * quickly push the domain boundaries beyond the limit for the * optimal, PME load balanced, cut-off. But it could be that * better overal performance can be obtained with a slightly * shorter cut-off and better DD load balancing. */ change_dd_dlb_cutoff_limit(cr); } } cycles_fast = pme_lb->setup[pme_lb->fastest].cycles; /* Check in stage 0 if we should stop scanning grids. * Stop when the time is more than SLOW_FAC longer than the fastest. */ if (pme_lb->stage == 0 && pme_lb->cur > 0 && cycles > pme_lb->setup[pme_lb->fastest].cycles*PME_LB_SLOW_FAC) { pme_lb->n = pme_lb->cur + 1; /* Done with scanning, go to stage 1 */ switch_to_stage1(pme_lb); } if (pme_lb->stage == 0) { int gridsize_start; gridsize_start = set->grid[XX]*set->grid[YY]*set->grid[ZZ]; do { if (pme_lb->cur+1 < pme_lb->n) { /* We had already generated the next setup */ OK = TRUE; } else { /* Find the next setup */ OK = pme_loadbal_increase_cutoff(pme_lb, ir->pme_order, cr->dd); if (!OK) { pme_lb->elimited = epmelblimPMEGRID; } } if (OK && ir->ePBC != epbcNONE) { OK = (sqr(pme_lb->setup[pme_lb->cur+1].rlistlong) <= max_cutoff2(ir->ePBC, state->box)); if (!OK) { pme_lb->elimited = epmelblimBOX; } } if (OK) { pme_lb->cur++; if (DOMAINDECOMP(cr)) { OK = change_dd_cutoff(cr, state, ir, pme_lb->setup[pme_lb->cur].rlistlong); if (!OK) { /* Failed: do not use this setup */ pme_lb->cur--; pme_lb->elimited = epmelblimDD; } } } if (!OK) { /* We hit the upper limit for the cut-off, * the setup should not go further than cur. */ pme_lb->n = pme_lb->cur + 1; print_loadbal_limited(fp_err, fp_log, step, pme_lb); /* Switch to the next stage */ switch_to_stage1(pme_lb); } } while (OK && !(pme_lb->setup[pme_lb->cur].grid[XX]* pme_lb->setup[pme_lb->cur].grid[YY]* pme_lb->setup[pme_lb->cur].grid[ZZ] < gridsize_start*PME_LB_GRID_SCALE_FAC && pme_lb->setup[pme_lb->cur].grid_efficiency < pme_lb->setup[pme_lb->cur-1].grid_efficiency*PME_LB_GRID_EFFICIENCY_REL_FAC)); } if (pme_lb->stage > 0 && pme_lb->end == 1) { pme_lb->cur = 0; pme_lb->stage = pme_lb->nstage; } else if (pme_lb->stage > 0 && pme_lb->end > 1) { /* If stage = nstage-1: * scan over all setups, rerunning only those setups * which are not much slower than the fastest * else: * use the next setup */ do { pme_lb->cur++; if (pme_lb->cur == pme_lb->end) { pme_lb->stage++; pme_lb->cur = pme_lb->start; } } while (pme_lb->stage == pme_lb->nstage - 1 && pme_lb->setup[pme_lb->cur].count > 0 && pme_lb->setup[pme_lb->cur].cycles > cycles_fast*PME_LB_SLOW_FAC); if (pme_lb->stage == pme_lb->nstage) { /* We are done optimizing, use the fastest setup we found */ pme_lb->cur = pme_lb->fastest; } } if (DOMAINDECOMP(cr) && pme_lb->stage > 0) { OK = change_dd_cutoff(cr, state, ir, pme_lb->setup[pme_lb->cur].rlistlong); if (!OK) { /* Failsafe solution */ if (pme_lb->cur > 1 && pme_lb->stage == pme_lb->nstage) { pme_lb->stage--; } pme_lb->fastest = 0; pme_lb->start = 0; pme_lb->end = pme_lb->cur; pme_lb->cur = pme_lb->start; pme_lb->elimited = epmelblimDD; print_loadbal_limited(fp_err, fp_log, step, pme_lb); } } /* Change the Coulomb cut-off and the PME grid */ set = &pme_lb->setup[pme_lb->cur]; ic->rcoulomb = set->rcut_coulomb; ic->rlist = set->rlist; ic->rlistlong = set->rlistlong; ir->nstcalclr = set->nstcalclr; ic->ewaldcoeff_q = set->ewaldcoeff_q; /* TODO: centralize the code that sets the potentials shifts */ if (ic->coulomb_modifier == eintmodPOTSHIFT) { ic->sh_ewald = gmx_erfc(ic->ewaldcoeff_q*ic->rcoulomb); } if (EVDW_PME(ic->vdwtype)) { /* We have PME for both Coulomb and VdW, set rvdw equal to rcoulomb */ ic->rvdw = set->rcut_coulomb; ic->ewaldcoeff_lj = set->ewaldcoeff_lj; if (ic->vdw_modifier == eintmodPOTSHIFT) { real crc2; ic->dispersion_shift.cpot = -pow(ic->rvdw, -6.0); ic->repulsion_shift.cpot = -pow(ic->rvdw, -12.0); ic->sh_invrc6 = -ic->dispersion_shift.cpot; crc2 = sqr(ic->ewaldcoeff_lj*ic->rvdw); ic->sh_lj_ewald = (exp(-crc2)*(1 + crc2 + 0.5*crc2*crc2) - 1)*pow(ic->rvdw, -6.0); } } bUsesSimpleTables = uses_simple_tables(ir->cutoff_scheme, nbv, 0); nbnxn_gpu_pme_loadbal_update_param(nbv, ic); /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore * also sharing texture references. To keep the code simple, we don't * treat texture references as shared resources, but this means that * the coulomb_tab texture ref will get updated by multiple threads. * Hence, to ensure that the non-bonded kernels don't start before all * texture binding operations are finished, we need to wait for all ranks * to arrive here before continuing. * * Note that we could omit this barrier if GPUs are not shared (or * texture objects are used), but as this is initialization code, there * is not point in complicating things. */ #ifdef GMX_THREAD_MPI if (PAR(cr) && use_GPU(nbv)) { gmx_barrier(cr); } #endif /* GMX_THREAD_MPI */ /* Usually we won't need the simple tables with GPUs. * But we do with hybrid acceleration and with free energy. * To avoid bugs, we always re-initialize the simple tables here. */ init_interaction_const_tables(NULL, ic, bUsesSimpleTables, rtab); if (cr->duty & DUTY_PME) { if (pme_lb->setup[pme_lb->cur].pmedata == NULL) { /* Generate a new PME data structure, * copying part of the old pointers. */ gmx_pme_reinit(&set->pmedata, cr, pme_lb->setup[0].pmedata, ir, set->grid); } *pmedata = set->pmedata; } else { /* Tell our PME-only node to switch grid */ gmx_pme_send_switchgrid(cr, set->grid, set->ewaldcoeff_q, set->ewaldcoeff_lj); } if (debug) { print_grid(NULL, debug, "", "switched to", set, -1); } if (pme_lb->stage == pme_lb->nstage) { print_grid(fp_err, fp_log, "", "optimal", set, -1); } return TRUE; }
int init_mmcg( int nfile, // number of files const t_filenm fnm[], // file names t_inputrec* ir, // input record and box stuff gmx_mtop_t *top_global, // global topology int *allcgnr, // charge groups number int *allcgid[], // charge groups ids int *allsolnr, // solvent groups number int *allsolid[], // solvent groups ids t_commrec *cr // communicators ) { char *mmcgf, *topf, line[STRLEN+1], strtmp[STRLEN+1], *pos; FILE *mmcgfp, *topfp; int k0,k=0,i,resnr; int icg,cg0,cg1; int allatresnr, *allatresid; t_block all_cgs; all_cgs = gmx_mtop_global_cgs(top_global); // index of cgs for whole system mmcgf = ftp2fn(efGMX,nfile,fnm); if((mmcgfp = fopen(mmcgf,"r"))==NULL) { // opening-of mmcg-data failed fprintf(stderr, "ERROR : Impossible to open the mmcg-data file\n"); return 1; } while (!feof(mmcgfp)) { // counting lines in mmcg-data file if (fgets (line,STRLEN+1,mmcgfp)) allatresnr++; } // reading mmcg data snew (allatresid,allatresnr); rewind (mmcgfp); for (i=0,k=0; i<allatresnr;i++) { fgets(line,STRLEN+1,mmcgfp); if(line[0] == '#') { if(strstr(line,"nstwtlist")) { // number of step for list regeneration pos = strchr(line,'='); pos++; ir->mmcg.nstwtlist = atoi(pos); } else if (strstr(line,"shell1wt")) { // inner limit of water droplet pos = strchr(line,'='); pos++; ir->mmcg.shell1wt = atoi(pos); } else if (strstr(line,"shell2wt")) { // outer limit of water droplet pos = strchr(line,'='); pos++; ir->mmcg.shell2wt = atoi(pos); } continue; } // reading all-atoms resids allatresid[k] = atoi(line); if(allatresid[k] != 0) k++; } // end for, mmcg file scanning allatresnr = k; if (cr->nnodes!=1) gmx_barrier(cr); fclose(mmcgfp); // printing all-atoms resids in the log file fprintf(log,"MM/CG : gmx_resids: %d\n", allatresnr); for(i=0; i<allatresnr;i++) fprintf (log, " %d\n", allatresid[i]); fprintf(log,"\n"); // internal enumeration for (i=0; i<allatresnr;i++) (allatresid[i])--; // allocating charge groups and solvent snew(*allcgid, ncg_mtop(top_global)); int maxsol; // It seems that GROMACS did not store the correct number of solvent // molecules in top_global... At this point the value of nmol is more than 10 million! // => srenew() at the end of the routine when allsolnr will be computed int moltype_id=0; while (strcmp(top_global->moltype->name[moltype_id],"SOL")) { moltype_id++; } maxsol = top_global->molblock[moltype_id].nmol; MPI_Bcast(&maxsol,1,MPI_INT,0,cr->mpi_comm_mysim); snew(*allsolid, maxsol); // charge groups to monitor, first and last indices cg0 = 0; cg1 = ncg_mtop(top_global); char **atomname_=NULL,**resname_=NULL; // we need these to get the information of atoms // cf. mtop_utils.h for(icg=cg0; icg<cg1; icg++) { // get residue number and name of the cg's first atom k0 = all_cgs.index[icg]; gmx_mtop_atominfo_global(top_global,k0,&atomname_,&resnr,&resname_); if (!strcmp(resname_,"SOL")) { // Water (*allsolid)[*allsolnr] = icg; (*allsolnr)++; // counting waters } for (i=0; i<allatresnr; i++) { if (resnr == allatresid[i]) { // touché ! (*allcgid)[*allcgnr] = icg; (*allcgnr)++; // counting cg } } } if(cr->nnodes!=1) gmx_barrier(cr); return 0; }
void init_gamess(t_commrec *cr, t_QMrec *qm, t_MMrec *mm) { /* it works hopelessly complicated :-) * first a file is written. Then the standard gamess input/output * routine is called (no system()!) to set up all fortran arrays. * this routine writes a punch file, like in a normal gamess run. * via this punch file the other games routines, needed for gradient * and energy evaluations are called. This setup works fine for * dynamics simulations. 7-6-2002 (London) */ int i, j, rank; FILE *out; char periodic_system[37][3] = { "XX", "H ", "He", "Li", "Be", "B ", "C ", "N ", "O ", "F ", "Ne", "Na", "Mg", "Al", "Si", "P ", "S ", "Cl", "Ar", "K ", "Ca", "Sc", "Ti", "V ", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr" }; if (PAR(cr)) { if (MASTER(cr)) { out = fopen("FOR009", "w"); /* of these options I am not completely sure.... the overall * preformance on more than 4 cpu's is rather poor at the moment. */ fprintf(out, "memory 48000000\nPARALLEL IOMODE SCREENED\n"); fprintf(out, "ELEC %d\nMULT %d\nSUPER ON\nNOSYM\nGEOMETRY ANGSTROM\n", qm->nelectrons, qm->multiplicity); for (i = 0; i < qm->nrQMatoms; i++) { #ifdef DOUBLE fprintf(out, "%10.7lf %10.7lf %10.7lf %5.3lf %2s\n", i/2., i/3., i/4., qm->atomicnumberQM[i]*1.0, periodic_system[qm->atomicnumberQM[i]]); #else fprintf(out, "%10.7f %10.7f %10.7f %5.3f %2s\n", i/2., i/3., i/4., qm->atomicnumberQM[i]*1.0, periodic_system[qm->atomicnumberQM[i]]); #endif } if (mm->nrMMatoms) { for (j = i; j < i+2; j++) { #ifdef DOUBLE fprintf(out, "%10.7lf %10.7lf %10.7lf %5.3lf BQ\n", j/5., j/6., j/7., 1.0); #else fprintf(out, "%10.7f %10.7f %10.7f %5.3f BQ\n", j/5., j/6., j/7., 2.0); #endif } } if (!qm->bTS) { fprintf(out, "END\nBASIS %s\nRUNTYPE GRADIENT\nSCFTYPE %s\n", eQMbasis_names[qm->QMbasis], eQMmethod_names[qm->QMmethod]); /* see enum.h */ } else { fprintf(out, "END\nBASIS %s\nRUNTYPE SADDLE\nSCFTYPE %s\n", eQMbasis_names[qm->QMbasis], eQMmethod_names[qm->QMmethod]); /* see enum.h */ } fclose(out); } gmx_barrier(cr); F77_FUNC(inigms, IMIGMS) (); } else /* normal serial run */ { out = fopen("FOR009", "w"); /* of these options I am not completely sure.... the overall * preformance on more than 4 cpu's is rather poor at the moment. */ fprintf(out, "ELEC %d\nMULT %d\nSUPER ON\nNOSYM\nGEOMETRY ANGSTROM\n", qm->nelectrons, qm->multiplicity); for (i = 0; i < qm->nrQMatoms; i++) { #ifdef DOUBLE fprintf(out, "%10.7lf %10.7lf %10.7lf %5.3lf %2s\n", i/2., i/3., i/4., qm->atomicnumberQM[i]*1.0, periodic_system[qm->atomicnumberQM[i]]); #else fprintf(out, "%10.7f %10.7f %10.7f %5.3f %2s\n", i/2., i/3., i/4., qm->atomicnumberQM[i]*1.0, periodic_system[qm->atomicnumberQM[i]]); #endif } if (mm->nrMMatoms) { for (j = i; j < i+2; j++) { #ifdef DOUBLE fprintf(out, "%10.7lf %10.7lf %10.7lf %5.3lf BQ\n", j/5., j/6., j/7., 1.0); #else fprintf(out, "%10.7f %10.7f %10.7f %5.3f BQ\n", j/5., j/6., j/7., 2.0); #endif } } if (!qm->bTS) { fprintf(out, "END\nBASIS %s\nRUNTYPE GRADIENT\nSCFTYPE %s\n", eQMbasis_names[qm->QMbasis], eQMmethod_names[qm->QMmethod]); /* see enum.h */ } else { fprintf(out, "END\nBASIS %s\nRUNTYPE SADDLE\nSCFTYPE %s\n", eQMbasis_names[qm->QMbasis], eQMmethod_names[qm->QMmethod]); /* see enum.h */ } F77_FUNC(inigms, IMIGMS) (); } }
void do_mmcg (int natoms, // number of atoms in simulation t_inputrec *inputrec, // input record and box stuff t_mdatoms *md, // the atoms t_state *state, // positions & velocities gmx_mtop_t *top, // global topology t_commrec *cr, // communicators rvec *cg_cm, // centre of mass of charge groups int *allcgid, // charge groups ids int allcgnr, // charge groups number int *allsolid, // solvent groups ids int allsolnr, // solvent groups number FILE *log) // logfile { int i, j, p, q, qmin; real dvmod, shell2w2, dmin, d; rvec vecdist, dv, *v; shell2w2 = pow (inputrec->mmcg.shell2wt, 2.0); // Threshold v = state->v; // Velocities t_block cgs; // charge groups cgs = gmx_mtop_global_cgs(top); rvec *all_cg_cm=NULL; snew(all_cg_cm,allcgnr); // for DD, we need to get cg_cm of all given charge groups (fr->cg_cm is local), // the nearest one from a monitored water // may be more than one DD cell distant. if (DOMAINDECOMP(cr)) { if(cr->nnodes!=1) gmx_barrier(cr); for(i=0; i<allcgnr; i++) { int sender = 0,senderf,k; for(j=0; j<cr->dd->ncg_home; j++) { // is the cg a home cg ? if(cr->dd->index_gl[j]==allcgid[i] ) { sender = cr->sim_nodeid; for(k=0;k<3;k++) all_cg_cm[i][k]=cg_cm[j][k];//FIXME improve ! compilation error when done with pointers } } MPI_Allreduce(&sender,&senderf,1,MPI_INT,MPI_SUM,cr->dd->mpi_comm_all); for(k=0;k<3;k++) MPI_Bcast(&all_cg_cm[i][k],sizeof(all_cg_cm[i][k]),MPI_BYTE,senderf,cr->dd->mpi_comm_all); //FIXME again ! } } for(i=0; i<allsolnr; i++) { // Loop over waters - START p = allsolid[i]; if (PARTDECOMP(cr)) { // water i is in the node if((cgs.index[p]>=md->start) && (cgs.index[p]<(md->start+md->homenr))) { for (j=0; j<allcgnr; j++) { // Looking for min dist (dmin) q = allcgid[j]; // and for the nearest charge group (qmin) d = distance2(cg_cm[p],cg_cm[q]); if(!j) { dmin = d; qmin = q; } else if (d < dmin) { dmin = d; qmin = q; } } if (dmin >= shell2w2) { // Modifing velocity rvec_sub(cg_cm[p], cg_cm[qmin], vecdist); unitv (vecdist, vecdist); for (j=cgs.index[p]; j<(3+cgs.index[p]); j++) { dvmod = iprod (v[j], vecdist); if (dvmod <= 0) continue; svmul (2.0*dvmod, vecdist, dv); rvec_dec (&v[j], dv); // Warning (=>?) } } } } if (DOMAINDECOMP(cr)) { int g_atnr; // global atom ID int l_atnr; // local atom ID in the DD cell int l_cgnr; // local charge group number for(g_atnr=cgs.index[p];g_atnr<=cgs.index[p]+2;g_atnr++) { if(ga2la_get_home(cr->dd->ga2la,g_atnr,&l_atnr)) {// search in global to local lookup table // if the atom (not water) is in home atoms // and get local atom number l_cgnr = cr->dd->la2lc[l_atnr]; // get local charge group number for (j=0; j<allcgnr; j++) { // Looking for min dist (dmin) // and for the nearest charge group (qmin) d = distance2(cg_cm[l_cgnr],all_cg_cm[j]); if(!j) { dmin = d; qmin = j; } else if (d < dmin) { dmin = d; qmin = j; } } if (dmin >= shell2w2) { // Modifing velocity rvec_sub(cg_cm[l_cgnr], all_cg_cm[qmin], vecdist); unitv (vecdist, vecdist); dvmod = iprod (v[l_atnr], vecdist); if (dvmod <= 0) continue; svmul (2.0*dvmod, vecdist, dv); rvec_dec (&v[l_atnr], dv); // Warning (=>?) } } } } } // Loop over waters - END return; }