Exemple #1
0
void init_single(FILE *fplog,t_inputrec *inputrec,
		 char *tpxfile,gmx_mtop_t *mtop, 
                 t_state *state)
{
  int         step;
  real        t;
  
  read_tpx_state(tpxfile,&step,&t,inputrec,state,NULL,mtop);
  set_state_entries(state,inputrec,1);

  if (fplog)
    pr_inputrec(fplog,0,"Input Parameters",inputrec,FALSE);
}
Exemple #2
0
void init_parallel(FILE *log,char *tpxfile,t_commrec *cr,
		   t_inputrec *inputrec,gmx_mtop_t *mtop,
		   t_state *state,
		   int list)
{
  int  step;
  real t;
  char buf[256];
  
  if (MASTER(cr)) {
    init_inputrec(inputrec);
    read_tpx_state(tpxfile,&step,&t,inputrec,state,NULL,mtop);
    /* When we will be doing domain decomposition with separate PME nodes
     * the rng entries will be too large, we correct for this later.
     */
    set_state_entries(state,inputrec,cr->nnodes);
  }
  bcast_ir_mtop(cr,inputrec,mtop);

  if (inputrec->eI == eiBD || EI_SD(inputrec->eI)) {
    /* Make sure the random seeds are different on each node */
    inputrec->ld_seed += cr->nodeid;
  }
  
  /* Printing */
  if (list!=0 && log!=NULL) 
  {
	  if (list&LIST_INPUTREC)
		  pr_inputrec(log,0,"parameters of the run",inputrec,FALSE);
	  if (list&LIST_X)
		  pr_rvecs(log,0,"box",state->box,DIM);
	  if (list&LIST_X)
		  pr_rvecs(log,0,"box_rel",state->box_rel,DIM);
	  if (list&LIST_V)
		  pr_rvecs(log,0,"boxv",state->boxv,DIM);
	  if (list&LIST_X)
		  pr_rvecs(log,0,int_title("x",0,buf,255),state->x,state->natoms);
	  if (list&LIST_V)
		  pr_rvecs(log,0,int_title("v",0,buf,255),state->v,state->natoms);
	  if (list&LIST_TOP)
		  pr_mtop(log,0,int_title("topology",cr->nodeid,buf,255),mtop,TRUE);
	  fflush(log);
  }
}
Exemple #3
0
static void list_tpx(const char *fn, gmx_bool bShowNumbers, const char *mdpfn,
                     gmx_bool bSysTop)
{
    FILE         *gp;
    int           fp, indent, i, j, **gcount, atot;
    t_state       state;
    rvec         *f = NULL;
    t_inputrec    ir;
    t_tpxheader   tpx;
    gmx_mtop_t    mtop;
    gmx_groups_t *groups;
    t_topology    top;

    read_tpxheader(fn, &tpx, TRUE, NULL, NULL);

    read_tpx_state(fn,
                   tpx.bIr  ? &ir : NULL,
                   &state, tpx.bF ? f : NULL,
                   tpx.bTop ? &mtop : NULL);

    if (mdpfn && tpx.bIr)
    {
        gp = gmx_fio_fopen(mdpfn, "w");
        pr_inputrec(gp, 0, NULL, &(ir), TRUE);
        gmx_fio_fclose(gp);
    }

    if (!mdpfn)
    {
        if (bSysTop)
        {
            top = gmx_mtop_t_to_t_topology(&mtop);
        }

        if (available(stdout, &tpx, 0, fn))
        {
            indent = 0;
            indent = pr_title(stdout, indent, fn);
            pr_inputrec(stdout, 0, "inputrec", tpx.bIr ? &(ir) : NULL, FALSE);

            indent = 0;
            pr_header(stdout, indent, "header", &(tpx));

            if (!bSysTop)
            {
                pr_mtop(stdout, indent, "topology", &(mtop), bShowNumbers);
            }
            else
            {
                pr_top(stdout, indent, "topology", &(top), bShowNumbers);
            }

            pr_rvecs(stdout, indent, "box", tpx.bBox ? state.box : NULL, DIM);
            pr_rvecs(stdout, indent, "box_rel", tpx.bBox ? state.box_rel : NULL, DIM);
            pr_rvecs(stdout, indent, "boxv", tpx.bBox ? state.boxv : NULL, DIM);
            pr_rvecs(stdout, indent, "pres_prev", tpx.bBox ? state.pres_prev : NULL, DIM);
            pr_rvecs(stdout, indent, "svir_prev", tpx.bBox ? state.svir_prev : NULL, DIM);
            pr_rvecs(stdout, indent, "fvir_prev", tpx.bBox ? state.fvir_prev : NULL, DIM);
            /* leave nosehoover_xi in for now to match the tpr version */
            pr_doubles(stdout, indent, "nosehoover_xi", state.nosehoover_xi, state.ngtc);
            /*pr_doubles(stdout,indent,"nosehoover_vxi",state.nosehoover_vxi,state.ngtc);*/
            /*pr_doubles(stdout,indent,"therm_integral",state.therm_integral,state.ngtc);*/
            pr_rvecs(stdout, indent, "x", tpx.bX ? state.x : NULL, state.natoms);
            pr_rvecs(stdout, indent, "v", tpx.bV ? state.v : NULL, state.natoms);
            if (tpx.bF)
            {
                pr_rvecs(stdout, indent, "f", f, state.natoms);
            }
        }

        groups = &mtop.groups;

        snew(gcount, egcNR);
        for (i = 0; (i < egcNR); i++)
        {
            snew(gcount[i], groups->grps[i].nr);
        }

        for (i = 0; (i < mtop.natoms); i++)
        {
            for (j = 0; (j < egcNR); j++)
            {
                gcount[j][ggrpnr(groups, j, i)]++;
            }
        }
        printf("Group statistics\n");
        for (i = 0; (i < egcNR); i++)
        {
            atot = 0;
            printf("%-12s: ", gtypes[i]);
            for (j = 0; (j < groups->grps[i].nr); j++)
            {
                printf("  %5d", gcount[i][j]);
                atot += gcount[i][j];
            }
            printf("  (total %d atoms)\n", atot);
            sfree(gcount[i]);
        }
        sfree(gcount);
    }
    done_state(&state);
    sfree(f);
}
Exemple #4
0
int main(int argc,char *argv[])
{
  static char *desc[] = {
    "The [TT]pmetest[tt] program tests the scaling of the PME code. When only given",
    "a [TT].tpr[tt] file it will compute PME for one frame. When given a trajectory",
    "it will do so for all the frames in the trajectory. Before the PME",
    "routine is called the coordinates are sorted along the X-axis.[PAR]",
    "As an extra service to the public the program can also compute",
    "long-range Coulomb energies for components of the system. When the",
    "[TT]-groups[tt] flag is given to the program the energy groups",
    "from the [TT].tpr[tt] file will be read, and half an energy matrix computed."
  };
  t_commrec    *cr,*mcr;
  static t_filenm fnm[] = {
    { efTPX, NULL,      NULL,       ffREAD  },
    { efTRN, "-o",      NULL,       ffWRITE },
    { efLOG, "-g",      "pme",      ffWRITE },
    { efTRX, "-f",      NULL,       ffOPTRD },
    { efXVG, "-x",      "ener-pme", ffWRITE }
  };
#define NFILE asize(fnm)

  /* Command line options ! */
  static gmx_bool bVerbose=FALSE;
  static gmx_bool bOptFFT=FALSE;
  static gmx_bool bSort=FALSE;
  static int  ewald_geometry=eewg3D;
  static int  nnodes=1;
  static int  pme_order=0;
  static rvec grid = { -1, -1, -1 };
  static real rc   = 0.0;
  static real dtol = 0.0;
  static gmx_bool bGroups = FALSE;
  static t_pargs pa[] = {
    { "-np",      FALSE, etINT, {&nnodes},
      "Number of nodes, must be the same as used for [TT]grompp[tt]" },
    { "-v",       FALSE, etBOOL,{&bVerbose},  
      "Be loud and noisy" },
    { "-sort",    FALSE, etBOOL,{&bSort},  
      "Sort coordinates. Crucial for domain decomposition." },
    { "-grid",    FALSE, etRVEC,{&grid},
      "Number of grid cells in X, Y, Z dimension (if -1 use from [TT].tpr[tt])" },
    { "-order",   FALSE, etINT, {&pme_order},
      "Order of the PME spreading algorithm" },
    { "-groups",  FALSE, etBOOL, {&bGroups},
      "Compute half an energy matrix based on the energy groups in your [TT].tpr[tt] file" },
    { "-rc",      FALSE, etREAL, {&rc},
      "Rcoulomb for Ewald summation" },
    { "-tol",     FALSE, etREAL, {&dtol},
      "Tolerance for Ewald summation" }
  };
  FILE        *fp;
  t_inputrec  *ir;
  t_topology  top;
  t_tpxheader tpx;
  t_nrnb      nrnb;
  t_nsborder  *nsb;
  t_forcerec  *fr;
  t_mdatoms   *mdatoms;
  char        title[STRLEN];
  int         natoms,step,status,i,ncg,root;
  real        t,lambda,ewaldcoeff,qtot;
  rvec        *x,*f,*xbuf;
  int         *index;
  gmx_bool        bCont;
  real        *charge,*qbuf,*qqbuf;
  matrix      box;
  
  /* Start the actual parallel code if necessary */
  cr   = init_par(&argc,&argv);
  root = 0;
  
  if (MASTER(cr)) 
    CopyRight(stderr,argv[0]);
  
  /* Parse command line on all processors, arguments are passed on in 
   * init_par (see above)
   */
  parse_common_args(&argc,argv,
		    PCA_KEEP_ARGS | PCA_NOEXIT_ON_ARGS | PCA_BE_NICE |
		    PCA_CAN_SET_DEFFNM | (MASTER(cr) ? 0 : PCA_QUIET),
		    NFILE,fnm,asize(pa),pa,asize(desc),desc,0,NULL);
  
#ifndef GMX_MPI
  if (nnodes > 1) 
    gmx_fatal(FARGS,"GROMACS compiled without MPI support - can't do parallel runs");
#endif

  /* Open log files on all processors */
  open_log(ftp2fn(efLOG,NFILE,fnm),cr);
  snew(ir,1);
  
  if (MASTER(cr)) {
    /* Read tpr file etc. */
    read_tpxheader(ftp2fn(efTPX,NFILE,fnm),&tpx,FALSE,NULL,NULL);
    snew(x,tpx.natoms);
    read_tpx(ftp2fn(efTPX,NFILE,fnm),&step,&t,&lambda,ir,
	     box,&natoms,x,NULL,NULL,&top);
    /* Charges */
    qtot = 0;
    snew(charge,natoms);
    for(i=0; (i<natoms); i++) {
      charge[i] = top.atoms.atom[i].q;
      qtot += charge[i];
    }
  
    /* Grid stuff */
    if (opt2parg_bSet("-grid",asize(pa),pa)) {
      ir->nkx = grid[XX];
      ir->nky = grid[YY];
      ir->nkz = grid[ZZ];
    }
    /* Check command line parameters for consistency */
    if ((ir->nkx <= 0) || (ir->nky <= 0) || (ir->nkz <= 0))
      gmx_fatal(FARGS,"PME grid = %d %d %d",ir->nkx,ir->nky,ir->nkz);
    if (opt2parg_bSet("-rc",asize(pa),pa)) 
      ir->rcoulomb = rc;
    if (ir->rcoulomb <= 0)
      gmx_fatal(FARGS,"rcoulomb should be > 0 (not %f)",ir->rcoulomb);
    if (opt2parg_bSet("-order",asize(pa),pa)) 
      ir->pme_order = pme_order;
    if (ir->pme_order <= 0)
      gmx_fatal(FARGS,"pme_order should be > 0 (not %d)",ir->pme_order);
    if (opt2parg_bSet("-tol",asize(pa),pa))
      ir->ewald_rtol = dtol;
    if (ir->ewald_rtol <= 0)
      gmx_fatal(FARGS,"ewald_tol should be > 0 (not %f)",ir->ewald_rtol);
  }
  else {
    init_top(&top);
  }

  /* Add parallellization code here */
  snew(nsb,1);
  if (MASTER(cr)) {
    ncg = top.blocks[ebCGS].multinr[0];
    for(i=0; (i<cr->nnodes-1); i++)
      top.blocks[ebCGS].multinr[i] = min(ncg,(ncg*(i+1))/cr->nnodes);
    for( ; (i<MAXNODES); i++)
      top.blocks[ebCGS].multinr[i] = ncg;
  }
  if (PAR(cr)) {
    /* Set some variables to zero to avoid core dumps */
    ir->opts.ngtc = ir->opts.ngacc = ir->opts.ngfrz = ir->opts.ngener = 0;
#ifdef GMX_MPI
    /* Distribute the data over processors */
    MPI_Bcast(&natoms,1,MPI_INT,root,MPI_COMM_WORLD);
    MPI_Bcast(ir,sizeof(*ir),MPI_BYTE,root,MPI_COMM_WORLD);
    MPI_Bcast(&qtot,1,GMX_MPI_REAL,root,MPI_COMM_WORLD);
#endif

    /* Call some dedicated communication routines, master sends n-1 times */
    if (MASTER(cr)) {
      for(i=1; (i<cr->nnodes); i++) {
	mv_block(i,&(top.blocks[ebCGS]));
	mv_block(i,&(top.atoms.excl));
      }
    }
    else {
      ld_block(root,&(top.blocks[ebCGS]));
      ld_block(root,&(top.atoms.excl));
    }
    if (!MASTER(cr)) {
      snew(charge,natoms);
      snew(x,natoms);
    }
#ifdef GMX_MPI
    MPI_Bcast(charge,natoms,GMX_MPI_REAL,root,MPI_COMM_WORLD);
#endif
  }
  ewaldcoeff = calc_ewaldcoeff(ir->rcoulomb,ir->ewald_rtol);
  
  
  if (bVerbose)
    pr_inputrec(stdlog,0,"Inputrec",ir);

  /* Allocate memory for temp arrays etc. */
  snew(xbuf,natoms);
  snew(f,natoms);
  snew(qbuf,natoms);
  snew(qqbuf,natoms);
  snew(index,natoms);

  /* Initialize the PME code */  
  init_pme(stdlog,cr,ir->nkx,ir->nky,ir->nkz,ir->pme_order,
	   natoms,FALSE,bOptFFT,ewald_geometry);
	   
  /* MFlops accounting */
  init_nrnb(&nrnb);
  
  /* Initialize the work division */
  calc_nsb(stdlog,&(top.blocks[ebCGS]),cr->nnodes,nsb,0);
  nsb->nodeid = cr->nodeid;
  print_nsb(stdlog,"pmetest",nsb);  

  /* Initiate forcerec */
  mdatoms = atoms2md(stdlog,&top.atoms,ir->opts.nFreeze,ir->eI,
		     ir->delta_t,0,ir->opts.tau_t,FALSE,FALSE);
  snew(fr,1);
  init_forcerec(stdlog,fr,ir,&top,cr,mdatoms,nsb,box,FALSE,NULL,NULL,FALSE);
  
  /* First do PME based on coordinates in tpr file, send them to
   * other processors if needed.
   */
  if (MASTER(cr))
    fprintf(stdlog,"-----\n"
	    "Results based on tpr file %s\n",ftp2fn(efTPX,NFILE,fnm));
#ifdef GMX_MPI
  if (PAR(cr)) {
    MPI_Bcast(x[0],natoms*DIM,GMX_MPI_REAL,root,MPI_COMM_WORLD);
    MPI_Bcast(box[0],DIM*DIM,GMX_MPI_REAL,root,MPI_COMM_WORLD);
    MPI_Bcast(&t,1,GMX_MPI_REAL,root,MPI_COMM_WORLD);
  }
#endif
  do_my_pme(stdlog,0,bVerbose,ir,x,xbuf,f,charge,qbuf,qqbuf,box,bSort,
	    cr,nsb,&nrnb,&(top.atoms.excl),qtot,fr,index,NULL,
	    bGroups ? ir->opts.ngener : 1,mdatoms->cENER);

  /* If we have a trajectry file, we will read the frames in it and compute
   * the PME energy.
   */
  if (ftp2bSet(efTRX,NFILE,fnm)) {
    fprintf(stdlog,"-----\n"
	    "Results based on trx file %s\n",ftp2fn(efTRX,NFILE,fnm));
    if (MASTER(cr)) {
      sfree(x);
      natoms = read_first_x(&status,ftp2fn(efTRX,NFILE,fnm),&t,&x,box); 
      if (natoms != top.atoms.nr)
	gmx_fatal(FARGS,"natoms in trx = %d, in tpr = %d",natoms,top.atoms.nr);
      fp = xvgropen(ftp2fn(efXVG,NFILE,fnm),"PME Energy","Time (ps)","E (kJ/mol)");
    }
    else
      fp = NULL;
    do {
      /* Send coordinates, box and time to the other nodes */
#ifdef GMX_MPI
      if (PAR(cr)) {
	MPI_Bcast(x[0],natoms*DIM,GMX_MPI_REAL,root,MPI_COMM_WORLD);
	MPI_Bcast(box[0],DIM*DIM,GMX_MPI_REAL,root,MPI_COMM_WORLD);
	MPI_Bcast(&t,1,GMX_MPI_REAL,root,MPI_COMM_WORLD);
      }
#endif
      rm_pbc(&top.idef,nsb->natoms,box,x,x);
      /* Call the PME wrapper function */
      do_my_pme(stdlog,t,bVerbose,ir,x,xbuf,f,charge,qbuf,qqbuf,box,bSort,cr,
		nsb,&nrnb,&(top.atoms.excl),qtot,fr,index,fp,
		bGroups ? ir->opts.ngener : 1,mdatoms->cENER);
      /* Only the master processor reads more data */
      if (MASTER(cr))
          bCont = read_next_x(status,&t,natoms,x,box);
      /* Check whether we need to continue */
#ifdef GMX_MPI
      if (PAR(cr))
          MPI_Bcast(&bCont,1,MPI_INT,root,MPI_COMM_WORLD);
#endif
      
    } while (bCont);
    
    /* Finish I/O, close files */
    if (MASTER(cr)) {
      close_trx(status);
      ffclose(fp);
    }
  }
  
  if (bVerbose) {
    /* Do some final I/O about performance, might be useful in debugging */
    fprintf(stdlog,"-----\n");
    print_nrnb(stdlog,&nrnb);
  }
  
  /* Finish the parallel stuff */  
  if (gmx_parallel_env_initialized())
    gmx_finalize(cr);

  /* Thank the audience, as usual */
  if (MASTER(cr)) 
    thanx(stderr);

  return 0;
}
Exemple #5
0
int mdrunner(gmx_hw_opt_t *hw_opt,
             FILE *fplog, t_commrec *cr, int nfile,
             const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
             gmx_bool bCompact, int nstglobalcomm,
             ivec ddxyz, int dd_node_order, real rdd, real rconstr,
             const char *dddlb_opt, real dlb_scale,
             const char *ddcsx, const char *ddcsy, const char *ddcsz,
             const char *nbpu_opt, int nstlist_cmdline,
             gmx_int64_t nsteps_cmdline, int nstepout, int resetstep,
             int gmx_unused nmultisim, int repl_ex_nst, int repl_ex_nex,
             int repl_ex_seed, real pforce, real cpt_period, real max_hours,
             int imdport, unsigned long Flags)
{
    gmx_bool                  bForceUseGPU, bTryUseGPU, bRerunMD;
    t_inputrec               *inputrec;
    t_state                  *state = NULL;
    matrix                    box;
    gmx_ddbox_t               ddbox = {0};
    int                       npme_major, npme_minor;
    t_nrnb                   *nrnb;
    gmx_mtop_t               *mtop          = NULL;
    t_mdatoms                *mdatoms       = NULL;
    t_forcerec               *fr            = NULL;
    t_fcdata                 *fcd           = NULL;
    real                      ewaldcoeff_q  = 0;
    real                      ewaldcoeff_lj = 0;
    struct gmx_pme_t        **pmedata       = NULL;
    gmx_vsite_t              *vsite         = NULL;
    gmx_constr_t              constr;
    int                       nChargePerturbed = -1, nTypePerturbed = 0, status;
    gmx_wallcycle_t           wcycle;
    gmx_bool                  bReadEkin;
    gmx_walltime_accounting_t walltime_accounting = NULL;
    int                       rc;
    gmx_int64_t               reset_counters;
    gmx_edsam_t               ed           = NULL;
    int                       nthreads_pme = 1;
    int                       nthreads_pp  = 1;
    gmx_membed_t              membed       = NULL;
    gmx_hw_info_t            *hwinfo       = NULL;
    /* The master rank decides early on bUseGPU and broadcasts this later */
    gmx_bool                  bUseGPU      = FALSE;

    /* CAUTION: threads may be started later on in this function, so
       cr doesn't reflect the final parallel state right now */
    snew(inputrec, 1);
    snew(mtop, 1);

    if (Flags & MD_APPENDFILES)
    {
        fplog = NULL;
    }

    bRerunMD     = (Flags & MD_RERUN);
    bForceUseGPU = (strncmp(nbpu_opt, "gpu", 3) == 0);
    bTryUseGPU   = (strncmp(nbpu_opt, "auto", 4) == 0) || bForceUseGPU;

    /* Detect hardware, gather information. This is an operation that is
     * global for this process (MPI rank). */
    hwinfo = gmx_detect_hardware(fplog, cr, bTryUseGPU);

    gmx_print_detected_hardware(fplog, cr, hwinfo);

    if (fplog != NULL)
    {
        /* Print references after all software/hardware printing */
        please_cite(fplog, "Abraham2015");
        please_cite(fplog, "Pall2015");
        please_cite(fplog, "Pronk2013");
        please_cite(fplog, "Hess2008b");
        please_cite(fplog, "Spoel2005a");
        please_cite(fplog, "Lindahl2001a");
        please_cite(fplog, "Berendsen95a");
    }

    snew(state, 1);
    if (SIMMASTER(cr))
    {
        /* Read (nearly) all data required for the simulation */
        read_tpx_state(ftp2fn(efTPR, nfile, fnm), inputrec, state, NULL, mtop);

        if (inputrec->cutoff_scheme == ecutsVERLET)
        {
            /* Here the master rank decides if all ranks will use GPUs */
            bUseGPU = (hwinfo->gpu_info.n_dev_compatible > 0 ||
                       getenv("GMX_EMULATE_GPU") != NULL);

            /* TODO add GPU kernels for this and replace this check by:
             * (bUseGPU && (ir->vdwtype == evdwPME &&
             *               ir->ljpme_combination_rule == eljpmeLB))
             * update the message text and the content of nbnxn_acceleration_supported.
             */
            if (bUseGPU &&
                !nbnxn_gpu_acceleration_supported(fplog, cr, inputrec, bRerunMD))
            {
                /* Fallback message printed by nbnxn_acceleration_supported */
                if (bForceUseGPU)
                {
                    gmx_fatal(FARGS, "GPU acceleration requested, but not supported with the given input settings");
                }
                bUseGPU = FALSE;
            }

            prepare_verlet_scheme(fplog, cr,
                                  inputrec, nstlist_cmdline, mtop, state->box,
                                  bUseGPU);
        }
        else
        {
            if (nstlist_cmdline > 0)
            {
                gmx_fatal(FARGS, "Can not set nstlist with the group cut-off scheme");
            }

            if (hwinfo->gpu_info.n_dev_compatible > 0)
            {
                md_print_warn(cr, fplog,
                              "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
                              "      To use a GPU, set the mdp option: cutoff-scheme = Verlet\n");
            }

            if (bForceUseGPU)
            {
                gmx_fatal(FARGS, "GPU requested, but can't be used without cutoff-scheme=Verlet");
            }

#ifdef GMX_TARGET_BGQ
            md_print_warn(cr, fplog,
                          "NOTE: There is no SIMD implementation of the group scheme kernels on\n"
                          "      BlueGene/Q. You will observe better performance from using the\n"
                          "      Verlet cut-off scheme.\n");
#endif
        }

        if (inputrec->eI == eiSD2)
        {
            md_print_warn(cr, fplog, "The stochastic dynamics integrator %s is deprecated, since\n"
                          "it is slower than integrator %s and is slightly less accurate\n"
                          "with constraints. Use the %s integrator.",
                          ei_names[inputrec->eI], ei_names[eiSD1], ei_names[eiSD1]);
        }
    }

    /* Check and update the hardware options for internal consistency */
    check_and_update_hw_opt_1(hw_opt, cr);

    /* Early check for externally set process affinity. */
    gmx_check_thread_affinity_set(fplog, cr,
                                  hw_opt, hwinfo->nthreads_hw_avail, FALSE);

#ifdef GMX_THREAD_MPI
    if (SIMMASTER(cr))
    {
        if (cr->npmenodes > 0 && hw_opt->nthreads_tmpi <= 0)
        {
            gmx_fatal(FARGS, "You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME ranks");
        }

        /* Since the master knows the cut-off scheme, update hw_opt for this.
         * This is done later for normal MPI and also once more with tMPI
         * for all tMPI ranks.
         */
        check_and_update_hw_opt_2(hw_opt, inputrec->cutoff_scheme);

        /* NOW the threads will be started: */
        hw_opt->nthreads_tmpi = get_nthreads_mpi(hwinfo,
                                                 hw_opt,
                                                 inputrec, mtop,
                                                 cr, fplog, bUseGPU);

        if (hw_opt->nthreads_tmpi > 1)
        {
            t_commrec *cr_old       = cr;
            /* now start the threads. */
            cr = mdrunner_start_threads(hw_opt, fplog, cr_old, nfile, fnm,
                                        oenv, bVerbose, bCompact, nstglobalcomm,
                                        ddxyz, dd_node_order, rdd, rconstr,
                                        dddlb_opt, dlb_scale, ddcsx, ddcsy, ddcsz,
                                        nbpu_opt, nstlist_cmdline,
                                        nsteps_cmdline, nstepout, resetstep, nmultisim,
                                        repl_ex_nst, repl_ex_nex, repl_ex_seed, pforce,
                                        cpt_period, max_hours,
                                        Flags);
            /* the main thread continues here with a new cr. We don't deallocate
               the old cr because other threads may still be reading it. */
            if (cr == NULL)
            {
                gmx_comm("Failed to spawn threads");
            }
        }
    }
#endif
    /* END OF CAUTION: cr is now reliable */

    /* g_membed initialisation *
     * Because we change the mtop, init_membed is called before the init_parallel *
     * (in case we ever want to make it run in parallel) */
    if (opt2bSet("-membed", nfile, fnm))
    {
        if (MASTER(cr))
        {
            fprintf(stderr, "Initializing membed");
        }
        membed = init_membed(fplog, nfile, fnm, mtop, inputrec, state, cr, &cpt_period);
    }

    if (PAR(cr))
    {
        /* now broadcast everything to the non-master nodes/threads: */
        init_parallel(cr, inputrec, mtop);

        /* The master rank decided on the use of GPUs,
         * broadcast this information to all ranks.
         */
        gmx_bcast_sim(sizeof(bUseGPU), &bUseGPU, cr);
    }

    if (fplog != NULL)
    {
        pr_inputrec(fplog, 0, "Input Parameters", inputrec, FALSE);
        fprintf(fplog, "\n");
    }

    /* now make sure the state is initialized and propagated */
    set_state_entries(state, inputrec);

    /* A parallel command line option consistency check that we can
       only do after any threads have started. */
    if (!PAR(cr) &&
        (ddxyz[XX] > 1 || ddxyz[YY] > 1 || ddxyz[ZZ] > 1 || cr->npmenodes > 0))
    {
        gmx_fatal(FARGS,
                  "The -dd or -npme option request a parallel simulation, "
#ifndef GMX_MPI
                  "but %s was compiled without threads or MPI enabled"
#else
#ifdef GMX_THREAD_MPI
                  "but the number of threads (option -nt) is 1"
#else
                  "but %s was not started through mpirun/mpiexec or only one rank was requested through mpirun/mpiexec"
#endif
#endif
                  , output_env_get_program_display_name(oenv)
                  );
    }

    if (bRerunMD &&
        (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
    {
        gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
    }

    if (can_use_allvsall(inputrec, TRUE, cr, fplog) && DOMAINDECOMP(cr))
    {
        gmx_fatal(FARGS, "All-vs-all loops do not work with domain decomposition, use a single MPI rank");
    }

    if (!(EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype)))
    {
        if (cr->npmenodes > 0)
        {
            gmx_fatal_collective(FARGS, cr, NULL,
                                 "PME-only ranks are requested, but the system does not use PME for electrostatics or LJ");
        }

        cr->npmenodes = 0;
    }

    if (bUseGPU && cr->npmenodes < 0)
    {
        /* With GPUs we don't automatically use PME-only ranks. PME ranks can
         * improve performance with many threads per GPU, since our OpenMP
         * scaling is bad, but it's difficult to automate the setup.
         */
        cr->npmenodes = 0;
    }

#ifdef GMX_FAHCORE
    if (MASTER(cr))
    {
        fcRegisterSteps(inputrec->nsteps, inputrec->init_step);
    }
#endif

    /* NMR restraints must be initialized before load_checkpoint,
     * since with time averaging the history is added to t_state.
     * For proper consistency check we therefore need to extend
     * t_state here.
     * So the PME-only nodes (if present) will also initialize
     * the distance restraints.
     */
    snew(fcd, 1);

    /* This needs to be called before read_checkpoint to extend the state */
    init_disres(fplog, mtop, inputrec, cr, fcd, state, repl_ex_nst > 0);

    init_orires(fplog, mtop, state->x, inputrec, cr, &(fcd->orires),
                state);

    if (DEFORM(*inputrec))
    {
        /* Store the deform reference box before reading the checkpoint */
        if (SIMMASTER(cr))
        {
            copy_mat(state->box, box);
        }
        if (PAR(cr))
        {
            gmx_bcast(sizeof(box), box, cr);
        }
        /* Because we do not have the update struct available yet
         * in which the reference values should be stored,
         * we store them temporarily in static variables.
         * This should be thread safe, since they are only written once
         * and with identical values.
         */
        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
        deform_init_init_step_tpx = inputrec->init_step;
        copy_mat(box, deform_init_box_tpx);
        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
    }

    if (opt2bSet("-cpi", nfile, fnm))
    {
        /* Check if checkpoint file exists before doing continuation.
         * This way we can use identical input options for the first and subsequent runs...
         */
        if (gmx_fexist_master(opt2fn_master("-cpi", nfile, fnm, cr), cr) )
        {
            load_checkpoint(opt2fn_master("-cpi", nfile, fnm, cr), &fplog,
                            cr, ddxyz,
                            inputrec, state, &bReadEkin,
                            (Flags & MD_APPENDFILES),
                            (Flags & MD_APPENDFILESSET));

            if (bReadEkin)
            {
                Flags |= MD_READ_EKIN;
            }
        }
    }

    if (MASTER(cr) && (Flags & MD_APPENDFILES))
    {
        gmx_log_open(ftp2fn(efLOG, nfile, fnm), cr,
                     Flags, &fplog);
    }

    /* override nsteps with value from cmdline */
    override_nsteps_cmdline(fplog, nsteps_cmdline, inputrec, cr);

    if (SIMMASTER(cr))
    {
        copy_mat(state->box, box);
    }

    if (PAR(cr))
    {
        gmx_bcast(sizeof(box), box, cr);
    }

    /* Essential dynamics */
    if (opt2bSet("-ei", nfile, fnm))
    {
        /* Open input and output files, allocate space for ED data structure */
        ed = ed_open(mtop->natoms, &state->edsamstate, nfile, fnm, Flags, oenv, cr);
    }

    if (PAR(cr) && !(EI_TPI(inputrec->eI) ||
                     inputrec->eI == eiNM))
    {
        cr->dd = init_domain_decomposition(fplog, cr, Flags, ddxyz, rdd, rconstr,
                                           dddlb_opt, dlb_scale,
                                           ddcsx, ddcsy, ddcsz,
                                           mtop, inputrec,
                                           box, state->x,
                                           &ddbox, &npme_major, &npme_minor);

        make_dd_communicators(fplog, cr, dd_node_order);

        /* Set overallocation to avoid frequent reallocation of arrays */
        set_over_alloc_dd(TRUE);
    }
    else
    {
        /* PME, if used, is done on all nodes with 1D decomposition */
        cr->npmenodes = 0;
        cr->duty      = (DUTY_PP | DUTY_PME);
        npme_major    = 1;
        npme_minor    = 1;

        if (inputrec->ePBC == epbcSCREW)
        {
            gmx_fatal(FARGS,
                      "pbc=%s is only implemented with domain decomposition",
                      epbc_names[inputrec->ePBC]);
        }
    }

    if (PAR(cr))
    {
        /* After possible communicator splitting in make_dd_communicators.
         * we can set up the intra/inter node communication.
         */
        gmx_setup_nodecomm(fplog, cr);
    }

    /* Initialize per-physical-node MPI process/thread ID and counters. */
    gmx_init_intranode_counters(cr);
#ifdef GMX_MPI
    if (MULTISIM(cr))
    {
        md_print_info(cr, fplog,
                      "This is simulation %d out of %d running as a composite GROMACS\n"
                      "multi-simulation job. Setup for this simulation:\n\n",
                      cr->ms->sim, cr->ms->nsim);
    }
    md_print_info(cr, fplog, "Using %d MPI %s\n",
                  cr->nnodes,
#ifdef GMX_THREAD_MPI
                  cr->nnodes == 1 ? "thread" : "threads"
#else
                  cr->nnodes == 1 ? "process" : "processes"
#endif
                  );
    fflush(stderr);
#endif

    /* Check and update hw_opt for the cut-off scheme */
    check_and_update_hw_opt_2(hw_opt, inputrec->cutoff_scheme);

    /* Check and update hw_opt for the number of MPI ranks */
    check_and_update_hw_opt_3(hw_opt);

    gmx_omp_nthreads_init(fplog, cr,
                          hwinfo->nthreads_hw_avail,
                          hw_opt->nthreads_omp,
                          hw_opt->nthreads_omp_pme,
                          (cr->duty & DUTY_PP) == 0,
                          inputrec->cutoff_scheme == ecutsVERLET);

#ifndef NDEBUG
    if (integrator[inputrec->eI].func != do_tpi &&
        inputrec->cutoff_scheme == ecutsVERLET)
    {
        gmx_feenableexcept();
    }
#endif

    if (bUseGPU)
    {
        /* Select GPU id's to use */
        gmx_select_gpu_ids(fplog, cr, &hwinfo->gpu_info, bForceUseGPU,
                           &hw_opt->gpu_opt);
    }
    else
    {
        /* Ignore (potentially) manually selected GPUs */
        hw_opt->gpu_opt.n_dev_use = 0;
    }

    /* check consistency across ranks of things like SIMD
     * support and number of GPUs selected */
    gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt, bUseGPU);

    /* Now that we know the setup is consistent, check for efficiency */
    check_resource_division_efficiency(hwinfo, hw_opt, Flags & MD_NTOMPSET,
                                       cr, fplog);

    if (DOMAINDECOMP(cr))
    {
        /* When we share GPUs over ranks, we need to know this for the DLB */
        dd_setup_dlb_resource_sharing(cr, hwinfo, hw_opt);
    }

    /* getting number of PP/PME threads
       PME: env variable should be read only on one node to make sure it is
       identical everywhere;
     */
    /* TODO nthreads_pp is only used for pinning threads.
     * This is a temporary solution until we have a hw topology library.
     */
    nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
    nthreads_pme = gmx_omp_nthreads_get(emntPME);

    wcycle = wallcycle_init(fplog, resetstep, cr, nthreads_pp, nthreads_pme);

    if (PAR(cr))
    {
        /* Master synchronizes its value of reset_counters with all nodes
         * including PME only nodes */
        reset_counters = wcycle_get_reset_counters(wcycle);
        gmx_bcast_sim(sizeof(reset_counters), &reset_counters, cr);
        wcycle_set_reset_counters(wcycle, reset_counters);
    }

    snew(nrnb, 1);
    if (cr->duty & DUTY_PP)
    {
        bcast_state(cr, state);

        /* Initiate forcerecord */
        fr          = mk_forcerec();
        fr->hwinfo  = hwinfo;
        fr->gpu_opt = &hw_opt->gpu_opt;
        init_forcerec(fplog, oenv, fr, fcd, inputrec, mtop, cr, box,
                      opt2fn("-table", nfile, fnm),
                      opt2fn("-tabletf", nfile, fnm),
                      opt2fn("-tablep", nfile, fnm),
                      opt2fn("-tableb", nfile, fnm),
                      nbpu_opt,
                      FALSE,
                      pforce);

        /* version for PCA_NOT_READ_NODE (see md.c) */
        /*init_forcerec(fplog,fr,fcd,inputrec,mtop,cr,box,FALSE,
           "nofile","nofile","nofile","nofile",FALSE,pforce);
         */

        /* Initialize QM-MM */
        if (fr->bQMMM)
        {
            init_QMMMrec(cr, mtop, inputrec, fr);
        }

        /* Initialize the mdatoms structure.
         * mdatoms is not filled with atom data,
         * as this can not be done now with domain decomposition.
         */
        mdatoms = init_mdatoms(fplog, mtop, inputrec->efep != efepNO);

        /* Initialize the virtual site communication */
        vsite = init_vsite(mtop, cr, FALSE);

        calc_shifts(box, fr->shift_vec);

        /* With periodic molecules the charge groups should be whole at start up
         * and the virtual sites should not be far from their proper positions.
         */
        if (!inputrec->bContinuation && MASTER(cr) &&
            !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
        {
            /* Make molecules whole at start of run */
            if (fr->ePBC != epbcNONE)
            {
                do_pbc_first_mtop(fplog, inputrec->ePBC, box, mtop, state->x);
            }
            if (vsite)
            {
                /* Correct initial vsite positions are required
                 * for the initial distribution in the domain decomposition
                 * and for the initial shell prediction.
                 */
                construct_vsites_mtop(vsite, mtop, state->x);
            }
        }

        if (EEL_PME(fr->eeltype) || EVDW_PME(fr->vdwtype))
        {
            ewaldcoeff_q  = fr->ewaldcoeff_q;
            ewaldcoeff_lj = fr->ewaldcoeff_lj;
            pmedata       = &fr->pmedata;
        }
        else
        {
            pmedata = NULL;
        }
    }
    else
    {
        /* This is a PME only node */

        /* We don't need the state */
        done_state(state);

        ewaldcoeff_q  = calc_ewaldcoeff_q(inputrec->rcoulomb, inputrec->ewald_rtol);
        ewaldcoeff_lj = calc_ewaldcoeff_lj(inputrec->rvdw, inputrec->ewald_rtol_lj);
        snew(pmedata, 1);
    }

    if (hw_opt->thread_affinity != threadaffOFF)
    {
        /* Before setting affinity, check whether the affinity has changed
         * - which indicates that probably the OpenMP library has changed it
         * since we first checked).
         */
        gmx_check_thread_affinity_set(fplog, cr,
                                      hw_opt, hwinfo->nthreads_hw_avail, TRUE);

        /* Set the CPU affinity */
        gmx_set_thread_affinity(fplog, cr, hw_opt, hwinfo);
    }

    /* Initiate PME if necessary,
     * either on all nodes or on dedicated PME nodes only. */
    if (EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype))
    {
        if (mdatoms)
        {
            nChargePerturbed = mdatoms->nChargePerturbed;
            if (EVDW_PME(inputrec->vdwtype))
            {
                nTypePerturbed   = mdatoms->nTypePerturbed;
            }
        }
        if (cr->npmenodes > 0)
        {
            /* The PME only nodes need to know nChargePerturbed(FEP on Q) and nTypePerturbed(FEP on LJ)*/
            gmx_bcast_sim(sizeof(nChargePerturbed), &nChargePerturbed, cr);
            gmx_bcast_sim(sizeof(nTypePerturbed), &nTypePerturbed, cr);
        }

        if (cr->duty & DUTY_PME)
        {
            status = gmx_pme_init(pmedata, cr, npme_major, npme_minor, inputrec,
                                  mtop ? mtop->natoms : 0, nChargePerturbed, nTypePerturbed,
                                  (Flags & MD_REPRODUCIBLE), nthreads_pme);
            if (status != 0)
            {
                gmx_fatal(FARGS, "Error %d initializing PME", status);
            }
        }
    }


    if (integrator[inputrec->eI].func == do_md)
    {
        /* Turn on signal handling on all nodes */
        /*
         * (A user signal from the PME nodes (if any)
         * is communicated to the PP nodes.
         */
        signal_handler_install();
    }

    if (cr->duty & DUTY_PP)
    {
        /* Assumes uniform use of the number of OpenMP threads */
        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntDefault));

        if (inputrec->bPull)
        {
            /* Initialize pull code */
            inputrec->pull_work =
                init_pull(fplog, inputrec->pull, inputrec, nfile, fnm,
                          mtop, cr, oenv, inputrec->fepvals->init_lambda,
                          EI_DYNAMICS(inputrec->eI) && MASTER(cr), Flags);
        }

        if (inputrec->bRot)
        {
            /* Initialize enforced rotation code */
            init_rot(fplog, inputrec, nfile, fnm, cr, state->x, box, mtop, oenv,
                     bVerbose, Flags);
        }

        if (inputrec->eSwapCoords != eswapNO)
        {
            /* Initialize ion swapping code */
            init_swapcoords(fplog, bVerbose, inputrec, opt2fn_master("-swap", nfile, fnm, cr),
                            mtop, state->x, state->box, &state->swapstate, cr, oenv, Flags);
        }

        constr = init_constraints(fplog, mtop, inputrec, ed, state, cr);

        if (DOMAINDECOMP(cr))
        {
            GMX_RELEASE_ASSERT(fr, "fr was NULL while cr->duty was DUTY_PP");
            dd_init_bondeds(fplog, cr->dd, mtop, vsite, inputrec,
                            Flags & MD_DDBONDCHECK, fr->cginfo_mb);

            set_dd_parameters(fplog, cr->dd, dlb_scale, inputrec, &ddbox);

            setup_dd_grid(fplog, cr->dd);
        }

        /* Now do whatever the user wants us to do (how flexible...) */
        integrator[inputrec->eI].func(fplog, cr, nfile, fnm,
                                      oenv, bVerbose, bCompact,
                                      nstglobalcomm,
                                      vsite, constr,
                                      nstepout, inputrec, mtop,
                                      fcd, state,
                                      mdatoms, nrnb, wcycle, ed, fr,
                                      repl_ex_nst, repl_ex_nex, repl_ex_seed,
                                      membed,
                                      cpt_period, max_hours,
                                      imdport,
                                      Flags,
                                      walltime_accounting);

        if (inputrec->bPull)
        {
            finish_pull(inputrec->pull_work);
        }

        if (inputrec->bRot)
        {
            finish_rot(inputrec->rot);
        }

    }
    else
    {
        GMX_RELEASE_ASSERT(pmedata, "pmedata was NULL while cr->duty was not DUTY_PP");
        /* do PME only */
        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntPME));
        gmx_pmeonly(*pmedata, cr, nrnb, wcycle, walltime_accounting, ewaldcoeff_q, ewaldcoeff_lj, inputrec);
    }

    wallcycle_stop(wcycle, ewcRUN);

    /* Finish up, write some stuff
     * if rerunMD, don't write last frame again
     */
    finish_run(fplog, cr,
               inputrec, nrnb, wcycle, walltime_accounting,
               fr ? fr->nbv : NULL,
               EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));


    /* Free GPU memory and context */
    free_gpu_resources(fr, cr, &hwinfo->gpu_info, fr ? fr->gpu_opt : NULL);

    if (opt2bSet("-membed", nfile, fnm))
    {
        sfree(membed);
    }

    gmx_hardware_info_free(hwinfo);

    /* Does what it says */
    print_date_and_time(fplog, cr->nodeid, "Finished mdrun", gmx_gettime());
    walltime_accounting_destroy(walltime_accounting);

    /* PLUMED */
    if(plumedswitch){
      plumed_finalize(plumedmain);
    }
    /* END PLUMED */

    /* Close logfile already here if we were appending to it */
    if (MASTER(cr) && (Flags & MD_APPENDFILES))
    {
        gmx_log_close(fplog);
    }

    rc = (int)gmx_get_stop_condition();

    done_ed(&ed);

#ifdef GMX_THREAD_MPI
    /* we need to join all threads. The sub-threads join when they
       exit this function, but the master thread needs to be told to
       wait for that. */
    if (PAR(cr) && MASTER(cr))
    {
        tMPI_Finalize();
    }
#endif

    return rc;
}