Example #1
0
void omp_times(FixOMP *fix, const char *label, enum Timer::ttype which,
                      const int nthreads,FILE *scr, FILE *log)
{
  const char fmt[] = "%-8s|%- 12.5g|%- 12.5g|%- 12.5g|%6.1f |%6.2f\n";
  double time_min, time_max, time_avg, time_total, time_std;

  time_min =  1.0e100;
  time_max = -1.0e100;
  time_total = time_avg = time_std = 0.0;

  for (int i=0; i < nthreads; ++i) {
    ThrData *thr = fix->get_thr(i);
    double tmp=thr->get_time(which);
    time_min = MIN(time_min,tmp);
    time_max = MAX(time_max,tmp);
    time_avg += tmp;
    time_std += tmp*tmp;
    time_total += thr->get_time(Timer::ALL);
  }

  time_avg /= nthreads;
  time_std /= nthreads;
  time_total /= nthreads;

  if (time_avg > 1.0e-10)
    time_std = sqrt(time_std/time_avg - time_avg)*100.0;
  else
    time_std = 0.0;

  if (scr) fprintf(scr,fmt,label,time_min,time_avg,time_max,time_std,
                   time_avg/time_total*100.0);
  if (log) fprintf(log,fmt,label,time_min,time_avg,time_max,time_std,
                   time_avg/time_total*100.0);
}
Example #2
0
void Finish::end(int flag)
{
  int i,m,nneigh,nneighfull;
  int histo[10];
  int minflag,prdflag,tadflag,timeflag,fftflag,histoflag,neighflag;
  double time,tmp,ave,max,min;
  double time_loop,time_other,cpu_loop;

  int me,nprocs;
  MPI_Comm_rank(world,&me);
  MPI_Comm_size(world,&nprocs);

  const int nthreads = comm->nthreads;

  // recompute natoms in case atoms have been lost

  bigint nblocal = atom->nlocal;
  MPI_Allreduce(&nblocal,&atom->natoms,1,MPI_LMP_BIGINT,MPI_SUM,world);

  // choose flavors of statistical output
  // flag determines caller
  // flag = 0 = just loop summary
  // flag = 1 = dynamics or minimization
  // flag = 2 = PRD
  // flag = 3 = TAD
  // turn off neighflag for Kspace partition of verlet/split integrator

  minflag = prdflag = tadflag = timeflag = fftflag = histoflag = neighflag = 0;
  time_loop = cpu_loop = time_other = 0.0;

  if (flag == 1) {
    if (update->whichflag == 2) minflag = 1;
    timeflag = histoflag = 1;
    neighflag = 1;
    if (update->whichflag == 1 &&
        strncmp(update->integrate_style,"verlet/split",12) == 0 &&
        universe->iworld == 1) neighflag = 0;
    if (force->kspace && force->kspace_match("pppm",0)
        && force->kspace->fftbench) fftflag = 1;
  }
  if (flag == 2) prdflag = timeflag = histoflag = neighflag = 1;
  if (flag == 3) tadflag = histoflag = neighflag = 1;

  // loop stats

  if (timer->has_loop()) {

    // overall loop time

    time_loop = timer->get_wall(Timer::TOTAL);
    cpu_loop = timer->get_cpu(Timer::TOTAL);
    MPI_Allreduce(&time_loop,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
    time_loop = tmp/nprocs;
    MPI_Allreduce(&cpu_loop,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
    cpu_loop = tmp/nprocs;
    if (time_loop > 0.0) cpu_loop = cpu_loop/time_loop*100.0;

    if (me == 0) {
      int ntasks = nprocs * nthreads;
      const char fmt1[] = "Loop time of %g on %d procs "
        "for %d steps with " BIGINT_FORMAT " atoms\n\n";
      if (screen) fprintf(screen,fmt1,time_loop,ntasks,update->nsteps,
                          atom->natoms);
      if (logfile) fprintf(logfile,fmt1,time_loop,ntasks,update->nsteps,
                           atom->natoms);

      // Gromacs/NAMD-style performance metric for suitable unit settings

      if ( timeflag && !minflag && !prdflag && !tadflag &&
           (update->nsteps > 0) && (update->dt != 0.0) &&
           ((strcmp(update->unit_style,"lj") == 0) ||
            (strcmp(update->unit_style,"metal") == 0) ||
            (strcmp(update->unit_style,"micro") == 0) ||
            (strcmp(update->unit_style,"nano") == 0) ||
            (strcmp(update->unit_style,"electron") == 0) ||
            (strcmp(update->unit_style,"real") == 0)) ) {
        double one_fs = force->femtosecond;
        double t_step = ((double) time_loop) / ((double) update->nsteps);
        double step_t = 1.0/t_step;

        if (strcmp(update->unit_style,"lj") == 0) {
          double tau_day = 24.0*3600.0 / t_step * update->dt / one_fs;
          const char perf[] = "Performance: %.3f tau/day, %.3f timesteps/s\n";
          if (screen) fprintf(screen,perf,tau_day,step_t);
          if (logfile) fprintf(logfile,perf,tau_day,step_t);
        } else {
          double hrs_ns = t_step / update->dt * 1000000.0 * one_fs / 3600.0;
          double ns_day = 24.0*3600.0 / t_step * update->dt / one_fs/1000000.0;
          const char perf[] =
            "Performance: %.3f ns/day, %.3f hours/ns, %.3f timesteps/s\n";
          if (screen) fprintf(screen,perf,ns_day,hrs_ns,step_t);
          if (logfile) fprintf(logfile,perf,ns_day,hrs_ns,step_t);
        }
      }

      // CPU use on MPI tasks and OpenMP threads

#ifdef LMP_USER_OMP
      const char fmt2[] =
        "%.1f%% CPU use with %d MPI tasks x %d OpenMP threads\n";
      if (screen) fprintf(screen,fmt2,cpu_loop,nprocs,nthreads);
      if (logfile) fprintf(logfile,fmt2,cpu_loop,nprocs,nthreads);
#else
      if (lmp->kokkos) {
        const char fmt2[] =
          "%.1f%% CPU use with %d MPI tasks x %d OpenMP threads\n";
        if (screen) fprintf(screen,fmt2,cpu_loop,nprocs,lmp->kokkos->num_threads);
        if (logfile) fprintf(logfile,fmt2,cpu_loop,nprocs,lmp->kokkos->num_threads);
      } else {
        const char fmt2[] =
          "%.1f%% CPU use with %d MPI tasks x no OpenMP threads\n";
        if (screen) fprintf(screen,fmt2,cpu_loop,nprocs);
        if (logfile) fprintf(logfile,fmt2,cpu_loop,nprocs);
      }
#endif

    }
  }

  // avoid division by zero for very short runs

  if (time_loop == 0.0) time_loop = 1.0;
  if (cpu_loop == 0.0) cpu_loop = 100.0;

  // get "Other" wall time for later use

  if (timer->has_normal())
    time_other = timer->get_wall(Timer::TOTAL) - timer->get_wall(Timer::ALL);

  // minimization stats

  if (minflag) {
    if (me == 0) {
      if (screen) fprintf(screen,"\n");
      if (logfile) fprintf(logfile,"\n");
    }

    if (me == 0) {
      if (screen) {
        fprintf(screen,"Minimization stats:\n");
        fprintf(screen,"  Stopping criterion = %s\n",
                update->minimize->stopstr);
        fprintf(screen,"  Energy initial, next-to-last, final = \n"
                "    %18.12g %18.12g %18.12g\n",
                update->minimize->einitial,update->minimize->eprevious,
                update->minimize->efinal);
        fprintf(screen,"  Force two-norm initial, final = %g %g\n",
                update->minimize->fnorm2_init,update->minimize->fnorm2_final);
        fprintf(screen,"  Force max component initial, final = %g %g\n",
                update->minimize->fnorminf_init,
                update->minimize->fnorminf_final);
        fprintf(screen,"  Final line search alpha, max atom move = %g %g\n",
                update->minimize->alpha_final,
                update->minimize->alpha_final*
                update->minimize->fnorminf_final);
        fprintf(screen,"  Iterations, force evaluations = %d %d\n",
                update->minimize->niter,update->minimize->neval);
      }
      if (logfile) {
        fprintf(logfile,"Minimization stats:\n");
        fprintf(logfile,"  Stopping criterion = %s\n",
                update->minimize->stopstr);
        fprintf(logfile,"  Energy initial, next-to-last, final = \n"
                "    %18.12g %18.12g %18.12g\n",
                update->minimize->einitial,update->minimize->eprevious,
                update->minimize->efinal);
        fprintf(logfile,"  Force two-norm initial, final = %g %g\n",
                update->minimize->fnorm2_init,update->minimize->fnorm2_final);
        fprintf(logfile,"  Force max component initial, final = %g %g\n",
                update->minimize->fnorminf_init,
                update->minimize->fnorminf_final);
        fprintf(logfile,"  Final line search alpha, max atom move = %g %g\n",
                update->minimize->alpha_final,
                update->minimize->alpha_final*
                update->minimize->fnorminf_final);
        fprintf(logfile,"  Iterations, force evaluations = %d %d\n",
                update->minimize->niter,update->minimize->neval);
      }
    }
  }

  // PRD stats using PAIR,BOND,KSPACE for dephase,dynamics,quench

  if (prdflag) {
    if (me == 0) {
      if (screen) fprintf(screen,"\n");
      if (logfile) fprintf(logfile,"\n");
    }

    if (screen) fprintf(screen,"PRD stats:\n");
    if (logfile) fprintf(logfile,"PRD stats:\n");

    time = timer->get_wall(Timer::DEPHASE);
    MPI_Allreduce(&time,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
    time = tmp/nprocs;
    if (me == 0) {
      if (screen)
        fprintf(screen,"  Dephase  time (%%) = %g (%g)\n",
                time,time/time_loop*100.0);
      if (logfile)
        fprintf(logfile,"  Dephase  time (%%) = %g (%g)\n",
                time,time/time_loop*100.0);
    }

    time = timer->get_wall(Timer::DYNAMICS);
    MPI_Allreduce(&time,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
    time = tmp/nprocs;
    if (me == 0) {
      if (screen)
        fprintf(screen,"  Dynamics time (%%) = %g (%g)\n",
                time,time/time_loop*100.0);
      if (logfile)
        fprintf(logfile,"  Dynamics time (%%) = %g (%g)\n",
                time,time/time_loop*100.0);
    }

    time = timer->get_wall(Timer::QUENCH);
    MPI_Allreduce(&time,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
    time = tmp/nprocs;
    if (me == 0) {
      if (screen)
        fprintf(screen,"  Quench   time (%%) = %g (%g)\n",
                time,time/time_loop*100.0);
      if (logfile)
        fprintf(logfile,"  Quench   time (%%) = %g (%g)\n",
                time,time/time_loop*100.0);
    }

      time = timer->get_wall(Timer::REPCOMM);
    MPI_Allreduce(&time,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
    time = tmp/nprocs;
    if (me == 0) {
      if (screen)
        fprintf(screen,"  Comm     time (%%) = %g (%g)\n",
                time,time/time_loop*100.0);
      if (logfile)
        fprintf(logfile,"  Comm     time (%%) = %g (%g)\n",
                time,time/time_loop*100.0);
    }


    time = timer->get_wall(Timer::REPOUT);
    MPI_Allreduce(&time,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
    time = tmp/nprocs;
    if (me == 0) {
      if (screen)
        fprintf(screen,"  Output   time (%%) = %g (%g)\n",
                time,time/time_loop*100.0);
      if (logfile)
        fprintf(logfile,"  Output   time (%%) = %g (%g)\n",
                time,time/time_loop*100.0);
    }

    time = time_other;
    MPI_Allreduce(&time,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
    time = tmp/nprocs;
    if (me == 0) { // XXXX: replica comm, replica output
      if (screen)
        fprintf(screen,"  Other    time (%%) = %g (%g)\n",
                time,time/time_loop*100.0);
      if (logfile)
        fprintf(logfile,"  Other    time (%%) = %g (%g)\n",
                time,time/time_loop*100.0);
    }
  }

  // TAD stats using PAIR,BOND,KSPACE for neb,dynamics,quench

  if (tadflag) {
    if (me == 0) {
      if (screen) fprintf(screen,"\n");
      if (logfile) fprintf(logfile,"\n");
    }

    if (screen) fprintf(screen,"TAD stats:\n");
    if (logfile) fprintf(logfile,"TAD stats:\n");

    time = timer->get_wall(Timer::NEB);
    MPI_Allreduce(&time,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
    time = tmp/nprocs;
    if (me == 0) {
      if (screen)
        fprintf(screen,"  NEB      time (%%) = %g (%g)\n",
                time,time/time_loop*100.0);
      if (logfile)
        fprintf(logfile,"  NEB      time (%%) = %g (%g)\n",
                time,time/time_loop*100.0);
    }

    time = timer->get_wall(Timer::DYNAMICS);
    MPI_Allreduce(&time,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
    time = tmp/nprocs;
    if (me == 0) {
      if (screen)
        fprintf(screen,"  Dynamics time (%%) = %g (%g)\n",
                time,time/time_loop*100.0);
      if (logfile)
        fprintf(logfile,"  Dynamics time (%%) = %g (%g)\n",
                time,time/time_loop*100.0);
    }

    time = timer->get_wall(Timer::QUENCH);
    MPI_Allreduce(&time,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
    time = tmp/nprocs;
    if (me == 0) {
      if (screen)
        fprintf(screen,"  Quench   time (%%) = %g (%g)\n",
                time,time/time_loop*100.0);
      if (logfile)
        fprintf(logfile,"  Quench   time (%%) = %g (%g)\n",
                time,time/time_loop*100.0);
    }


    time = timer->get_wall(Timer::REPCOMM);
    MPI_Allreduce(&time,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
    time = tmp/nprocs;
    if (me == 0) {
      if (screen)
        fprintf(screen,"  Comm     time (%%) = %g (%g)\n",
                time,time/time_loop*100.0);
      if (logfile)
        fprintf(logfile,"  Comm     time (%%) = %g (%g)\n",
                time,time/time_loop*100.0);
    }


    time = timer->get_wall(Timer::REPOUT);
    MPI_Allreduce(&time,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
    time = tmp/nprocs;
    if (me == 0) {
      if (screen)
        fprintf(screen,"  Output   time (%%) = %g (%g)\n",
                time,time/time_loop*100.0);
      if (logfile)
        fprintf(logfile,"  Output   time (%%) = %g (%g)\n",
                time,time/time_loop*100.0);
    }

    time = time_other;
    MPI_Allreduce(&time,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
    time = tmp/nprocs;
    if (me == 0) {
      if (screen)
        fprintf(screen,"  Other    time (%%) = %g (%g)\n",
                time,time/time_loop*100.0);
      if (logfile)
        fprintf(logfile,"  Other    time (%%) = %g (%g)\n",
                time,time/time_loop*100.0);
    }
  }

  if (timeflag && timer->has_normal()) {

    if (timer->has_full()) {
      const char hdr[] = "\nMPI task timing breakdown:\n"
        "Section |  min time  |  avg time  |  max time  |%varavg|  %CPU | %total\n"
        "-----------------------------------------------------------------------\n";
      if (me == 0) {
        if (screen)  fputs(hdr,screen);
        if (logfile) fputs(hdr,logfile);
      }
    } else {
      const char hdr[] = "\nMPI task timing breakdown:\n"
        "Section |  min time  |  avg time  |  max time  |%varavg| %total\n"
        "---------------------------------------------------------------\n";
      if (me == 0) {
        if (screen)  fputs(hdr,screen);
        if (logfile) fputs(hdr,logfile);
      }
    }

    mpi_timings("Pair",timer,Timer::PAIR, world,nprocs,
                nthreads,me,time_loop,screen,logfile);

    if (atom->molecular)
      mpi_timings("Bond",timer,Timer::BOND,world,nprocs,
                  nthreads,me,time_loop,screen,logfile);

    if (force->kspace)
      mpi_timings("Kspace",timer,Timer::KSPACE,world,nprocs,
                  nthreads,me,time_loop,screen,logfile);

    mpi_timings("Neigh",timer,Timer::NEIGH,world,nprocs,
                nthreads,me,time_loop,screen,logfile);
    mpi_timings("Comm",timer,Timer::COMM,world,nprocs,
                nthreads,me,time_loop,screen,logfile);
    mpi_timings("Output",timer,Timer::OUTPUT,world,nprocs,
                nthreads,me,time_loop,screen,logfile);
    mpi_timings("Modify",timer,Timer::MODIFY,world,nprocs,
                nthreads,me,time_loop,screen,logfile);
    if (timer->has_sync())
      mpi_timings("Sync",timer,Timer::SYNC,world,nprocs,
                  nthreads,me,time_loop,screen,logfile);

    time = time_other;
    MPI_Allreduce(&time,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
    time = tmp/nprocs;

    const char *fmt;
    if (timer->has_full())
      fmt = "Other   |            |%- 12.4g|            |       |       |%6.2f\n";
    else
      fmt = "Other   |            |%- 12.4g|            |       |%6.2f\n";

    if (me == 0) {
      if (screen) fprintf(screen,fmt,time,time/time_loop*100.0);
      if (logfile) fprintf(logfile,fmt,time,time/time_loop*100.0);
    }
  }

#ifdef LMP_USER_OMP
  const char thr_hdr_fmt[] =
    "\nThread timing breakdown (MPI rank %d):\nTotal threaded time %.4g / %.1f%%\n";
  const char thr_header[] =
    "Section |  min time  |  avg time  |  max time  |%varavg| %total\n"
    "---------------------------------------------------------------\n";

  int ifix = modify->find_fix("package_omp");

  // print thread breakdown only with full timer detail

  if ((ifix >= 0) && timer->has_full() && me == 0) {
    double thr_total = 0.0;
    ThrData *td;
    FixOMP *fixomp = static_cast<FixOMP *>(lmp->modify->fix[ifix]);
    for (i=0; i < nthreads; ++i) {
      td = fixomp->get_thr(i);
      thr_total += td->get_time(Timer::ALL);
    }
    thr_total /= (double) nthreads;

    if (thr_total > 0.0) {
      if (screen) {
        fprintf(screen,thr_hdr_fmt,me,thr_total,thr_total/time_loop*100.0);
        fputs(thr_header,screen);
      }
      if (logfile) {
        fprintf(logfile,thr_hdr_fmt,me,thr_total,thr_total/time_loop*100.0);
        fputs(thr_header,logfile);
      }

      omp_times(fixomp,"Pair",Timer::PAIR,nthreads,screen,logfile);

      if (atom->molecular)
        omp_times(fixomp,"Bond",Timer::BOND,nthreads,screen,logfile);

      if (force->kspace)
        omp_times(fixomp,"Kspace",Timer::KSPACE,nthreads,screen,logfile);

      omp_times(fixomp,"Neigh",Timer::NEIGH,nthreads,screen,logfile);
      omp_times(fixomp,"Reduce",Timer::COMM,nthreads,screen,logfile);
    }
  }
#endif

  if (lmp->kokkos && lmp->kokkos->ngpu > 0)
    if (const char* env_clb = getenv("CUDA_LAUNCH_BLOCKING"))
      if (!(strcmp(env_clb,"1") == 0)) {
        error->warning(FLERR,"Timing breakdown may not be accurate since GPU/CPU overlap is enabled. "
          "Using 'export CUDA_LAUNCH_BLOCKING=1' will give an accurate timing breakdown but will reduce performance");
      }

  // FFT timing statistics
  // time3d,time1d = total time during run for 3d and 1d FFTs
  // loop on timing() until nsample FFTs require at least 1.0 CPU sec
  // time_kspace may be 0.0 if another partition is doing Kspace

  if (fftflag) {
    if (me == 0) {
      if (screen) fprintf(screen,"\n");
      if (logfile) fprintf(logfile,"\n");
    }

    int nsteps = update->nsteps;

    double time3d;
    int nsample = 1;
    int nfft = force->kspace->timing_3d(nsample,time3d);
    while (time3d < 1.0) {
      nsample *= 2;
      nfft = force->kspace->timing_3d(nsample,time3d);
    }

    time3d = nsteps * time3d / nsample;
    MPI_Allreduce(&time3d,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
    time3d = tmp/nprocs;

    double time1d;
    nsample = 1;
    nfft = force->kspace->timing_1d(nsample,time1d);
    while (time1d < 1.0) {
      nsample *= 2;
      nfft = force->kspace->timing_1d(nsample,time1d);
    }

    time1d = nsteps * time1d / nsample;
    MPI_Allreduce(&time1d,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
    time1d = tmp/nprocs;

    double time_kspace = timer->get_wall(Timer::KSPACE);
    MPI_Allreduce(&time_kspace,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
    time_kspace = tmp/nprocs;

    double ntotal = 1.0 * force->kspace->nx_pppm *
      force->kspace->ny_pppm * force->kspace->nz_pppm;
    double nflops = 5.0 * ntotal * log(ntotal);

    double fraction,flop3,flop1;
    if (nsteps) {
      if (time_kspace) fraction = time3d/time_kspace*100.0;
      else fraction = 0.0;
      flop3 = nfft*nflops/1.0e9/(time3d/nsteps);
      flop1 = nfft*nflops/1.0e9/(time1d/nsteps);
    } else fraction = flop3 = flop1 = 0.0;

    if (me == 0) {
      if (screen) {
        fprintf(screen,"FFT time (%% of Kspce) = %g (%g)\n",time3d,fraction);
        fprintf(screen,"FFT Gflps 3d (1d only) = %g %g\n",flop3,flop1);
      }
      if (logfile) {
        fprintf(logfile,"FFT time (%% of Kspce) = %g (%g)\n",time3d,fraction);
        fprintf(logfile,"FFT Gflps 3d (1d only) = %g %g\n",flop3,flop1);
      }
    }
  }

  if (histoflag) {
    if (me == 0) {
      if (screen) fprintf(screen,"\n");
      if (logfile) fprintf(logfile,"\n");
    }

    tmp = atom->nlocal;
    stats(1,&tmp,&ave,&max,&min,10,histo);
    if (me == 0) {
      if (screen) {
        fprintf(screen,"Nlocal:    %g ave %g max %g min\n",ave,max,min);
        fprintf(screen,"Histogram:");
        for (i = 0; i < 10; i++) fprintf(screen," %d",histo[i]);
        fprintf(screen,"\n");
      }
      if (logfile) {
        fprintf(logfile,"Nlocal:    %g ave %g max %g min\n",ave,max,min);
        fprintf(logfile,"Histogram:");
        for (i = 0; i < 10; i++) fprintf(logfile," %d",histo[i]);
        fprintf(logfile,"\n");
      }
    }

    tmp = atom->nghost;
    stats(1,&tmp,&ave,&max,&min,10,histo);
    if (me == 0) {
      if (screen) {
        fprintf(screen,"Nghost:    %g ave %g max %g min\n",ave,max,min);
        fprintf(screen,"Histogram:");
        for (i = 0; i < 10; i++) fprintf(screen," %d",histo[i]);
        fprintf(screen,"\n");
      }
      if (logfile) {
        fprintf(logfile,"Nghost:    %g ave %g max %g min\n",ave,max,min);
        fprintf(logfile,"Histogram:");
        for (i = 0; i < 10; i++) fprintf(logfile," %d",histo[i]);
        fprintf(logfile,"\n");
      }
    }

    // find a non-skip neighbor list containing half pairwise interactions
    // count neighbors in that list for stats purposes
    // allow it to be Kokkos neigh list as well

    for (m = 0; m < neighbor->old_nrequest; m++) {
      if ((neighbor->old_requests[m]->half ||
           neighbor->old_requests[m]->gran ||
           neighbor->old_requests[m]->respaouter ||
           neighbor->old_requests[m]->half_from_full) &&
          neighbor->old_requests[m]->skip == 0 &&
          neighbor->lists[m] && neighbor->lists[m]->numneigh) {
        if (!neighbor->lists[m] && lmp->kokkos &&
            lmp->kokkos->neigh_list_kokkos(m)) break;
        else break;
      }
    }

    nneigh = 0;
    if (m < neighbor->old_nrequest) {
      if (neighbor->lists[m]) {
        int inum = neighbor->lists[m]->inum;
        int *ilist = neighbor->lists[m]->ilist;
        int *numneigh = neighbor->lists[m]->numneigh;
        for (i = 0; i < inum; i++)
          nneigh += numneigh[ilist[i]];
      } else if (lmp->kokkos) nneigh = lmp->kokkos->neigh_count(m);
    }

    tmp = nneigh;
    stats(1,&tmp,&ave,&max,&min,10,histo);
    if (me == 0) {
      if (screen) {
        fprintf(screen,"Neighs:    %g ave %g max %g min\n",ave,max,min);
        fprintf(screen,"Histogram:");
        for (i = 0; i < 10; i++) fprintf(screen," %d",histo[i]);
        fprintf(screen,"\n");
      }
      if (logfile) {
        fprintf(logfile,"Neighs:    %g ave %g max %g min\n",ave,max,min);
        fprintf(logfile,"Histogram:");
        for (i = 0; i < 10; i++) fprintf(logfile," %d",histo[i]);
        fprintf(logfile,"\n");
      }
    }

    // find a non-skip neighbor list containing full pairwise interactions
    // count neighbors in that list for stats purposes
    // allow it to be Kokkos neigh list as well

    for (m = 0; m < neighbor->old_nrequest; m++) {
      if (neighbor->old_requests[m]->full &&
          neighbor->old_requests[m]->skip == 0) {
        if (lmp->kokkos && lmp->kokkos->neigh_list_kokkos(m)) break;
        else break;
      }
    }

    nneighfull = 0;
    if (m < neighbor->old_nrequest) {
      if (neighbor->lists[m] && neighbor->lists[m]->numneigh) {
        int inum = neighbor->lists[m]->inum;
        int *ilist = neighbor->lists[m]->ilist;
        int *numneigh = neighbor->lists[m]->numneigh;
        for (i = 0; i < inum; i++)
          nneighfull += numneigh[ilist[i]];
      } else if (!neighbor->lists[m] && lmp->kokkos)
          nneighfull = lmp->kokkos->neigh_count(m);

      tmp = nneighfull;
      stats(1,&tmp,&ave,&max,&min,10,histo);
      if (me == 0) {
        if (screen) {
          fprintf(screen,"FullNghs:  %g ave %g max %g min\n",ave,max,min);
          fprintf(screen,"Histogram:");
          for (i = 0; i < 10; i++) fprintf(screen," %d",histo[i]);
          fprintf(screen,"\n");
        }
        if (logfile) {
          fprintf(logfile,"FullNghs:  %g ave %g max %g min\n",ave,max,min);
          fprintf(logfile,"Histogram:");
          for (i = 0; i < 10; i++) fprintf(logfile," %d",histo[i]);
          fprintf(logfile,"\n");
        }
      }
    }
  }

  if (neighflag) {
    if (me == 0) {
      if (screen) fprintf(screen,"\n");
      if (logfile) fprintf(logfile,"\n");
    }

    tmp = MAX(nneigh,nneighfull);
    double nall;
    MPI_Allreduce(&tmp,&nall,1,MPI_DOUBLE,MPI_SUM,world);

    int nspec;
    double nspec_all = 0;
    if (atom->molecular == 1) {
      int **nspecial = atom->nspecial;
      int nlocal = atom->nlocal;
      nspec = 0;
      for (i = 0; i < nlocal; i++) nspec += nspecial[i][2];
      tmp = nspec;
      MPI_Allreduce(&tmp,&nspec_all,1,MPI_DOUBLE,MPI_SUM,world);
    } else if (atom->molecular == 2) {
      Molecule **onemols = atom->avec->onemols;
      int *molindex = atom->molindex;
      int *molatom = atom->molatom;
      int nlocal = atom->nlocal;
      int imol,iatom;
      nspec = 0;
      for (i = 0; i < nlocal; i++) {
        if (molindex[i] < 0) continue;
        imol = molindex[i];
        iatom = molatom[i];
        nspec += onemols[imol]->nspecial[iatom][2];
      }
      tmp = nspec;
      MPI_Allreduce(&tmp,&nspec_all,1,MPI_DOUBLE,MPI_SUM,world);
    }

    if (me == 0) {
      if (screen) {
        if (nall < 2.0e9)
          fprintf(screen,
                  "Total # of neighbors = %d\n",static_cast<int> (nall));
        else fprintf(screen,"Total # of neighbors = %g\n",nall);
        if (atom->natoms > 0)
          fprintf(screen,"Ave neighs/atom = %g\n",nall/atom->natoms);
        if (atom->molecular && atom->natoms > 0)
          fprintf(screen,"Ave special neighs/atom = %g\n",
                  nspec_all/atom->natoms);
        fprintf(screen,"Neighbor list builds = " BIGINT_FORMAT "\n",
                neighbor->ncalls);
        if (neighbor->dist_check)
          fprintf(screen,"Dangerous builds = " BIGINT_FORMAT "\n",
                  neighbor->ndanger);
        else fprintf(screen,"Dangerous builds not checked\n");
      }
      if (logfile) {
        if (nall < 2.0e9)
          fprintf(logfile,
                  "Total # of neighbors = %d\n",static_cast<int> (nall));
        else fprintf(logfile,"Total # of neighbors = %g\n",nall);
        if (atom->natoms > 0)
          fprintf(logfile,"Ave neighs/atom = %g\n",nall/atom->natoms);
        if (atom->molecular && atom->natoms > 0)
          fprintf(logfile,"Ave special neighs/atom = %g\n",
                  nspec_all/atom->natoms);
        fprintf(logfile,"Neighbor list builds = " BIGINT_FORMAT "\n",
                neighbor->ncalls);
        if (neighbor->dist_check)
          fprintf(logfile,"Dangerous builds = " BIGINT_FORMAT "\n",
                  neighbor->ndanger);
        else fprintf(logfile,"Dangerous builds not checked\n");
      }
    }
  }

  if (logfile) fflush(logfile);
}