/*! \brief * A convenience wrapper for launching either the GPU or CPU FFT. * * \param[in] pme The PME structure. * \param[in] gridIndex The grid index - should currently always be 0. * \param[in] dir The FFT direction enum. * \param[in] wcycle The wallclock counter. */ void inline parallel_3dfft_execute_gpu_wrapper(gmx_pme_t *pme, const int gridIndex, enum gmx_fft_direction dir, gmx_wallcycle_t wcycle) { GMX_ASSERT(gridIndex == 0, "Only single grid supported"); if (pme_gpu_performs_FFT(pme->gpu)) { wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU); wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME); pme_gpu_3dfft(pme->gpu, dir, gridIndex); wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME); wallcycle_stop(wcycle, ewcLAUNCH_GPU); } else { wallcycle_start(wcycle, ewcPME_FFT_MIXED_MODE); #pragma omp parallel for num_threads(pme->nthread) schedule(static) for (int thread = 0; thread < pme->nthread; thread++) { gmx_parallel_3dfft_execute(pme->pfft_setup[gridIndex], dir, thread, wcycle); } wallcycle_stop(wcycle, ewcPME_FFT_MIXED_MODE); } }
void posres_wrapper_lambda(struct gmx_wallcycle *wcycle, const t_lambda *fepvals, const t_idef *idef, const struct t_pbc *pbc, const rvec x[], gmx_enerdata_t *enerd, real *lambda, t_forcerec *fr) { real v; int i; if (0 == idef->il[F_POSRES].nr) { return; } wallcycle_sub_start_nocount(wcycle, ewcsRESTRAINTS); for (i = 0; i < enerd->n_lambda; i++) { real dvdl_dum = 0, lambda_dum; lambda_dum = (i == 0 ? lambda[efptRESTRAINT] : fepvals->all_lambda[efptRESTRAINT][i-1]); v = posres(idef->il[F_POSRES].nr, idef->il[F_POSRES].iatoms, idef->iparams_posres, x, NULL, NULL, fr->ePBC == epbcNONE ? NULL : pbc, lambda_dum, &dvdl_dum, fr->rc_scaling, fr->ePBC, fr->posres_com, fr->posres_comB); enerd->enerpart_lambda[i] += v; } wallcycle_sub_stop(wcycle, ewcsRESTRAINTS); }
void pme_gpu_launch_spread(gmx_pme_t *pme, const rvec *x, gmx_wallcycle *wcycle) { GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled."); PmeGpu *pmeGpu = pme->gpu; // The only spot of PME GPU where LAUNCH_GPU counter increases call-count wallcycle_start(wcycle, ewcLAUNCH_GPU); // The only spot of PME GPU where ewcsLAUNCH_GPU_PME subcounter increases call-count wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_PME); pme_gpu_copy_input_coordinates(pmeGpu, x); wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME); wallcycle_stop(wcycle, ewcLAUNCH_GPU); const unsigned int gridIndex = 0; real *fftgrid = pme->fftgrid[gridIndex]; if (pmeGpu->settings.currentFlags & GMX_PME_SPREAD) { /* Spread the coefficients on a grid */ const bool computeSplines = true; const bool spreadCharges = true; wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU); wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME); pme_gpu_spread(pmeGpu, gridIndex, fftgrid, computeSplines, spreadCharges); wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME); wallcycle_stop(wcycle, ewcLAUNCH_GPU); } }
void pme_gpu_launch_complex_transforms(gmx_pme_t *pme, gmx_wallcycle *wcycle) { PmeGpu *pmeGpu = pme->gpu; const bool computeEnergyAndVirial = (pmeGpu->settings.currentFlags & GMX_PME_CALC_ENER_VIR) != 0; const bool performBackFFT = (pmeGpu->settings.currentFlags & (GMX_PME_CALC_F | GMX_PME_CALC_POT)) != 0; const unsigned int gridIndex = 0; t_complex *cfftgrid = pme->cfftgrid[gridIndex]; if (pmeGpu->settings.currentFlags & GMX_PME_SPREAD) { if (!pme_gpu_performs_FFT(pmeGpu)) { wallcycle_start(wcycle, ewcWAIT_GPU_PME_SPREAD); pme_gpu_sync_spread_grid(pme->gpu); wallcycle_stop(wcycle, ewcWAIT_GPU_PME_SPREAD); } } try { if (pmeGpu->settings.currentFlags & GMX_PME_SOLVE) { /* do R2C 3D-FFT */ parallel_3dfft_execute_gpu_wrapper(pme, gridIndex, GMX_FFT_REAL_TO_COMPLEX, wcycle); /* solve in k-space for our local cells */ if (pme_gpu_performs_solve(pmeGpu)) { const auto gridOrdering = pme_gpu_uses_dd(pmeGpu) ? GridOrdering::YZX : GridOrdering::XYZ; wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU); wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME); pme_gpu_solve(pmeGpu, cfftgrid, gridOrdering, computeEnergyAndVirial); wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME); wallcycle_stop(wcycle, ewcLAUNCH_GPU); } else { wallcycle_start(wcycle, ewcPME_SOLVE_MIXED_MODE); #pragma omp parallel for num_threads(pme->nthread) schedule(static) for (int thread = 0; thread < pme->nthread; thread++) { solve_pme_yzx(pme, cfftgrid, pme->boxVolume, computeEnergyAndVirial, pme->nthread, thread); } wallcycle_stop(wcycle, ewcPME_SOLVE_MIXED_MODE); } } if (performBackFFT) { parallel_3dfft_execute_gpu_wrapper(pme, gridIndex, GMX_FFT_COMPLEX_TO_REAL, wcycle); } } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR; }
void pme_gpu_reinit_computation(const gmx_pme_t *pme, gmx_wallcycle *wcycle) { GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled."); wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU); wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME); pme_gpu_clear_grids(pme->gpu); pme_gpu_clear_energy_virial(pme->gpu); wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME); wallcycle_stop(wcycle, ewcLAUNCH_GPU); }
void pme_gpu_launch_gather(const gmx_pme_t *pme, gmx_wallcycle gmx_unused *wcycle, PmeForceOutputHandling forceTreatment) { GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled."); if (!pme_gpu_performs_gather(pme->gpu)) { return; } wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU); wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME); const unsigned int gridIndex = 0; real *fftgrid = pme->fftgrid[gridIndex]; pme_gpu_gather(pme->gpu, forceTreatment, reinterpret_cast<float *>(fftgrid)); wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME); wallcycle_stop(wcycle, ewcLAUNCH_GPU); }
void pme_gpu_prepare_computation(gmx_pme_t *pme, bool needToUpdateBox, const matrix box, gmx_wallcycle *wcycle, int flags) { GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled."); GMX_ASSERT(pme->nnodes > 0, ""); GMX_ASSERT(pme->nnodes == 1 || pme->ndecompdim > 0, ""); PmeGpu *pmeGpu = pme->gpu; pmeGpu->settings.currentFlags = flags; // TODO these flags are only here to honor the CPU PME code, and probably should be removed bool shouldUpdateBox = false; for (int i = 0; i < DIM; ++i) { for (int j = 0; j <= i; ++j) { shouldUpdateBox |= (pmeGpu->common->previousBox[i][j] != box[i][j]); pmeGpu->common->previousBox[i][j] = box[i][j]; } } if (needToUpdateBox || shouldUpdateBox) // || is to make the first computation always update { wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU); wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME); pme_gpu_update_input_box(pmeGpu, box); wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME); wallcycle_stop(wcycle, ewcLAUNCH_GPU); if (!pme_gpu_performs_solve(pmeGpu)) { // TODO remove code duplication and add test coverage matrix scaledBox; pmeGpu->common->boxScaler->scaleBox(box, scaledBox); gmx::invertBoxMatrix(scaledBox, pme->recipbox); pme->boxVolume = scaledBox[XX][XX] * scaledBox[YY][YY] * scaledBox[ZZ][ZZ]; } } }