//! Construct a reduction mask for which parts (blocks) of the force array are touched on which thread task static void calc_bonded_reduction_mask(int natoms, f_thread_t *f_thread, const t_idef *idef, int thread, int nthread) { static_assert(BITMASK_SIZE == GMX_OPENMP_MAX_THREADS, "For the error message below we assume these two are equal."); if (nthread > BITMASK_SIZE) { #pragma omp master gmx_fatal(FARGS, "You are using %d OpenMP threads, which is larger than GMX_OPENMP_MAX_THREADS (%d). Decrease the number of OpenMP threads or rebuild GROMACS with a larger value for GMX_OPENMP_MAX_THREADS.", nthread, GMX_OPENMP_MAX_THREADS); #pragma omp barrier } GMX_ASSERT(nthread <= BITMASK_SIZE, "We need at least nthread bits in the mask"); int nblock = (natoms + reduction_block_size - 1) >> reduction_block_bits; if (nblock > f_thread->block_nalloc) { f_thread->block_nalloc = over_alloc_large(nblock); srenew(f_thread->mask, f_thread->block_nalloc); srenew(f_thread->block_index, f_thread->block_nalloc); sfree_aligned(f_thread->f); snew_aligned(f_thread->f, f_thread->block_nalloc*reduction_block_size, 128); } gmx_bitmask_t *mask = f_thread->mask; for (int b = 0; b < nblock; b++) { bitmask_clear(&mask[b]); } for (int ftype = 0; ftype < F_NRE; ftype++) { if (ftype_is_bonded_potential(ftype)) { int nb = idef->il[ftype].nr; if (nb > 0) { int nat1 = interaction_function[ftype].nratoms + 1; int nb0 = idef->il_thread_division[ftype*(nthread + 1) + thread]; int nb1 = idef->il_thread_division[ftype*(nthread + 1) + thread + 1]; for (int i = nb0; i < nb1; i += nat1) { for (int a = 1; a < nat1; a++) { bitmask_set_bit(&mask[idef->il[ftype].iatoms[i+a] >> reduction_block_bits], thread); } } } } }
static void realloc_work(struct pme_solve_work_t *work, int nkx) { if (nkx > work->nalloc) { int simd_width, i; work->nalloc = nkx; srenew(work->mhx, work->nalloc); srenew(work->mhy, work->nalloc); srenew(work->mhz, work->nalloc); srenew(work->m2, work->nalloc); /* Allocate an aligned pointer for SIMD operations, including extra * elements at the end for padding. */ #ifdef PME_SIMD_SOLVE simd_width = GMX_SIMD_REAL_WIDTH; #else /* We can use any alignment, apart from 0, so we use 4 */ simd_width = 4; #endif sfree_aligned(work->denom); sfree_aligned(work->tmp1); sfree_aligned(work->tmp2); sfree_aligned(work->eterm); snew_aligned(work->denom, work->nalloc+simd_width, simd_width*sizeof(real)); snew_aligned(work->tmp1, work->nalloc+simd_width, simd_width*sizeof(real)); snew_aligned(work->tmp2, work->nalloc+simd_width, simd_width*sizeof(real)); snew_aligned(work->eterm, work->nalloc+simd_width, simd_width*sizeof(real)); srenew(work->m2inv, work->nalloc); /* Init all allocated elements of denom to 1 to avoid 1/0 exceptions * of simd padded elements. */ for (i = 0; i < work->nalloc+simd_width; i++) { work->denom[i] = 1; } } }
/*! \brief \brief Allocates nbytes of host memory. Use ocl_free to free memory allocated with this function. * * \todo * This function should allocate page-locked memory to help reduce D2H and H2D * transfer times, similar with pmalloc from pmalloc_cuda.cu. * * \param[in,out] h_ptr Pointer where to store the address of the newly allocated buffer. * \param[in] nbytes Size in bytes of the buffer to be allocated. */ void ocl_pmalloc(void **h_ptr, size_t nbytes) { /* Need a temporary type whose size is 1 byte, so that the * implementation of snew_aligned can cope without issuing * warnings. */ char **temporary = reinterpret_cast<char **>(h_ptr); /* 16-byte alignment is required by the neighbour-searching code, * because it uses four-wide SIMD for bounding-box calculation. * However, when we organize using page-locked memory for * device-host transfers, it will probably need to be aligned to a * 4kb page, like CUDA does. */ snew_aligned(*temporary, nbytes, 16); }
static void realloc_splinevec(splinevec th, real **ptr_z, int nalloc) { const int padding = 4; int i; srenew(th[XX], nalloc); srenew(th[YY], nalloc); /* In z we add padding, this is only required for the aligned SIMD code */ sfree_aligned(*ptr_z); snew_aligned(*ptr_z, nalloc+2*padding, SIMD4_ALIGNMENT); th[ZZ] = *ptr_z + padding; for (i = 0; i < padding; i++) { (*ptr_z)[ i] = 0; (*ptr_z)[padding+nalloc+i] = 0; } }
void pmegrid_init(pmegrid_t *grid, int cx, int cy, int cz, int x0, int y0, int z0, int x1, int y1, int z1, gmx_bool set_alignment, int pme_order, real *ptr) { int nz, gridsize; grid->ci[XX] = cx; grid->ci[YY] = cy; grid->ci[ZZ] = cz; grid->offset[XX] = x0; grid->offset[YY] = y0; grid->offset[ZZ] = z0; grid->n[XX] = x1 - x0 + pme_order - 1; grid->n[YY] = y1 - y0 + pme_order - 1; grid->n[ZZ] = z1 - z0 + pme_order - 1; copy_ivec(grid->n, grid->s); nz = grid->s[ZZ]; set_grid_alignment(&nz, pme_order); if (set_alignment) { grid->s[ZZ] = nz; } else if (nz != grid->s[ZZ]) { gmx_incons("pmegrid_init call with an unaligned z size"); } grid->order = pme_order; if (ptr == NULL) { gridsize = grid->s[XX]*grid->s[YY]*grid->s[ZZ]; set_gridsize_alignment(&gridsize, pme_order); snew_aligned(grid->grid, gridsize, SIMD4_ALIGNMENT); } else { grid->grid = ptr; } }
struct pme_spline_work *make_pme_spline_work(int gmx_unused order) { struct pme_spline_work *work; #ifdef PME_SIMD4_SPREAD_GATHER real tmp[GMX_SIMD4_WIDTH*3], *tmp_aligned; gmx_simd4_real_t zero_S; gmx_simd4_real_t real_mask_S0, real_mask_S1; int of, i; snew_aligned(work, 1, SIMD4_ALIGNMENT); tmp_aligned = gmx_simd4_align_r(tmp); zero_S = gmx_simd4_setzero_r(); /* Generate bit masks to mask out the unused grid entries, * as we only operate on order of the 8 grid entries that are * load into 2 SIMD registers. */ for (of = 0; of < 2*GMX_SIMD4_WIDTH-(order-1); of++) { for (i = 0; i < 2*GMX_SIMD4_WIDTH; i++) { tmp_aligned[i] = (i >= of && i < of+order ? -1.0 : 1.0); } real_mask_S0 = gmx_simd4_load_r(tmp_aligned); real_mask_S1 = gmx_simd4_load_r(tmp_aligned+GMX_SIMD4_WIDTH); work->mask_S0[of] = gmx_simd4_cmplt_r(real_mask_S0, zero_S); work->mask_S1[of] = gmx_simd4_cmplt_r(real_mask_S1, zero_S); } #else work = NULL; #endif return work; }
void pmegrids_init(pmegrids_t *grids, int nx, int ny, int nz, int nz_base, int pme_order, gmx_bool bUseThreads, int nthread, int overlap_x, int overlap_y) { ivec n, n_base; int t, x, y, z, d, i, tfac; int max_comm_lines = -1; n[XX] = nx - (pme_order - 1); n[YY] = ny - (pme_order - 1); n[ZZ] = nz - (pme_order - 1); copy_ivec(n, n_base); n_base[ZZ] = nz_base; pmegrid_init(&grids->grid, 0, 0, 0, 0, 0, 0, n[XX], n[YY], n[ZZ], FALSE, pme_order, NULL); grids->nthread = nthread; make_subgrid_division(n_base, pme_order-1, grids->nthread, grids->nc); if (bUseThreads) { ivec nst; int gridsize; for (d = 0; d < DIM; d++) { nst[d] = div_round_up(n[d], grids->nc[d]) + pme_order - 1; } set_grid_alignment(&nst[ZZ], pme_order); if (debug) { fprintf(debug, "pmegrid thread local division: %d x %d x %d\n", grids->nc[XX], grids->nc[YY], grids->nc[ZZ]); fprintf(debug, "pmegrid %d %d %d max thread pmegrid %d %d %d\n", nx, ny, nz, nst[XX], nst[YY], nst[ZZ]); } snew(grids->grid_th, grids->nthread); t = 0; gridsize = nst[XX]*nst[YY]*nst[ZZ]; set_gridsize_alignment(&gridsize, pme_order); snew_aligned(grids->grid_all, grids->nthread*gridsize+(grids->nthread+1)*GMX_CACHE_SEP, SIMD4_ALIGNMENT); for (x = 0; x < grids->nc[XX]; x++) { for (y = 0; y < grids->nc[YY]; y++) { for (z = 0; z < grids->nc[ZZ]; z++) { pmegrid_init(&grids->grid_th[t], x, y, z, (n[XX]*(x ))/grids->nc[XX], (n[YY]*(y ))/grids->nc[YY], (n[ZZ]*(z ))/grids->nc[ZZ], (n[XX]*(x+1))/grids->nc[XX], (n[YY]*(y+1))/grids->nc[YY], (n[ZZ]*(z+1))/grids->nc[ZZ], TRUE, pme_order, grids->grid_all+GMX_CACHE_SEP+t*(gridsize+GMX_CACHE_SEP)); t++; } } } } else { grids->grid_th = NULL; } snew(grids->g2t, DIM); tfac = 1; for (d = DIM-1; d >= 0; d--) { snew(grids->g2t[d], n[d]); t = 0; for (i = 0; i < n[d]; i++) { /* The second check should match the parameters * of the pmegrid_init call above. */ while (t + 1 < grids->nc[d] && i >= (n[d]*(t+1))/grids->nc[d]) { t++; } grids->g2t[d][i] = t*tfac; } tfac *= grids->nc[d]; switch (d) { case XX: max_comm_lines = overlap_x; break; case YY: max_comm_lines = overlap_y; break; case ZZ: max_comm_lines = pme_order - 1; break; } grids->nthread_comm[d] = 0; while ((n[d]*grids->nthread_comm[d])/grids->nc[d] < max_comm_lines && grids->nthread_comm[d] < grids->nc[d]) { grids->nthread_comm[d]++; } if (debug != NULL) { fprintf(debug, "pmegrid thread grid communication range in %c: %d\n", 'x'+d, grids->nthread_comm[d]); } /* It should be possible to make grids->nthread_comm[d]==grids->nc[d] * work, but this is not a problematic restriction. */ if (grids->nc[d] > 1 && grids->nthread_comm[d] > grids->nc[d]) { gmx_fatal(FARGS, "Too many threads for PME (%d) compared to the number of grid lines, reduce the number of threads doing PME", grids->nthread); } } }
/* NxMxK the size of the data * comm communicator to use for fft5d * P0 number of processor in 1st axes (can be null for automatic) * lin is allocated by fft5d because size of array is only known after planning phase * rlout2 is only used as intermediate buffer - only returned after allocation to reuse for back transform - should not be used by caller */ fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads) { int P[2], bMaster, prank[2], i, t; int rNG, rMG, rKG; int *N0 = 0, *N1 = 0, *M0 = 0, *M1 = 0, *K0 = 0, *K1 = 0, *oN0 = 0, *oN1 = 0, *oM0 = 0, *oM1 = 0, *oK0 = 0, *oK1 = 0; int N[3], M[3], K[3], pN[3], pM[3], pK[3], oM[3], oK[3], *iNin[3] = {0}, *oNin[3] = {0}, *iNout[3] = {0}, *oNout[3] = {0}; int C[3], rC[3], nP[2]; int lsize; t_complex *lin = 0, *lout = 0, *lout2 = 0, *lout3 = 0; fft5d_plan plan; int s; /* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */ #ifdef GMX_MPI if (GMX_PARALLEL_ENV_INITIALIZED && comm[0] != MPI_COMM_NULL) { MPI_Comm_size(comm[0], &P[0]); MPI_Comm_rank(comm[0], &prank[0]); } else #endif { P[0] = 1; prank[0] = 0; } #ifdef GMX_MPI if (GMX_PARALLEL_ENV_INITIALIZED && comm[1] != MPI_COMM_NULL) { MPI_Comm_size(comm[1], &P[1]); MPI_Comm_rank(comm[1], &prank[1]); } else #endif { P[1] = 1; prank[1] = 0; } bMaster = (prank[0] == 0 && prank[1] == 0); if (debug) { fprintf(debug, "FFT5D: Using %dx%d processor grid, rank %d,%d\n", P[0], P[1], prank[0], prank[1]); } if (bMaster) { if (debug) { fprintf(debug, "FFT5D: N: %d, M: %d, K: %d, P: %dx%d, real2complex: %d, backward: %d, order yz: %d, debug %d\n", NG, MG, KG, P[0], P[1], (flags&FFT5D_REALCOMPLEX) > 0, (flags&FFT5D_BACKWARD) > 0, (flags&FFT5D_ORDER_YZ) > 0, (flags&FFT5D_DEBUG) > 0); } /* The check below is not correct, one prime factor 11 or 13 is ok. if (fft5d_fmax(fft5d_fmax(lpfactor(NG),lpfactor(MG)),lpfactor(KG))>7) { printf("WARNING: FFT very slow with prime factors larger 7\n"); printf("Change FFT size or in case you cannot change it look at\n"); printf("http://www.fftw.org/fftw3_doc/Generating-your-own-code.html\n"); } */ } if (NG == 0 || MG == 0 || KG == 0) { if (bMaster) { printf("FFT5D: FATAL: Datasize cannot be zero in any dimension\n"); } return 0; } rNG = NG; rMG = MG; rKG = KG; if (flags&FFT5D_REALCOMPLEX) { if (!(flags&FFT5D_BACKWARD)) { NG = NG/2+1; } else { if (!(flags&FFT5D_ORDER_YZ)) { MG = MG/2+1; } else { KG = KG/2+1; } } } /*for transpose we need to know the size for each processor not only our own size*/ N0 = (int*)malloc(P[0]*sizeof(int)); N1 = (int*)malloc(P[1]*sizeof(int)); M0 = (int*)malloc(P[0]*sizeof(int)); M1 = (int*)malloc(P[1]*sizeof(int)); K0 = (int*)malloc(P[0]*sizeof(int)); K1 = (int*)malloc(P[1]*sizeof(int)); oN0 = (int*)malloc(P[0]*sizeof(int)); oN1 = (int*)malloc(P[1]*sizeof(int)); oM0 = (int*)malloc(P[0]*sizeof(int)); oM1 = (int*)malloc(P[1]*sizeof(int)); oK0 = (int*)malloc(P[0]*sizeof(int)); oK1 = (int*)malloc(P[1]*sizeof(int)); for (i = 0; i < P[0]; i++) { #define EVENDIST #ifndef EVENDIST oN0[i] = i*ceil((double)NG/P[0]); oM0[i] = i*ceil((double)MG/P[0]); oK0[i] = i*ceil((double)KG/P[0]); #else oN0[i] = (NG*i)/P[0]; oM0[i] = (MG*i)/P[0]; oK0[i] = (KG*i)/P[0]; #endif } for (i = 0; i < P[1]; i++) { #ifndef EVENDIST oN1[i] = i*ceil((double)NG/P[1]); oM1[i] = i*ceil((double)MG/P[1]); oK1[i] = i*ceil((double)KG/P[1]); #else oN1[i] = (NG*i)/P[1]; oM1[i] = (MG*i)/P[1]; oK1[i] = (KG*i)/P[1]; #endif } for (i = 0; i < P[0]-1; i++) { N0[i] = oN0[i+1]-oN0[i]; M0[i] = oM0[i+1]-oM0[i]; K0[i] = oK0[i+1]-oK0[i]; } N0[P[0]-1] = NG-oN0[P[0]-1]; M0[P[0]-1] = MG-oM0[P[0]-1]; K0[P[0]-1] = KG-oK0[P[0]-1]; for (i = 0; i < P[1]-1; i++) { N1[i] = oN1[i+1]-oN1[i]; M1[i] = oM1[i+1]-oM1[i]; K1[i] = oK1[i+1]-oK1[i]; } N1[P[1]-1] = NG-oN1[P[1]-1]; M1[P[1]-1] = MG-oM1[P[1]-1]; K1[P[1]-1] = KG-oK1[P[1]-1]; /* for step 1-3 the local N,M,K sizes of the transposed system C: contiguous dimension, and nP: number of processor in subcommunicator for that step */ pM[0] = M0[prank[0]]; oM[0] = oM0[prank[0]]; pK[0] = K1[prank[1]]; oK[0] = oK1[prank[1]]; C[0] = NG; rC[0] = rNG; if (!(flags&FFT5D_ORDER_YZ)) { N[0] = vmax(N1, P[1]); M[0] = M0[prank[0]]; K[0] = vmax(K1, P[1]); pN[0] = N1[prank[1]]; iNout[0] = N1; oNout[0] = oN1; nP[0] = P[1]; C[1] = KG; rC[1] = rKG; N[1] = vmax(K0, P[0]); pN[1] = K0[prank[0]]; iNin[1] = K1; oNin[1] = oK1; iNout[1] = K0; oNout[1] = oK0; M[1] = vmax(M0, P[0]); pM[1] = M0[prank[0]]; oM[1] = oM0[prank[0]]; K[1] = N1[prank[1]]; pK[1] = N1[prank[1]]; oK[1] = oN1[prank[1]]; nP[1] = P[0]; C[2] = MG; rC[2] = rMG; iNin[2] = M0; oNin[2] = oM0; M[2] = vmax(K0, P[0]); pM[2] = K0[prank[0]]; oM[2] = oK0[prank[0]]; K[2] = vmax(N1, P[1]); pK[2] = N1[prank[1]]; oK[2] = oN1[prank[1]]; free(N0); free(oN0); /*these are not used for this order*/ free(M1); free(oM1); /*the rest is freed in destroy*/ } else { N[0] = vmax(N0, P[0]); M[0] = vmax(M0, P[0]); K[0] = K1[prank[1]]; pN[0] = N0[prank[0]]; iNout[0] = N0; oNout[0] = oN0; nP[0] = P[0]; C[1] = MG; rC[1] = rMG; N[1] = vmax(M1, P[1]); pN[1] = M1[prank[1]]; iNin[1] = M0; oNin[1] = oM0; iNout[1] = M1; oNout[1] = oM1; M[1] = N0[prank[0]]; pM[1] = N0[prank[0]]; oM[1] = oN0[prank[0]]; K[1] = vmax(K1, P[1]); pK[1] = K1[prank[1]]; oK[1] = oK1[prank[1]]; nP[1] = P[1]; C[2] = KG; rC[2] = rKG; iNin[2] = K1; oNin[2] = oK1; M[2] = vmax(N0, P[0]); pM[2] = N0[prank[0]]; oM[2] = oN0[prank[0]]; K[2] = vmax(M1, P[1]); pK[2] = M1[prank[1]]; oK[2] = oM1[prank[1]]; free(N1); free(oN1); /*these are not used for this order*/ free(K0); free(oK0); /*the rest is freed in destroy*/ } N[2] = pN[2] = -1; /*not used*/ /* Difference between x-y-z regarding 2d decomposition is whether they are distributed along axis 1, 2 or both */ /* int lsize = fmax(N[0]*M[0]*K[0]*nP[0],N[1]*M[1]*K[1]*nP[1]); */ lsize = std::max(N[0]*M[0]*K[0]*nP[0], std::max(N[1]*M[1]*K[1]*nP[1], C[2]*M[2]*K[2])); /* int lsize = fmax(C[0]*M[0]*K[0],fmax(C[1]*M[1]*K[1],C[2]*M[2]*K[2])); */ if (!(flags&FFT5D_NOMALLOC)) { snew_aligned(lin, lsize, 32); snew_aligned(lout, lsize, 32); if (nthreads > 1) { /* We need extra transpose buffers to avoid OpenMP barriers */ snew_aligned(lout2, lsize, 32); snew_aligned(lout3, lsize, 32); } else { /* We can reuse the buffers to avoid cache misses */ lout2 = lin; lout3 = lout; } } else { lin = *rlin; lout = *rlout; if (nthreads > 1) { lout2 = *rlout2; lout3 = *rlout3; } else { lout2 = lin; lout3 = lout; } } plan = (fft5d_plan)calloc(1, sizeof(struct fft5d_plan_t)); if (debug) { fprintf(debug, "Running on %d threads\n", nthreads); } #ifdef GMX_FFT_FFTW3 /*if not FFTW - then we don't do a 3d plan but instead use only 1D plans */ /* It is possible to use the 3d plan with OMP threads - but in that case it is not allowed to be called from * within a parallel region. For now deactivated. If it should be supported it has to made sure that * that the execute of the 3d plan is in a master/serial block (since it contains it own parallel region) * and that the 3d plan is faster than the 1d plan. */ if ((!(flags&FFT5D_INPLACE)) && (!(P[0] > 1 || P[1] > 1)) && nthreads == 1) /*don't do 3d plan in parallel or if in_place requested */ { int fftwflags = FFTW_DESTROY_INPUT; FFTW(iodim) dims[3]; int inNG = NG, outMG = MG, outKG = KG; FFTW_LOCK; if (!(flags&FFT5D_NOMEASURE)) { fftwflags |= FFTW_MEASURE; } if (flags&FFT5D_REALCOMPLEX) { if (!(flags&FFT5D_BACKWARD)) /*input pointer is not complex*/ { inNG *= 2; } else /*output pointer is not complex*/ { if (!(flags&FFT5D_ORDER_YZ)) { outMG *= 2; } else { outKG *= 2; } } } if (!(flags&FFT5D_BACKWARD)) { dims[0].n = KG; dims[1].n = MG; dims[2].n = rNG; dims[0].is = inNG*MG; /*N M K*/ dims[1].is = inNG; dims[2].is = 1; if (!(flags&FFT5D_ORDER_YZ)) { dims[0].os = MG; /*M K N*/ dims[1].os = 1; dims[2].os = MG*KG; } else { dims[0].os = 1; /*K N M*/ dims[1].os = KG*NG; dims[2].os = KG; } } else { if (!(flags&FFT5D_ORDER_YZ)) { dims[0].n = NG; dims[1].n = KG; dims[2].n = rMG; dims[0].is = 1; dims[1].is = NG*MG; dims[2].is = NG; dims[0].os = outMG*KG; dims[1].os = outMG; dims[2].os = 1; } else { dims[0].n = MG; dims[1].n = NG; dims[2].n = rKG; dims[0].is = NG; dims[1].is = 1; dims[2].is = NG*MG; dims[0].os = outKG*NG; dims[1].os = outKG; dims[2].os = 1; } } #ifdef FFT5D_THREADS #ifdef FFT5D_FFTW_THREADS FFTW(plan_with_nthreads)(nthreads); #endif #endif if ((flags&FFT5D_REALCOMPLEX) && !(flags&FFT5D_BACKWARD)) { plan->p3d = FFTW(plan_guru_dft_r2c)(/*rank*/ 3, dims, /*howmany*/ 0, /*howmany_dims*/ 0, (real*)lin, (FFTW(complex) *) lout, /*flags*/ fftwflags); }
t_forcetable make_tables(FILE *out,const output_env_t oenv, const t_forcerec *fr, gmx_bool bVerbose,const char *fn, real rtab,int flags) { const char *fns[3] = { "ctab.xvg", "dtab.xvg", "rtab.xvg" }; const char *fns14[3] = { "ctab14.xvg", "dtab14.xvg", "rtab14.xvg" }; FILE *fp; t_tabledata *td; gmx_bool b14only,bReadTab,bGenTab; real x0,y0,yp; int i,j,k,nx,nx0,tabsel[etiNR]; t_forcetable table; b14only = (flags & GMX_MAKETABLES_14ONLY); if (flags & GMX_MAKETABLES_FORCEUSER) { tabsel[etiCOUL] = etabUSER; tabsel[etiLJ6] = etabUSER; tabsel[etiLJ12] = etabUSER; } else { set_table_type(tabsel,fr,b14only); } snew(td,etiNR); table.r = rtab; table.scale = 0; table.n = 0; table.scale_exp = 0; nx0 = 10; nx = 0; /* Check whether we have to read or generate */ bReadTab = FALSE; bGenTab = FALSE; for(i=0; (i<etiNR); i++) { if (ETAB_USER(tabsel[i])) bReadTab = TRUE; if (tabsel[i] != etabUSER) bGenTab = TRUE; } if (bReadTab) { read_tables(out,fn,etiNR,0,td); if (rtab == 0 || (flags & GMX_MAKETABLES_14ONLY)) { rtab = td[0].x[td[0].nx-1]; table.n = td[0].nx; nx = table.n; } else { if (td[0].x[td[0].nx-1] < rtab) gmx_fatal(FARGS,"Tables in file %s not long enough for cut-off:\n" "\tshould be at least %f nm\n",fn,rtab); nx = table.n = (int)(rtab*td[0].tabscale + 0.5); } table.scale = td[0].tabscale; nx0 = td[0].nx0; } if (bGenTab) { if (!bReadTab) { #ifdef GMX_DOUBLE table.scale = 2000.0; #else table.scale = 500.0; #endif nx = table.n = rtab*table.scale; } } if (fr->bBHAM) { if(fr->bham_b_max!=0) table.scale_exp = table.scale/fr->bham_b_max; else table.scale_exp = table.scale; } /* Each table type (e.g. coul,lj6,lj12) requires four * numbers per nx+1 data points. For performance reasons we want * the table data to be aligned to 16-byte. */ snew_aligned(table.tab, 12*(nx+1)*sizeof(real),16); for(k=0; (k<etiNR); k++) { if (tabsel[k] != etabUSER) { init_table(out,nx,nx0, (tabsel[k] == etabEXPMIN) ? table.scale_exp : table.scale, &(td[k]),!bReadTab); fill_table(&(td[k]),tabsel[k],fr); if (out) fprintf(out,"%s table with %d data points for %s%s.\n" "Tabscale = %g points/nm\n", ETAB_USER(tabsel[k]) ? "Modified" : "Generated", td[k].nx,b14only?"1-4 ":"",tprops[tabsel[k]].name, td[k].tabscale); } copy2table(table.n,k*4,12,td[k].x,td[k].v,td[k].f,table.tab); if (bDebugMode() && bVerbose) { if (b14only) fp=xvgropen(fns14[k],fns14[k],"r","V",oenv); else fp=xvgropen(fns[k],fns[k],"r","V",oenv); /* plot the output 5 times denser than the table data */ for(i=5*((nx0+1)/2); i<5*table.n; i++) { x0 = i*table.r/(5*(table.n-1)); evaluate_table(table.tab,4*k,12,table.scale,x0,&y0,&yp); fprintf(fp,"%15.10e %15.10e %15.10e\n",x0,y0,yp); } gmx_fio_fclose(fp); } done_tabledata(&(td[k])); } sfree(td); return table; }
t_forcetable make_atf_table(FILE *out,const output_env_t oenv, const t_forcerec *fr, const char *fn, matrix box) { const char *fns[3] = { "tf_tab.xvg", "atfdtab.xvg", "atfrtab.xvg" }; FILE *fp; t_tabledata *td; real x0,y0,yp,rtab; int i,nx,nx0; real rx, ry, rz, box_r; t_forcetable table; /* Set the table dimensions for ATF, not really necessary to * use etiNR (since we only have one table, but ...) */ snew(td,1); if (fr->adress_type == eAdressSphere){ /* take half box diagonal direction as tab range */ rx = 0.5*box[0][0]+0.5*box[1][0]+0.5*box[2][0]; ry = 0.5*box[0][1]+0.5*box[1][1]+0.5*box[2][1]; rz = 0.5*box[0][2]+0.5*box[1][2]+0.5*box[2][2]; box_r = sqrt(rx*rx+ry*ry+rz*rz); }else{ /* xsplit: take half box x direction as tab range */ box_r = box[0][0]/2; } table.r = box_r; table.scale = 0; table.n = 0; table.scale_exp = 0; nx0 = 10; nx = 0; read_tables(out,fn,1,0,td); rtab = td[0].x[td[0].nx-1]; if (fr->adress_type == eAdressXSplit && (rtab < box[0][0]/2)){ gmx_fatal(FARGS,"AdResS full box therm force table in file %s extends to %f:\n" "\tshould extend to at least half the length of the box in x-direction" "%f\n",fn,rtab, box[0][0]/2); } if (rtab < box_r){ gmx_fatal(FARGS,"AdResS full box therm force table in file %s extends to %f:\n" "\tshould extend to at least for spherical adress" "%f (=distance from center to furthermost point in box \n",fn,rtab, box_r); } table.n = td[0].nx; nx = table.n; table.scale = td[0].tabscale; nx0 = td[0].nx0; /* Each table type (e.g. coul,lj6,lj12) requires four * numbers per datapoint. For performance reasons we want * the table data to be aligned to 16-byte. This is accomplished * by allocating 16 bytes extra to a temporary pointer, and then * calculating an aligned pointer. This new pointer must not be * used in a free() call, but thankfully we're sloppy enough not * to do this :-) */ snew_aligned(table.data,4*nx,16); copy2table(table.n,0,4,td[0].x,td[0].v,td[0].f,1.0,table.data); if(bDebugMode()) { fp=xvgropen(fns[0],fns[0],"r","V",oenv); /* plot the output 5 times denser than the table data */ /* for(i=5*nx0;i<5*table.n;i++) */ for(i=5*((nx0+1)/2); i<5*table.n; i++) { /* x0=i*table.r/(5*table.n); */ x0 = i*table.r/(5*(table.n-1)); evaluate_table(table.data,0,4,table.scale,x0,&y0,&yp); fprintf(fp,"%15.10e %15.10e %15.10e\n",x0,y0,yp); } ffclose(fp); } done_tabledata(&(td[0])); sfree(td); table.interaction = GMX_TABLE_INTERACTION_ELEC_VDWREP_VDWDISP; table.format = GMX_TABLE_FORMAT_CUBICSPLINE_YFGH; table.formatsize = 4; table.ninteractions = 3; table.stride = table.formatsize*table.ninteractions; return table; }
t_forcetable make_gb_table(FILE *out,const output_env_t oenv, const t_forcerec *fr, const char *fn, real rtab) { const char *fns[3] = { "gbctab.xvg", "gbdtab.xvg", "gbrtab.xvg" }; const char *fns14[3] = { "gbctab14.xvg", "gbdtab14.xvg", "gbrtab14.xvg" }; FILE *fp; t_tabledata *td; gmx_bool bReadTab,bGenTab; real x0,y0,yp; int i,j,k,nx,nx0,tabsel[etiNR]; double r,r2,Vtab,Ftab,expterm; t_forcetable table; double abs_error_r, abs_error_r2; double rel_error_r, rel_error_r2; double rel_error_r_old=0, rel_error_r2_old=0; double x0_r_error, x0_r2_error; /* Only set a Coulomb table for GB */ /* tabsel[0]=etabGB; tabsel[1]=-1; tabsel[2]=-1; */ /* Set the table dimensions for GB, not really necessary to * use etiNR (since we only have one table, but ...) */ snew(td,1); table.interaction = GMX_TABLE_INTERACTION_ELEC; table.format = GMX_TABLE_FORMAT_CUBICSPLINE_YFGH; table.r = fr->gbtabr; table.scale = fr->gbtabscale; table.scale_exp = 0; table.n = table.scale*table.r; table.formatsize = 4; table.ninteractions = 1; table.stride = table.formatsize*table.ninteractions; nx0 = 0; nx = table.scale*table.r; /* Check whether we have to read or generate * We will always generate a table, so remove the read code * (Compare with original make_table function */ bReadTab = FALSE; bGenTab = TRUE; /* Each table type (e.g. coul,lj6,lj12) requires four * numbers per datapoint. For performance reasons we want * the table data to be aligned to 16-byte. This is accomplished * by allocating 16 bytes extra to a temporary pointer, and then * calculating an aligned pointer. This new pointer must not be * used in a free() call, but thankfully we're sloppy enough not * to do this :-) */ snew_aligned(table.data,4*nx,16); init_table(out,nx,nx0,table.scale,&(td[0]),!bReadTab); /* Local implementation so we don't have to use the etabGB * enum above, which will cause problems later when * making the other tables (right now even though we are using * GB, the normal Coulomb tables will be created, but this * will cause a problem since fr->eeltype==etabGB which will not * be defined in fill_table and set_table_type */ for(i=nx0;i<nx;i++) { Vtab = 0.0; Ftab = 0.0; r = td->x[i]; r2 = r*r; expterm = exp(-0.25*r2); Vtab = 1/sqrt(r2+expterm); Ftab = (r-0.25*r*expterm)/((r2+expterm)*sqrt(r2+expterm)); /* Convert to single precision when we store to mem */ td->v[i] = Vtab; td->f[i] = Ftab; } copy2table(table.n,0,4,td[0].x,td[0].v,td[0].f,1.0,table.data); if(bDebugMode()) { fp=xvgropen(fns[0],fns[0],"r","V",oenv); /* plot the output 5 times denser than the table data */ /* for(i=5*nx0;i<5*table.n;i++) */ for(i=nx0;i<table.n;i++) { /* x0=i*table.r/(5*table.n); */ x0=i*table.r/table.n; evaluate_table(table.data,0,4,table.scale,x0,&y0,&yp); fprintf(fp,"%15.10e %15.10e %15.10e\n",x0,y0,yp); } gmx_fio_fclose(fp); } /* for(i=100*nx0;i<99.81*table.n;i++) { r = i*table.r/(100*table.n); r2 = r*r; expterm = exp(-0.25*r2); Vtab = 1/sqrt(r2+expterm); Ftab = (r-0.25*r*expterm)/((r2+expterm)*sqrt(r2+expterm)); evaluate_table(table.data,0,4,table.scale,r,&y0,&yp); printf("gb: i=%d, x0=%g, y0=%15.15f, Vtab=%15.15f, yp=%15.15f, Ftab=%15.15f\n",i,r, y0, Vtab, yp, Ftab); abs_error_r=fabs(y0-Vtab); abs_error_r2=fabs(yp-(-1)*Ftab); rel_error_r=abs_error_r/y0; rel_error_r2=fabs(abs_error_r2/yp); if(rel_error_r>rel_error_r_old) { rel_error_r_old=rel_error_r; x0_r_error=x0; } if(rel_error_r2>rel_error_r2_old) { rel_error_r2_old=rel_error_r2; x0_r2_error=x0; } } printf("gb: MAX REL ERROR IN R=%15.15f, MAX REL ERROR IN R2=%15.15f\n",rel_error_r_old, rel_error_r2_old); printf("gb: XO_R=%g, X0_R2=%g\n",x0_r_error, x0_r2_error); exit(1); */ done_tabledata(&(td[0])); sfree(td); return table; }
t_forcetable make_tables(FILE *out,const output_env_t oenv, const t_forcerec *fr, gmx_bool bVerbose,const char *fn, real rtab,int flags) { const char *fns[3] = { "ctab.xvg", "dtab.xvg", "rtab.xvg" }; const char *fns14[3] = { "ctab14.xvg", "dtab14.xvg", "rtab14.xvg" }; FILE *fp; t_tabledata *td; gmx_bool b14only,bReadTab,bGenTab; real x0,y0,yp; int i,j,k,nx,nx0,tabsel[etiNR]; real scalefactor; t_forcetable table; b14only = (flags & GMX_MAKETABLES_14ONLY); if (flags & GMX_MAKETABLES_FORCEUSER) { tabsel[etiCOUL] = etabUSER; tabsel[etiLJ6] = etabUSER; tabsel[etiLJ12] = etabUSER; } else { set_table_type(tabsel,fr,b14only); } snew(td,etiNR); table.r = rtab; table.scale = 0; table.n = 0; table.scale_exp = 0; nx0 = 10; nx = 0; table.interaction = GMX_TABLE_INTERACTION_ELEC_VDWREP_VDWDISP; table.format = GMX_TABLE_FORMAT_CUBICSPLINE_YFGH; table.formatsize = 4; table.ninteractions = 3; table.stride = table.formatsize*table.ninteractions; /* Check whether we have to read or generate */ bReadTab = FALSE; bGenTab = FALSE; for(i=0; (i<etiNR); i++) { if (ETAB_USER(tabsel[i])) bReadTab = TRUE; if (tabsel[i] != etabUSER) bGenTab = TRUE; } if (bReadTab) { read_tables(out,fn,etiNR,0,td); if (rtab == 0 || (flags & GMX_MAKETABLES_14ONLY)) { rtab = td[0].x[td[0].nx-1]; table.n = td[0].nx; nx = table.n; } else { if (td[0].x[td[0].nx-1] < rtab) gmx_fatal(FARGS,"Tables in file %s not long enough for cut-off:\n" "\tshould be at least %f nm\n",fn,rtab); nx = table.n = (int)(rtab*td[0].tabscale + 0.5); } table.scale = td[0].tabscale; nx0 = td[0].nx0; } if (bGenTab) { if (!bReadTab) { #ifdef GMX_DOUBLE table.scale = 2000.0; #else table.scale = 500.0; #endif nx = table.n = rtab*table.scale; } } if (fr->bBHAM) { if(fr->bham_b_max!=0) table.scale_exp = table.scale/fr->bham_b_max; else table.scale_exp = table.scale; } /* Each table type (e.g. coul,lj6,lj12) requires four * numbers per nx+1 data points. For performance reasons we want * the table data to be aligned to 16-byte. */ snew_aligned(table.data, 12*(nx+1)*sizeof(real),16); for(k=0; (k<etiNR); k++) { if (tabsel[k] != etabUSER) { init_table(out,nx,nx0, (tabsel[k] == etabEXPMIN) ? table.scale_exp : table.scale, &(td[k]),!bReadTab); fill_table(&(td[k]),tabsel[k],fr); if (out) fprintf(out,"%s table with %d data points for %s%s.\n" "Tabscale = %g points/nm\n", ETAB_USER(tabsel[k]) ? "Modified" : "Generated", td[k].nx,b14only?"1-4 ":"",tprops[tabsel[k]].name, td[k].tabscale); } /* Set scalefactor for c6/c12 tables. This is because we save flops in the non-table kernels * by including the derivative constants (6.0 or 12.0) in the parameters, since * we no longer calculate force in most steps. This means the c6/c12 parameters * have been scaled up, so we need to scale down the table interactions too. * It comes here since we need to scale user tables too. */ if(k==etiLJ6) { scalefactor = 1.0/6.0; } else if(k==etiLJ12 && tabsel[k]!=etabEXPMIN) { scalefactor = 1.0/12.0; } else { scalefactor = 1.0; } copy2table(table.n,k*4,12,td[k].x,td[k].v,td[k].f,scalefactor,table.data); if (bDebugMode() && bVerbose) { if (b14only) fp=xvgropen(fns14[k],fns14[k],"r","V",oenv); else fp=xvgropen(fns[k],fns[k],"r","V",oenv); /* plot the output 5 times denser than the table data */ for(i=5*((nx0+1)/2); i<5*table.n; i++) { x0 = i*table.r/(5*(table.n-1)); evaluate_table(table.data,4*k,12,table.scale,x0,&y0,&yp); fprintf(fp,"%15.10e %15.10e %15.10e\n",x0,y0,yp); } gmx_fio_fclose(fp); } done_tabledata(&(td[k])); } sfree(td); return table; }