コード例 #1
0
//! Construct a reduction mask for which parts (blocks) of the force array are touched on which thread task
static void
calc_bonded_reduction_mask(int natoms,
                           f_thread_t *f_thread,
                           const t_idef *idef,
                           int thread, int nthread)
{
    static_assert(BITMASK_SIZE == GMX_OPENMP_MAX_THREADS, "For the error message below we assume these two are equal.");

    if (nthread > BITMASK_SIZE)
    {
#pragma omp master
        gmx_fatal(FARGS, "You are using %d OpenMP threads, which is larger than GMX_OPENMP_MAX_THREADS (%d). Decrease the number of OpenMP threads or rebuild GROMACS with a larger value for GMX_OPENMP_MAX_THREADS.",
                  nthread, GMX_OPENMP_MAX_THREADS);
#pragma omp barrier
    }
    GMX_ASSERT(nthread <= BITMASK_SIZE, "We need at least nthread bits in the mask");

    int nblock = (natoms + reduction_block_size - 1) >> reduction_block_bits;

    if (nblock > f_thread->block_nalloc)
    {
        f_thread->block_nalloc = over_alloc_large(nblock);
        srenew(f_thread->mask,        f_thread->block_nalloc);
        srenew(f_thread->block_index, f_thread->block_nalloc);
        sfree_aligned(f_thread->f);
        snew_aligned(f_thread->f,     f_thread->block_nalloc*reduction_block_size, 128);
    }

    gmx_bitmask_t *mask = f_thread->mask;

    for (int b = 0; b < nblock; b++)
    {
        bitmask_clear(&mask[b]);
    }

    for (int ftype = 0; ftype < F_NRE; ftype++)
    {
        if (ftype_is_bonded_potential(ftype))
        {
            int nb = idef->il[ftype].nr;
            if (nb > 0)
            {
                int nat1 = interaction_function[ftype].nratoms + 1;

                int nb0 = idef->il_thread_division[ftype*(nthread + 1) + thread];
                int nb1 = idef->il_thread_division[ftype*(nthread + 1) + thread + 1];

                for (int i = nb0; i < nb1; i += nat1)
                {
                    for (int a = 1; a < nat1; a++)
                    {
                        bitmask_set_bit(&mask[idef->il[ftype].iatoms[i+a] >> reduction_block_bits], thread);
                    }
                }
            }
        }
    }
コード例 #2
0
ファイル: pme-solve.cpp プロジェクト: rmcgibbo/gromacs
static void realloc_work(struct pme_solve_work_t *work, int nkx)
{
    if (nkx > work->nalloc)
    {
        int simd_width, i;

        work->nalloc = nkx;
        srenew(work->mhx, work->nalloc);
        srenew(work->mhy, work->nalloc);
        srenew(work->mhz, work->nalloc);
        srenew(work->m2, work->nalloc);
        /* Allocate an aligned pointer for SIMD operations, including extra
         * elements at the end for padding.
         */
#ifdef PME_SIMD_SOLVE
        simd_width = GMX_SIMD_REAL_WIDTH;
#else
        /* We can use any alignment, apart from 0, so we use 4 */
        simd_width = 4;
#endif
        sfree_aligned(work->denom);
        sfree_aligned(work->tmp1);
        sfree_aligned(work->tmp2);
        sfree_aligned(work->eterm);
        snew_aligned(work->denom, work->nalloc+simd_width, simd_width*sizeof(real));
        snew_aligned(work->tmp1,  work->nalloc+simd_width, simd_width*sizeof(real));
        snew_aligned(work->tmp2,  work->nalloc+simd_width, simd_width*sizeof(real));
        snew_aligned(work->eterm, work->nalloc+simd_width, simd_width*sizeof(real));
        srenew(work->m2inv, work->nalloc);

        /* Init all allocated elements of denom to 1 to avoid 1/0 exceptions
         * of simd padded elements.
         */
        for (i = 0; i < work->nalloc+simd_width; i++)
        {
            work->denom[i] = 1;
        }
    }
}
コード例 #3
0
ファイル: oclutils.cpp プロジェクト: MrTheodor/gromacs
/*! \brief \brief Allocates nbytes of host memory. Use ocl_free to free memory allocated with this function.
 *
 *  \todo
 *  This function should allocate page-locked memory to help reduce D2H and H2D
 *  transfer times, similar with pmalloc from pmalloc_cuda.cu.
 *
 * \param[in,out]    h_ptr   Pointer where to store the address of the newly allocated buffer.
 * \param[in]        nbytes  Size in bytes of the buffer to be allocated.
 */
void ocl_pmalloc(void **h_ptr, size_t nbytes)
{
    /* Need a temporary type whose size is 1 byte, so that the
     * implementation of snew_aligned can cope without issuing
     * warnings. */
    char **temporary = reinterpret_cast<char **>(h_ptr);

    /* 16-byte alignment is required by the neighbour-searching code,
     * because it uses four-wide SIMD for bounding-box calculation.
     * However, when we organize using page-locked memory for
     * device-host transfers, it will probably need to be aligned to a
     * 4kb page, like CUDA does. */
    snew_aligned(*temporary, nbytes, 16);
}
コード例 #4
0
static void realloc_splinevec(splinevec th, real **ptr_z, int nalloc)
{
    const int padding = 4;
    int       i;

    srenew(th[XX], nalloc);
    srenew(th[YY], nalloc);
    /* In z we add padding, this is only required for the aligned SIMD code */
    sfree_aligned(*ptr_z);
    snew_aligned(*ptr_z, nalloc+2*padding, SIMD4_ALIGNMENT);
    th[ZZ] = *ptr_z + padding;

    for (i = 0; i < padding; i++)
    {
        (*ptr_z)[               i] = 0;
        (*ptr_z)[padding+nalloc+i] = 0;
    }
}
コード例 #5
0
ファイル: pme-grid.cpp プロジェクト: wangxubo0201/gromacs
void pmegrid_init(pmegrid_t *grid,
                  int cx, int cy, int cz,
                  int x0, int y0, int z0,
                  int x1, int y1, int z1,
                  gmx_bool set_alignment,
                  int pme_order,
                  real *ptr)
{
    int nz, gridsize;

    grid->ci[XX]     = cx;
    grid->ci[YY]     = cy;
    grid->ci[ZZ]     = cz;
    grid->offset[XX] = x0;
    grid->offset[YY] = y0;
    grid->offset[ZZ] = z0;
    grid->n[XX]      = x1 - x0 + pme_order - 1;
    grid->n[YY]      = y1 - y0 + pme_order - 1;
    grid->n[ZZ]      = z1 - z0 + pme_order - 1;
    copy_ivec(grid->n, grid->s);

    nz = grid->s[ZZ];
    set_grid_alignment(&nz, pme_order);
    if (set_alignment)
    {
        grid->s[ZZ] = nz;
    }
    else if (nz != grid->s[ZZ])
    {
        gmx_incons("pmegrid_init call with an unaligned z size");
    }

    grid->order = pme_order;
    if (ptr == NULL)
    {
        gridsize = grid->s[XX]*grid->s[YY]*grid->s[ZZ];
        set_gridsize_alignment(&gridsize, pme_order);
        snew_aligned(grid->grid, gridsize, SIMD4_ALIGNMENT);
    }
    else
    {
        grid->grid = ptr;
    }
}
コード例 #6
0
ファイル: pme-spline-work.c プロジェクト: furtheraway/gromacs
struct pme_spline_work *make_pme_spline_work(int gmx_unused order)
{
    struct pme_spline_work *work;

#ifdef PME_SIMD4_SPREAD_GATHER
    real             tmp[GMX_SIMD4_WIDTH*3], *tmp_aligned;
    gmx_simd4_real_t zero_S;
    gmx_simd4_real_t real_mask_S0, real_mask_S1;
    int              of, i;

    snew_aligned(work, 1, SIMD4_ALIGNMENT);

    tmp_aligned = gmx_simd4_align_r(tmp);

    zero_S = gmx_simd4_setzero_r();

    /* Generate bit masks to mask out the unused grid entries,
     * as we only operate on order of the 8 grid entries that are
     * load into 2 SIMD registers.
     */
    for (of = 0; of < 2*GMX_SIMD4_WIDTH-(order-1); of++)
    {
        for (i = 0; i < 2*GMX_SIMD4_WIDTH; i++)
        {
            tmp_aligned[i] = (i >= of && i < of+order ? -1.0 : 1.0);
        }
        real_mask_S0      = gmx_simd4_load_r(tmp_aligned);
        real_mask_S1      = gmx_simd4_load_r(tmp_aligned+GMX_SIMD4_WIDTH);
        work->mask_S0[of] = gmx_simd4_cmplt_r(real_mask_S0, zero_S);
        work->mask_S1[of] = gmx_simd4_cmplt_r(real_mask_S1, zero_S);
    }
#else
    work = NULL;
#endif

    return work;
}
コード例 #7
0
ファイル: pme-grid.cpp プロジェクト: wangxubo0201/gromacs
void pmegrids_init(pmegrids_t *grids,
                   int nx, int ny, int nz, int nz_base,
                   int pme_order,
                   gmx_bool bUseThreads,
                   int nthread,
                   int overlap_x,
                   int overlap_y)
{
    ivec n, n_base;
    int  t, x, y, z, d, i, tfac;
    int  max_comm_lines = -1;

    n[XX] = nx - (pme_order - 1);
    n[YY] = ny - (pme_order - 1);
    n[ZZ] = nz - (pme_order - 1);

    copy_ivec(n, n_base);
    n_base[ZZ] = nz_base;

    pmegrid_init(&grids->grid, 0, 0, 0, 0, 0, 0, n[XX], n[YY], n[ZZ], FALSE, pme_order,
                 NULL);

    grids->nthread = nthread;

    make_subgrid_division(n_base, pme_order-1, grids->nthread, grids->nc);

    if (bUseThreads)
    {
        ivec nst;
        int  gridsize;

        for (d = 0; d < DIM; d++)
        {
            nst[d] = div_round_up(n[d], grids->nc[d]) + pme_order - 1;
        }
        set_grid_alignment(&nst[ZZ], pme_order);

        if (debug)
        {
            fprintf(debug, "pmegrid thread local division: %d x %d x %d\n",
                    grids->nc[XX], grids->nc[YY], grids->nc[ZZ]);
            fprintf(debug, "pmegrid %d %d %d max thread pmegrid %d %d %d\n",
                    nx, ny, nz,
                    nst[XX], nst[YY], nst[ZZ]);
        }

        snew(grids->grid_th, grids->nthread);
        t        = 0;
        gridsize = nst[XX]*nst[YY]*nst[ZZ];
        set_gridsize_alignment(&gridsize, pme_order);
        snew_aligned(grids->grid_all,
                     grids->nthread*gridsize+(grids->nthread+1)*GMX_CACHE_SEP,
                     SIMD4_ALIGNMENT);

        for (x = 0; x < grids->nc[XX]; x++)
        {
            for (y = 0; y < grids->nc[YY]; y++)
            {
                for (z = 0; z < grids->nc[ZZ]; z++)
                {
                    pmegrid_init(&grids->grid_th[t],
                                 x, y, z,
                                 (n[XX]*(x  ))/grids->nc[XX],
                                 (n[YY]*(y  ))/grids->nc[YY],
                                 (n[ZZ]*(z  ))/grids->nc[ZZ],
                                 (n[XX]*(x+1))/grids->nc[XX],
                                 (n[YY]*(y+1))/grids->nc[YY],
                                 (n[ZZ]*(z+1))/grids->nc[ZZ],
                                 TRUE,
                                 pme_order,
                                 grids->grid_all+GMX_CACHE_SEP+t*(gridsize+GMX_CACHE_SEP));
                    t++;
                }
            }
        }
    }
    else
    {
        grids->grid_th = NULL;
    }

    snew(grids->g2t, DIM);
    tfac = 1;
    for (d = DIM-1; d >= 0; d--)
    {
        snew(grids->g2t[d], n[d]);
        t = 0;
        for (i = 0; i < n[d]; i++)
        {
            /* The second check should match the parameters
             * of the pmegrid_init call above.
             */
            while (t + 1 < grids->nc[d] && i >= (n[d]*(t+1))/grids->nc[d])
            {
                t++;
            }
            grids->g2t[d][i] = t*tfac;
        }

        tfac *= grids->nc[d];

        switch (d)
        {
            case XX: max_comm_lines = overlap_x;     break;
            case YY: max_comm_lines = overlap_y;     break;
            case ZZ: max_comm_lines = pme_order - 1; break;
        }
        grids->nthread_comm[d] = 0;
        while ((n[d]*grids->nthread_comm[d])/grids->nc[d] < max_comm_lines &&
               grids->nthread_comm[d] < grids->nc[d])
        {
            grids->nthread_comm[d]++;
        }
        if (debug != NULL)
        {
            fprintf(debug, "pmegrid thread grid communication range in %c: %d\n",
                    'x'+d, grids->nthread_comm[d]);
        }
        /* It should be possible to make grids->nthread_comm[d]==grids->nc[d]
         * work, but this is not a problematic restriction.
         */
        if (grids->nc[d] > 1 && grids->nthread_comm[d] > grids->nc[d])
        {
            gmx_fatal(FARGS, "Too many threads for PME (%d) compared to the number of grid lines, reduce the number of threads doing PME", grids->nthread);
        }
    }
}
コード例 #8
0
ファイル: fft5d.cpp プロジェクト: daniellandau/gromacs
/* NxMxK the size of the data
 * comm communicator to use for fft5d
 * P0 number of processor in 1st axes (can be null for automatic)
 * lin is allocated by fft5d because size of array is only known after planning phase
 * rlout2 is only used as intermediate buffer - only returned after allocation to reuse for back transform - should not be used by caller
 */
fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads)
{

    int        P[2], bMaster, prank[2], i, t;
    int        rNG, rMG, rKG;
    int       *N0 = 0, *N1 = 0, *M0 = 0, *M1 = 0, *K0 = 0, *K1 = 0, *oN0 = 0, *oN1 = 0, *oM0 = 0, *oM1 = 0, *oK0 = 0, *oK1 = 0;
    int        N[3], M[3], K[3], pN[3], pM[3], pK[3], oM[3], oK[3], *iNin[3] = {0}, *oNin[3] = {0}, *iNout[3] = {0}, *oNout[3] = {0};
    int        C[3], rC[3], nP[2];
    int        lsize;
    t_complex *lin = 0, *lout = 0, *lout2 = 0, *lout3 = 0;
    fft5d_plan plan;
    int        s;

    /* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */
#ifdef GMX_MPI
    if (GMX_PARALLEL_ENV_INITIALIZED && comm[0] != MPI_COMM_NULL)
    {
        MPI_Comm_size(comm[0], &P[0]);
        MPI_Comm_rank(comm[0], &prank[0]);
    }
    else
#endif
    {
        P[0]     = 1;
        prank[0] = 0;
    }
#ifdef GMX_MPI
    if (GMX_PARALLEL_ENV_INITIALIZED && comm[1] != MPI_COMM_NULL)
    {
        MPI_Comm_size(comm[1], &P[1]);
        MPI_Comm_rank(comm[1], &prank[1]);
    }
    else
#endif
    {
        P[1]     = 1;
        prank[1] = 0;
    }

    bMaster = (prank[0] == 0 && prank[1] == 0);


    if (debug)
    {
        fprintf(debug, "FFT5D: Using %dx%d processor grid, rank %d,%d\n",
                P[0], P[1], prank[0], prank[1]);
    }

    if (bMaster)
    {
        if (debug)
        {
            fprintf(debug, "FFT5D: N: %d, M: %d, K: %d, P: %dx%d, real2complex: %d, backward: %d, order yz: %d, debug %d\n",
                    NG, MG, KG, P[0], P[1], (flags&FFT5D_REALCOMPLEX) > 0, (flags&FFT5D_BACKWARD) > 0, (flags&FFT5D_ORDER_YZ) > 0, (flags&FFT5D_DEBUG) > 0);
        }
        /* The check below is not correct, one prime factor 11 or 13 is ok.
           if (fft5d_fmax(fft5d_fmax(lpfactor(NG),lpfactor(MG)),lpfactor(KG))>7) {
            printf("WARNING: FFT very slow with prime factors larger 7\n");
            printf("Change FFT size or in case you cannot change it look at\n");
            printf("http://www.fftw.org/fftw3_doc/Generating-your-own-code.html\n");
           }
         */
    }

    if (NG == 0 || MG == 0 || KG == 0)
    {
        if (bMaster)
        {
            printf("FFT5D: FATAL: Datasize cannot be zero in any dimension\n");
        }
        return 0;
    }

    rNG = NG; rMG = MG; rKG = KG;

    if (flags&FFT5D_REALCOMPLEX)
    {
        if (!(flags&FFT5D_BACKWARD))
        {
            NG = NG/2+1;
        }
        else
        {
            if (!(flags&FFT5D_ORDER_YZ))
            {
                MG = MG/2+1;
            }
            else
            {
                KG = KG/2+1;
            }
        }
    }


    /*for transpose we need to know the size for each processor not only our own size*/

    N0  = (int*)malloc(P[0]*sizeof(int)); N1 = (int*)malloc(P[1]*sizeof(int));
    M0  = (int*)malloc(P[0]*sizeof(int)); M1 = (int*)malloc(P[1]*sizeof(int));
    K0  = (int*)malloc(P[0]*sizeof(int)); K1 = (int*)malloc(P[1]*sizeof(int));
    oN0 = (int*)malloc(P[0]*sizeof(int)); oN1 = (int*)malloc(P[1]*sizeof(int));
    oM0 = (int*)malloc(P[0]*sizeof(int)); oM1 = (int*)malloc(P[1]*sizeof(int));
    oK0 = (int*)malloc(P[0]*sizeof(int)); oK1 = (int*)malloc(P[1]*sizeof(int));

    for (i = 0; i < P[0]; i++)
    {
        #define EVENDIST
        #ifndef EVENDIST
        oN0[i] = i*ceil((double)NG/P[0]);
        oM0[i] = i*ceil((double)MG/P[0]);
        oK0[i] = i*ceil((double)KG/P[0]);
        #else
        oN0[i] = (NG*i)/P[0];
        oM0[i] = (MG*i)/P[0];
        oK0[i] = (KG*i)/P[0];
        #endif
    }
    for (i = 0; i < P[1]; i++)
    {
        #ifndef EVENDIST
        oN1[i] = i*ceil((double)NG/P[1]);
        oM1[i] = i*ceil((double)MG/P[1]);
        oK1[i] = i*ceil((double)KG/P[1]);
        #else
        oN1[i] = (NG*i)/P[1];
        oM1[i] = (MG*i)/P[1];
        oK1[i] = (KG*i)/P[1];
        #endif
    }
    for (i = 0; i < P[0]-1; i++)
    {
        N0[i] = oN0[i+1]-oN0[i];
        M0[i] = oM0[i+1]-oM0[i];
        K0[i] = oK0[i+1]-oK0[i];
    }
    N0[P[0]-1] = NG-oN0[P[0]-1];
    M0[P[0]-1] = MG-oM0[P[0]-1];
    K0[P[0]-1] = KG-oK0[P[0]-1];
    for (i = 0; i < P[1]-1; i++)
    {
        N1[i] = oN1[i+1]-oN1[i];
        M1[i] = oM1[i+1]-oM1[i];
        K1[i] = oK1[i+1]-oK1[i];
    }
    N1[P[1]-1] = NG-oN1[P[1]-1];
    M1[P[1]-1] = MG-oM1[P[1]-1];
    K1[P[1]-1] = KG-oK1[P[1]-1];

    /* for step 1-3 the local N,M,K sizes of the transposed system
       C: contiguous dimension, and nP: number of processor in subcommunicator
       for that step */


    pM[0] = M0[prank[0]];
    oM[0] = oM0[prank[0]];
    pK[0] = K1[prank[1]];
    oK[0] = oK1[prank[1]];
    C[0]  = NG;
    rC[0] = rNG;
    if (!(flags&FFT5D_ORDER_YZ))
    {
        N[0]     = vmax(N1, P[1]);
        M[0]     = M0[prank[0]];
        K[0]     = vmax(K1, P[1]);
        pN[0]    = N1[prank[1]];
        iNout[0] = N1;
        oNout[0] = oN1;
        nP[0]    = P[1];
        C[1]     = KG;
        rC[1]    = rKG;
        N[1]     = vmax(K0, P[0]);
        pN[1]    = K0[prank[0]];
        iNin[1]  = K1;
        oNin[1]  = oK1;
        iNout[1] = K0;
        oNout[1] = oK0;
        M[1]     = vmax(M0, P[0]);
        pM[1]    = M0[prank[0]];
        oM[1]    = oM0[prank[0]];
        K[1]     = N1[prank[1]];
        pK[1]    = N1[prank[1]];
        oK[1]    = oN1[prank[1]];
        nP[1]    = P[0];
        C[2]     = MG;
        rC[2]    = rMG;
        iNin[2]  = M0;
        oNin[2]  = oM0;
        M[2]     = vmax(K0, P[0]);
        pM[2]    = K0[prank[0]];
        oM[2]    = oK0[prank[0]];
        K[2]     = vmax(N1, P[1]);
        pK[2]    = N1[prank[1]];
        oK[2]    = oN1[prank[1]];
        free(N0); free(oN0); /*these are not used for this order*/
        free(M1); free(oM1); /*the rest is freed in destroy*/
    }
    else
    {
        N[0]     = vmax(N0, P[0]);
        M[0]     = vmax(M0, P[0]);
        K[0]     = K1[prank[1]];
        pN[0]    = N0[prank[0]];
        iNout[0] = N0;
        oNout[0] = oN0;
        nP[0]    = P[0];
        C[1]     = MG;
        rC[1]    = rMG;
        N[1]     = vmax(M1, P[1]);
        pN[1]    = M1[prank[1]];
        iNin[1]  = M0;
        oNin[1]  = oM0;
        iNout[1] = M1;
        oNout[1] = oM1;
        M[1]     = N0[prank[0]];
        pM[1]    = N0[prank[0]];
        oM[1]    = oN0[prank[0]];
        K[1]     = vmax(K1, P[1]);
        pK[1]    = K1[prank[1]];
        oK[1]    = oK1[prank[1]];
        nP[1]    = P[1];
        C[2]     = KG;
        rC[2]    = rKG;
        iNin[2]  = K1;
        oNin[2]  = oK1;
        M[2]     = vmax(N0, P[0]);
        pM[2]    = N0[prank[0]];
        oM[2]    = oN0[prank[0]];
        K[2]     = vmax(M1, P[1]);
        pK[2]    = M1[prank[1]];
        oK[2]    = oM1[prank[1]];
        free(N1); free(oN1); /*these are not used for this order*/
        free(K0); free(oK0); /*the rest is freed in destroy*/
    }
    N[2] = pN[2] = -1;       /*not used*/

    /*
       Difference between x-y-z regarding 2d decomposition is whether they are
       distributed along axis 1, 2 or both
     */

    /* int lsize = fmax(N[0]*M[0]*K[0]*nP[0],N[1]*M[1]*K[1]*nP[1]); */
    lsize = std::max(N[0]*M[0]*K[0]*nP[0], std::max(N[1]*M[1]*K[1]*nP[1], C[2]*M[2]*K[2]));
    /* int lsize = fmax(C[0]*M[0]*K[0],fmax(C[1]*M[1]*K[1],C[2]*M[2]*K[2])); */
    if (!(flags&FFT5D_NOMALLOC))
    {
        snew_aligned(lin, lsize, 32);
        snew_aligned(lout, lsize, 32);
        if (nthreads > 1)
        {
            /* We need extra transpose buffers to avoid OpenMP barriers */
            snew_aligned(lout2, lsize, 32);
            snew_aligned(lout3, lsize, 32);
        }
        else
        {
            /* We can reuse the buffers to avoid cache misses */
            lout2 = lin;
            lout3 = lout;
        }
    }
    else
    {
        lin  = *rlin;
        lout = *rlout;
        if (nthreads > 1)
        {
            lout2 = *rlout2;
            lout3 = *rlout3;
        }
        else
        {
            lout2 = lin;
            lout3 = lout;
        }
    }

    plan = (fft5d_plan)calloc(1, sizeof(struct fft5d_plan_t));


    if (debug)
    {
        fprintf(debug, "Running on %d threads\n", nthreads);
    }

#ifdef GMX_FFT_FFTW3                                                            /*if not FFTW - then we don't do a 3d plan but instead use only 1D plans */
    /* It is possible to use the 3d plan with OMP threads - but in that case it is not allowed to be called from
     * within a parallel region. For now deactivated. If it should be supported it has to made sure that
     * that the execute of the 3d plan is in a master/serial block (since it contains it own parallel region)
     * and that the 3d plan is faster than the 1d plan.
     */
    if ((!(flags&FFT5D_INPLACE)) && (!(P[0] > 1 || P[1] > 1)) && nthreads == 1) /*don't do 3d plan in parallel or if in_place requested */
    {
        int fftwflags = FFTW_DESTROY_INPUT;
        FFTW(iodim) dims[3];
        int inNG = NG, outMG = MG, outKG = KG;

        FFTW_LOCK;
        if (!(flags&FFT5D_NOMEASURE))
        {
            fftwflags |= FFTW_MEASURE;
        }
        if (flags&FFT5D_REALCOMPLEX)
        {
            if (!(flags&FFT5D_BACKWARD))        /*input pointer is not complex*/
            {
                inNG *= 2;
            }
            else                                /*output pointer is not complex*/
            {
                if (!(flags&FFT5D_ORDER_YZ))
                {
                    outMG *= 2;
                }
                else
                {
                    outKG *= 2;
                }
            }
        }

        if (!(flags&FFT5D_BACKWARD))
        {
            dims[0].n  = KG;
            dims[1].n  = MG;
            dims[2].n  = rNG;

            dims[0].is = inNG*MG;         /*N M K*/
            dims[1].is = inNG;
            dims[2].is = 1;
            if (!(flags&FFT5D_ORDER_YZ))
            {
                dims[0].os = MG;           /*M K N*/
                dims[1].os = 1;
                dims[2].os = MG*KG;
            }
            else
            {
                dims[0].os = 1;           /*K N M*/
                dims[1].os = KG*NG;
                dims[2].os = KG;
            }
        }
        else
        {
            if (!(flags&FFT5D_ORDER_YZ))
            {
                dims[0].n  = NG;
                dims[1].n  = KG;
                dims[2].n  = rMG;

                dims[0].is = 1;
                dims[1].is = NG*MG;
                dims[2].is = NG;

                dims[0].os = outMG*KG;
                dims[1].os = outMG;
                dims[2].os = 1;
            }
            else
            {
                dims[0].n  = MG;
                dims[1].n  = NG;
                dims[2].n  = rKG;

                dims[0].is = NG;
                dims[1].is = 1;
                dims[2].is = NG*MG;

                dims[0].os = outKG*NG;
                dims[1].os = outKG;
                dims[2].os = 1;
            }
        }
#ifdef FFT5D_THREADS
#ifdef FFT5D_FFTW_THREADS
        FFTW(plan_with_nthreads)(nthreads);
#endif
#endif
        if ((flags&FFT5D_REALCOMPLEX) && !(flags&FFT5D_BACKWARD))
        {
            plan->p3d = FFTW(plan_guru_dft_r2c)(/*rank*/ 3, dims,
                                                         /*howmany*/ 0, /*howmany_dims*/ 0,
                                                         (real*)lin, (FFTW(complex) *) lout,
                                                         /*flags*/ fftwflags);
        }
コード例 #9
0
t_forcetable make_tables(FILE *out,const output_env_t oenv,
                         const t_forcerec *fr,
			 gmx_bool bVerbose,const char *fn,
			 real rtab,int flags)
{
  const char *fns[3] = { "ctab.xvg", "dtab.xvg", "rtab.xvg" };
  const char *fns14[3] = { "ctab14.xvg", "dtab14.xvg", "rtab14.xvg" };
  FILE        *fp;
  t_tabledata *td;
  gmx_bool        b14only,bReadTab,bGenTab;
  real        x0,y0,yp;
  int         i,j,k,nx,nx0,tabsel[etiNR];
  
  t_forcetable table;

  b14only = (flags & GMX_MAKETABLES_14ONLY);

  if (flags & GMX_MAKETABLES_FORCEUSER) {
    tabsel[etiCOUL] = etabUSER;
    tabsel[etiLJ6]  = etabUSER;
    tabsel[etiLJ12] = etabUSER;
  } else {
    set_table_type(tabsel,fr,b14only);
  }
  snew(td,etiNR);
  table.r         = rtab;
  table.scale     = 0;
  table.n         = 0;
  table.scale_exp = 0;
  nx0             = 10;
  nx              = 0;
  
  /* Check whether we have to read or generate */
  bReadTab = FALSE;
  bGenTab  = FALSE;
  for(i=0; (i<etiNR); i++) {
    if (ETAB_USER(tabsel[i]))
      bReadTab = TRUE;
    if (tabsel[i] != etabUSER)
      bGenTab  = TRUE;
  }
  if (bReadTab) {
    read_tables(out,fn,etiNR,0,td);
    if (rtab == 0 || (flags & GMX_MAKETABLES_14ONLY)) {
      rtab      = td[0].x[td[0].nx-1];
      table.n   = td[0].nx;
      nx        = table.n;
    } else {
      if (td[0].x[td[0].nx-1] < rtab) 
	gmx_fatal(FARGS,"Tables in file %s not long enough for cut-off:\n"
		  "\tshould be at least %f nm\n",fn,rtab);
      nx        = table.n = (int)(rtab*td[0].tabscale + 0.5);
    }
    table.scale = td[0].tabscale;
    nx0         = td[0].nx0;
  }
  if (bGenTab) {
    if (!bReadTab) {
#ifdef GMX_DOUBLE
      table.scale = 2000.0;
#else
      table.scale = 500.0;
#endif
      nx = table.n = rtab*table.scale;
    }
  }
  if (fr->bBHAM) {
    if(fr->bham_b_max!=0)
      table.scale_exp = table.scale/fr->bham_b_max;
    else
      table.scale_exp = table.scale;
  }

  /* Each table type (e.g. coul,lj6,lj12) requires four 
   * numbers per nx+1 data points. For performance reasons we want
   * the table data to be aligned to 16-byte.
   */
  snew_aligned(table.tab, 12*(nx+1)*sizeof(real),16);

  for(k=0; (k<etiNR); k++) {
    if (tabsel[k] != etabUSER) {
      init_table(out,nx,nx0,
		 (tabsel[k] == etabEXPMIN) ? table.scale_exp : table.scale,
		 &(td[k]),!bReadTab);
      fill_table(&(td[k]),tabsel[k],fr);
      if (out) 
	fprintf(out,"%s table with %d data points for %s%s.\n"
		"Tabscale = %g points/nm\n",
		ETAB_USER(tabsel[k]) ? "Modified" : "Generated",
		td[k].nx,b14only?"1-4 ":"",tprops[tabsel[k]].name,
		td[k].tabscale);
    }
    copy2table(table.n,k*4,12,td[k].x,td[k].v,td[k].f,table.tab);
    
    if (bDebugMode() && bVerbose) {
      if (b14only)
	fp=xvgropen(fns14[k],fns14[k],"r","V",oenv);
      else
	fp=xvgropen(fns[k],fns[k],"r","V",oenv);
      /* plot the output 5 times denser than the table data */
      for(i=5*((nx0+1)/2); i<5*table.n; i++) {
	x0 = i*table.r/(5*(table.n-1));
	evaluate_table(table.tab,4*k,12,table.scale,x0,&y0,&yp);
	fprintf(fp,"%15.10e  %15.10e  %15.10e\n",x0,y0,yp);
      }
      gmx_fio_fclose(fp);
    }
    done_tabledata(&(td[k]));
  }
  sfree(td);

  return table;
}
コード例 #10
0
ファイル: tables.c プロジェクト: martinhoefling/gromacs
t_forcetable make_atf_table(FILE *out,const output_env_t oenv,
			    const t_forcerec *fr,
			    const char *fn,
                            matrix box)
{
	const char *fns[3] = { "tf_tab.xvg", "atfdtab.xvg", "atfrtab.xvg" };
	FILE        *fp;
	t_tabledata *td;
	real        x0,y0,yp,rtab;
	int         i,nx,nx0;
        real        rx, ry, rz, box_r;
	
	t_forcetable table;
	
	
	/* Set the table dimensions for ATF, not really necessary to
	 * use etiNR (since we only have one table, but ...) 
	 */
	snew(td,1);
        
        if (fr->adress_type == eAdressSphere){
            /* take half box diagonal direction as tab range */
               rx = 0.5*box[0][0]+0.5*box[1][0]+0.5*box[2][0];
               ry = 0.5*box[0][1]+0.5*box[1][1]+0.5*box[2][1];
               rz = 0.5*box[0][2]+0.5*box[1][2]+0.5*box[2][2];
               box_r = sqrt(rx*rx+ry*ry+rz*rz);
               
        }else{
            /* xsplit: take half box x direction as tab range */
               box_r        = box[0][0]/2;
        }
        table.r         = box_r;
	table.scale     = 0;
	table.n         = 0;
	table.scale_exp = 0;
	nx0             = 10;
	nx              = 0;
	
        read_tables(out,fn,1,0,td);
        rtab      = td[0].x[td[0].nx-1];

       if (fr->adress_type == eAdressXSplit && (rtab < box[0][0]/2)){
           gmx_fatal(FARGS,"AdResS full box therm force table in file %s extends to %f:\n"
                        "\tshould extend to at least half the length of the box in x-direction"
                        "%f\n",fn,rtab, box[0][0]/2);
       }
       if (rtab < box_r){
               gmx_fatal(FARGS,"AdResS full box therm force table in file %s extends to %f:\n"
                "\tshould extend to at least for spherical adress"
                "%f (=distance from center to furthermost point in box \n",fn,rtab, box_r);
       }


        table.n   = td[0].nx;
        nx        = table.n;
        table.scale = td[0].tabscale;
        nx0         = td[0].nx0;

	/* Each table type (e.g. coul,lj6,lj12) requires four 
	 * numbers per datapoint. For performance reasons we want
	 * the table data to be aligned to 16-byte. This is accomplished
	 * by allocating 16 bytes extra to a temporary pointer, and then
	 * calculating an aligned pointer. This new pointer must not be
	 * used in a free() call, but thankfully we're sloppy enough not
	 * to do this :-)
	 */
	
    snew_aligned(table.data,4*nx,16);

	copy2table(table.n,0,4,td[0].x,td[0].v,td[0].f,1.0,table.data);
	
	if(bDebugMode())
	  {
	    fp=xvgropen(fns[0],fns[0],"r","V",oenv);
	    /* plot the output 5 times denser than the table data */
	    /* for(i=5*nx0;i<5*table.n;i++) */
	   
            for(i=5*((nx0+1)/2); i<5*table.n; i++)
	      {
		/* x0=i*table.r/(5*table.n); */
		x0 = i*table.r/(5*(table.n-1));
		evaluate_table(table.data,0,4,table.scale,x0,&y0,&yp);
		fprintf(fp,"%15.10e  %15.10e  %15.10e\n",x0,y0,yp);
		
	      }
	    ffclose(fp);
	  }

	done_tabledata(&(td[0]));
	sfree(td);

    table.interaction   = GMX_TABLE_INTERACTION_ELEC_VDWREP_VDWDISP;
    table.format        = GMX_TABLE_FORMAT_CUBICSPLINE_YFGH;
    table.formatsize    = 4;
    table.ninteractions = 3;
    table.stride        = table.formatsize*table.ninteractions;

	
	return table;
}
コード例 #11
0
ファイル: tables.c プロジェクト: martinhoefling/gromacs
t_forcetable make_gb_table(FILE *out,const output_env_t oenv,
                           const t_forcerec *fr,
                           const char *fn,
                           real rtab)
{
	const char *fns[3] = { "gbctab.xvg", "gbdtab.xvg", "gbrtab.xvg" };
	const char *fns14[3] = { "gbctab14.xvg", "gbdtab14.xvg", "gbrtab14.xvg" };
	FILE        *fp;
	t_tabledata *td;
	gmx_bool        bReadTab,bGenTab;
	real        x0,y0,yp;
	int         i,j,k,nx,nx0,tabsel[etiNR];
	double      r,r2,Vtab,Ftab,expterm;
	
	t_forcetable table;
	
	double abs_error_r, abs_error_r2;
	double rel_error_r, rel_error_r2;
	double rel_error_r_old=0, rel_error_r2_old=0;
	double x0_r_error, x0_r2_error;
	
	
	/* Only set a Coulomb table for GB */
	/* 
	 tabsel[0]=etabGB;
	 tabsel[1]=-1;
	 tabsel[2]=-1;
	*/
	
	/* Set the table dimensions for GB, not really necessary to
	 * use etiNR (since we only have one table, but ...) 
	 */
	snew(td,1);
    table.interaction   = GMX_TABLE_INTERACTION_ELEC;
    table.format        = GMX_TABLE_FORMAT_CUBICSPLINE_YFGH;
	table.r             = fr->gbtabr;
	table.scale         = fr->gbtabscale;
	table.scale_exp     = 0;
	table.n             = table.scale*table.r;
    table.formatsize    = 4;
    table.ninteractions = 1;
    table.stride        = table.formatsize*table.ninteractions;
	nx0                 = 0;
	nx                  = table.scale*table.r;
	
	/* Check whether we have to read or generate 
	 * We will always generate a table, so remove the read code
	 * (Compare with original make_table function
	 */
	bReadTab = FALSE;
	bGenTab  = TRUE;
	
	/* Each table type (e.g. coul,lj6,lj12) requires four 
	 * numbers per datapoint. For performance reasons we want
	 * the table data to be aligned to 16-byte. This is accomplished
	 * by allocating 16 bytes extra to a temporary pointer, and then
	 * calculating an aligned pointer. This new pointer must not be
	 * used in a free() call, but thankfully we're sloppy enough not
	 * to do this :-)
	 */
	
	snew_aligned(table.data,4*nx,16);
	
	init_table(out,nx,nx0,table.scale,&(td[0]),!bReadTab);
	
	/* Local implementation so we don't have to use the etabGB
	 * enum above, which will cause problems later when
	 * making the other tables (right now even though we are using
	 * GB, the normal Coulomb tables will be created, but this
	 * will cause a problem since fr->eeltype==etabGB which will not
	 * be defined in fill_table and set_table_type
	 */
	
	for(i=nx0;i<nx;i++)
    {
		Vtab    = 0.0;
		Ftab    = 0.0;
		r       = td->x[i];
		r2      = r*r;
		expterm = exp(-0.25*r2);
		
		Vtab = 1/sqrt(r2+expterm);
		Ftab = (r-0.25*r*expterm)/((r2+expterm)*sqrt(r2+expterm));
		
		/* Convert to single precision when we store to mem */
		td->v[i]  = Vtab;
		td->f[i]  = Ftab;
		
    }
	
	copy2table(table.n,0,4,td[0].x,td[0].v,td[0].f,1.0,table.data);
	
	if(bDebugMode())
    {
		fp=xvgropen(fns[0],fns[0],"r","V",oenv);
		/* plot the output 5 times denser than the table data */
		/* for(i=5*nx0;i<5*table.n;i++) */
		for(i=nx0;i<table.n;i++)
		{
			/* x0=i*table.r/(5*table.n); */
			x0=i*table.r/table.n;
			evaluate_table(table.data,0,4,table.scale,x0,&y0,&yp);
			fprintf(fp,"%15.10e  %15.10e  %15.10e\n",x0,y0,yp);
			
		}
		gmx_fio_fclose(fp);
    }
	
	/*
	 for(i=100*nx0;i<99.81*table.n;i++)
	 {
	 r = i*table.r/(100*table.n);
	 r2      = r*r;
	 expterm = exp(-0.25*r2);
	 
	 Vtab = 1/sqrt(r2+expterm);
	 Ftab = (r-0.25*r*expterm)/((r2+expterm)*sqrt(r2+expterm));
	 
	 
	 evaluate_table(table.data,0,4,table.scale,r,&y0,&yp);
	 printf("gb: i=%d, x0=%g, y0=%15.15f, Vtab=%15.15f, yp=%15.15f, Ftab=%15.15f\n",i,r, y0, Vtab, yp, Ftab);
	 
	 abs_error_r=fabs(y0-Vtab);
	 abs_error_r2=fabs(yp-(-1)*Ftab);
	 
	 rel_error_r=abs_error_r/y0;
	 rel_error_r2=fabs(abs_error_r2/yp);
	 
	 
	 if(rel_error_r>rel_error_r_old)
	 {
	 rel_error_r_old=rel_error_r;
	 x0_r_error=x0;
	 }
	 
	 if(rel_error_r2>rel_error_r2_old)
	 {
	 rel_error_r2_old=rel_error_r2;
	 x0_r2_error=x0;	
	 }
	 }
	 
	 printf("gb: MAX REL ERROR IN R=%15.15f, MAX REL ERROR IN R2=%15.15f\n",rel_error_r_old, rel_error_r2_old);
	 printf("gb: XO_R=%g, X0_R2=%g\n",x0_r_error, x0_r2_error);
	 
	 exit(1); */
	done_tabledata(&(td[0]));
	sfree(td);
	
	return table;
	
	
}
コード例 #12
0
ファイル: tables.c プロジェクト: martinhoefling/gromacs
t_forcetable make_tables(FILE *out,const output_env_t oenv,
                         const t_forcerec *fr,
			 gmx_bool bVerbose,const char *fn,
			 real rtab,int flags)
{
  const char *fns[3] = { "ctab.xvg", "dtab.xvg", "rtab.xvg" };
  const char *fns14[3] = { "ctab14.xvg", "dtab14.xvg", "rtab14.xvg" };
  FILE        *fp;
  t_tabledata *td;
  gmx_bool        b14only,bReadTab,bGenTab;
  real        x0,y0,yp;
  int         i,j,k,nx,nx0,tabsel[etiNR];
  real        scalefactor;

  t_forcetable table;

  b14only = (flags & GMX_MAKETABLES_14ONLY);

  if (flags & GMX_MAKETABLES_FORCEUSER) {
    tabsel[etiCOUL] = etabUSER;
    tabsel[etiLJ6]  = etabUSER;
    tabsel[etiLJ12] = etabUSER;
  } else {
    set_table_type(tabsel,fr,b14only);
  }
  snew(td,etiNR);
  table.r         = rtab;
  table.scale     = 0;
  table.n         = 0;
  table.scale_exp = 0;
  nx0             = 10;
  nx              = 0;
  
  table.interaction   = GMX_TABLE_INTERACTION_ELEC_VDWREP_VDWDISP;
  table.format        = GMX_TABLE_FORMAT_CUBICSPLINE_YFGH;
  table.formatsize    = 4;
  table.ninteractions = 3;
  table.stride        = table.formatsize*table.ninteractions;

  /* Check whether we have to read or generate */
  bReadTab = FALSE;
  bGenTab  = FALSE;
  for(i=0; (i<etiNR); i++) {
    if (ETAB_USER(tabsel[i]))
      bReadTab = TRUE;
    if (tabsel[i] != etabUSER)
      bGenTab  = TRUE;
  }
  if (bReadTab) {
    read_tables(out,fn,etiNR,0,td);
    if (rtab == 0 || (flags & GMX_MAKETABLES_14ONLY)) {
      rtab      = td[0].x[td[0].nx-1];
      table.n   = td[0].nx;
      nx        = table.n;
    } else {
      if (td[0].x[td[0].nx-1] < rtab) 
	gmx_fatal(FARGS,"Tables in file %s not long enough for cut-off:\n"
		  "\tshould be at least %f nm\n",fn,rtab);
      nx        = table.n = (int)(rtab*td[0].tabscale + 0.5);
    }
    table.scale = td[0].tabscale;
    nx0         = td[0].nx0;
  }
  if (bGenTab) {
    if (!bReadTab) {
#ifdef GMX_DOUBLE
      table.scale = 2000.0;
#else
      table.scale = 500.0;
#endif
      nx = table.n = rtab*table.scale;
    }
  }
  if (fr->bBHAM) {
    if(fr->bham_b_max!=0)
      table.scale_exp = table.scale/fr->bham_b_max;
    else
      table.scale_exp = table.scale;
  }

  /* Each table type (e.g. coul,lj6,lj12) requires four 
   * numbers per nx+1 data points. For performance reasons we want
   * the table data to be aligned to 16-byte.
   */
  snew_aligned(table.data, 12*(nx+1)*sizeof(real),16);

  for(k=0; (k<etiNR); k++) {
    if (tabsel[k] != etabUSER) {
      init_table(out,nx,nx0,
		 (tabsel[k] == etabEXPMIN) ? table.scale_exp : table.scale,
		 &(td[k]),!bReadTab);
      fill_table(&(td[k]),tabsel[k],fr);
      if (out) 
	fprintf(out,"%s table with %d data points for %s%s.\n"
		"Tabscale = %g points/nm\n",
		ETAB_USER(tabsel[k]) ? "Modified" : "Generated",
		td[k].nx,b14only?"1-4 ":"",tprops[tabsel[k]].name,
		td[k].tabscale);
    }

    /* Set scalefactor for c6/c12 tables. This is because we save flops in the non-table kernels
     * by including the derivative constants (6.0 or 12.0) in the parameters, since
     * we no longer calculate force in most steps. This means the c6/c12 parameters
     * have been scaled up, so we need to scale down the table interactions too.
     * It comes here since we need to scale user tables too.
     */
      if(k==etiLJ6)
      {
          scalefactor = 1.0/6.0;
      }
      else if(k==etiLJ12 && tabsel[k]!=etabEXPMIN)
      {
          scalefactor = 1.0/12.0;
      }
      else
      {
          scalefactor = 1.0;
      }

    copy2table(table.n,k*4,12,td[k].x,td[k].v,td[k].f,scalefactor,table.data);
    
    if (bDebugMode() && bVerbose) {
      if (b14only)
	fp=xvgropen(fns14[k],fns14[k],"r","V",oenv);
      else
	fp=xvgropen(fns[k],fns[k],"r","V",oenv);
      /* plot the output 5 times denser than the table data */
      for(i=5*((nx0+1)/2); i<5*table.n; i++) {
	x0 = i*table.r/(5*(table.n-1));
	evaluate_table(table.data,4*k,12,table.scale,x0,&y0,&yp);
	fprintf(fp,"%15.10e  %15.10e  %15.10e\n",x0,y0,yp);
      }
      gmx_fio_fclose(fp);
    }
    done_tabledata(&(td[k]));
  }
  sfree(td);

  return table;
}