Esempio n. 1
0
/*! \brief Tabulates the Ewald Coulomb force and initializes the size/scale
 * and the table GPU array.
 *
 * If called with an already allocated table, it just re-uploads the
 * table.
 */
static void init_ewald_coulomb_force_table(const interaction_const_t       *ic,
                                           cl_nbparam_t                    *nbp,
                                           const gmx_device_runtime_data_t *runData)
{
    cl_mem       coul_tab;

    cl_int       cl_error;

    if (nbp->coulomb_tab_climg2d != nullptr)
    {
        freeDeviceBuffer(&(nbp->coulomb_tab_climg2d));
    }

    /* Switched from using textures to using buffers */
    // TODO: decide which alternative is most efficient - textures or buffers.
    /*
       cl_image_format array_format;

       array_format.image_channel_data_type = CL_FLOAT;
       array_format.image_channel_order     = CL_R;

       coul_tab = clCreateImage2D(runData->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
       &array_format, tabsize, 1, 0, ftmp, &cl_error);
     */

    coul_tab = clCreateBuffer(runData->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, ic->tabq_size*sizeof(cl_float), ic->tabq_coul_F, &cl_error);
    assert(cl_error == CL_SUCCESS);
    // TODO: handle errors, check clCreateBuffer flags

    nbp->coulomb_tab_climg2d  = coul_tab;
    nbp->coulomb_tab_scale    = ic->tabq_scale;
}
Esempio n. 2
0
// ****************************************************************************
// Function: RunBenchmark
//
// Purpose: 
//   Runs the stablity test. The algorithm for the parallel
//   version of the test, which enables testing of an entire GPU
//   cluster at the same time, is as follows. Each participating node
//   first allocates its data, while node zero additionally determines
//   start and finish times based on a user input parameter. All nodes
//   then enter the outermost loop, copying fresh data from the CPU
//   before entering the core of the test. In the core, each node
//   performs a loop consisting of the forward kernel, a potential
//   check, and then the inverse kernel. After performing a configurable
//   number of forward/inverse iterations, along with a configurable
//   number of checks, each node sends the number of failures it
//   encountered to node zero. Node zero collects and reports the error
//   counts, determines whether the test has run its course, and
//   broadcasts the decision. If the decision is to proceed, each node
//   begins the next iteration of the outer loop, copying fresh data and
//   then performing the kernels and checks of the core loop.
//
// Arguments:
//   resultDB: the benchmark stores its results in this ResultDatabase
//   op: the options parser / parameter database
//
// Returns:  nothing
//
// Programmer: Collin McCurdy
// Creation: September 08, 2009
//
// Modifications:
//
// ****************************************************************************
void RunBenchmark(ResultDatabase &resultDB, OptionParser& op)
{	
    int mpi_rank, mpi_size, node_rank;
    int i, j;
    float2* source, * result;
    void* work, * chk;

#ifdef PARALLEL
    MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);

    NodeInfo NI;
    node_rank = NI.nodeRank();

    cout << "MPI Task " << mpi_rank << " of " << mpi_size 
         << " (noderank=" << node_rank << ") starting....\n";
#else
    mpi_rank = 0;
    mpi_size = 1;
    node_rank = 0;
#endif
   
    // ensure chk buffer alloc succeeds before grabbing the
    // rest of available memory.
    allocDeviceBuffer(&chk, 1);
    unsigned long avail_bytes = findAvailBytes();
    // unsigned long avail_bytes = 1024*1024*1024-1;
    
    // now determine how much available memory will be used (subject
    // to CUDA's constraint on the maximum block dimension size)
    int blocks = avail_bytes / (512*sizeof(float2));
    int slices = 1;
    while (blocks/slices > 65535) {
        slices *= 2;
    }
    int half_n_ffts = ((blocks/slices)*slices)/2;
    int n_ffts = half_n_ffts * 2;
    fprintf(stderr, "avail_bytes=%ld, blocks=%d, n_ffts=%d\n", 
            avail_bytes, blocks, n_ffts);

    int half_n_cmplx = half_n_ffts * 512;
    unsigned long used_bytes = half_n_cmplx * 2 * sizeof(float2);

    cout << mpi_rank << ": testing " 
         << used_bytes/((double)1024*1024) << " MBs\n";

    // allocate host memory
    source = (float2*)malloc(used_bytes);
    result = (float2*)malloc(used_bytes);

    // alloc device memory
    allocDeviceBuffer(&work, used_bytes);

    // alloc gather buffer
    int* recvbuf = (int*)malloc(mpi_size*sizeof(int));
    
    // compute start and finish times
    time_t start = time(NULL);
    time_t finish = start + (time_t)(op.getOptionInt("time")*60);
    struct tm start_tm, finish_tm;
    localtime_r(&start, &start_tm);
    localtime_r(&finish, &finish_tm);
    if (mpi_rank == 0) {
        printf("start = %s", asctime(&start_tm));
        printf("finish = %s", asctime(&finish_tm));
    }
    
    for (int iter = 0; ; iter++) {
        bool failed = false;
        int errorCount = 0, stop = 0;

        // (re-)init host memory...
        for (i = 0; i < half_n_cmplx; i++) {
            source[i].x = (rand()/(float)RAND_MAX)*2-1;
            source[i].y = (rand()/(float)RAND_MAX)*2-1;
            source[i+half_n_cmplx].x = source[i].x;
            source[i+half_n_cmplx].y = source[i].y;
        }

        // copy to device
        copyToDevice(work, source, used_bytes);
        copyToDevice(chk, &errorCount, 1);

        forward(work, n_ffts);
        if (check(work, chk, half_n_ffts, half_n_cmplx)) {
            fprintf(stderr, "First check failed...");
            failed = true;
        }

        if (!failed) {
            for (i = 1; i <= CHECKS; i++) {
                for (j = 1; j <= ITERS_PER_CHECK; j++) {
                    inverse(work, n_ffts);
                    forward(work, n_ffts);
                }

                if (check(work, chk, half_n_ffts, half_n_cmplx)) {
                    failed = true;
                    break;
                }
            }
        }

        // failing node is responsible for verifying failure, counting
        // errors and reporting count to node 0.
        if (failed) {
            fprintf(stderr, "Failure on node %d, iter %d:", mpi_rank, iter);
            
            // repeat check on CPU
            copyFromDevice(result, work, used_bytes);
            float2* result2 = result + half_n_cmplx;
            for (j = 0; j < half_n_cmplx; j++) {
                if (result[j].x != result2[j].x || 
                    result[j].y != result2[j].y) 
                {
                    errorCount++;
                }
            }
            if (!errorCount) {
                fprintf(stderr, "verification failed!\n");
            }
            else {
                fprintf(stderr, "%d errors\n", errorCount);
            }
        }

#ifdef PARALLEL
        MPI_Gather(&errorCount, 1, MPI_INT, recvbuf, 1, MPI_INT, 
                   0, MPI_COMM_WORLD);
#else
        recvbuf[0] = errorCount;
#endif

        // node 0 collects and reports error counts, determines
        // whether test has run its course, and broadcasts decision
        if (mpi_rank == 0) {
            time_t curtime = time(NULL);
            struct tm curtm;
            localtime_r(&curtime, &curtm);
            fprintf(stderr, "iter=%d: %s", iter, asctime(&curtm));
            
            for (i = 0; i < mpi_size; i++) {
                if (recvbuf[i]) {
                    fprintf(stderr, "--> %d failures on node %d\n", recvbuf[i], i);
                }
            }

            if (curtime > finish) {
                stop = 1;
            }
        }
        
#ifdef PARALLEL
        MPI_Bcast(&stop, 1, MPI_INT, 0, MPI_COMM_WORLD);
#endif

        resultDB.AddResult("Check", "", "Failures", errorCount);
        if (stop) break;
    }
    
    freeDeviceBuffer(work);
    freeDeviceBuffer(chk);

    free(source);
    free(result);
    free(recvbuf);
}
Esempio n. 3
0
//! This function is documented in the header file
void nbnxn_gpu_init_atomdata(gmx_nbnxn_ocl_t               *nb,
                             const nbnxn_atomdata_t        *nbat)
{
    cl_int           cl_error;
    int              nalloc, natoms;
    bool             realloced;
    bool             bDoTime = nb->bDoTime == CL_TRUE;
    cl_timers_t     *timers  = nb->timers;
    cl_atomdata_t   *d_atdat = nb->atdat;
    cl_command_queue ls      = nb->stream[eintLocal];

    natoms    = nbat->natoms;
    realloced = false;

    if (bDoTime)
    {
        /* time async copy */
        timers->atdat.openTimingRegion(ls);
    }

    /* need to reallocate if we have to copy more atoms than the amount of space
       available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */
    if (natoms > d_atdat->nalloc)
    {
        nalloc = over_alloc_small(natoms);

        /* free up first if the arrays have already been initialized */
        if (d_atdat->nalloc != -1)
        {
            freeDeviceBuffer(&d_atdat->f);
            freeDeviceBuffer(&d_atdat->xq);
            freeDeviceBuffer(&d_atdat->lj_comb);
            freeDeviceBuffer(&d_atdat->atom_types);
        }

        d_atdat->f_elem_size = sizeof(rvec);

        // TODO: handle errors, check clCreateBuffer flags
        d_atdat->f = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_WRITE, nalloc * d_atdat->f_elem_size, nullptr, &cl_error);
        assert(CL_SUCCESS == cl_error);

        // TODO: change the flag to read-only
        d_atdat->xq = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_WRITE, nalloc * sizeof(cl_float4), nullptr, &cl_error);
        assert(CL_SUCCESS == cl_error);
        // TODO: handle errors, check clCreateBuffer flags

        if (useLjCombRule(nb->nbparam->vdwtype))
        {
            // TODO: change the flag to read-only
            d_atdat->lj_comb = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_WRITE, nalloc * sizeof(cl_float2), nullptr, &cl_error);
            assert(CL_SUCCESS == cl_error);
            // TODO: handle errors, check clCreateBuffer flags
        }
        else
        {
            // TODO: change the flag to read-only
            d_atdat->atom_types = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_WRITE, nalloc * sizeof(int), nullptr, &cl_error);
            assert(CL_SUCCESS == cl_error);
            // TODO: handle errors, check clCreateBuffer flags
        }

        d_atdat->nalloc = nalloc;
        realloced       = true;
    }

    d_atdat->natoms       = natoms;
    d_atdat->natoms_local = nbat->natoms_local;

    /* need to clear GPU f output if realloc happened */
    if (realloced)
    {
        nbnxn_ocl_clear_f(nb, nalloc);
    }

    if (useLjCombRule(nb->nbparam->vdwtype))
    {
        ocl_copy_H2D_async(d_atdat->lj_comb, nbat->lj_comb, 0,
                           natoms*sizeof(cl_float2), ls, bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
    }
    else
    {
        ocl_copy_H2D_async(d_atdat->atom_types, nbat->type, 0,
                           natoms*sizeof(int), ls, bDoTime ? timers->atdat.fetchNextEvent() : nullptr);

    }

    if (bDoTime)
    {
        timers->atdat.closeTimingRegion(ls);
    }

    /* kick off the tasks enqueued above to ensure concurrency with the search */
    cl_error = clFlush(ls);
    assert(CL_SUCCESS == cl_error);
}
Esempio n. 4
0
//! This function is documented in the header file
void nbnxn_gpu_free(gmx_nbnxn_ocl_t *nb)
{
    if (nb == nullptr)
    {
        return;
    }

    /* Free kernels */
    int kernel_count = sizeof(nb->kernel_ener_noprune_ptr) / sizeof(nb->kernel_ener_noprune_ptr[0][0]);
    free_kernels(nb->kernel_ener_noprune_ptr[0], kernel_count);

    kernel_count = sizeof(nb->kernel_ener_prune_ptr) / sizeof(nb->kernel_ener_prune_ptr[0][0]);
    free_kernels(nb->kernel_ener_prune_ptr[0], kernel_count);

    kernel_count = sizeof(nb->kernel_noener_noprune_ptr) / sizeof(nb->kernel_noener_noprune_ptr[0][0]);
    free_kernels(nb->kernel_noener_noprune_ptr[0], kernel_count);

    kernel_count = sizeof(nb->kernel_noener_prune_ptr) / sizeof(nb->kernel_noener_prune_ptr[0][0]);
    free_kernels(nb->kernel_noener_prune_ptr[0], kernel_count);

    free_kernel(&(nb->kernel_zero_e_fshift));

    /* Free atdat */
    freeDeviceBuffer(&(nb->atdat->xq));
    freeDeviceBuffer(&(nb->atdat->f));
    freeDeviceBuffer(&(nb->atdat->e_lj));
    freeDeviceBuffer(&(nb->atdat->e_el));
    freeDeviceBuffer(&(nb->atdat->fshift));
    freeDeviceBuffer(&(nb->atdat->lj_comb));
    freeDeviceBuffer(&(nb->atdat->atom_types));
    freeDeviceBuffer(&(nb->atdat->shift_vec));
    sfree(nb->atdat);

    /* Free nbparam */
    freeDeviceBuffer(&(nb->nbparam->nbfp_climg2d));
    freeDeviceBuffer(&(nb->nbparam->nbfp_comb_climg2d));
    freeDeviceBuffer(&(nb->nbparam->coulomb_tab_climg2d));
    sfree(nb->nbparam);

    /* Free plist */
    auto *plist = nb->plist[eintLocal];
    freeDeviceBuffer(&plist->sci);
    freeDeviceBuffer(&plist->cj4);
    freeDeviceBuffer(&plist->imask);
    freeDeviceBuffer(&plist->excl);
    sfree(plist);
    if (nb->bUseTwoStreams)
    {
        auto *plist_nl = nb->plist[eintNonlocal];
        freeDeviceBuffer(&plist_nl->sci);
        freeDeviceBuffer(&plist_nl->cj4);
        freeDeviceBuffer(&plist_nl->imask);
        freeDeviceBuffer(&plist_nl->excl);
        sfree(plist_nl);
    }

    /* Free nbst */
    pfree(nb->nbst.e_lj);
    nb->nbst.e_lj = nullptr;

    pfree(nb->nbst.e_el);
    nb->nbst.e_el = nullptr;

    pfree(nb->nbst.fshift);
    nb->nbst.fshift = nullptr;

    /* Free command queues */
    clReleaseCommandQueue(nb->stream[eintLocal]);
    nb->stream[eintLocal] = nullptr;
    if (nb->bUseTwoStreams)
    {
        clReleaseCommandQueue(nb->stream[eintNonlocal]);
        nb->stream[eintNonlocal] = nullptr;
    }
    /* Free other events */
    if (nb->nonlocal_done)
    {
        clReleaseEvent(nb->nonlocal_done);
        nb->nonlocal_done = nullptr;
    }
    if (nb->misc_ops_and_local_H2D_done)
    {
        clReleaseEvent(nb->misc_ops_and_local_H2D_done);
        nb->misc_ops_and_local_H2D_done = nullptr;
    }

    free_gpu_device_runtime_data(nb->dev_rundata);
    sfree(nb->dev_rundata);

    /* Free timers and timings */
    delete nb->timers;
    sfree(nb->timings);
    sfree(nb);

    if (debug)
    {
        fprintf(debug, "Cleaned up OpenCL data structures.\n");
    }
}