/*! \brief Tabulates the Ewald Coulomb force and initializes the size/scale * and the table GPU array. * * If called with an already allocated table, it just re-uploads the * table. */ static void init_ewald_coulomb_force_table(const interaction_const_t *ic, cl_nbparam_t *nbp, const gmx_device_runtime_data_t *runData) { cl_mem coul_tab; cl_int cl_error; if (nbp->coulomb_tab_climg2d != nullptr) { freeDeviceBuffer(&(nbp->coulomb_tab_climg2d)); } /* Switched from using textures to using buffers */ // TODO: decide which alternative is most efficient - textures or buffers. /* cl_image_format array_format; array_format.image_channel_data_type = CL_FLOAT; array_format.image_channel_order = CL_R; coul_tab = clCreateImage2D(runData->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, &array_format, tabsize, 1, 0, ftmp, &cl_error); */ coul_tab = clCreateBuffer(runData->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, ic->tabq_size*sizeof(cl_float), ic->tabq_coul_F, &cl_error); assert(cl_error == CL_SUCCESS); // TODO: handle errors, check clCreateBuffer flags nbp->coulomb_tab_climg2d = coul_tab; nbp->coulomb_tab_scale = ic->tabq_scale; }
// **************************************************************************** // Function: RunBenchmark // // Purpose: // Runs the stablity test. The algorithm for the parallel // version of the test, which enables testing of an entire GPU // cluster at the same time, is as follows. Each participating node // first allocates its data, while node zero additionally determines // start and finish times based on a user input parameter. All nodes // then enter the outermost loop, copying fresh data from the CPU // before entering the core of the test. In the core, each node // performs a loop consisting of the forward kernel, a potential // check, and then the inverse kernel. After performing a configurable // number of forward/inverse iterations, along with a configurable // number of checks, each node sends the number of failures it // encountered to node zero. Node zero collects and reports the error // counts, determines whether the test has run its course, and // broadcasts the decision. If the decision is to proceed, each node // begins the next iteration of the outer loop, copying fresh data and // then performing the kernels and checks of the core loop. // // Arguments: // resultDB: the benchmark stores its results in this ResultDatabase // op: the options parser / parameter database // // Returns: nothing // // Programmer: Collin McCurdy // Creation: September 08, 2009 // // Modifications: // // **************************************************************************** void RunBenchmark(ResultDatabase &resultDB, OptionParser& op) { int mpi_rank, mpi_size, node_rank; int i, j; float2* source, * result; void* work, * chk; #ifdef PARALLEL MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); NodeInfo NI; node_rank = NI.nodeRank(); cout << "MPI Task " << mpi_rank << " of " << mpi_size << " (noderank=" << node_rank << ") starting....\n"; #else mpi_rank = 0; mpi_size = 1; node_rank = 0; #endif // ensure chk buffer alloc succeeds before grabbing the // rest of available memory. allocDeviceBuffer(&chk, 1); unsigned long avail_bytes = findAvailBytes(); // unsigned long avail_bytes = 1024*1024*1024-1; // now determine how much available memory will be used (subject // to CUDA's constraint on the maximum block dimension size) int blocks = avail_bytes / (512*sizeof(float2)); int slices = 1; while (blocks/slices > 65535) { slices *= 2; } int half_n_ffts = ((blocks/slices)*slices)/2; int n_ffts = half_n_ffts * 2; fprintf(stderr, "avail_bytes=%ld, blocks=%d, n_ffts=%d\n", avail_bytes, blocks, n_ffts); int half_n_cmplx = half_n_ffts * 512; unsigned long used_bytes = half_n_cmplx * 2 * sizeof(float2); cout << mpi_rank << ": testing " << used_bytes/((double)1024*1024) << " MBs\n"; // allocate host memory source = (float2*)malloc(used_bytes); result = (float2*)malloc(used_bytes); // alloc device memory allocDeviceBuffer(&work, used_bytes); // alloc gather buffer int* recvbuf = (int*)malloc(mpi_size*sizeof(int)); // compute start and finish times time_t start = time(NULL); time_t finish = start + (time_t)(op.getOptionInt("time")*60); struct tm start_tm, finish_tm; localtime_r(&start, &start_tm); localtime_r(&finish, &finish_tm); if (mpi_rank == 0) { printf("start = %s", asctime(&start_tm)); printf("finish = %s", asctime(&finish_tm)); } for (int iter = 0; ; iter++) { bool failed = false; int errorCount = 0, stop = 0; // (re-)init host memory... for (i = 0; i < half_n_cmplx; i++) { source[i].x = (rand()/(float)RAND_MAX)*2-1; source[i].y = (rand()/(float)RAND_MAX)*2-1; source[i+half_n_cmplx].x = source[i].x; source[i+half_n_cmplx].y = source[i].y; } // copy to device copyToDevice(work, source, used_bytes); copyToDevice(chk, &errorCount, 1); forward(work, n_ffts); if (check(work, chk, half_n_ffts, half_n_cmplx)) { fprintf(stderr, "First check failed..."); failed = true; } if (!failed) { for (i = 1; i <= CHECKS; i++) { for (j = 1; j <= ITERS_PER_CHECK; j++) { inverse(work, n_ffts); forward(work, n_ffts); } if (check(work, chk, half_n_ffts, half_n_cmplx)) { failed = true; break; } } } // failing node is responsible for verifying failure, counting // errors and reporting count to node 0. if (failed) { fprintf(stderr, "Failure on node %d, iter %d:", mpi_rank, iter); // repeat check on CPU copyFromDevice(result, work, used_bytes); float2* result2 = result + half_n_cmplx; for (j = 0; j < half_n_cmplx; j++) { if (result[j].x != result2[j].x || result[j].y != result2[j].y) { errorCount++; } } if (!errorCount) { fprintf(stderr, "verification failed!\n"); } else { fprintf(stderr, "%d errors\n", errorCount); } } #ifdef PARALLEL MPI_Gather(&errorCount, 1, MPI_INT, recvbuf, 1, MPI_INT, 0, MPI_COMM_WORLD); #else recvbuf[0] = errorCount; #endif // node 0 collects and reports error counts, determines // whether test has run its course, and broadcasts decision if (mpi_rank == 0) { time_t curtime = time(NULL); struct tm curtm; localtime_r(&curtime, &curtm); fprintf(stderr, "iter=%d: %s", iter, asctime(&curtm)); for (i = 0; i < mpi_size; i++) { if (recvbuf[i]) { fprintf(stderr, "--> %d failures on node %d\n", recvbuf[i], i); } } if (curtime > finish) { stop = 1; } } #ifdef PARALLEL MPI_Bcast(&stop, 1, MPI_INT, 0, MPI_COMM_WORLD); #endif resultDB.AddResult("Check", "", "Failures", errorCount); if (stop) break; } freeDeviceBuffer(work); freeDeviceBuffer(chk); free(source); free(result); free(recvbuf); }
//! This function is documented in the header file void nbnxn_gpu_init_atomdata(gmx_nbnxn_ocl_t *nb, const nbnxn_atomdata_t *nbat) { cl_int cl_error; int nalloc, natoms; bool realloced; bool bDoTime = nb->bDoTime == CL_TRUE; cl_timers_t *timers = nb->timers; cl_atomdata_t *d_atdat = nb->atdat; cl_command_queue ls = nb->stream[eintLocal]; natoms = nbat->natoms; realloced = false; if (bDoTime) { /* time async copy */ timers->atdat.openTimingRegion(ls); } /* need to reallocate if we have to copy more atoms than the amount of space available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */ if (natoms > d_atdat->nalloc) { nalloc = over_alloc_small(natoms); /* free up first if the arrays have already been initialized */ if (d_atdat->nalloc != -1) { freeDeviceBuffer(&d_atdat->f); freeDeviceBuffer(&d_atdat->xq); freeDeviceBuffer(&d_atdat->lj_comb); freeDeviceBuffer(&d_atdat->atom_types); } d_atdat->f_elem_size = sizeof(rvec); // TODO: handle errors, check clCreateBuffer flags d_atdat->f = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_WRITE, nalloc * d_atdat->f_elem_size, nullptr, &cl_error); assert(CL_SUCCESS == cl_error); // TODO: change the flag to read-only d_atdat->xq = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_WRITE, nalloc * sizeof(cl_float4), nullptr, &cl_error); assert(CL_SUCCESS == cl_error); // TODO: handle errors, check clCreateBuffer flags if (useLjCombRule(nb->nbparam->vdwtype)) { // TODO: change the flag to read-only d_atdat->lj_comb = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_WRITE, nalloc * sizeof(cl_float2), nullptr, &cl_error); assert(CL_SUCCESS == cl_error); // TODO: handle errors, check clCreateBuffer flags } else { // TODO: change the flag to read-only d_atdat->atom_types = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_WRITE, nalloc * sizeof(int), nullptr, &cl_error); assert(CL_SUCCESS == cl_error); // TODO: handle errors, check clCreateBuffer flags } d_atdat->nalloc = nalloc; realloced = true; } d_atdat->natoms = natoms; d_atdat->natoms_local = nbat->natoms_local; /* need to clear GPU f output if realloc happened */ if (realloced) { nbnxn_ocl_clear_f(nb, nalloc); } if (useLjCombRule(nb->nbparam->vdwtype)) { ocl_copy_H2D_async(d_atdat->lj_comb, nbat->lj_comb, 0, natoms*sizeof(cl_float2), ls, bDoTime ? timers->atdat.fetchNextEvent() : nullptr); } else { ocl_copy_H2D_async(d_atdat->atom_types, nbat->type, 0, natoms*sizeof(int), ls, bDoTime ? timers->atdat.fetchNextEvent() : nullptr); } if (bDoTime) { timers->atdat.closeTimingRegion(ls); } /* kick off the tasks enqueued above to ensure concurrency with the search */ cl_error = clFlush(ls); assert(CL_SUCCESS == cl_error); }
//! This function is documented in the header file void nbnxn_gpu_free(gmx_nbnxn_ocl_t *nb) { if (nb == nullptr) { return; } /* Free kernels */ int kernel_count = sizeof(nb->kernel_ener_noprune_ptr) / sizeof(nb->kernel_ener_noprune_ptr[0][0]); free_kernels(nb->kernel_ener_noprune_ptr[0], kernel_count); kernel_count = sizeof(nb->kernel_ener_prune_ptr) / sizeof(nb->kernel_ener_prune_ptr[0][0]); free_kernels(nb->kernel_ener_prune_ptr[0], kernel_count); kernel_count = sizeof(nb->kernel_noener_noprune_ptr) / sizeof(nb->kernel_noener_noprune_ptr[0][0]); free_kernels(nb->kernel_noener_noprune_ptr[0], kernel_count); kernel_count = sizeof(nb->kernel_noener_prune_ptr) / sizeof(nb->kernel_noener_prune_ptr[0][0]); free_kernels(nb->kernel_noener_prune_ptr[0], kernel_count); free_kernel(&(nb->kernel_zero_e_fshift)); /* Free atdat */ freeDeviceBuffer(&(nb->atdat->xq)); freeDeviceBuffer(&(nb->atdat->f)); freeDeviceBuffer(&(nb->atdat->e_lj)); freeDeviceBuffer(&(nb->atdat->e_el)); freeDeviceBuffer(&(nb->atdat->fshift)); freeDeviceBuffer(&(nb->atdat->lj_comb)); freeDeviceBuffer(&(nb->atdat->atom_types)); freeDeviceBuffer(&(nb->atdat->shift_vec)); sfree(nb->atdat); /* Free nbparam */ freeDeviceBuffer(&(nb->nbparam->nbfp_climg2d)); freeDeviceBuffer(&(nb->nbparam->nbfp_comb_climg2d)); freeDeviceBuffer(&(nb->nbparam->coulomb_tab_climg2d)); sfree(nb->nbparam); /* Free plist */ auto *plist = nb->plist[eintLocal]; freeDeviceBuffer(&plist->sci); freeDeviceBuffer(&plist->cj4); freeDeviceBuffer(&plist->imask); freeDeviceBuffer(&plist->excl); sfree(plist); if (nb->bUseTwoStreams) { auto *plist_nl = nb->plist[eintNonlocal]; freeDeviceBuffer(&plist_nl->sci); freeDeviceBuffer(&plist_nl->cj4); freeDeviceBuffer(&plist_nl->imask); freeDeviceBuffer(&plist_nl->excl); sfree(plist_nl); } /* Free nbst */ pfree(nb->nbst.e_lj); nb->nbst.e_lj = nullptr; pfree(nb->nbst.e_el); nb->nbst.e_el = nullptr; pfree(nb->nbst.fshift); nb->nbst.fshift = nullptr; /* Free command queues */ clReleaseCommandQueue(nb->stream[eintLocal]); nb->stream[eintLocal] = nullptr; if (nb->bUseTwoStreams) { clReleaseCommandQueue(nb->stream[eintNonlocal]); nb->stream[eintNonlocal] = nullptr; } /* Free other events */ if (nb->nonlocal_done) { clReleaseEvent(nb->nonlocal_done); nb->nonlocal_done = nullptr; } if (nb->misc_ops_and_local_H2D_done) { clReleaseEvent(nb->misc_ops_and_local_H2D_done); nb->misc_ops_and_local_H2D_done = nullptr; } free_gpu_device_runtime_data(nb->dev_rundata); sfree(nb->dev_rundata); /* Free timers and timings */ delete nb->timers; sfree(nb->timings); sfree(nb); if (debug) { fprintf(debug, "Cleaned up OpenCL data structures.\n"); } }