//! This function is documented in the header file void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_ocl_t *nb, const nbnxn_atomdata_t *nbatom) { cl_atomdata_t *adat = nb->atdat; cl_command_queue ls = nb->stream[eintLocal]; /* only if we have a dynamic box */ if (nbatom->bDynamicBox || !adat->bShiftVecUploaded) { ocl_copy_H2D_async(adat->shift_vec, nbatom->shift_vec, 0, SHIFTS * adat->shift_vec_elem_size, ls, nullptr); adat->bShiftVecUploaded = CL_TRUE; } }
/*! \brief Launch GPU kernel As we execute nonbonded workload in separate queues, before launching the kernel we need to make sure that he following operations have completed: - atomdata allocation and related H2D transfers (every nstlist step); - pair list H2D transfer (every nstlist step); - shift vector H2D transfer (every nstlist step); - force (+shift force and energy) output clearing (every step). These operations are issued in the local queue at the beginning of the step and therefore always complete before the local kernel launch. The non-local kernel is launched after the local on the same device/context, so this is inherently scheduled after the operations in the local stream (including the above "misc_ops"). However, for the sake of having a future-proof implementation, we use the misc_ops_done event to record the point in time when the above operations are finished and synchronize with this event in the non-local stream. */ void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t *nb, const struct nbnxn_atomdata_t *nbatom, int flags, int iloc) { cl_int cl_error; int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */ /* OpenCL kernel launch-related stuff */ int shmem; size_t local_work_size[3], global_work_size[3]; cl_kernel nb_kernel = NULL; /* fn pointer to the nonbonded kernel */ cl_atomdata_t *adat = nb->atdat; cl_nbparam_t *nbp = nb->nbparam; cl_plist_t *plist = nb->plist[iloc]; cl_timers_t *t = nb->timers; cl_command_queue stream = nb->stream[iloc]; bool bCalcEner = flags & GMX_FORCE_ENERGY; int bCalcFshift = flags & GMX_FORCE_VIRIAL; bool bDoTime = nb->bDoTime; cl_uint arg_no; cl_nbparam_params_t nbparams_params; #ifdef DEBUG_OCL float * debug_buffer_h; size_t debug_buffer_size; #endif /* turn energy calculation always on/off (for debugging/testing only) */ bCalcEner = (bCalcEner || always_ener) && !never_ener; /* Don't launch the non-local kernel if there is no work to do. Doing the same for the local kernel is more complicated, since the local part of the force array also depends on the non-local kernel. So to avoid complicating the code and to reduce the risk of bugs, we always call the local kernel, the local x+q copy and later (not in this function) the stream wait, local f copyback and the f buffer clearing. All these operations, except for the local interaction kernel, are needed for the non-local interactions. The skip of the local kernel call is taken care of later in this function. */ if (iloc == eintNonlocal && plist->nsci == 0) { return; } /* calculate the atom data index range based on locality */ if (LOCAL_I(iloc)) { adat_begin = 0; adat_len = adat->natoms_local; } else { adat_begin = adat->natoms_local; adat_len = adat->natoms - adat->natoms_local; } /* beginning of timed HtoD section */ /* HtoD x, q */ ocl_copy_H2D_async(adat->xq, nbatom->x + adat_begin * 4, adat_begin*sizeof(float)*4, adat_len * sizeof(float) * 4, stream, bDoTime ? (&(t->nb_h2d[iloc])) : NULL); /* When we get here all misc operations issues in the local stream as well as the local xq H2D are done, so we record that in the local stream and wait for it in the nonlocal one. */ if (nb->bUseTwoStreams) { if (iloc == eintLocal) { #ifdef CL_VERSION_1_2 cl_error = clEnqueueMarkerWithWaitList(stream, 0, NULL, &(nb->misc_ops_and_local_H2D_done)); #else cl_error = clEnqueueMarker(stream, &(nb->misc_ops_and_local_H2D_done)); #endif assert(CL_SUCCESS == cl_error); /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed * in the local stream in order to be able to sync with the above event * from the non-local stream. */ cl_error = clFlush(stream); assert(CL_SUCCESS == cl_error); } else { sync_ocl_event(stream, &(nb->misc_ops_and_local_H2D_done)); } } if (plist->nsci == 0) { /* Don't launch an empty local kernel (is not allowed with OpenCL). * TODO: Separate H2D and kernel launch into separate functions. */ return; } /* beginning of timed nonbonded calculation section */ /* get the pointer to the kernel flavor we need to use */ nb_kernel = select_nbnxn_kernel(nb, nbp->eeltype, nbp->vdwtype, bCalcEner, plist->bDoPrune || always_prune); /* kernel launch config */ local_work_size[0] = CL_SIZE; local_work_size[1] = CL_SIZE; local_work_size[2] = 1; global_work_size[0] = plist->nsci * local_work_size[0]; global_work_size[1] = 1 * local_work_size[1]; global_work_size[2] = 1 * local_work_size[2]; validate_global_work_size(global_work_size, 3, nb->dev_info); shmem = calc_shmem_required(); #ifdef DEBUG_OCL { static int run_step = 1; if (DEBUG_RUN_STEP == run_step) { debug_buffer_size = global_work_size[0] * global_work_size[1] * global_work_size[2] * sizeof(float); debug_buffer_h = (float*)calloc(1, debug_buffer_size); assert(NULL != debug_buffer_h); if (NULL == nb->debug_buffer) { nb->debug_buffer = clCreateBuffer(nb->dev_info->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, debug_buffer_size, debug_buffer_h, &cl_error); assert(CL_SUCCESS == cl_error); } } run_step++; } #endif if (debug) { fprintf(debug, "GPU launch configuration:\n\tLocal work size: %dx%dx%d\n\t" "Global work size : %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n", (int)(local_work_size[0]), (int)(local_work_size[1]), (int)(local_work_size[2]), (int)(global_work_size[0]), (int)(global_work_size[1]), plist->nsci*NCL_PER_SUPERCL, NCL_PER_SUPERCL, plist->na_c); } fillin_ocl_structures(nbp, &nbparams_params); arg_no = 0; cl_error = clSetKernelArg(nb_kernel, arg_no++, sizeof(int), &(adat->ntypes)); cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(nbparams_params), &(nbparams_params)); cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->xq)); cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->f)); cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->e_lj)); cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->e_el)); cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->fshift)); cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->atom_types)); cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->shift_vec)); cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->nbfp_climg2d)); cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->nbfp_comb_climg2d)); cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->coulomb_tab_climg2d)); cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->sci)); cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->cj4)); cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->excl)); cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(int), &bCalcFshift); cl_error |= clSetKernelArg(nb_kernel, arg_no++, shmem, NULL); cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nb->debug_buffer)); assert(cl_error == CL_SUCCESS); if (cl_error) { printf("ClERROR! %d\n", cl_error); } cl_error = clEnqueueNDRangeKernel(stream, nb_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, bDoTime ? &(t->nb_k[iloc]) : NULL); assert(cl_error == CL_SUCCESS); #ifdef DEBUG_OCL { static int run_step = 1; if (DEBUG_RUN_STEP == run_step) { FILE *pf; char file_name[256] = {0}; ocl_copy_D2H_async(debug_buffer_h, nb->debug_buffer, 0, debug_buffer_size, stream, NULL); // Make sure all data has been transfered back from device clFinish(stream); printf("\nWriting debug_buffer to debug_buffer_ocl.txt..."); sprintf(file_name, "debug_buffer_ocl_%d.txt", DEBUG_RUN_STEP); pf = fopen(file_name, "wt"); assert(pf != NULL); fprintf(pf, "%20s", ""); for (int j = 0; j < global_work_size[0]; j++) { char label[20]; sprintf(label, "(wIdx=%2d thIdx=%2d)", j / local_work_size[0], j % local_work_size[0]); fprintf(pf, "%20s", label); } for (int i = 0; i < global_work_size[1]; i++) { char label[20]; sprintf(label, "(wIdy=%2d thIdy=%2d)", i / local_work_size[1], i % local_work_size[1]); fprintf(pf, "\n%20s", label); for (int j = 0; j < global_work_size[0]; j++) { fprintf(pf, "%20.5f", debug_buffer_h[i * global_work_size[0] + j]); } //fprintf(pf, "\n"); } fclose(pf); printf(" done.\n"); free(debug_buffer_h); debug_buffer_h = NULL; } run_step++; } #endif }
//! This function is documented in the header file void nbnxn_gpu_init_atomdata(gmx_nbnxn_ocl_t *nb, const nbnxn_atomdata_t *nbat) { cl_int cl_error; int nalloc, natoms; bool realloced; bool bDoTime = nb->bDoTime == CL_TRUE; cl_timers_t *timers = nb->timers; cl_atomdata_t *d_atdat = nb->atdat; cl_command_queue ls = nb->stream[eintLocal]; natoms = nbat->natoms; realloced = false; if (bDoTime) { /* time async copy */ timers->atdat.openTimingRegion(ls); } /* need to reallocate if we have to copy more atoms than the amount of space available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */ if (natoms > d_atdat->nalloc) { nalloc = over_alloc_small(natoms); /* free up first if the arrays have already been initialized */ if (d_atdat->nalloc != -1) { freeDeviceBuffer(&d_atdat->f); freeDeviceBuffer(&d_atdat->xq); freeDeviceBuffer(&d_atdat->lj_comb); freeDeviceBuffer(&d_atdat->atom_types); } d_atdat->f_elem_size = sizeof(rvec); // TODO: handle errors, check clCreateBuffer flags d_atdat->f = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_WRITE, nalloc * d_atdat->f_elem_size, nullptr, &cl_error); assert(CL_SUCCESS == cl_error); // TODO: change the flag to read-only d_atdat->xq = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_WRITE, nalloc * sizeof(cl_float4), nullptr, &cl_error); assert(CL_SUCCESS == cl_error); // TODO: handle errors, check clCreateBuffer flags if (useLjCombRule(nb->nbparam->vdwtype)) { // TODO: change the flag to read-only d_atdat->lj_comb = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_WRITE, nalloc * sizeof(cl_float2), nullptr, &cl_error); assert(CL_SUCCESS == cl_error); // TODO: handle errors, check clCreateBuffer flags } else { // TODO: change the flag to read-only d_atdat->atom_types = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_WRITE, nalloc * sizeof(int), nullptr, &cl_error); assert(CL_SUCCESS == cl_error); // TODO: handle errors, check clCreateBuffer flags } d_atdat->nalloc = nalloc; realloced = true; } d_atdat->natoms = natoms; d_atdat->natoms_local = nbat->natoms_local; /* need to clear GPU f output if realloc happened */ if (realloced) { nbnxn_ocl_clear_f(nb, nalloc); } if (useLjCombRule(nb->nbparam->vdwtype)) { ocl_copy_H2D_async(d_atdat->lj_comb, nbat->lj_comb, 0, natoms*sizeof(cl_float2), ls, bDoTime ? timers->atdat.fetchNextEvent() : nullptr); } else { ocl_copy_H2D_async(d_atdat->atom_types, nbat->type, 0, natoms*sizeof(int), ls, bDoTime ? timers->atdat.fetchNextEvent() : nullptr); } if (bDoTime) { timers->atdat.closeTimingRegion(ls); } /* kick off the tasks enqueued above to ensure concurrency with the search */ cl_error = clFlush(ls); assert(CL_SUCCESS == cl_error); }