Exemplo n.º 1
0
     operator cl_event() const
     {
         cl_int err = CL_SUCCESS;
         if(e_.size() == 1)
         {
             return e_[0];
         }
         if(!e_.empty())
         {
             cl_context ctx = get_info<command_queue::context_info_type>(command_queue());
             cl_event e = clCreateUserEvent(ctx, &err);
             OCLM_THROW_IF_EXCEPTION(err, "clCreateUserEvents");
 #ifdef CL_VERSION_1_2
             err = clEnqueueMarkerWithWaitList(command_queue(),
                 static_cast<cl_uint>(e_.size()), &e_[0], &e);
             OCLM_THROW_IF_EXCEPTION(err, "clEnqueueMarkerWithWaitList");
 #else
             get();
             err = clSetUserEventStatus(e, CL_COMPLETE);
 #endif
             return e;
         }
         else
             return cl_event();
     }
Exemplo n.º 2
0
JNIEXPORT jint JNICALL Java_org_lwjgl_opencl_CL12_nclEnqueueMarkerWithWaitList(JNIEnv *env, jclass clazz, jlong command_queue, jint num_events_in_wait_list, jlong event_wait_list, jlong event, jlong function_pointer) {
	const cl_event *event_wait_list_address = (const cl_event *)(intptr_t)event_wait_list;
	cl_event *event_address = (cl_event *)(intptr_t)event;
	clEnqueueMarkerWithWaitListPROC clEnqueueMarkerWithWaitList = (clEnqueueMarkerWithWaitListPROC)((intptr_t)function_pointer);
	cl_int __result = clEnqueueMarkerWithWaitList((cl_command_queue)(intptr_t)command_queue, num_events_in_wait_list, event_wait_list_address, event_address);
	return __result;
}
Exemplo n.º 3
0
    /// Enqueues a marker after \p events in the queue and returns an
    /// event that can be used to track its progress.
    ///
    /// \opencl_version_warning{1,2}
    event enqueue_marker(const wait_list &events)
    {
        event event_;

        cl_int ret = clEnqueueMarkerWithWaitList(
            m_queue, events.size(), events.get_event_ptr(), &event_.get()
        );

        if(ret != CL_SUCCESS){
            BOOST_THROW_EXCEPTION(opencl_error(ret));
        }

        return event_;
    }
Exemplo n.º 4
0
    /// Enqueues a marker in the queue and returns an event that can be
    /// used to track its progress.
    event enqueue_marker()
    {
        event event_;

        #ifdef CL_VERSION_1_2
        cl_int ret = clEnqueueMarkerWithWaitList(m_queue, 0, 0, &event_.get());
        #else
        cl_int ret = clEnqueueMarker(m_queue, &event_.get());
        #endif

        if(ret != CL_SUCCESS){
            BOOST_THROW_EXCEPTION(opencl_error(ret));
        }

        return event_;
    }
Exemplo n.º 5
0
/*! \brief
 * Launch asynchronously the download of nonbonded forces from the GPU
 * (and energies/shift forces if required).
 */
void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t               *nb,
                              const struct nbnxn_atomdata_t *nbatom,
                              int                            flags,
                              int                            aloc)
{
    cl_int gmx_unused cl_error;
    int               adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
    int               iloc = -1;

    /* determine interaction locality from atom locality */
    if (LOCAL_A(aloc))
    {
        iloc = eintLocal;
    }
    else if (NONLOCAL_A(aloc))
    {
        iloc = eintNonlocal;
    }
    else
    {
        char stmp[STRLEN];
        sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
                "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);

        gmx_incons(stmp);
    }

    cl_atomdata_t   *adat    = nb->atdat;
    cl_timers_t     *t       = nb->timers;
    bool             bDoTime = nb->bDoTime;
    cl_command_queue stream  = nb->stream[iloc];

    bool             bCalcEner   = flags & GMX_FORCE_ENERGY;
    int              bCalcFshift = flags & GMX_FORCE_VIRIAL;


    /* don't launch non-local copy-back if there was no non-local work to do */
    if (iloc == eintNonlocal && nb->plist[iloc]->nsci == 0)
    {
        return;
    }

    /* calculate the atom data index range based on locality */
    if (LOCAL_A(aloc))
    {
        adat_begin  = 0;
        adat_len    = adat->natoms_local;
    }
    else
    {
        adat_begin  = adat->natoms_local;
        adat_len    = adat->natoms - adat->natoms_local;
    }

    /* beginning of timed D2H section */

    /* With DD the local D2H transfer can only start after the non-local
       has been launched. */
    if (iloc == eintLocal && nb->bUseTwoStreams)
    {
        sync_ocl_event(stream, &(nb->nonlocal_done));
    }

    /* DtoH f */
    ocl_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f, adat_begin*3*sizeof(float),
                       (adat_len)* adat->f_elem_size, stream, bDoTime ? &(t->nb_d2h_f[iloc]) : NULL);

    /* kick off work */
    cl_error = clFlush(stream);
    assert(CL_SUCCESS == cl_error);

    /* After the non-local D2H is launched the nonlocal_done event can be
       recorded which signals that the local D2H can proceed. This event is not
       placed after the non-local kernel because we first need the non-local
       data back first. */
    if (iloc == eintNonlocal)
    {
#ifdef CL_VERSION_1_2
        cl_error = clEnqueueMarkerWithWaitList(stream, 0, NULL, &(nb->nonlocal_done));
#else
        cl_error = clEnqueueMarker(stream, &(nb->nonlocal_done));
#endif
        assert(CL_SUCCESS == cl_error);
    }

    /* only transfer energies in the local stream */
    if (LOCAL_I(iloc))
    {
        /* DtoH fshift */
        if (bCalcFshift)
        {
            ocl_copy_D2H_async(nb->nbst.fshift, adat->fshift, 0,
                               SHIFTS * adat->fshift_elem_size, stream, bDoTime ? &(t->nb_d2h_fshift[iloc]) : NULL);
        }

        /* DtoH energies */
        if (bCalcEner)
        {
            ocl_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, 0,
                               sizeof(float), stream, bDoTime ? &(t->nb_d2h_e_lj[iloc]) : NULL);

            ocl_copy_D2H_async(nb->nbst.e_el, adat->e_el, 0,
                               sizeof(float), stream, bDoTime ? &(t->nb_d2h_e_el[iloc]) : NULL);
        }
    }

    debug_dump_cj4_f_fshift(nb, nbatom, stream, adat_begin, adat_len);
}
Exemplo n.º 6
0
/*! \brief Launch GPU kernel

   As we execute nonbonded workload in separate queues, before launching
   the kernel we need to make sure that he following operations have completed:
   - atomdata allocation and related H2D transfers (every nstlist step);
   - pair list H2D transfer (every nstlist step);
   - shift vector H2D transfer (every nstlist step);
   - force (+shift force and energy) output clearing (every step).

   These operations are issued in the local queue at the beginning of the step
   and therefore always complete before the local kernel launch. The non-local
   kernel is launched after the local on the same device/context, so this is
   inherently scheduled after the operations in the local stream (including the
   above "misc_ops").
   However, for the sake of having a future-proof implementation, we use the
   misc_ops_done event to record the point in time when the above  operations
   are finished and synchronize with this event in the non-local stream.
 */
void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t               *nb,
                             const struct nbnxn_atomdata_t *nbatom,
                             int                            flags,
                             int                            iloc)
{
    cl_int               cl_error;
    int                  adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
    /* OpenCL kernel launch-related stuff */
    int                  shmem;
    size_t               local_work_size[3], global_work_size[3];
    cl_kernel            nb_kernel = NULL; /* fn pointer to the nonbonded kernel */

    cl_atomdata_t       *adat    = nb->atdat;
    cl_nbparam_t        *nbp     = nb->nbparam;
    cl_plist_t          *plist   = nb->plist[iloc];
    cl_timers_t         *t       = nb->timers;
    cl_command_queue     stream  = nb->stream[iloc];

    bool                 bCalcEner   = flags & GMX_FORCE_ENERGY;
    int                  bCalcFshift = flags & GMX_FORCE_VIRIAL;
    bool                 bDoTime     = nb->bDoTime;
    cl_uint              arg_no;

    cl_nbparam_params_t  nbparams_params;
#ifdef DEBUG_OCL
    float              * debug_buffer_h;
    size_t               debug_buffer_size;
#endif

    /* turn energy calculation always on/off (for debugging/testing only) */
    bCalcEner = (bCalcEner || always_ener) && !never_ener;

    /* Don't launch the non-local kernel if there is no work to do.
       Doing the same for the local kernel is more complicated, since the
       local part of the force array also depends on the non-local kernel.
       So to avoid complicating the code and to reduce the risk of bugs,
       we always call the local kernel, the local x+q copy and later (not in
       this function) the stream wait, local f copyback and the f buffer
       clearing. All these operations, except for the local interaction kernel,
       are needed for the non-local interactions. The skip of the local kernel
       call is taken care of later in this function. */
    if (iloc == eintNonlocal && plist->nsci == 0)
    {
        return;
    }

    /* calculate the atom data index range based on locality */
    if (LOCAL_I(iloc))
    {
        adat_begin  = 0;
        adat_len    = adat->natoms_local;
    }
    else
    {
        adat_begin  = adat->natoms_local;
        adat_len    = adat->natoms - adat->natoms_local;
    }

    /* beginning of timed HtoD section */

    /* HtoD x, q */
    ocl_copy_H2D_async(adat->xq, nbatom->x + adat_begin * 4, adat_begin*sizeof(float)*4,
                       adat_len * sizeof(float) * 4, stream, bDoTime ? (&(t->nb_h2d[iloc])) : NULL);

    /* When we get here all misc operations issues in the local stream as well as
       the local xq H2D are done,
       so we record that in the local stream and wait for it in the nonlocal one. */
    if (nb->bUseTwoStreams)
    {
        if (iloc == eintLocal)
        {
#ifdef CL_VERSION_1_2
            cl_error = clEnqueueMarkerWithWaitList(stream, 0, NULL, &(nb->misc_ops_and_local_H2D_done));
#else
            cl_error = clEnqueueMarker(stream, &(nb->misc_ops_and_local_H2D_done));
#endif
            assert(CL_SUCCESS == cl_error);

            /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
             * in the local stream in order to be able to sync with the above event
             * from the non-local stream.
             */
            cl_error = clFlush(stream);
            assert(CL_SUCCESS == cl_error);
        }
        else
        {
            sync_ocl_event(stream, &(nb->misc_ops_and_local_H2D_done));
        }
    }

    if (plist->nsci == 0)
    {
        /* Don't launch an empty local kernel (is not allowed with OpenCL).
         * TODO: Separate H2D and kernel launch into separate functions.
         */
        return;
    }

    /* beginning of timed nonbonded calculation section */

    /* get the pointer to the kernel flavor we need to use */
    nb_kernel = select_nbnxn_kernel(nb,
                                    nbp->eeltype,
                                    nbp->vdwtype,
                                    bCalcEner,
                                    plist->bDoPrune || always_prune);

    /* kernel launch config */
    local_work_size[0] = CL_SIZE;
    local_work_size[1] = CL_SIZE;
    local_work_size[2] = 1;

    global_work_size[0] = plist->nsci * local_work_size[0];
    global_work_size[1] = 1 * local_work_size[1];
    global_work_size[2] = 1 * local_work_size[2];

    validate_global_work_size(global_work_size, 3, nb->dev_info);

    shmem     = calc_shmem_required();

#ifdef DEBUG_OCL
    {
        static int run_step = 1;

        if (DEBUG_RUN_STEP == run_step)
        {
            debug_buffer_size = global_work_size[0] * global_work_size[1] * global_work_size[2] * sizeof(float);
            debug_buffer_h    = (float*)calloc(1, debug_buffer_size);
            assert(NULL != debug_buffer_h);

            if (NULL == nb->debug_buffer)
            {
                nb->debug_buffer = clCreateBuffer(nb->dev_info->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                                                  debug_buffer_size, debug_buffer_h, &cl_error);

                assert(CL_SUCCESS == cl_error);
            }
        }

        run_step++;
    }
#endif
    if (debug)
    {
        fprintf(debug, "GPU launch configuration:\n\tLocal work size: %dx%dx%d\n\t"
                "Global work size : %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n",
                (int)(local_work_size[0]), (int)(local_work_size[1]), (int)(local_work_size[2]),
                (int)(global_work_size[0]), (int)(global_work_size[1]), plist->nsci*NCL_PER_SUPERCL,
                NCL_PER_SUPERCL, plist->na_c);
    }

    fillin_ocl_structures(nbp, &nbparams_params);

    arg_no    = 0;
    cl_error  = clSetKernelArg(nb_kernel, arg_no++, sizeof(int), &(adat->ntypes));
    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(nbparams_params), &(nbparams_params));
    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->xq));
    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->f));
    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->e_lj));
    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->e_el));
    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->fshift));
    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->atom_types));
    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->shift_vec));
    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->nbfp_climg2d));
    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->nbfp_comb_climg2d));
    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->coulomb_tab_climg2d));
    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->sci));
    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->cj4));
    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->excl));
    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(int), &bCalcFshift);
    cl_error |= clSetKernelArg(nb_kernel, arg_no++, shmem, NULL);
    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nb->debug_buffer));

    assert(cl_error == CL_SUCCESS);

    if (cl_error)
    {
        printf("ClERROR! %d\n", cl_error);
    }
    cl_error = clEnqueueNDRangeKernel(stream, nb_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, bDoTime ? &(t->nb_k[iloc]) : NULL);
    assert(cl_error == CL_SUCCESS);

#ifdef DEBUG_OCL
    {
        static int run_step = 1;

        if (DEBUG_RUN_STEP == run_step)
        {
            FILE *pf;
            char  file_name[256] = {0};

            ocl_copy_D2H_async(debug_buffer_h, nb->debug_buffer, 0,
                               debug_buffer_size, stream, NULL);

            // Make sure all data has been transfered back from device
            clFinish(stream);

            printf("\nWriting debug_buffer to debug_buffer_ocl.txt...");

            sprintf(file_name, "debug_buffer_ocl_%d.txt", DEBUG_RUN_STEP);
            pf = fopen(file_name, "wt");
            assert(pf != NULL);

            fprintf(pf, "%20s", "");
            for (int j = 0; j < global_work_size[0]; j++)
            {
                char label[20];
                sprintf(label, "(wIdx=%2d thIdx=%2d)", j / local_work_size[0], j % local_work_size[0]);
                fprintf(pf, "%20s", label);
            }

            for (int i = 0; i < global_work_size[1]; i++)
            {
                char label[20];
                sprintf(label, "(wIdy=%2d thIdy=%2d)", i / local_work_size[1], i % local_work_size[1]);
                fprintf(pf, "\n%20s", label);

                for (int j = 0; j < global_work_size[0]; j++)
                {
                    fprintf(pf, "%20.5f", debug_buffer_h[i * global_work_size[0] + j]);
                }

                //fprintf(pf, "\n");
            }

            fclose(pf);

            printf(" done.\n");


            free(debug_buffer_h);
            debug_buffer_h = NULL;
        }

        run_step++;
    }
#endif
}
Exemplo n.º 7
0
  void enqueue() {
    // Errors in this function can not be handled by opencl_err.hpp
    // because they require non-standard error handling
    CAF_LOG_TRACE("command::enqueue()");
    this->ref(); // reference held by the OpenCL comand queue
    cl_event event_k;
    auto data_or_nullptr = [](const dim_vec& vec) {
      return vec.empty() ? nullptr : vec.data();
    };
    // OpenCL expects cl_uint (unsigned int), hence the cast
    cl_int err = clEnqueueNDRangeKernel(
      queue_.get(), actor_facade_->kernel_.get(),
      static_cast<cl_uint>(actor_facade_->config_.dimensions().size()),
      data_or_nullptr(actor_facade_->config_.offsets()),
      data_or_nullptr(actor_facade_->config_.dimensions()),
      data_or_nullptr(actor_facade_->config_.local_dimensions()),
      static_cast<cl_uint>(mem_in_events_.size()),
      (mem_in_events_.empty() ? nullptr : mem_in_events_.data()), &event_k
    );
    if (err != CL_SUCCESS) {
      CAF_LOGMF(CAF_ERROR, "clEnqueueNDRangeKernel: " << get_opencl_error(err));
      clReleaseEvent(event_k);
      this->deref();
      return;
    } else {
      enqueue_read_buffers(event_k, detail::get_indices(result_buffers_));
      cl_event marker;
#if defined(__APPLE__)
      err = clEnqueueMarkerWithWaitList(
        queue_.get(),
        static_cast<cl_uint>(mem_out_events_.size()),
        mem_out_events_.data(), &marker
      );
#else
      err = clEnqueueMarker(queue_.get(), &marker);
#endif
      if (err != CL_SUCCESS) {
        CAF_LOGMF(CAF_ERROR, "clSetEventCallback: " << get_opencl_error(err));
        clReleaseEvent(marker);
        clReleaseEvent(event_k);
        this->deref(); // callback is not set
        return;
      }
      err = clSetEventCallback(marker, CL_COMPLETE,
                               [](cl_event, cl_int, void* data) {
                                 auto cmd = reinterpret_cast<command*>(data);
                                 cmd->handle_results();
                                 cmd->deref();
                               },
                               this);
      if (err != CL_SUCCESS) {
        CAF_LOGMF(CAF_ERROR, "clSetEventCallback: " << get_opencl_error(err));
        clReleaseEvent(marker);
        clReleaseEvent(event_k);
        this->deref(); // callback is not set
        return;
      }
      err = clFlush(queue_.get());
      if (err != CL_SUCCESS) {
        CAF_LOGMF(CAF_ERROR, "clFlush: " << get_opencl_error(err));
      }
      mem_out_events_.push_back(std::move(event_k));
      mem_out_events_.push_back(std::move(marker));
    }
  }
int 
ImageOverlap::runCLKernels()
{
    cl_int status;
	
	//wait for fill end
	status=clEnqueueMarkerWithWaitList(commandQueue[2],2,eventlist,&enqueueEvent);
	CHECK_OPENCL_ERROR(status,"clEnqueueMarkerWithWaitList failed.(commandQueue[2])");

    // Set appropriate arguments to the kernelOverLap

    // map buffer image
	status = clSetKernelArg(
		kernelOverLap,
		0,
		sizeof(cl_mem),
		&mapImage);
    CHECK_OPENCL_ERROR(status,"clSetKernelArg failed. (mapImage)");

    // fill Buffer image
    status = clSetKernelArg(
        kernelOverLap,
        1,
        sizeof(cl_mem),
        &fillImage);
    CHECK_OPENCL_ERROR(status,"clSetKernelArg failed. (fillImage)");

	// fill Buffer image
	status = clSetKernelArg(
		kernelOverLap,
		2,
		sizeof(cl_mem),
		&outputImage);
	CHECK_OPENCL_ERROR(status,"clSetKernelArg failed. (outputImage)");

    // Enqueue a kernel run call.
    size_t globalThreads[] = {width, height};
    size_t localThreads[] = {blockSizeX, blockSizeY};

    status = clEnqueueNDRangeKernel(
        commandQueue[2],
        kernelOverLap,
        2,
        NULL,
        globalThreads,
        localThreads,
        1,
        &enqueueEvent,
        NULL);
    CHECK_OPENCL_ERROR(status,"clEnqueueNDRangeKernel failed.");

    // Enqueue Read Image
    size_t origin[] = {0, 0, 0};
    size_t region[] = {width, height, 1};
	size_t  rowPitch; 
	size_t  slicePitch;
    // Read copy
	outputImageData = (cl_uchar4*)clEnqueueMapImage( commandQueue[2],
		outputImage, 
		CL_FALSE, 
		mapFlag,
		origin, region,
		&rowPitch, &slicePitch,
		0, NULL,
		NULL,
		&status );
     CHECK_OPENCL_ERROR(status,"clEnqueueMapImage failed.(commandQueue[2])");

	clFlush(commandQueue[0]);
	clFlush(commandQueue[1]);
    
	status = clEnqueueUnmapMemObject(commandQueue[2],outputImage,(void*)outputImageData,NULL,0,NULL);
	CHECK_OPENCL_ERROR(status,"clEnqueueUnmapMemObject failed.(outputImage)");

	// Wait for the read buffer to finish execution
    status = clFinish(commandQueue[2]);
    CHECK_OPENCL_ERROR(status,"clFinish failed.(commandQueue[2])");


	return SDK_SUCCESS;
}