Beispiel #1
0
cl_int WINAPI wine_clEnqueueMarker(cl_command_queue command_queue, cl_event * event)
{
    cl_int ret;
    TRACE("\n");
    ret = clEnqueueMarker(command_queue, event);
    return ret;
}
Beispiel #2
0
/*!
    Synchronizes the host against the active command queue.
    This function will block until all currently queued commands
    have finished execution.
*/
void QCLContext::sync()
{
    cl_event event;
    cl_int error = clEnqueueMarker(activeQueue(), &event);
    reportError("QCLContext::sync:", error);
    if (error == CL_SUCCESS) {
        clWaitForEvents(1, &event);
        clReleaseEvent(event);
    }
}
Beispiel #3
0
/*!
    Returns a marker event for the active command queue.  The event
    will be signalled when all commands that were queued before this
    point have finished.

    \sa barrier(), sync()
*/
QCLEvent QCLContext::marker()
{
    cl_event evid;
    cl_int error = clEnqueueMarker(activeQueue(), &evid);
    reportError("QCLContext::marker:", error);
    if (error != CL_SUCCESS)
        return QCLEvent();
    else
        return QCLEvent(evid);
}
Beispiel #4
0
cl_mem cl_fft<float>::runInternal(const cl_mem input, cl_event *out_startEvent, cl_event *out_kernelEvents)
{
	CL_CHECK_ERR("clEnqueueMarker", clEnqueueMarker(command_queue, out_startEvent));

	// Lanci del kernel
	const cl_uint Nhalf = samplesPerRun / 2;
	cl_event prev_evt = *out_startEvent;
	for (unsigned int i = 0; i < launches.size(); i++)
	{
		if (launches[i].isOptibase == false)
		{
			// Solo il primo step ha input reali
			cl_kernel kernel = (i == 0) ? k_fftstep_real2cpx : k_fftstep_cpx2cpx;
			CL_CHECK_ERR("clSetKernelArg", clSetKernelArg(kernel, 0, sizeof(cl_mem), (i == 0) ? &input : &v_tmp1));
			CL_CHECK_ERR("clSetKernelArg", clSetKernelArg(kernel, 1, sizeof(cl_mem), &v_tmp2));
			CL_CHECK_ERR("clSetKernelArg", clSetKernelArg(kernel, 2, sizeof(cl_mem), &v_twiddleFactors));
			CL_CHECK_ERR("clSetKernelArg", clSetKernelArg(kernel, 3, sizeof(cl_uint), &launches[i].Wshift));
			CL_CHECK_ERR("clSetKernelArg", clSetKernelArg(kernel, 4, sizeof(cl_uint), &Nhalf));
			CL_CHECK_ERR("clEnqueueNDRangeKernel", clEnqueueNDRangeKernel(command_queue,
				kernel,
				2,
				NULL,
				launches[i].globalSize,
				launches[i].groupSize,
				1,
				&prev_evt,
				&out_kernelEvents[i]
			));
		}
		else
		{
			CL_CHECK_ERR("clSetKernelArg", clSetKernelArg(k_fftstep_optibase, 0, sizeof(cl_mem), &v_tmp1));
			CL_CHECK_ERR("clSetKernelArg", clSetKernelArg(k_fftstep_optibase, 1, sizeof(cl_mem), &v_tmp2));
			CL_CHECK_ERR("clSetKernelArg", clSetKernelArg(k_fftstep_optibase, 2, sizeof(cl_mem), &v_twiddleFactors));
			CL_CHECK_ERR("clSetKernelArg", clSetKernelArg(k_fftstep_optibase, 3, sizeof(cl_uint), &Nhalf));
			CL_CHECK_ERR("clEnqueueNDRangeKernel", clEnqueueNDRangeKernel(command_queue,
				k_fftstep_optibase,
				1,
				NULL,
				launches[i].globalSize,
				launches[i].groupSize,
				1,
				&prev_evt,
				&out_kernelEvents[i]
			));
		}

		prev_evt = out_kernelEvents[i];
		swap(v_tmp1, v_tmp2);
	}

	return v_tmp1;
}
Beispiel #5
0
void
Wait( cl_command_queue queue )
{
	cl_event wait;

	cl_int status = clEnqueueMarker( queue, &wait );
	if( status != CL_SUCCESS )
		fprintf( stderr, "Wait: clEnqueueMarker failed\n" );

	status = clEnqueueWaitForEvents( queue, 1, &wait );
	if( status != CL_SUCCESS )
		fprintf( stderr, "Wait: clEnqueueWaitForEvents failed\n" );
}
Beispiel #6
0
    /// Enqueues a marker in the queue and returns an event that can be
    /// used to track its progress.
    event enqueue_marker()
    {
        event event_;

        #ifdef CL_VERSION_1_2
        cl_int ret = clEnqueueMarkerWithWaitList(m_queue, 0, 0, &event_.get());
        #else
        cl_int ret = clEnqueueMarker(m_queue, &event_.get());
        #endif

        if(ret != CL_SUCCESS){
            BOOST_THROW_EXCEPTION(opencl_error(ret));
        }

        return event_;
    }
Beispiel #7
0
int acc_event_record (void* event, void* stream){
  // debug info
  if (verbose_print){
    fprintf(stdout, "\n ... EVENT RECORDING ... \n");
    fprintf(stdout, " ---> Entering: acc_event_record.\n");
  }

  // local event and queue pointers
  cl_event *clevent = (cl_event *) event;
  acc_opencl_stream_type *clstream = (acc_opencl_stream_type *) stream;

  // set a marker 'event' to listen on to queue 'stream' 
  cl_error = clEnqueueMarker((*clstream).queue, clevent);
  if (acc_opencl_error_check(cl_error, __LINE__))
    return -1;

  // debug info
  if (verbose_print){
    fprintf(stdout, " ---> Leaving: acc_event_record.\n");
  }

  // assign return value
  return 0;
}
Beispiel #8
0
/*! \brief
 * Launch asynchronously the download of nonbonded forces from the GPU
 * (and energies/shift forces if required).
 */
void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t               *nb,
                              const struct nbnxn_atomdata_t *nbatom,
                              int                            flags,
                              int                            aloc)
{
    cl_int gmx_unused cl_error;
    int               adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
    int               iloc = -1;

    /* determine interaction locality from atom locality */
    if (LOCAL_A(aloc))
    {
        iloc = eintLocal;
    }
    else if (NONLOCAL_A(aloc))
    {
        iloc = eintNonlocal;
    }
    else
    {
        char stmp[STRLEN];
        sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
                "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);

        gmx_incons(stmp);
    }

    cl_atomdata_t   *adat    = nb->atdat;
    cl_timers_t     *t       = nb->timers;
    bool             bDoTime = nb->bDoTime;
    cl_command_queue stream  = nb->stream[iloc];

    bool             bCalcEner   = flags & GMX_FORCE_ENERGY;
    int              bCalcFshift = flags & GMX_FORCE_VIRIAL;


    /* don't launch non-local copy-back if there was no non-local work to do */
    if (iloc == eintNonlocal && nb->plist[iloc]->nsci == 0)
    {
        return;
    }

    /* calculate the atom data index range based on locality */
    if (LOCAL_A(aloc))
    {
        adat_begin  = 0;
        adat_len    = adat->natoms_local;
    }
    else
    {
        adat_begin  = adat->natoms_local;
        adat_len    = adat->natoms - adat->natoms_local;
    }

    /* beginning of timed D2H section */

    /* With DD the local D2H transfer can only start after the non-local
       has been launched. */
    if (iloc == eintLocal && nb->bUseTwoStreams)
    {
        sync_ocl_event(stream, &(nb->nonlocal_done));
    }

    /* DtoH f */
    ocl_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f, adat_begin*3*sizeof(float),
                       (adat_len)* adat->f_elem_size, stream, bDoTime ? &(t->nb_d2h_f[iloc]) : NULL);

    /* kick off work */
    cl_error = clFlush(stream);
    assert(CL_SUCCESS == cl_error);

    /* After the non-local D2H is launched the nonlocal_done event can be
       recorded which signals that the local D2H can proceed. This event is not
       placed after the non-local kernel because we first need the non-local
       data back first. */
    if (iloc == eintNonlocal)
    {
#ifdef CL_VERSION_1_2
        cl_error = clEnqueueMarkerWithWaitList(stream, 0, NULL, &(nb->nonlocal_done));
#else
        cl_error = clEnqueueMarker(stream, &(nb->nonlocal_done));
#endif
        assert(CL_SUCCESS == cl_error);
    }

    /* only transfer energies in the local stream */
    if (LOCAL_I(iloc))
    {
        /* DtoH fshift */
        if (bCalcFshift)
        {
            ocl_copy_D2H_async(nb->nbst.fshift, adat->fshift, 0,
                               SHIFTS * adat->fshift_elem_size, stream, bDoTime ? &(t->nb_d2h_fshift[iloc]) : NULL);
        }

        /* DtoH energies */
        if (bCalcEner)
        {
            ocl_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, 0,
                               sizeof(float), stream, bDoTime ? &(t->nb_d2h_e_lj[iloc]) : NULL);

            ocl_copy_D2H_async(nb->nbst.e_el, adat->e_el, 0,
                               sizeof(float), stream, bDoTime ? &(t->nb_d2h_e_el[iloc]) : NULL);
        }
    }

    debug_dump_cj4_f_fshift(nb, nbatom, stream, adat_begin, adat_len);
}
Beispiel #9
0
/*! \brief Launch GPU kernel

   As we execute nonbonded workload in separate queues, before launching
   the kernel we need to make sure that he following operations have completed:
   - atomdata allocation and related H2D transfers (every nstlist step);
   - pair list H2D transfer (every nstlist step);
   - shift vector H2D transfer (every nstlist step);
   - force (+shift force and energy) output clearing (every step).

   These operations are issued in the local queue at the beginning of the step
   and therefore always complete before the local kernel launch. The non-local
   kernel is launched after the local on the same device/context, so this is
   inherently scheduled after the operations in the local stream (including the
   above "misc_ops").
   However, for the sake of having a future-proof implementation, we use the
   misc_ops_done event to record the point in time when the above  operations
   are finished and synchronize with this event in the non-local stream.
 */
void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t               *nb,
                             const struct nbnxn_atomdata_t *nbatom,
                             int                            flags,
                             int                            iloc)
{
    cl_int               cl_error;
    int                  adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
    /* OpenCL kernel launch-related stuff */
    int                  shmem;
    size_t               local_work_size[3], global_work_size[3];
    cl_kernel            nb_kernel = NULL; /* fn pointer to the nonbonded kernel */

    cl_atomdata_t       *adat    = nb->atdat;
    cl_nbparam_t        *nbp     = nb->nbparam;
    cl_plist_t          *plist   = nb->plist[iloc];
    cl_timers_t         *t       = nb->timers;
    cl_command_queue     stream  = nb->stream[iloc];

    bool                 bCalcEner   = flags & GMX_FORCE_ENERGY;
    int                  bCalcFshift = flags & GMX_FORCE_VIRIAL;
    bool                 bDoTime     = nb->bDoTime;
    cl_uint              arg_no;

    cl_nbparam_params_t  nbparams_params;
#ifdef DEBUG_OCL
    float              * debug_buffer_h;
    size_t               debug_buffer_size;
#endif

    /* turn energy calculation always on/off (for debugging/testing only) */
    bCalcEner = (bCalcEner || always_ener) && !never_ener;

    /* Don't launch the non-local kernel if there is no work to do.
       Doing the same for the local kernel is more complicated, since the
       local part of the force array also depends on the non-local kernel.
       So to avoid complicating the code and to reduce the risk of bugs,
       we always call the local kernel, the local x+q copy and later (not in
       this function) the stream wait, local f copyback and the f buffer
       clearing. All these operations, except for the local interaction kernel,
       are needed for the non-local interactions. The skip of the local kernel
       call is taken care of later in this function. */
    if (iloc == eintNonlocal && plist->nsci == 0)
    {
        return;
    }

    /* calculate the atom data index range based on locality */
    if (LOCAL_I(iloc))
    {
        adat_begin  = 0;
        adat_len    = adat->natoms_local;
    }
    else
    {
        adat_begin  = adat->natoms_local;
        adat_len    = adat->natoms - adat->natoms_local;
    }

    /* beginning of timed HtoD section */

    /* HtoD x, q */
    ocl_copy_H2D_async(adat->xq, nbatom->x + adat_begin * 4, adat_begin*sizeof(float)*4,
                       adat_len * sizeof(float) * 4, stream, bDoTime ? (&(t->nb_h2d[iloc])) : NULL);

    /* When we get here all misc operations issues in the local stream as well as
       the local xq H2D are done,
       so we record that in the local stream and wait for it in the nonlocal one. */
    if (nb->bUseTwoStreams)
    {
        if (iloc == eintLocal)
        {
#ifdef CL_VERSION_1_2
            cl_error = clEnqueueMarkerWithWaitList(stream, 0, NULL, &(nb->misc_ops_and_local_H2D_done));
#else
            cl_error = clEnqueueMarker(stream, &(nb->misc_ops_and_local_H2D_done));
#endif
            assert(CL_SUCCESS == cl_error);

            /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
             * in the local stream in order to be able to sync with the above event
             * from the non-local stream.
             */
            cl_error = clFlush(stream);
            assert(CL_SUCCESS == cl_error);
        }
        else
        {
            sync_ocl_event(stream, &(nb->misc_ops_and_local_H2D_done));
        }
    }

    if (plist->nsci == 0)
    {
        /* Don't launch an empty local kernel (is not allowed with OpenCL).
         * TODO: Separate H2D and kernel launch into separate functions.
         */
        return;
    }

    /* beginning of timed nonbonded calculation section */

    /* get the pointer to the kernel flavor we need to use */
    nb_kernel = select_nbnxn_kernel(nb,
                                    nbp->eeltype,
                                    nbp->vdwtype,
                                    bCalcEner,
                                    plist->bDoPrune || always_prune);

    /* kernel launch config */
    local_work_size[0] = CL_SIZE;
    local_work_size[1] = CL_SIZE;
    local_work_size[2] = 1;

    global_work_size[0] = plist->nsci * local_work_size[0];
    global_work_size[1] = 1 * local_work_size[1];
    global_work_size[2] = 1 * local_work_size[2];

    validate_global_work_size(global_work_size, 3, nb->dev_info);

    shmem     = calc_shmem_required();

#ifdef DEBUG_OCL
    {
        static int run_step = 1;

        if (DEBUG_RUN_STEP == run_step)
        {
            debug_buffer_size = global_work_size[0] * global_work_size[1] * global_work_size[2] * sizeof(float);
            debug_buffer_h    = (float*)calloc(1, debug_buffer_size);
            assert(NULL != debug_buffer_h);

            if (NULL == nb->debug_buffer)
            {
                nb->debug_buffer = clCreateBuffer(nb->dev_info->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                                                  debug_buffer_size, debug_buffer_h, &cl_error);

                assert(CL_SUCCESS == cl_error);
            }
        }

        run_step++;
    }
#endif
    if (debug)
    {
        fprintf(debug, "GPU launch configuration:\n\tLocal work size: %dx%dx%d\n\t"
                "Global work size : %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n",
                (int)(local_work_size[0]), (int)(local_work_size[1]), (int)(local_work_size[2]),
                (int)(global_work_size[0]), (int)(global_work_size[1]), plist->nsci*NCL_PER_SUPERCL,
                NCL_PER_SUPERCL, plist->na_c);
    }

    fillin_ocl_structures(nbp, &nbparams_params);

    arg_no    = 0;
    cl_error  = clSetKernelArg(nb_kernel, arg_no++, sizeof(int), &(adat->ntypes));
    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(nbparams_params), &(nbparams_params));
    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->xq));
    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->f));
    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->e_lj));
    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->e_el));
    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->fshift));
    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->atom_types));
    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->shift_vec));
    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->nbfp_climg2d));
    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->nbfp_comb_climg2d));
    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->coulomb_tab_climg2d));
    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->sci));
    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->cj4));
    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->excl));
    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(int), &bCalcFshift);
    cl_error |= clSetKernelArg(nb_kernel, arg_no++, shmem, NULL);
    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nb->debug_buffer));

    assert(cl_error == CL_SUCCESS);

    if (cl_error)
    {
        printf("ClERROR! %d\n", cl_error);
    }
    cl_error = clEnqueueNDRangeKernel(stream, nb_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, bDoTime ? &(t->nb_k[iloc]) : NULL);
    assert(cl_error == CL_SUCCESS);

#ifdef DEBUG_OCL
    {
        static int run_step = 1;

        if (DEBUG_RUN_STEP == run_step)
        {
            FILE *pf;
            char  file_name[256] = {0};

            ocl_copy_D2H_async(debug_buffer_h, nb->debug_buffer, 0,
                               debug_buffer_size, stream, NULL);

            // Make sure all data has been transfered back from device
            clFinish(stream);

            printf("\nWriting debug_buffer to debug_buffer_ocl.txt...");

            sprintf(file_name, "debug_buffer_ocl_%d.txt", DEBUG_RUN_STEP);
            pf = fopen(file_name, "wt");
            assert(pf != NULL);

            fprintf(pf, "%20s", "");
            for (int j = 0; j < global_work_size[0]; j++)
            {
                char label[20];
                sprintf(label, "(wIdx=%2d thIdx=%2d)", j / local_work_size[0], j % local_work_size[0]);
                fprintf(pf, "%20s", label);
            }

            for (int i = 0; i < global_work_size[1]; i++)
            {
                char label[20];
                sprintf(label, "(wIdy=%2d thIdy=%2d)", i / local_work_size[1], i % local_work_size[1]);
                fprintf(pf, "\n%20s", label);

                for (int j = 0; j < global_work_size[0]; j++)
                {
                    fprintf(pf, "%20.5f", debug_buffer_h[i * global_work_size[0] + j]);
                }

                //fprintf(pf, "\n");
            }

            fclose(pf);

            printf(" done.\n");


            free(debug_buffer_h);
            debug_buffer_h = NULL;
        }

        run_step++;
    }
#endif
}
Beispiel #10
0
////////////////////////////////////////////////////////////////////////////////
// Test driver
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    cl_platform_id cpPlatform;
    cl_device_id cdDevice;
    cl_context cxGPUContext;                        //OpenCL context
    cl_command_queue cqCommandQueue;                //OpenCL command queue
    cl_mem c_Kernel, d_Input, d_Buffer, d_Output;   //OpenCL memory buffer objects
    cl_float *h_Kernel, *h_Input, *h_Buffer, *h_OutputCPU, *h_OutputGPU;

    cl_int ciErrNum;

    const unsigned int imageW = 3072;
    const unsigned int imageH = 3072;

    shrQAStart(argc, argv);

    // set logfile name and start logs
    shrSetLogFileName ("oclConvolutionSeparable.txt");
    shrLog("%s Starting...\n\n", argv[0]); 

    shrLog("Allocating and initializing host memory...\n");
        h_Kernel    = (cl_float *)malloc(KERNEL_LENGTH * sizeof(cl_float));
        h_Input     = (cl_float *)malloc(imageW * imageH * sizeof(cl_float));
        h_Buffer    = (cl_float *)malloc(imageW * imageH * sizeof(cl_float));
        h_OutputCPU = (cl_float *)malloc(imageW * imageH * sizeof(cl_float));
        h_OutputGPU = (cl_float *)malloc(imageW * imageH * sizeof(cl_float));

        srand(2009);
        for(unsigned int i = 0; i < KERNEL_LENGTH; i++)
            h_Kernel[i] = (cl_float)(rand() % 16);

        for(unsigned int i = 0; i < imageW * imageH; i++)
            h_Input[i] = (cl_float)(rand() % 16);

    shrLog("Initializing OpenCL...\n");
        //Get the NVIDIA platform
        ciErrNum = oclGetPlatformID(&cpPlatform);
        oclCheckError(ciErrNum, CL_SUCCESS);

        //Get the devices
        ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);

        //Create the context
        cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);

        //Create a command-queue
        cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);

    shrLog("Initializing OpenCL separable convolution...\n");
        initConvolutionSeparable(cxGPUContext, cqCommandQueue, (const char **)argv);

    shrLog("Creating OpenCL memory objects...\n");
        c_Kernel = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, KERNEL_LENGTH * sizeof(cl_float), h_Kernel, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);
        d_Input = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, imageW * imageH * sizeof(cl_float), h_Input, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);
        d_Buffer = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, imageW * imageH * sizeof(cl_float), NULL, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);
        d_Output = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, imageW * imageH * sizeof(cl_float), NULL, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);

    shrLog("Applying separable convolution to %u x %u image...\n\n", imageW, imageH);
        //Just a single run or a warmup iteration
        convolutionRows(
            NULL,
            d_Buffer,
            d_Input,
            c_Kernel,
            imageW,
            imageH
        );

        convolutionColumns(
            NULL,
            d_Output,
            d_Buffer,
            c_Kernel,
            imageW,
            imageH
        );

#ifdef GPU_PROFILING
    const int numIterations = 16;
    cl_event startMark, endMark;
    ciErrNum = clEnqueueMarker(cqCommandQueue, &startMark);
    ciErrNum |= clFinish(cqCommandQueue);
    shrCheckError(ciErrNum, CL_SUCCESS);
    shrDeltaT(0);

    for(int iter = 0; iter < numIterations; iter++){
        convolutionRows(
            cqCommandQueue,
            d_Buffer,
            d_Input,
            c_Kernel,
            imageW,
            imageH
        );

        convolutionColumns(
            cqCommandQueue,
            d_Output,
            d_Buffer,
            c_Kernel,
            imageW,
            imageH
        );
    }
    ciErrNum  = clEnqueueMarker(cqCommandQueue, &endMark);
    ciErrNum |= clFinish(cqCommandQueue);
    shrCheckError(ciErrNum, CL_SUCCESS);

    //Calculate performance metrics by wallclock time
    double gpuTime = shrDeltaT(0) / (double)numIterations;
    shrLogEx(LOGBOTH | MASTER, 0, "oclConvolutionSeparable, Throughput = %.4f MPixels/s, Time = %.5f s, Size = %u Pixels, NumDevsUsed = %i, Workgroup = %u\n",
            (1.0e-6 * (double)(imageW * imageH)/ gpuTime), gpuTime, (imageW * imageH), 1, 0);

    //Get OpenCL profiler  info
    cl_ulong startTime = 0, endTime = 0;
    ciErrNum  = clGetEventProfilingInfo(startMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &startTime, NULL);
    ciErrNum |= clGetEventProfilingInfo(endMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL);
    shrCheckError(ciErrNum, CL_SUCCESS);
    shrLog("\nOpenCL time: %.5f s\n\n", 1.0e-9 * ((double)endTime - (double)startTime)/ (double)numIterations);
#endif

    shrLog("Reading back OpenCL results...\n\n");
        ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Output, CL_TRUE, 0, imageW * imageH * sizeof(cl_float), h_OutputGPU, 0, NULL, NULL);
        oclCheckError(ciErrNum, CL_SUCCESS);

    shrLog("Comparing against Host/C++ computation...\n"); 
        convolutionRowHost(h_Buffer, h_Input, h_Kernel, imageW, imageH, KERNEL_RADIUS);
        convolutionColumnHost(h_OutputCPU, h_Buffer, h_Kernel, imageW, imageH, KERNEL_RADIUS);
        double sum = 0, delta = 0;
        double L2norm;
        for(unsigned int i = 0; i < imageW * imageH; i++){
            delta += (h_OutputCPU[i] - h_OutputGPU[i]) * (h_OutputCPU[i] - h_OutputGPU[i]);
            sum += h_OutputCPU[i] * h_OutputCPU[i];
        }
        L2norm = sqrt(delta / sum);
        shrLog("Relative L2 norm: %.3e\n\n", L2norm);

    // cleanup
    closeConvolutionSeparable();
    ciErrNum  = clReleaseMemObject(d_Output);
    ciErrNum |= clReleaseMemObject(d_Buffer);
    ciErrNum |= clReleaseMemObject(d_Input);
    ciErrNum |= clReleaseMemObject(c_Kernel);
    ciErrNum |= clReleaseCommandQueue(cqCommandQueue);
    ciErrNum |= clReleaseContext(cxGPUContext);
    oclCheckError(ciErrNum, CL_SUCCESS);

    free(h_OutputGPU);
    free(h_OutputCPU);
    free(h_Buffer);
    free(h_Input);
    free(h_Kernel);

    // finish
    shrQAFinishExit(argc, (const char **)argv, (L2norm < 1e-6) ? QA_PASSED : QA_FAILED);
}
Beispiel #11
0
  void enqueue() {
    // Errors in this function can not be handled by opencl_err.hpp
    // because they require non-standard error handling
    CAF_LOG_TRACE("command::enqueue()");
    this->ref(); // reference held by the OpenCL comand queue
    cl_event event_k;
    auto data_or_nullptr = [](const dim_vec& vec) {
      return vec.empty() ? nullptr : vec.data();
    };
    // OpenCL expects cl_uint (unsigned int), hence the cast
    cl_int err = clEnqueueNDRangeKernel(
      queue_.get(), actor_facade_->kernel_.get(),
      static_cast<cl_uint>(actor_facade_->config_.dimensions().size()),
      data_or_nullptr(actor_facade_->config_.offsets()),
      data_or_nullptr(actor_facade_->config_.dimensions()),
      data_or_nullptr(actor_facade_->config_.local_dimensions()),
      static_cast<cl_uint>(mem_in_events_.size()),
      (mem_in_events_.empty() ? nullptr : mem_in_events_.data()), &event_k
    );
    if (err != CL_SUCCESS) {
      CAF_LOGMF(CAF_ERROR, "clEnqueueNDRangeKernel: " << get_opencl_error(err));
      clReleaseEvent(event_k);
      this->deref();
      return;
    } else {
      enqueue_read_buffers(event_k, detail::get_indices(result_buffers_));
      cl_event marker;
#if defined(__APPLE__)
      err = clEnqueueMarkerWithWaitList(
        queue_.get(),
        static_cast<cl_uint>(mem_out_events_.size()),
        mem_out_events_.data(), &marker
      );
#else
      err = clEnqueueMarker(queue_.get(), &marker);
#endif
      if (err != CL_SUCCESS) {
        CAF_LOGMF(CAF_ERROR, "clSetEventCallback: " << get_opencl_error(err));
        clReleaseEvent(marker);
        clReleaseEvent(event_k);
        this->deref(); // callback is not set
        return;
      }
      err = clSetEventCallback(marker, CL_COMPLETE,
                               [](cl_event, cl_int, void* data) {
                                 auto cmd = reinterpret_cast<command*>(data);
                                 cmd->handle_results();
                                 cmd->deref();
                               },
                               this);
      if (err != CL_SUCCESS) {
        CAF_LOGMF(CAF_ERROR, "clSetEventCallback: " << get_opencl_error(err));
        clReleaseEvent(marker);
        clReleaseEvent(event_k);
        this->deref(); // callback is not set
        return;
      }
      err = clFlush(queue_.get());
      if (err != CL_SUCCESS) {
        CAF_LOGMF(CAF_ERROR, "clFlush: " << get_opencl_error(err));
      }
      mem_out_events_.push_back(std::move(event_k));
      mem_out_events_.push_back(std::move(marker));
    }
  }
Beispiel #12
0
////////////////////////////////////////////////////////////////////////////////
// Main program
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    cl_platform_id cpPlatform;       //OpenCL platform
    cl_device_id cdDevice;           //OpenCL device
    cl_context       cxGPUContext;   //OpenCL context
    cl_command_queue cqCommandQueue; //OpenCL command que
    cl_mem      d_Input, d_Output;   //OpenCL memory buffer objects

    cl_int ciErrNum;

    float *h_Input, *h_OutputCPU, *h_OutputGPU;

    const uint
        imageW = 2048,
        imageH = 2048,
        stride = 2048;

    const int dir = DCT_FORWARD;

    shrQAStart(argc, argv);


    int use_gpu = 0;
    for(int i = 0; i < argc && argv; i++)
    {
        if(!argv[i])
            continue;
          
        if(strstr(argv[i], "cpu"))
            use_gpu = 0;        

        else if(strstr(argv[i], "gpu"))
            use_gpu = 1;
    }

    // set logfile name and start logs
    shrSetLogFileName ("oclDCT8x8.txt");
    shrLog("%s Starting...\n\n", argv[0]); 

    shrLog("Allocating and initializing host memory...\n");
        h_Input     = (float *)malloc(imageH * stride * sizeof(float));
        h_OutputCPU = (float *)malloc(imageH * stride * sizeof(float));
        h_OutputGPU = (float *)malloc(imageH * stride * sizeof(float));
        srand(2009);
        for(uint i = 0; i < imageH; i++)
            for(uint j = 0; j < imageW; j++)
                h_Input[i * stride + j] = (float)rand() / (float)RAND_MAX;

    shrLog("Initializing OpenCL...\n");
        //Get the NVIDIA platform
        ciErrNum = oclGetPlatformID(&cpPlatform);
        oclCheckError(ciErrNum, CL_SUCCESS);

        //Get a GPU device
        ciErrNum = clGetDeviceIDs(cpPlatform, use_gpu?CL_DEVICE_TYPE_GPU:CL_DEVICE_TYPE_CPU, 1, &cdDevice, NULL);
        oclCheckError(ciErrNum, CL_SUCCESS);

        //Create the context
        cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);

        //Create a command-queue
        cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);

    shrLog("Initializing OpenCL DCT 8x8...\n");
        initDCT8x8(cxGPUContext, cqCommandQueue, (const char **)argv);

    shrLog("Creating OpenCL memory objects...\n");
        d_Input = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, imageH * stride *  sizeof(cl_float), h_Input, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);
        d_Output = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, imageH * stride * sizeof(cl_float), NULL, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);

    shrLog("Performing DCT8x8 of %u x %u image...\n\n", imageH, imageW);
        //Just a single iteration or a warmup iteration
        DCT8x8(
            cqCommandQueue,
            d_Output,
            d_Input,
            stride,
            imageH,
            imageW,
            dir
        );

#ifdef GPU_PROFILING
    const int numIterations = 16;
    cl_event startMark, endMark;
    ciErrNum = clEnqueueMarker(cqCommandQueue, &startMark);
    ciErrNum |= clFinish(cqCommandQueue);
    shrCheckError(ciErrNum, CL_SUCCESS);
    shrDeltaT(0);

    for(int iter = 0; iter < numIterations; iter++)
        DCT8x8(
            NULL,
            d_Output,
            d_Input,
            stride,
            imageH,
            imageW,
            dir
        );

    ciErrNum  = clEnqueueMarker(cqCommandQueue, &endMark);
    ciErrNum |= clFinish(cqCommandQueue);
    shrCheckError(ciErrNum, CL_SUCCESS);

    //Calculate performance metrics by wallclock time
    double gpuTime = shrDeltaT(0) / (double)numIterations;
    shrLogEx(LOGBOTH | MASTER, 0, "oclDCT8x8, Throughput = %.4f MPixels/s, Time = %.5f s, Size = %u Pixels, NumDevsUsed = %i, Workgroup = %u\n", 
            (1.0e-6 * (double)(imageW * imageH)/ gpuTime), gpuTime, (imageW * imageH), 1, 0); 

    //Get profiler time
    cl_ulong startTime = 0, endTime = 0;
    ciErrNum  = clGetEventProfilingInfo(startMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &startTime, NULL);
    ciErrNum |= clGetEventProfilingInfo(endMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL);
    shrCheckError(ciErrNum, CL_SUCCESS);
    shrLog("\nOpenCL time: %.5f s\n\n", 1.0e-9 * ((double)endTime - (double)startTime) / (double)numIterations);
#endif

    shrLog("Reading back OpenCL results...\n");
        ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Output, CL_TRUE, 0, imageH * stride * sizeof(cl_float), h_OutputGPU, 0, NULL, NULL);
        oclCheckError(ciErrNum, CL_SUCCESS);

    shrLog("Comparing against Host/C++ computation...\n"); 
        DCT8x8CPU(h_OutputCPU, h_Input, stride, imageH, imageW, dir);
        double sum = 0, delta = 0;
        double L2norm;
        for(uint i = 0; i < imageH; i++)
            for(uint j = 0; j < imageW; j++){
                sum += h_OutputCPU[i * stride + j] * h_OutputCPU[i * stride + j];
                delta += (h_OutputGPU[i * stride + j] - h_OutputCPU[i * stride + j]) * (h_OutputGPU[i * stride + j] - h_OutputCPU[i * stride + j]);
            }
        L2norm = sqrt(delta / sum);
        shrLog("Relative L2 norm: %.3e\n\n", L2norm);

    shrLog("Shutting down...\n");
        //Release kernels and program
        closeDCT8x8();

        //Release other OpenCL objects
        ciErrNum  = clReleaseMemObject(d_Output);
        ciErrNum |= clReleaseMemObject(d_Input);
        ciErrNum |= clReleaseCommandQueue(cqCommandQueue);
        ciErrNum |= clReleaseContext(cxGPUContext);
        oclCheckError(ciErrNum, CL_SUCCESS);

        //Release host buffers
        free(h_OutputGPU);
        free(h_OutputCPU);
        free(h_Input);

        //Finish
        shrQAFinishExit(argc, (const char **)argv, (L2norm < 1E-6) ? QA_PASSED : QA_FAILED);
}
END_TEST

START_TEST (test_misc_events)
{
    cl_platform_id platform = 0;
    cl_device_id device;
    cl_context ctx;
    cl_command_queue queue;
    cl_int result;
    cl_event uevent1, uevent2, marker1, marker2;

    result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, 0);
    fail_if(
        result != CL_SUCCESS,
        "unable to get the default device"
    );

    ctx = clCreateContext(0, 1, &device, 0, 0, &result);
    fail_if(
        result != CL_SUCCESS || ctx == 0,
        "unable to create a valid context"
    );

    queue = clCreateCommandQueue(ctx, device, 0, &result);
    fail_if(
        result != CL_SUCCESS || queue == 0,
        "cannot create a command queue"
    );

    /*
     * This test will build a command queue blocked by an user event. The events
     * will be in this order :
     *
     * -: UserEvent1
     * 0: WaitForEvents1 (wait=UserEvent1)
     * 1: Marker1
     * -: UserEvent2
     * 2: WaitForEvents2 (wait=UserEvent2)
     * 3: Barrier
     * 4: Marker2 (to check the barrier worked)
     *
     * When the command queue is built, we :
     *  - Check that Marker1 is Queued (WaitForEvents waits)
     *  - Set UserEvent1 to Complete
     *  - Check that Marker1 is Complete (WaitForEvents stopped to wait)
     *  - Check that Marker2 is Queued (Barrier is there)
     *  - Set UserEvent2 to Complete
     *  - Check that Marker2 is Complete (no more barrier)
     */
    uevent1 = clCreateUserEvent(ctx, &result);
    fail_if(
        result != CL_SUCCESS,
        "unable to create UserEvent1"
    );

    uevent2 = clCreateUserEvent(ctx, &result);
    fail_if(
        result != CL_SUCCESS,
        "unable to create UserEvent2"
    );

    result = clEnqueueWaitForEvents(queue, 1, &uevent1);
    fail_if(
        result != CL_SUCCESS,
        "unable to enqueue WaitForEvents(UserEvent1)"
    );

    result = clEnqueueMarker(queue, &marker1);
    fail_if(
        result != CL_SUCCESS,
        "unable to enqueue Marker1"
    );

    result = clEnqueueWaitForEvents(queue, 1, &uevent2);
    fail_if(
        result != CL_SUCCESS,
        "unable to enqueue WaitForEvents(UserEvent2)"
    );

    result = clEnqueueBarrier(queue);
    fail_if(
        result != CL_SUCCESS,
        "unable to enqueue Barrier"
    );

    result = clEnqueueMarker(queue, &marker2);
    fail_if(
        result != CL_SUCCESS,
        "unable to enqueue Marker2"
    );

    // Now the checks
    cl_int status;

    result = clGetEventInfo(marker1, CL_EVENT_COMMAND_EXECUTION_STATUS,
                            sizeof(cl_int), &status, 0);
    fail_if(
        result != CL_SUCCESS || status != CL_QUEUED,
        "Marker1 must be Queued"
    );

    result = clSetUserEventStatus(uevent1, CL_COMPLETE);
    fail_if(
        result != CL_SUCCESS,
        "unable to set UserEvent1 to Complete"
    );

    result = clGetEventInfo(marker1, CL_EVENT_COMMAND_EXECUTION_STATUS,
                            sizeof(cl_int), &status, 0);
    fail_if(
        result != CL_SUCCESS || status != CL_COMPLETE,
        "Marker1 must be Complete"
    );

    result = clGetEventInfo(marker2, CL_EVENT_COMMAND_EXECUTION_STATUS,
                            sizeof(cl_int), &status, 0);
    fail_if(
        result != CL_SUCCESS || status != CL_QUEUED,
        "Marker2 must be Queued"
    );

    result = clSetUserEventStatus(uevent2, CL_COMPLETE);
    fail_if(
        result != CL_SUCCESS,
        "unable to set UserEvent2 to Complete"
    );

    result = clGetEventInfo(marker2, CL_EVENT_COMMAND_EXECUTION_STATUS,
                            sizeof(cl_int), &status, 0);
    fail_if(
        result != CL_SUCCESS || status != CL_COMPLETE,
        "Marker2 must be Complete"
    );

    clFinish(queue);

    clReleaseEvent(uevent1);
    clReleaseEvent(uevent2);
    clReleaseEvent(marker1);
    clReleaseEvent(marker2);
    clReleaseCommandQueue(queue);
    clReleaseContext(ctx);
}
void Extrae_OpenCL_clCreateCommandQueue (cl_command_queue queue,
	cl_device_id device, cl_command_queue_properties properties)
{
	if (!Extrae_OpenCL_lookForOpenCLQueue (queue, NULL))
	{
		cl_int err;
		char _threadname[THREAD_INFO_NAME_LEN];
		char _hostname[HOST_NAME_MAX];
		char *_device_type;
		int prev_threadid, found, idx;
		cl_device_type device_type;
		cl_event event;

		idx = nCommandQueues;
		CommandQueues = (RegisteredCommandQueue_t*) realloc (
			CommandQueues,
			sizeof(RegisteredCommandQueue_t)*(nCommandQueues+1));
		if (CommandQueues == NULL)
		{
			fprintf (stderr, PACKAGE_NAME": Fatal error! Failed to allocate memory for OpenCL Command Queues\n");
			exit (-1);
		}

		CommandQueues[idx].queue = queue;
		CommandQueues[idx].isOutOfOrder =
			(properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) != 0;

		err = clGetDeviceInfo (device, CL_DEVICE_TYPE, sizeof(device_type), &device_type, NULL);
		if (err == CL_SUCCESS)
		{
			if (device_type  == CL_DEVICE_TYPE_GPU)
				_device_type = "GPU";
			else if (device_type == CL_DEVICE_TYPE_CPU)
				_device_type = "CPU";
			else
				_device_type = "Other";
		}
		else
			_device_type = "Unknown";

		/* Was the thread created before (i.e. did we executed a cudadevicereset?) */
		if (gethostname(_hostname, HOST_NAME_MAX) == 0)
			sprintf (_threadname, "OpenCL-%s-CQ%d-%s", _device_type, 1+idx,
			  _hostname);
		else
			sprintf (_threadname, "OpenCL-%s-CQ%d-%s", _device_type, 1+idx,
			  "unknown-host");

		prev_threadid = Extrae_search_thread_name (_threadname, &found);

		if (found)
		{
			/* If thread name existed, reuse its thread id */
			CommandQueues[idx].threadid = prev_threadid;
		}
		else
		{
			/* For timing purposes we change num of threads here instead of doing Backend_getNumberOfThreads() + CUDAdevices*/
			Backend_ChangeNumberOfThreads (Backend_getNumberOfThreads() + 1);
			CommandQueues[idx].threadid = Backend_getNumberOfThreads()-1;

			/* Set thread name */
			Extrae_set_thread_name (CommandQueues[idx].threadid, _threadname);
		}

		CommandQueues[idx].nevents = 0;

#ifdef CL_VERSION_1_2
		err = clEnqueueBarrierWithWaitList (queue, 0, NULL, &event);
#else
		err = clEnqueueBarrier (queue);
		if (err == CL_SUCCESS)
			err = clEnqueueMarker (queue, &event);
#endif
		CommandQueues[idx].host_reference_time = TIME;

		if (err == CL_SUCCESS)
		{
			err = clFinish(queue);
			if (err != CL_SUCCESS)
			{
				fprintf (stderr, PACKAGE_NAME": Error in clFinish (error = %d)! Dying...\n", err);
				exit (-1);
			}

			err = clGetEventProfilingInfo (event, CL_PROFILING_COMMAND_SUBMIT,
				sizeof(cl_ulong), &(CommandQueues[idx].device_reference_time),
				NULL);
			if (err != CL_SUCCESS)
			{
				fprintf (stderr, PACKAGE_NAME": Error in clGetEventProfilingInfo (error = %d)! Dying...\n", err);
				exit (-1);
			}
		}
		else
		{
			fprintf (stderr, PACKAGE_NAME": Error while looking for clock references in host & accelerator\n");
			exit (-1);
		}

		nCommandQueues++;
	}
}