コード例 #1
0
/* Return CL_TRUE on error */
static cl_bool setSummarizationWorkgroupSize(const CLInfo* ci)
{
    size_t maxGroupSize;
    size_t groupSize;
    size_t nextMultiple;
    cl_int err;

    err = clGetKernelWorkGroupInfo(_summarizationKernel,
                                   ci->dev,
                                   CL_KERNEL_WORK_GROUP_SIZE,
                                   sizeof(maxGroupSize), &maxGroupSize,
                                   NULL);
    if (err != CL_SUCCESS)
    {
        return CL_TRUE;
    }

    if (maxGroupSize == 1)
    {
        /* Seems to be a problem on OS X CPU implementation */
        mw_printf("Workgroup size of 1 for summarization is not acceptable\n");
        return CL_TRUE;
    }

    /* OpenCL 1.1 has CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE
     * which we probably should be using if available  */

    nextMultiple = mwNextMultiple(64, ci->di.warpSize);
    if (nextMultiple <= maxGroupSize)
    {
        groupSize = nextMultiple;
    }
    else
    {
        groupSize = maxGroupSize;
    }

    if (groupSize > 128) /* Just in case */
    {
        groupSize = 128;
    }

    _summarizationWorkgroupSize = groupSize;

    return CL_FALSE;
}
コード例 #2
0
static cl_int runSummarization(CLInfo* ci,
                               SeparationCLMem* cm,
                               const IntegralArea* ia,
                               cl_uint which,
                               Kahan* resultOut)
{
    cl_int err = CL_SUCCESS;
    cl_mem buf;
    cl_uint offset;
    size_t global[1];
    size_t local[1];
    real result[2] = { -1.0, -1.0 };
    cl_uint nElements = ia->r_steps * ia->mu_steps;
    cl_mem sumBufs[2] = { cm->summarizationBufs[0], cm->summarizationBufs[1] };

    if (which == 0)
    {
        buf = cm->outBg;
        offset = 0;
    }
    else
    {
        buf = cm->outStreams;
        offset = (which - 1) * nElements;
    }


    /* First call reads from an offset into one of the output buffers */
    err |= clSetKernelArg(_summarizationKernel, 0, sizeof(cl_mem), &sumBufs[0]);
    err |= clSetKernelArg(_summarizationKernel, 1, sizeof(cl_mem), &buf);
    err |= clSetKernelArg(_summarizationKernel, 2, sizeof(cl_uint), &nElements);
    err |= clSetKernelArg(_summarizationKernel, 3, sizeof(cl_uint), &offset);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Error setting summarization kernel arguments");
        return err;
    }

    local[0] = _summarizationWorkgroupSize;
    global[0] = mwNextMultiple(local[0], nElements);

    err = clEnqueueNDRangeKernel(ci->queue, _summarizationKernel, 1,
                                 NULL, global, local,
                                 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Error enqueuing summarization kernel");
        return err;
    }

    /* Why is this necessary? It seems to frequently break on the 7970 and nowhere else without it */
    err = clFinish(ci->queue);
    //err = clFlush(ci->queue);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Error finishing summarization kernel");
        return err;
    }

    /* Later calls swap between summarization buffers without an offset */
    nElements = (cl_uint) mwDivRoundup(global[0], local[0]);
    offset = 0;
    err |= clSetKernelArg(_summarizationKernel, 3, sizeof(cl_uint), &offset);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Error setting summarization kernel offset argument");
        return err;
    }

    while (nElements > 1)
    {
        /* Swap old summarization buffer to the input and shrink the range */
        swapBuffers(sumBufs);

        global[0] = mwNextMultiple(local[0], nElements);

        err |= clSetKernelArg(_summarizationKernel, 0, sizeof(cl_mem), &sumBufs[0]);
        err |= clSetKernelArg(_summarizationKernel, 1, sizeof(cl_mem), &sumBufs[1]);
        err |= clSetKernelArg(_summarizationKernel, 2, sizeof(cl_uint), &nElements);
        if (err != CL_SUCCESS)
        {
            mwPerrorCL(err, "Error setting summarization kernel arguments");
            return err;
        }

        /*
        err = clEnqueueBarrier(ci->queue);
        if (err != CL_SUCCESS)
        {
            mwPerrorCL(err, "Error enqueuing summarization barrier");
            return err;
        }
        */

        err = clEnqueueNDRangeKernel(ci->queue, _summarizationKernel, 1,
                                     NULL, global, local,
                                     0, NULL, NULL);
        if (err != CL_SUCCESS)
        {
            mwPerrorCL(err, "Error enqueuing summarization kernel");
            return err;
        }

        err = clFinish(ci->queue);
        if (err != CL_SUCCESS)
        {
            mwPerrorCL(err, "Error finishing summarization kernel");
            return err;
        }

        nElements = (cl_uint) mwDivRoundup(global[0], local[0]);
    }


    err = clEnqueueBarrier(ci->queue);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Error enqueuing summarization barrier");
        return err;
    }

    err = clEnqueueReadBuffer(ci->queue, sumBufs[0], CL_TRUE,
                              0, 2 * sizeof(real), result,
                              0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Error reading summarization result buffer");
        return err;
    }

    resultOut->sum = result[0];
    resultOut->correction = result[1];

    return CL_SUCCESS;
}