/* Timing in nanoseconds */
cl_ulong mwEventTimeNS(cl_event ev)
{
    cl_int err;
    cl_ulong ts, te;

    if (!ev)
        return 0;

    err = clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &ts, NULL);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Failed to get event start time");
        return 0;
    }

    err = clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &te, NULL);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Failed to get event end time");
        return 0;
    }

    assert(te >= ts);

    return te - ts;
}
static cl_int createLBTrigBuffer(CLInfo* ci,
                                 SeparationCLMem* cm,
                                 const AstronomyParameters* ap,
                                 const IntegralArea* ia,
                                 const SeparationSizes* sizes,
                                 const cl_mem_flags constBufFlags)
{
    cl_int err = CL_SUCCESS;
    LTrigPair* lTrig = NULL;
    real* bSin = NULL;

    getSplitLBTrig(ap, ia, &lTrig, &bSin);

    cm->lTrig = clCreateBuffer(ci->clctx, constBufFlags, sizes->lTrig, lTrig, &err);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Error creating lTrig buffer of size "ZU, sizes->lTrig);
        return err;
    }

    cm->bSin = clCreateBuffer(ci->clctx, constBufFlags, sizes->bSin, bSin, &err);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Error creating bSin buffer of size "ZU, sizes->bSin);
        return err;
    }

    mwFreeA(lTrig);
    mwFreeA(bSin);

    return CL_SUCCESS;
}
static void mwGetPlatformInfo(PlatformInfo* pInfo, cl_platform_id platform)
{
    cl_int err;
    size_t readSize;

    err = clGetPlatformInfo(platform, CL_PLATFORM_NAME,
                            sizeof(pInfo->name), pInfo->name, &readSize);
    if (readSize > sizeof(pInfo->name))
        mwPerrorCL(err, "Failed to read platform name");

    err = clGetPlatformInfo(platform, CL_PLATFORM_VENDOR,
                            sizeof(pInfo->vendor), pInfo->vendor, &readSize);
    if (readSize > sizeof(pInfo->vendor))
        mwPerrorCL(err, "Failed to read platform vendor");

    err = clGetPlatformInfo(platform, CL_PLATFORM_VERSION,
                            sizeof(pInfo->version), pInfo->version, &readSize);
    if (readSize > sizeof(pInfo->version))
        mwPerrorCL(err, "Failed to read platform version");

    err = clGetPlatformInfo(platform, CL_PLATFORM_EXTENSIONS,
                            sizeof(pInfo->extensions), pInfo->extensions, &readSize);
    if (readSize > sizeof(pInfo->extensions))
        mwPerrorCL(err, "Failed to read platform extensions");

    err = clGetPlatformInfo(platform, CL_PLATFORM_PROFILE,
                            sizeof(pInfo->profile), pInfo->profile, &readSize);
    if (readSize > sizeof(pInfo->profile))
        mwPerrorCL(err, "Failed to read platform profile");
}
static cl_int mwCreateCtxQueue(CLInfo* ci, cl_bool useBufQueue, cl_bool enableProfiling)
{
    cl_int err = CL_SUCCESS;
    cl_command_queue_properties props = enableProfiling ? CL_QUEUE_PROFILING_ENABLE : 0;

    ci->clctx = clCreateContext(NULL, 1, &ci->dev, MW_CONTEXT_LOGGER, NULL, &err);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Error creating context");
        return err;
    }

    ci->queue = clCreateCommandQueue(ci->clctx, ci->dev, props, &err);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Error creating command queue");
        return err;
    }

    if (useBufQueue)
    {
        ci->bufQueue = clCreateCommandQueue(ci->clctx, ci->dev, 0, &err);
        if (err != CL_SUCCESS)
        {
            mwPerrorCL(err, "Error creating buffer command queue");
            return err;
        }
    }

    return CL_SUCCESS;
}
static cl_int runIntegralKernel(CLInfo* ci, const RunSizes* runSizes, const size_t offset[1])
{
    cl_int err;
    cl_event ev;

    err = clEnqueueNDRangeKernel(ci->queue,
                                 _separationKernel,
                                 1,
                                 offset, runSizes->global, runSizes->local,
                                 0, NULL, &ev);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Error enqueueing integral kernel execution");
        return err;
    }

    /* Give the screen a chance to redraw */
    err = mwCLWaitForEvent(ci, ev, runSizes->initialWait);
    clReleaseEvent(ev);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Failed to wait for integral event");
        return err;
    }

    return CL_SUCCESS;
}
cl_mem mwDuplicateBuffer(CLInfo* ci, cl_mem buf)
{
    cl_mem bufCopy;
    size_t size;
    cl_mem_flags flags;
    cl_int err;
    cl_event ev;

    if (!buf)
    {
        return NULL;
    }

    err = clGetMemObjectInfo(buf, CL_MEM_FLAGS, sizeof(flags), &flags, NULL);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Failed to get memory flags for buffer duplication");
        return NULL;
    }

    err = clGetMemObjectInfo(buf, CL_MEM_SIZE, sizeof(size), &size, NULL);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Failed to get memory size for buffer duplication");
        return NULL;
    }

    /* We may have initialized that one from a host pointer, but not this one */
    flags ^= CL_MEM_COPY_HOST_PTR;

    /* Create a copy of the same size */
    bufCopy = clCreateBuffer(ci->clctx, flags, size, NULL, &err);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Failed to create copy buffer of size "ZU, size);
        return NULL;
    }

    err = clEnqueueCopyBuffer(ci->queue, buf, bufCopy, 0, 0, size, 0, NULL, &ev);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Failed to enqueue buffer copy of size"ZU, size);
        clReleaseMemObject(bufCopy);
        return NULL;
    }

    err = mwWaitReleaseEvent(&ev);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Failed to wait for buffer copy");
        clReleaseMemObject(bufCopy);
        return NULL;
    }

    return bufCopy;
}
示例#7
0
/* Only sets the constant arguments, not the outputs which we double buffer */
cl_int separationSetKernelArgs(SeparationCLMem* cm, const RunSizes* runSizes)
{
    cl_int err = CL_SUCCESS;

    /* Set output buffer arguments */
    err |= clSetKernelArg(_separationKernel, 0, sizeof(cl_mem), &cm->outBg);
    err |= clSetKernelArg(_separationKernel, 1, sizeof(cl_mem), &cm->outStreams);

    /* The constant, global arguments */
    err |= clSetKernelArg(_separationKernel, 2, sizeof(cl_mem), &cm->rc);
    err |= clSetKernelArg(_separationKernel, 3, sizeof(cl_mem), &cm->rPts);
    err |= clSetKernelArg(_separationKernel, 4, sizeof(cl_mem), &cm->lTrig);
    err |= clSetKernelArg(_separationKernel, 5, sizeof(cl_mem), &cm->bSin);

    /* The __constant arguments */
    err |= clSetKernelArg(_separationKernel, 6, sizeof(cl_mem), &cm->ap);
    err |= clSetKernelArg(_separationKernel, 7, sizeof(cl_mem), &cm->sc);
    err |= clSetKernelArg(_separationKernel, 8, sizeof(cl_mem), &cm->sg_dx);

    err |= clSetKernelArg(_separationKernel, 9,  sizeof(cl_uint), &runSizes->extra);
    err |= clSetKernelArg(_separationKernel, 10, sizeof(cl_uint), &runSizes->r);
    err |= clSetKernelArg(_separationKernel, 11, sizeof(cl_uint), &runSizes->mu);
    err |= clSetKernelArg(_separationKernel, 12, sizeof(cl_uint), &runSizes->nu);

    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Error setting kernel arguments");
        return err;
    }

    return CL_SUCCESS;
}
cl_int mwGetWorkGroupInfo(cl_kernel kern, const CLInfo* ci, WGInfo* wgi)
{
    cl_int err = CL_SUCCESS;

    err |= clGetKernelWorkGroupInfo(kern,
                                    ci->dev,
                                    CL_KERNEL_COMPILE_WORK_GROUP_SIZE,
                                    sizeof(wgi->cwgs),
                                    wgi->cwgs,
                                    NULL);

    err |= clGetKernelWorkGroupInfo(kern,
                                    ci->dev,
                                    CL_KERNEL_WORK_GROUP_SIZE,
                                    sizeof(size_t),
                                    &wgi->wgs,
                                    NULL);

    err |= clGetKernelWorkGroupInfo(kern,
                                    ci->dev,
                                    CL_KERNEL_LOCAL_MEM_SIZE,
                                    sizeof(cl_ulong),
                                    &wgi->lms,
                                    NULL);
    if (err != CL_SUCCESS)
        mwPerrorCL(err, "Failed to get kernel work group info");

    return err;
}
cl_int integrateCL(const AstronomyParameters* ap,
                   const IntegralArea* ia,
                   const StreamConstants* sc,
                   const StreamGauss sg,
                   EvaluationState* es,
                   const CLRequest* clr,
                   CLInfo* ci)
{
    cl_int err;
    RunSizes runSizes;
    SeparationSizes sizes;
    SeparationCLMem cm = EMPTY_SEPARATION_CL_MEM;

    /* Need to test sizes for each integral, since the area size can change */
    calculateSizes(&sizes, ap, ia);

    if (findRunSizes(&runSizes, ci, &ci->di, ap, ia, clr))
    {
        mw_printf("Failed to find good run sizes\n");
        return MW_CL_ERROR;
    }

    err = createSeparationBuffers(ci, &cm, ap, ia, sc, sg, &sizes);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Failed to create CL buffers");
        return err;
    }

    err = separationSetKernelArgs(&cm, &runSizes);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Failed to set integral kernel arguments");
        return err;
    }

    err = runIntegral(ci, &cm, &runSizes, es, clr, ap, ia);

    releaseSeparationBuffers(&cm);

    separationIntegralGetSums(es);

    return err;
}
static cl_int mwGetDeviceType(cl_device_id dev, cl_device_type* devType)
{
    cl_int err = CL_SUCCESS;

    err = clGetDeviceInfo(dev, CL_DEVICE_TYPE, sizeof(cl_device_type), devType, NULL);
    if (err != CL_SUCCESS)
        mwPerrorCL(err, "Failed to get device type");

    return err;
}
cl_int mwFinishEvent(cl_event ev)
{
    cl_int err;

    err = clSetUserEventStatus(ev, CL_COMPLETE);
    if (err != CL_SUCCESS)
        mwPerrorCL(err, "Failed to mark custom event as completed");

    return err;
}
static cl_int createRBuffers(CLInfo* ci,
                             SeparationCLMem* cm,
                             const AstronomyParameters* ap,
                             const IntegralArea* ia,
                             const StreamGauss sg,
                             const SeparationSizes* sizes,
                             cl_mem_flags constBufFlags)
{
    cl_int err;
    RPoints* r_pts;
    RConsts* rc;

    r_pts = precalculateRPts(ap, ia, sg, &rc, FALSE);

    cm->rPts = clCreateBuffer(ci->clctx, constBufFlags, sizes->rPts, r_pts, &err);

    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Error creating stream r points buffer of size "ZU, sizes->rPts);
        return err;
    }

    cm->rc = clCreateBuffer(ci->clctx, constBufFlags, sizes->rc, rc, &err);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Error creating stream r consts buffer of size "ZU, sizes->rc);
        return err;
    }

    cm->sg_dx = clCreateBuffer(ci->clctx, constBufFlags, sizes->sg_dx, sg.dx, &err);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Error creating stream sg_dx buffer of size "ZU, sizes->sg_dx);
        return err;
    }

    mwFreeA(r_pts);
    mwFreeA(rc);

    return CL_SUCCESS;
}
static cl_int runIntegral(CLInfo* ci,
                          SeparationCLMem* cm,
                          RunSizes* runSizes,
                          EvaluationState* es,
                          const CLRequest* clr,
                          const AstronomyParameters* ap,
                          const IntegralArea* ia)
{
    cl_int err = CL_SUCCESS;
    double t1, t2, dt;
    double tAcc = 0.0;

    for (; es->nu_step < ia->nu_steps; es->nu_step++)
    {
        if (clr->enableCheckpointing && timeToCheckpointGPU(es, ia))
        {
            err = checkpointCL(ci, cm, ia, es);
            if (err != CL_SUCCESS)
                break;
        }

        t1 = mwGetTimeMilli();
        err = runNuStep(ci, ia, runSizes, es->nu_step);
        if (err != CL_SUCCESS)
        {
            mwPerrorCL(err, "Failed to run nu step");
            return err;
        }
        t2 = mwGetTimeMilli();

        dt = t2 - t1;
        tAcc += dt;

        reportProgress(ap, ia, es, es->nu_step + 1, dt);
    }

    es->nu_step = 0;

    mw_printf("Integration time: %f s. Average time per iteration = %f ms\n",
              tAcc / 1000.0, tAcc / (double) ia->nu_steps);

    if (err == CL_SUCCESS)
    {
        err = readKernelResults(ci, cm, es, ia);
        if (err != CL_SUCCESS)
            mw_printf("Failed to read final kernel results\n");

        /* Add final episode to running totals */
        addTmpCheckpointSums(es);
    }

    return err;
}
static cl_int setNuKernelArgs(const IntegralArea* ia, cl_uint nu_step)
{
    cl_int err;
    NuId nuid;

    nuid = calcNuStep(ia, nu_step);
    err = clSetKernelArg(_separationKernel, 13, sizeof(real), &nuid.id);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Error setting nu_id argument for step %u", nu_step);
        return err;
    }

    err = clSetKernelArg(_separationKernel, 14, sizeof(cl_uint), &nu_step);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Error setting nu_id argument for step %u", nu_step);
        return err;
    }

    return CL_SUCCESS;
}
/* Wait for an event then release it */
cl_int mwWaitReleaseEvent(cl_event* ev)
{
    cl_int err;

    assert(ev);

    err = clWaitForEvents(1, ev);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Failed to wait for event");
        return err;
    }

    err = clReleaseEvent(*ev);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Failed to release event");
        return err;
    }

    return CL_SUCCESS;
}
cl_platform_id* mwGetAllPlatformIDs(cl_uint* nPlatformsOut)
{
    cl_uint nPlatform = 0;
    cl_uint nPlatformActual = 0;
    cl_platform_id* ids = NULL;
    cl_int err;

    err = clGetPlatformIDs(0, NULL, &nPlatform);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Error getting number of platform");
        return NULL;
    }

    if (nPlatform == 0)
    {
        mw_printf("No CL platforms found\n");
        return NULL;
    }

    ids = mwMalloc(nPlatform * sizeof(cl_platform_id));
    err = clGetPlatformIDs(nPlatform, ids, &nPlatformActual);
    if ((err != CL_SUCCESS) || (nPlatformActual != nPlatform))
    {
        mwPerrorCL(err,
                   "Error getting platform IDs or inconsistent platform count (expected %u, actual %u)\n",
                   nPlatform,
                   nPlatformActual
            );
        free(ids);
        return NULL;
    }

    mw_printf("Found %u platform%s\n", nPlatform, nPlatform > 1 ? "s" : "");

    *nPlatformsOut = nPlatform;
    return ids;
}
cl_device_id* mwGetAllDevices(cl_platform_id platform, cl_uint* numDevOut)
{
    cl_int err;
    cl_device_id* devs;
    cl_uint numDev = 0;
    cl_device_type type = BOINC_APPLICATION ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_ALL;
    /*
      We may want to use CPUs for debugging, but the index BOINC gives
      you seems to only use GPUs.
     */

    err = clGetDeviceIDs(platform, type, 0, NULL, &numDev);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Failed to find number of devices");
        return NULL;
    }

    if (numDev == 0)
    {
        mw_printf("Didn't find any CL devices\n");
        return NULL;
    }

    mw_printf("Found %u CL device%s\n", numDev, numDev > 1 ? "s" : "");

    devs = (cl_device_id*) mwMalloc(numDev * sizeof(cl_device_id));
    err = clGetDeviceIDs(platform, type, numDev, devs, &numDev);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Failed to get device IDs");
        free(devs);
        return NULL;
    }

    *numDevOut = numDev;
    return devs;
}
cl_event mwCreateEvent(CLInfo* ci)
{
    cl_int err;
    cl_event ev;

    ev = clCreateUserEvent(ci->clctx, &err);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Failed to create custom event");
        return NULL;
    }

    return ev;
}
cl_bool mwPlatformSupportsAMDOfflineDevices(const CLInfo* ci)
{
    cl_int err;
    char exts[4096];
    size_t readSize = 0;

    err = clGetPlatformInfo(ci->plat, CL_PLATFORM_EXTENSIONS, sizeof(exts), exts, &readSize);
    if ((err != CL_SUCCESS) || (readSize >= sizeof(exts)))
    {
        mwPerrorCL(err, "Error reading platform extensions (readSize = "ZU")\n", readSize);
        return CL_FALSE;
    }

    return (strstr(exts, "cl_amd_offline_devices") != NULL);
}
static cl_mem mwCreateZeroReadWriteBufferComplete(CLInfo* ci, size_t size, cl_bool pinned)
{
    void* p;
    cl_mem mem = NULL;
    cl_int err = CL_SUCCESS;
    cl_mem_flags flags = CL_MEM_READ_WRITE;

    if (pinned)
    {
        flags |= CL_MEM_ALLOC_HOST_PTR;
        /* flags |= CL_MEM_USE_PERSISTENT_MEM_AMD; */
    }

    mem = clCreateBuffer(ci->clctx, flags, size, NULL, &err);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Failed to create zero buffer of size "ZU, size);
        goto fail;
    }

    p = clEnqueueMapBuffer(ci->queue, mem, CL_TRUE, CL_MAP_WRITE,
                           0, size, 0, NULL, NULL, &err);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Error mapping zero buffer");
        goto fail;
    }

    memset(p, 0, size);

    err = clEnqueueUnmapMemObject(ci->queue, mem, p, 0, NULL, NULL);
    if (err != CL_SUCCESS)
        mwPerrorCL(err, "Failed to unmap zero buffer");
fail:
    return mem;
}
static cl_int createAPBuffer(CLInfo* ci,
                             SeparationCLMem* cm,
                             const AstronomyParameters* ap,
                             const SeparationSizes* sizes,
                             const cl_mem_flags constBufFlags)
{
    cl_int err = CL_SUCCESS;
    double buf[16];
    union
    {
        double d;
        cl_uint i[2];
    } item;

    memset(buf, 0, sizeof(buf));

    buf[0] = 0.0;
    buf[1] = 0.0;

    item.i[0] = ap->convolve;
    item.i[1] = ap->number_streams;
    buf[2] = item.d;
    buf[3] = 0.0;

    buf[4] = ap->m_sun_r0;
    buf[5] = ap->r0;

    buf[6] = ap->q_inv_sqr;
    buf[7] = 0.0;

    buf[8] = ap->bg_a;
    buf[9] = ap->bg_b;

    buf[10] = ap->bg_c;
    buf[11] = 0.0;

    cm->ap = clCreateBuffer(ci->clctx, constBufFlags, sizeof(buf), (void*) buf, &err);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Error creating astronomy parameters buffer of size "ZU, sizes->ap);
        return err;
    }

    return CL_SUCCESS;
}
cl_int mwDestroyCLInfo(CLInfo* ci)
{
    cl_int err = CL_SUCCESS;
    /* Depending on where things fail, some of these will be NULL, and
     * will spew errors when trying to cleanup. */
    if (ci->queue)
        err |= clReleaseCommandQueue(ci->queue);
    if (ci->bufQueue)
        err |= clReleaseCommandQueue(ci->bufQueue);
    if (ci->prog)
        err |= clReleaseProgram(ci->prog);
    if (ci->clctx)
        err |= clReleaseContext(ci->clctx);

    /* TODO: or'ing the err and showing = useless */
    if (err)
        mwPerrorCL(err, "Error cleaning up CLInfo");

    return err;
}
static cl_int createSCBuffer(CLInfo* ci,
                             SeparationCLMem* cm,
                             const StreamConstants* sc,
                             const SeparationSizes* sizes,
                             const cl_mem_flags constBufFlags)
{
    cl_int err;
    real* buf;
    cl_int i;

    buf = mwCallocA(sizes->nStream * 8, sizeof(real));

    /* Pack into format used by kernel */
    for (i = 0; i < sizes->nStream; ++i)
    {
        buf[8 * i + 0] = X(sc[i].a);
        buf[8 * i + 1] = X(sc[i].c);

        buf[8 * i + 2] = Y(sc[i].a);
        buf[8 * i + 3] = Y(sc[i].c);

        buf[8 * i + 4] = Z(sc[i].a);
        buf[8 * i + 5] = Z(sc[i].c);

        buf[8 * i + 6] = sc[i].sigma_sq2_inv;
        buf[8 * i + 7] = 0.0;
    }

    cm->sc = clCreateBuffer(ci->clctx, constBufFlags, sizes->sc, (void*) buf, &err);
    mwFreeA(buf);

    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Error creating stream constants buffer of size "ZU, sizes->sc);
        return err;
    }

    return CL_SUCCESS;
}
/* Return CL_UINT_MAX if it doesn't find one */
static cl_uint choosePlatform(const char* prefVendor, const cl_platform_id* platforms, cl_uint nPlatform)
{
    cl_uint i;
    char platVendor[256];
    char prefBuf[256];
    cl_int err;

    if (!platforms || nPlatform == 0)
        return CL_UINT_MAX;

    /* No strnstr() on Windows, also be paranoid */
    memset(prefBuf, 0, sizeof(prefBuf));
    strncpy(prefBuf, prefVendor, sizeof(prefBuf));
    prefBuf[sizeof(prefBuf) - 1] = '\0';

    /* Out of the available platforms, see if one has a matching vendor */
    for (i = 0; i < nPlatform; ++i)
    {
        err = clGetPlatformInfo(platforms[i],
                                CL_PLATFORM_VENDOR,
                                sizeof(platVendor),
                                platVendor,
                                NULL);
        if (err != CL_SUCCESS)
        {
            mwPerrorCL(err, "Error getting platform vendor");
            return CL_UINT_MAX;
        }

        if (strstr(platVendor, prefBuf))
        {
            return i;
        }
    }

    return CL_UINT_MAX;
}
示例#25
0
cl_int setupSeparationCL(CLInfo* ci,
                         const AstronomyParameters* ap,
                         const IntegralArea* ias,
                         const CLRequest* clr)
{
    char* compileFlags;
    cl_bool useILKernel;
    cl_int err = MW_CL_ERROR;
    const char* kernSrc = (const char*) probabilities_kernel_cl;
    size_t kernSrcLen = probabilities_kernel_cl_len;

    const char* summarizationKernSrc = (const char*) summarization_kernel_cl;
    size_t summarizationKernSrcLen = summarization_kernel_cl_len;


    err = mwSetupCL(ci, clr);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Error getting device and context");
        return err;
    }

    if (!separationCheckDevCapabilities(&ci->di))
    {
        return MW_CL_ERROR;
    }

    useILKernel = usingILKernelIsAcceptable(ci, ap, clr);
    compileFlags = getCompilerFlags(ci, ap, useILKernel);
    if (!compileFlags)
    {
        mw_printf("Failed to get CL compiler flags\n");
        return MW_CL_ERROR;
    }

    if (clr->verbose)
    {
        mw_printf("\nCompiler flags:\n%s\n\n", compileFlags);
    }

    integrationProgram = mwCreateProgramFromSrc(ci, 1, &kernSrc, &kernSrcLen, compileFlags);
    if (!integrationProgram)
    {
        mw_printf("Error creating integral program from source\n");
        err = MW_CL_ERROR;
        goto setup_exit;
    }

    summarizationProgram = mwCreateProgramFromSrc(ci, 1, &summarizationKernSrc, &summarizationKernSrcLen, compileFlags);
    if (!summarizationProgram)
    {
        mw_printf("Error creating summarization program from source\n");
        err = MW_CL_ERROR;
        goto setup_exit;
    }

    if (useILKernel)
    {
        mw_printf("Using AMD IL kernel\n");
        err = setProgramFromILKernel(ci, ap);
        if (err != CL_SUCCESS)
        {
            mw_printf("Failed to create IL kernel. Falling back to source kernel\n");
        }
    }

    if (err == CL_SUCCESS)
    {
        _separationKernel = mwCreateKernel(integrationProgram, "probabilities");
        _summarizationKernel = mwCreateKernel(summarizationProgram, "summarization");
        if (   !_separationKernel
            || !_summarizationKernel
            || setSummarizationWorkgroupSize(ci)
            || !separationCheckDevMemory(&ci->di, ap, ias))
        {
            err = MW_CL_ERROR;
        }
    }


setup_exit:
    free(compileFlags);

    return err;
}
static cl_int runSummarization(CLInfo* ci,
                               SeparationCLMem* cm,
                               const IntegralArea* ia,
                               cl_uint which,
                               Kahan* resultOut)
{
    cl_int err = CL_SUCCESS;
    cl_mem buf;
    cl_uint offset;
    size_t global[1];
    size_t local[1];
    real result[2] = { -1.0, -1.0 };
    cl_uint nElements = ia->r_steps * ia->mu_steps;
    cl_mem sumBufs[2] = { cm->summarizationBufs[0], cm->summarizationBufs[1] };

    if (which == 0)
    {
        buf = cm->outBg;
        offset = 0;
    }
    else
    {
        buf = cm->outStreams;
        offset = (which - 1) * nElements;
    }


    /* First call reads from an offset into one of the output buffers */
    err |= clSetKernelArg(_summarizationKernel, 0, sizeof(cl_mem), &sumBufs[0]);
    err |= clSetKernelArg(_summarizationKernel, 1, sizeof(cl_mem), &buf);
    err |= clSetKernelArg(_summarizationKernel, 2, sizeof(cl_uint), &nElements);
    err |= clSetKernelArg(_summarizationKernel, 3, sizeof(cl_uint), &offset);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Error setting summarization kernel arguments");
        return err;
    }

    local[0] = _summarizationWorkgroupSize;
    global[0] = mwNextMultiple(local[0], nElements);

    err = clEnqueueNDRangeKernel(ci->queue, _summarizationKernel, 1,
                                 NULL, global, local,
                                 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Error enqueuing summarization kernel");
        return err;
    }

    /* Why is this necessary? It seems to frequently break on the 7970 and nowhere else without it */
    err = clFinish(ci->queue);
    //err = clFlush(ci->queue);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Error finishing summarization kernel");
        return err;
    }

    /* Later calls swap between summarization buffers without an offset */
    nElements = (cl_uint) mwDivRoundup(global[0], local[0]);
    offset = 0;
    err |= clSetKernelArg(_summarizationKernel, 3, sizeof(cl_uint), &offset);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Error setting summarization kernel offset argument");
        return err;
    }

    while (nElements > 1)
    {
        /* Swap old summarization buffer to the input and shrink the range */
        swapBuffers(sumBufs);

        global[0] = mwNextMultiple(local[0], nElements);

        err |= clSetKernelArg(_summarizationKernel, 0, sizeof(cl_mem), &sumBufs[0]);
        err |= clSetKernelArg(_summarizationKernel, 1, sizeof(cl_mem), &sumBufs[1]);
        err |= clSetKernelArg(_summarizationKernel, 2, sizeof(cl_uint), &nElements);
        if (err != CL_SUCCESS)
        {
            mwPerrorCL(err, "Error setting summarization kernel arguments");
            return err;
        }

        /*
        err = clEnqueueBarrier(ci->queue);
        if (err != CL_SUCCESS)
        {
            mwPerrorCL(err, "Error enqueuing summarization barrier");
            return err;
        }
        */

        err = clEnqueueNDRangeKernel(ci->queue, _summarizationKernel, 1,
                                     NULL, global, local,
                                     0, NULL, NULL);
        if (err != CL_SUCCESS)
        {
            mwPerrorCL(err, "Error enqueuing summarization kernel");
            return err;
        }

        err = clFinish(ci->queue);
        if (err != CL_SUCCESS)
        {
            mwPerrorCL(err, "Error finishing summarization kernel");
            return err;
        }

        nElements = (cl_uint) mwDivRoundup(global[0], local[0]);
    }


    err = clEnqueueBarrier(ci->queue);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Error enqueuing summarization barrier");
        return err;
    }

    err = clEnqueueReadBuffer(ci->queue, sumBufs[0], CL_TRUE,
                              0, 2 * sizeof(real), result,
                              0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Error reading summarization result buffer");
        return err;
    }

    resultOut->sum = result[0];
    resultOut->correction = result[1];

    return CL_SUCCESS;
}
示例#27
0
/* Returns CL_TRUE on error */
cl_bool findRunSizes(RunSizes* sizes,
                     const CLInfo* ci,
                     const DevInfo* di,
                     const AstronomyParameters* ap,
                     const IntegralArea* ia,
                     const CLRequest* clr)
{
    WGInfo wgi;
    cl_int err;
    size_t nWavefrontPerCU;
    size_t blockSize; /* Size chunks should be multiples of */
    cl_bool forceOneChunk = clr->nonResponsive || di->nonOutput || di->hasGraphicsQOS;

    /* I assume this is how this works for 1D limit */
    const cl_ulong maxWorkDim = (cl_ulong) di->maxWorkItemSizes[0] * di->maxWorkItemSizes[1] * di->maxWorkItemSizes[2];
    const cl_ulong r = (cl_ulong) ia->r_steps;
    const cl_ulong mu = (cl_ulong) ia->mu_steps;

    sizes->r = ia->r_steps;
    sizes->mu = ia->mu_steps;
    sizes->nu = ia->nu_steps;
    sizes->area = r * mu;

    if (r > CL_ULONG_MAX / mu)
    {
        mw_printf("Integral area overflows cl_ulong\n");
        return CL_TRUE;
    }

    if (di->devType == CL_DEVICE_TYPE_CPU)
    {
        sizes->nChunk = sizes->nChunkEstimate = 1;
        sizes->chunkSize = sizes->effectiveArea = sizes->area;
        sizes->extra = 0;

        sizes->local[0] = 1;
        sizes->global[0] = sizes->area;

        return CL_FALSE;
    }

    err = mwGetWorkGroupInfo(_separationKernel, ci, &wgi);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Failed to get work group info");
        return CL_TRUE;
    }

    if (clr->verbose)
    {
        mwPrintWorkGroupInfo(&wgi);
    }

    if (!mwDivisible(wgi.wgs, (size_t) di->warpSize))
    {
        mw_printf("Kernel reported work group size ("ZU") not a multiple of warp size (%u)\n",
                  wgi.wgs,
                  di->warpSize);
        return CL_TRUE;
    }

    /* This should give a good occupancy. If the global size isn't a
     * multiple of this bad performance things happen. */
    nWavefrontPerCU = wgi.wgs / di->warpSize;

    /* Since we don't use any workgroup features, it makes sense to
     * use the wavefront size as the workgroup size */
    sizes->local[0] = di->warpSize;

    /* For maximum efficiency, we want global work sizes to be multiples of
     * (warp size) * (number compute units) * (number of warps for good occupancy)
     * Then we throw in another factor since we can realistically do more work at once
     */
    blockSize = nWavefrontPerCU * di->warpSize * di->maxCompUnits;
    {
        cl_uint nBlockPerChunk = 1;
        sizes->nChunkEstimate = findNChunk(ap, ia, di, clr, &sizes->initialWait);

        /* Make a guess appropriate for the hardware. */

        /* m * b ~= area / n   */
        nBlockPerChunk = sizes->area / (sizes->nChunkEstimate * blockSize);
        if (nBlockPerChunk == 0)
            nBlockPerChunk = 1;

        sizes->chunkSize = nBlockPerChunk * blockSize;
    }

    //sizes->effectiveArea = sizes->chunkSize * mwDivRoundup(sizes->area, sizes->chunkSize);
    sizes->effectiveArea = di->warpSize * mwDivRoundup(sizes->area, di->warpSize);
    sizes->nChunk = forceOneChunk ? 1 : mwDivRoundup(sizes->effectiveArea, sizes->chunkSize);
    sizes->extra = (cl_uint) (sizes->effectiveArea - sizes->area);

    if (sizes->nChunk == 1) /* BlockPerChunk factor probably too high or very small workunit, or nonresponsive */
    {
        /* Like using nBlockPerChunk == 1 */
        sizes->effectiveArea = blockSize * mwDivRoundup(sizes->area, blockSize);
        sizes->chunkSize = sizes->effectiveArea;
        sizes->extra = sizes->effectiveArea - sizes->area;
    }

    mw_printf("Using a target frequency of %.1f\n"
              "Using a block size of "ZU" with "ZU" blocks/chunk\n",
              clr->targetFrequency,
              blockSize,
              sizes->chunkSize / blockSize
        );
    printPollMode(ci, sizes);

    sizes->chunkSize = sizes->effectiveArea / sizes->nChunk;

    /* We should be hitting memory size limits before we ever get to this */
    if (sizes->chunkSize > maxWorkDim)
    {
        mw_printf("Warning: Area too large for one chunk (max size = "LLU")\n", maxWorkDim);
        while (sizes->chunkSize > maxWorkDim)
        {
            sizes->nChunk *= 2;
            sizes->chunkSize = sizes->effectiveArea / sizes->nChunk;
        }

        if (!mwDivisible(sizes->chunkSize, sizes->local[0]))
        {
            mw_printf("FIXME: I'm too lazy to handle very large workunits properly\n");
            return CL_TRUE;
        }
        else if (!mwDivisible(sizes->chunkSize, blockSize))
        {
            mw_printf("FIXME: Very large workunit potentially slower than it should be\n");
        }
    }

    sizes->global[0] = sizes->chunkSize;

    printRunSizes(sizes, ia);

    if (sizes->effectiveArea < sizes->area)
    {
        mw_printf("Effective area less than actual area!\n");
        return CL_TRUE;
    }

    return CL_FALSE;
}
static cl_int mwGetCLInfo(CLInfo* ci, const CLRequest* clr)
{
    cl_int err = CL_SUCCESS;
    cl_uint nPlatform = 0;
    cl_uint nDev = 0;
    cl_platform_id* ids;
    cl_device_id* devs;
    cl_uint platformChoice;

    ids = mwGetAllPlatformIDs(&nPlatform);
    if (!ids)
        return MW_CL_ERROR;

    if (mwIsFirstRun())
    {
        mwPrintPlatforms(ids, nPlatform);
    }

    /* We have this set by default to UINT_MAX, so if it's in a
     * legitimate range, it was specified */
    if (clr->platform < nPlatform)
    {
        platformChoice = clr->platform;
    }
    else if (strcmp(clr->preferredPlatformVendor, "")) /* Didn't specify platform by index, try picking one by name */
    {
        platformChoice = choosePlatform(clr->preferredPlatformVendor, ids, nPlatform);
    }
    else
    {
        platformChoice = 0;
    }

    if (platformChoice >= nPlatform)
    {
        mw_printf("Didn't find preferred platform\n");
        platformChoice = 0;
    }

    mw_printf("Using device %u on platform %u\n", clr->devNum, platformChoice);

    ci->plat = ids[platformChoice];
    devs = mwGetAllDevices(ci->plat, &nDev);
    if (!devs)
    {
        mw_printf("Error getting devices\n");
        free(ids);
        return MW_CL_ERROR;
    }

    err = mwSelectDevice(ci, devs, clr, nDev);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Failed to select a device");
        err = -1;
    }

    free(ids);
    free(devs);

    return err;
}
cl_int mwGetDevInfo(DevInfo* di, cl_device_id dev)
{
    const AMDGPUData* amdData;
    cl_int err = CL_SUCCESS;

    di->devID = dev;

    err |= clGetDeviceInfo(dev, CL_DEVICE_TYPE,                     sizeof(di->devType),  &di->devType, NULL);

    err |= clGetDeviceInfo(dev, CL_DEVICE_NAME,                     sizeof(di->devName),  di->devName, NULL);
    err |= clGetDeviceInfo(dev, CL_DEVICE_VENDOR,                   sizeof(di->vendor),   di->vendor, NULL);
    err |= clGetDeviceInfo(dev, CL_DEVICE_VENDOR_ID,                sizeof(cl_uint),  &di->vendorID, NULL);
    err |= clGetDeviceInfo(dev, CL_DRIVER_VERSION,                  sizeof(di->driver),   di->driver, NULL);
    err |= clGetDeviceInfo(dev, CL_DEVICE_VERSION,                  sizeof(di->version),  di->version, NULL);
  //err |= clGetDeviceInfo(dev, CL_DEVICE_OPENCL_C_VERSION,         sizeof(di->clCVer),   di->clCVer, NULL);
    err |= clGetDeviceInfo(dev, CL_DEVICE_ENDIAN_LITTLE,            sizeof(cl_bool),  &di->littleEndian, NULL);
    err |= clGetDeviceInfo(dev, CL_DEVICE_ERROR_CORRECTION_SUPPORT, sizeof(cl_bool),  &di->errCorrect, NULL);
    err |= clGetDeviceInfo(dev, CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool),  &di->imgSupport, NULL);
    err |= clGetDeviceInfo(dev, CL_DEVICE_ADDRESS_BITS,             sizeof(cl_uint),  &di->addrBits, NULL);
    err |= clGetDeviceInfo(dev, CL_DEVICE_MAX_COMPUTE_UNITS,        sizeof(cl_uint),  &di->maxCompUnits, NULL);
    err |= clGetDeviceInfo(dev, CL_DEVICE_MAX_CLOCK_FREQUENCY,      sizeof(cl_uint),  &di->clockFreq, NULL);
    err |= clGetDeviceInfo(dev, CL_DEVICE_GLOBAL_MEM_SIZE,          sizeof(cl_ulong), &di->memSize, NULL);
    err |= clGetDeviceInfo(dev, CL_DEVICE_MAX_MEM_ALLOC_SIZE,       sizeof(cl_ulong), &di->maxMemAlloc, NULL);
    err |= clGetDeviceInfo(dev, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE,    sizeof(cl_ulong), &di->gMemCache, NULL);
    err |= clGetDeviceInfo(dev, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, sizeof(cl_uint), &di->cachelineSize, NULL);

  //err |= clGetDeviceInfo(dev, CL_DEVICE_HOST_UNIFIED_MEMORY,      sizeof(cl_ulong), &unifiedMem, NULL);
    err |= clGetDeviceInfo(dev, CL_DEVICE_LOCAL_MEM_TYPE, sizeof(cl_device_local_mem_type), &di->localMemType, NULL);

    err |= clGetDeviceInfo(dev, CL_DEVICE_DOUBLE_FP_CONFIG, sizeof(cl_device_fp_config), &di->doubleFPConfig, NULL);
    err |= clGetDeviceInfo(dev, CL_DEVICE_SINGLE_FP_CONFIG, sizeof(cl_device_fp_config), &di->floatFPConfig, NULL);

    err |= clGetDeviceInfo(dev, CL_DEVICE_LOCAL_MEM_SIZE,           sizeof(cl_ulong), &di->localMemSize, NULL);
    err |= clGetDeviceInfo(dev, CL_DEVICE_MAX_CONSTANT_ARGS,        sizeof(cl_uint),  &di->maxConstArgs, NULL);
    err |= clGetDeviceInfo(dev, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(cl_ulong), &di->maxConstBufSize, NULL);
    err |= clGetDeviceInfo(dev, CL_DEVICE_MAX_PARAMETER_SIZE, sizeof(size_t), &di->maxParamSize, NULL);
    err |= clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &di->maxWorkGroupSize, NULL);
    err |= clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), &di->maxWorkItemDim, NULL);
    err |= clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(di->maxWorkItemSizes), di->maxWorkItemSizes, NULL);
    err |= clGetDeviceInfo(dev, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &di->memBaseAddrAlign, NULL);
    err |= clGetDeviceInfo(dev, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, sizeof(cl_uint), &di->minAlignSize, NULL);
    err |= clGetDeviceInfo(dev, CL_DEVICE_PROFILING_TIMER_RESOLUTION, sizeof(size_t), &di->timerRes, NULL);
    err |= clGetDeviceInfo(dev, CL_DEVICE_EXTENSIONS, sizeof(di->exts), &di->exts, NULL);

    di->computeCapabilityMajor = di->computeCapabilityMinor = 0;
    di->warpSize = 0;
    if (err == CL_SUCCESS)
    {
        if (strstr(di->exts, "cl_nv_device_attribute_query") != NULL)
        {
            err |= clGetDeviceInfo(dev, CL_DEVICE_WARP_SIZE_NV,
                                   sizeof(di->warpSize), &di->warpSize, NULL);
            err |= clGetDeviceInfo(di->devID, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
                                   sizeof(cl_uint), &di->computeCapabilityMajor, NULL);
            err |= clGetDeviceInfo(di->devID, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
                                   sizeof(cl_uint), &di->computeCapabilityMinor, NULL);
        }
        else
        {
            if (di->devType == CL_DEVICE_TYPE_CPU)
            {
                di->warpSize = 1;
            }
            else if (di->devType == CL_DEVICE_TYPE_GPU)
            {
                /* FIXME: How do I get this on AMD? It's 64 for all of
                 * the high end stuff, but 32 for lower. I think it's
                 * 64 for all the GPUs that do have doubles */
                di->warpSize = 64;
            }
            else
            {
                mw_printf("Unknown device type, using warp size = 1\n");
                di->warpSize = 1;
            }
        }
    }

    di->nonOutput = mwDeviceIsNonOutput(di);
    di->hasGraphicsQOS = mwDeviceHasGraphicsQOS(di);


    if (mwIsNvidiaGPUDevice(di))
    {
        di->aluPerCU = mwCUDACoresPerComputeUnit(di);
        di->doubleFrac = mwCUDAEstimateDoubleFrac(di);
        di->calTarget = MW_CAL_TARGET_INVALID;

        if (strstr(di->exts, "cl_nv_device_attribute_query") != NULL)
        {
            err |= clGetDeviceInfo(dev, CL_DEVICE_WARP_SIZE_NV,
                                   sizeof(di->warpSize), &di->warpSize, NULL);
            err |= clGetDeviceInfo(di->devID, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
                                   sizeof(cl_uint), &di->computeCapabilityMajor, NULL);
            err |= clGetDeviceInfo(di->devID, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
                                   sizeof(cl_uint), &di->computeCapabilityMinor, NULL);
        }
    }
    else if (mwIsAMDGPUDevice(di))
    {
        amdData = mwLookupAMDGPUInfo(di);

        di->aluPerCU   = amdData->aluPerCU;
        di->doubleFrac = amdData->doubleFrac;
        di->calTarget  = amdData->target;
        di->warpSize   = amdData->wavefrontSize;
    }

    if (di->warpSize == 0)
    {
        mw_printf("Unknown device type, using warp size = 1\n");
        di->warpSize = 1;
    }

    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Error getting device information");
    }
    else
    {
        di->doubleExts = mwGetDoubleExts(di->exts);
    }

    return err;
}