Exemplo n.º 1
0
/* Returns CL_TRUE on error */
cl_bool findRunSizes(RunSizes* sizes,
                     const CLInfo* ci,
                     const DevInfo* di,
                     const AstronomyParameters* ap,
                     const IntegralArea* ia,
                     const CLRequest* clr)
{
    WGInfo wgi;
    cl_int err;
    size_t nWavefrontPerCU;
    size_t blockSize; /* Size chunks should be multiples of */
    cl_bool forceOneChunk = clr->nonResponsive || di->nonOutput || di->hasGraphicsQOS;

    /* I assume this is how this works for 1D limit */
    const cl_ulong maxWorkDim = (cl_ulong) di->maxWorkItemSizes[0] * di->maxWorkItemSizes[1] * di->maxWorkItemSizes[2];
    const cl_ulong r = (cl_ulong) ia->r_steps;
    const cl_ulong mu = (cl_ulong) ia->mu_steps;

    sizes->r = ia->r_steps;
    sizes->mu = ia->mu_steps;
    sizes->nu = ia->nu_steps;
    sizes->area = r * mu;

    if (r > CL_ULONG_MAX / mu)
    {
        mw_printf("Integral area overflows cl_ulong\n");
        return CL_TRUE;
    }

    if (di->devType == CL_DEVICE_TYPE_CPU)
    {
        sizes->nChunk = sizes->nChunkEstimate = 1;
        sizes->chunkSize = sizes->effectiveArea = sizes->area;
        sizes->extra = 0;

        sizes->local[0] = 1;
        sizes->global[0] = sizes->area;

        return CL_FALSE;
    }

    err = mwGetWorkGroupInfo(_separationKernel, ci, &wgi);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Failed to get work group info");
        return CL_TRUE;
    }

    if (clr->verbose)
    {
        mwPrintWorkGroupInfo(&wgi);
    }

    if (!mwDivisible(wgi.wgs, (size_t) di->warpSize))
    {
        mw_printf("Kernel reported work group size ("ZU") not a multiple of warp size (%u)\n",
                  wgi.wgs,
                  di->warpSize);
        return CL_TRUE;
    }

    /* This should give a good occupancy. If the global size isn't a
     * multiple of this bad performance things happen. */
    nWavefrontPerCU = wgi.wgs / di->warpSize;

    /* Since we don't use any workgroup features, it makes sense to
     * use the wavefront size as the workgroup size */
    sizes->local[0] = di->warpSize;

    /* For maximum efficiency, we want global work sizes to be multiples of
     * (warp size) * (number compute units) * (number of warps for good occupancy)
     * Then we throw in another factor since we can realistically do more work at once
     */
    blockSize = nWavefrontPerCU * di->warpSize * di->maxCompUnits;
    {
        cl_uint nBlockPerChunk = 1;
        sizes->nChunkEstimate = findNChunk(ap, ia, di, clr, &sizes->initialWait);

        /* Make a guess appropriate for the hardware. */

        /* m * b ~= area / n   */
        nBlockPerChunk = sizes->area / (sizes->nChunkEstimate * blockSize);
        if (nBlockPerChunk == 0)
            nBlockPerChunk = 1;

        sizes->chunkSize = nBlockPerChunk * blockSize;
    }

    //sizes->effectiveArea = sizes->chunkSize * mwDivRoundup(sizes->area, sizes->chunkSize);
    sizes->effectiveArea = di->warpSize * mwDivRoundup(sizes->area, di->warpSize);
    sizes->nChunk = forceOneChunk ? 1 : mwDivRoundup(sizes->effectiveArea, sizes->chunkSize);
    sizes->extra = (cl_uint) (sizes->effectiveArea - sizes->area);

    if (sizes->nChunk == 1) /* BlockPerChunk factor probably too high or very small workunit, or nonresponsive */
    {
        /* Like using nBlockPerChunk == 1 */
        sizes->effectiveArea = blockSize * mwDivRoundup(sizes->area, blockSize);
        sizes->chunkSize = sizes->effectiveArea;
        sizes->extra = sizes->effectiveArea - sizes->area;
    }

    mw_printf("Using a target frequency of %.1f\n"
              "Using a block size of "ZU" with "ZU" blocks/chunk\n",
              clr->targetFrequency,
              blockSize,
              sizes->chunkSize / blockSize
        );
    printPollMode(ci, sizes);

    sizes->chunkSize = sizes->effectiveArea / sizes->nChunk;

    /* We should be hitting memory size limits before we ever get to this */
    if (sizes->chunkSize > maxWorkDim)
    {
        mw_printf("Warning: Area too large for one chunk (max size = "LLU")\n", maxWorkDim);
        while (sizes->chunkSize > maxWorkDim)
        {
            sizes->nChunk *= 2;
            sizes->chunkSize = sizes->effectiveArea / sizes->nChunk;
        }

        if (!mwDivisible(sizes->chunkSize, sizes->local[0]))
        {
            mw_printf("FIXME: I'm too lazy to handle very large workunits properly\n");
            return CL_TRUE;
        }
        else if (!mwDivisible(sizes->chunkSize, blockSize))
        {
            mw_printf("FIXME: Very large workunit potentially slower than it should be\n");
        }
    }

    sizes->global[0] = sizes->chunkSize;

    printRunSizes(sizes, ia);

    if (sizes->effectiveArea < sizes->area)
    {
        mw_printf("Effective area less than actual area!\n");
        return CL_TRUE;
    }

    return CL_FALSE;
}
Exemplo n.º 2
0
/* Returns CL_TRUE on error */
cl_bool findRunSizes(RunSizes* sizes,
                     const CLInfo* ci,
                     const DevInfo* di,
                     const IntegralArea* ia,
                     const CLRequest* clr)
{
    WGInfo wgi;
    cl_int err;
    cl_uint groupSize;
    size_t i, nMod;
    size_t sum = 0;

    err = mwGetWorkGroupInfo(ci, &wgi);
    if (err != CL_SUCCESS)
    {
        mwCLWarn("Failed to get work group info", err);
        return CL_TRUE;
    }

    mwPrintWorkGroupInfo(&wgi);
    groupSize = mwFindGroupSize(di);

    sizes->local[0] = groupSize;
    sizes->local[1] = 1;

    sizes->nChunkEstimate = chooseNumChunk(ia, clr, di);
    sizes->nChunk = sizes->nChunkEstimate;

    /* Best for performance.
       Be a bit more flexible if chunking.
       Using the whole block size seems a bit better when attacking everything at once.
    */
    nMod = sizes->nChunk == 1 ? mwBlockSize(di) : groupSize * di->maxCompUnits;

    sizes->area = ia->r_steps * ia->mu_steps;
    sizes->effectiveArea = nMod * mwDivRoundup(sizes->area, nMod);
    sizes->extra = sizes->effectiveArea - sizes->area;

    warn("Keeping chunk boundaries as multiples of "ZU"\n", nMod);

    if (sizes->effectiveArea / sizes->nChunk < nMod)
    {
        sizes->nChunk = sizes->effectiveArea / nMod;
        warn("Warning: Estimated number of chunks ("ZU") too large. Using "ZU"\n", sizes->nChunkEstimate, sizes->nChunk);
    }

    if (nMod == 1)
    {
        /* When nMod = 1 we need to avoid losing pieces at the
         * beginning and end; the normal method doesn't quite work. */
        while (sizes->nChunk * (sizes->effectiveArea / sizes->nChunk) < sizes->effectiveArea)
        {
            sizes->nChunk++;
        }
        warn("Need to use "ZU" chunks to cover area using multiples of 1\n", sizes->nChunk);
    }

    sizes->chunkBorders = mwCallocA((sizes->nChunk + 1), sizeof(size_t));
    for (i = 0; i <= sizes->nChunk; ++i)
    {
        if (nMod == 1)
        {
            /* Avoid losing out the 0 border */
            sizes->chunkBorders[i] = i * (sizes->effectiveArea / sizes->nChunk);
        }
        else
        {
            sizes->chunkBorders[i] = (i * sizes->effectiveArea + sizes->nChunk) / (sizes->nChunk * nMod);
            sizes->chunkBorders[i] *= nMod;
        }

        if (sizes->chunkBorders[i] > sizes->effectiveArea)
            sizes->chunkBorders[i] = sizes->effectiveArea;

        if (i > 0)
        {
            sum += sizes->chunkBorders[i] - sizes->chunkBorders[i - 1];
            assert(sizes->chunkBorders[i] - sizes->chunkBorders[i - 1] > 0);
        }
    }

    printRunSizes(sizes, ia, clr->verbose);

    if (sum != sizes->effectiveArea)  /* Assert that the divisions aren't broken */
    {
        warn("Chunk total does not match: "ZU" != "ZU"\n", sum, sizes->effectiveArea);
        free(sizes->chunkBorders);
        return CL_TRUE;
    }

    return CL_FALSE;
}