/* Returns CL_TRUE on error */ cl_bool findRunSizes(RunSizes* sizes, const CLInfo* ci, const DevInfo* di, const AstronomyParameters* ap, const IntegralArea* ia, const CLRequest* clr) { WGInfo wgi; cl_int err; size_t nWavefrontPerCU; size_t blockSize; /* Size chunks should be multiples of */ cl_bool forceOneChunk = clr->nonResponsive || di->nonOutput || di->hasGraphicsQOS; /* I assume this is how this works for 1D limit */ const cl_ulong maxWorkDim = (cl_ulong) di->maxWorkItemSizes[0] * di->maxWorkItemSizes[1] * di->maxWorkItemSizes[2]; const cl_ulong r = (cl_ulong) ia->r_steps; const cl_ulong mu = (cl_ulong) ia->mu_steps; sizes->r = ia->r_steps; sizes->mu = ia->mu_steps; sizes->nu = ia->nu_steps; sizes->area = r * mu; if (r > CL_ULONG_MAX / mu) { mw_printf("Integral area overflows cl_ulong\n"); return CL_TRUE; } if (di->devType == CL_DEVICE_TYPE_CPU) { sizes->nChunk = sizes->nChunkEstimate = 1; sizes->chunkSize = sizes->effectiveArea = sizes->area; sizes->extra = 0; sizes->local[0] = 1; sizes->global[0] = sizes->area; return CL_FALSE; } err = mwGetWorkGroupInfo(_separationKernel, ci, &wgi); if (err != CL_SUCCESS) { mwPerrorCL(err, "Failed to get work group info"); return CL_TRUE; } if (clr->verbose) { mwPrintWorkGroupInfo(&wgi); } if (!mwDivisible(wgi.wgs, (size_t) di->warpSize)) { mw_printf("Kernel reported work group size ("ZU") not a multiple of warp size (%u)\n", wgi.wgs, di->warpSize); return CL_TRUE; } /* This should give a good occupancy. If the global size isn't a * multiple of this bad performance things happen. */ nWavefrontPerCU = wgi.wgs / di->warpSize; /* Since we don't use any workgroup features, it makes sense to * use the wavefront size as the workgroup size */ sizes->local[0] = di->warpSize; /* For maximum efficiency, we want global work sizes to be multiples of * (warp size) * (number compute units) * (number of warps for good occupancy) * Then we throw in another factor since we can realistically do more work at once */ blockSize = nWavefrontPerCU * di->warpSize * di->maxCompUnits; { cl_uint nBlockPerChunk = 1; sizes->nChunkEstimate = findNChunk(ap, ia, di, clr, &sizes->initialWait); /* Make a guess appropriate for the hardware. */ /* m * b ~= area / n */ nBlockPerChunk = sizes->area / (sizes->nChunkEstimate * blockSize); if (nBlockPerChunk == 0) nBlockPerChunk = 1; sizes->chunkSize = nBlockPerChunk * blockSize; } //sizes->effectiveArea = sizes->chunkSize * mwDivRoundup(sizes->area, sizes->chunkSize); sizes->effectiveArea = di->warpSize * mwDivRoundup(sizes->area, di->warpSize); sizes->nChunk = forceOneChunk ? 1 : mwDivRoundup(sizes->effectiveArea, sizes->chunkSize); sizes->extra = (cl_uint) (sizes->effectiveArea - sizes->area); if (sizes->nChunk == 1) /* BlockPerChunk factor probably too high or very small workunit, or nonresponsive */ { /* Like using nBlockPerChunk == 1 */ sizes->effectiveArea = blockSize * mwDivRoundup(sizes->area, blockSize); sizes->chunkSize = sizes->effectiveArea; sizes->extra = sizes->effectiveArea - sizes->area; } mw_printf("Using a target frequency of %.1f\n" "Using a block size of "ZU" with "ZU" blocks/chunk\n", clr->targetFrequency, blockSize, sizes->chunkSize / blockSize ); printPollMode(ci, sizes); sizes->chunkSize = sizes->effectiveArea / sizes->nChunk; /* We should be hitting memory size limits before we ever get to this */ if (sizes->chunkSize > maxWorkDim) { mw_printf("Warning: Area too large for one chunk (max size = "LLU")\n", maxWorkDim); while (sizes->chunkSize > maxWorkDim) { sizes->nChunk *= 2; sizes->chunkSize = sizes->effectiveArea / sizes->nChunk; } if (!mwDivisible(sizes->chunkSize, sizes->local[0])) { mw_printf("FIXME: I'm too lazy to handle very large workunits properly\n"); return CL_TRUE; } else if (!mwDivisible(sizes->chunkSize, blockSize)) { mw_printf("FIXME: Very large workunit potentially slower than it should be\n"); } } sizes->global[0] = sizes->chunkSize; printRunSizes(sizes, ia); if (sizes->effectiveArea < sizes->area) { mw_printf("Effective area less than actual area!\n"); return CL_TRUE; } return CL_FALSE; }
/* Returns CL_TRUE on error */ cl_bool findRunSizes(RunSizes* sizes, const CLInfo* ci, const DevInfo* di, const IntegralArea* ia, const CLRequest* clr) { WGInfo wgi; cl_int err; cl_uint groupSize; size_t i, nMod; size_t sum = 0; err = mwGetWorkGroupInfo(ci, &wgi); if (err != CL_SUCCESS) { mwCLWarn("Failed to get work group info", err); return CL_TRUE; } mwPrintWorkGroupInfo(&wgi); groupSize = mwFindGroupSize(di); sizes->local[0] = groupSize; sizes->local[1] = 1; sizes->nChunkEstimate = chooseNumChunk(ia, clr, di); sizes->nChunk = sizes->nChunkEstimate; /* Best for performance. Be a bit more flexible if chunking. Using the whole block size seems a bit better when attacking everything at once. */ nMod = sizes->nChunk == 1 ? mwBlockSize(di) : groupSize * di->maxCompUnits; sizes->area = ia->r_steps * ia->mu_steps; sizes->effectiveArea = nMod * mwDivRoundup(sizes->area, nMod); sizes->extra = sizes->effectiveArea - sizes->area; warn("Keeping chunk boundaries as multiples of "ZU"\n", nMod); if (sizes->effectiveArea / sizes->nChunk < nMod) { sizes->nChunk = sizes->effectiveArea / nMod; warn("Warning: Estimated number of chunks ("ZU") too large. Using "ZU"\n", sizes->nChunkEstimate, sizes->nChunk); } if (nMod == 1) { /* When nMod = 1 we need to avoid losing pieces at the * beginning and end; the normal method doesn't quite work. */ while (sizes->nChunk * (sizes->effectiveArea / sizes->nChunk) < sizes->effectiveArea) { sizes->nChunk++; } warn("Need to use "ZU" chunks to cover area using multiples of 1\n", sizes->nChunk); } sizes->chunkBorders = mwCallocA((sizes->nChunk + 1), sizeof(size_t)); for (i = 0; i <= sizes->nChunk; ++i) { if (nMod == 1) { /* Avoid losing out the 0 border */ sizes->chunkBorders[i] = i * (sizes->effectiveArea / sizes->nChunk); } else { sizes->chunkBorders[i] = (i * sizes->effectiveArea + sizes->nChunk) / (sizes->nChunk * nMod); sizes->chunkBorders[i] *= nMod; } if (sizes->chunkBorders[i] > sizes->effectiveArea) sizes->chunkBorders[i] = sizes->effectiveArea; if (i > 0) { sum += sizes->chunkBorders[i] - sizes->chunkBorders[i - 1]; assert(sizes->chunkBorders[i] - sizes->chunkBorders[i - 1] > 0); } } printRunSizes(sizes, ia, clr->verbose); if (sum != sizes->effectiveArea) /* Assert that the divisions aren't broken */ { warn("Chunk total does not match: "ZU" != "ZU"\n", sum, sizes->effectiveArea); free(sizes->chunkBorders); return CL_TRUE; } return CL_FALSE; }