/* Returns CL_TRUE on error */ cl_bool findRunSizes(RunSizes* sizes, const CLInfo* ci, const DevInfo* di, const AstronomyParameters* ap, const IntegralArea* ia, const CLRequest* clr) { WGInfo wgi; cl_int err; size_t nWavefrontPerCU; size_t blockSize; /* Size chunks should be multiples of */ cl_bool forceOneChunk = clr->nonResponsive || di->nonOutput || di->hasGraphicsQOS; /* I assume this is how this works for 1D limit */ const cl_ulong maxWorkDim = (cl_ulong) di->maxWorkItemSizes[0] * di->maxWorkItemSizes[1] * di->maxWorkItemSizes[2]; const cl_ulong r = (cl_ulong) ia->r_steps; const cl_ulong mu = (cl_ulong) ia->mu_steps; sizes->r = ia->r_steps; sizes->mu = ia->mu_steps; sizes->nu = ia->nu_steps; sizes->area = r * mu; if (r > CL_ULONG_MAX / mu) { mw_printf("Integral area overflows cl_ulong\n"); return CL_TRUE; } if (di->devType == CL_DEVICE_TYPE_CPU) { sizes->nChunk = sizes->nChunkEstimate = 1; sizes->chunkSize = sizes->effectiveArea = sizes->area; sizes->extra = 0; sizes->local[0] = 1; sizes->global[0] = sizes->area; return CL_FALSE; } err = mwGetWorkGroupInfo(_separationKernel, ci, &wgi); if (err != CL_SUCCESS) { mwPerrorCL(err, "Failed to get work group info"); return CL_TRUE; } if (clr->verbose) { mwPrintWorkGroupInfo(&wgi); } if (!mwDivisible(wgi.wgs, (size_t) di->warpSize)) { mw_printf("Kernel reported work group size ("ZU") not a multiple of warp size (%u)\n", wgi.wgs, di->warpSize); return CL_TRUE; } /* This should give a good occupancy. If the global size isn't a * multiple of this bad performance things happen. */ nWavefrontPerCU = wgi.wgs / di->warpSize; /* Since we don't use any workgroup features, it makes sense to * use the wavefront size as the workgroup size */ sizes->local[0] = di->warpSize; /* For maximum efficiency, we want global work sizes to be multiples of * (warp size) * (number compute units) * (number of warps for good occupancy) * Then we throw in another factor since we can realistically do more work at once */ blockSize = nWavefrontPerCU * di->warpSize * di->maxCompUnits; { cl_uint nBlockPerChunk = 1; sizes->nChunkEstimate = findNChunk(ap, ia, di, clr, &sizes->initialWait); /* Make a guess appropriate for the hardware. */ /* m * b ~= area / n */ nBlockPerChunk = sizes->area / (sizes->nChunkEstimate * blockSize); if (nBlockPerChunk == 0) nBlockPerChunk = 1; sizes->chunkSize = nBlockPerChunk * blockSize; } //sizes->effectiveArea = sizes->chunkSize * mwDivRoundup(sizes->area, sizes->chunkSize); sizes->effectiveArea = di->warpSize * mwDivRoundup(sizes->area, di->warpSize); sizes->nChunk = forceOneChunk ? 1 : mwDivRoundup(sizes->effectiveArea, sizes->chunkSize); sizes->extra = (cl_uint) (sizes->effectiveArea - sizes->area); if (sizes->nChunk == 1) /* BlockPerChunk factor probably too high or very small workunit, or nonresponsive */ { /* Like using nBlockPerChunk == 1 */ sizes->effectiveArea = blockSize * mwDivRoundup(sizes->area, blockSize); sizes->chunkSize = sizes->effectiveArea; sizes->extra = sizes->effectiveArea - sizes->area; } mw_printf("Using a target frequency of %.1f\n" "Using a block size of "ZU" with "ZU" blocks/chunk\n", clr->targetFrequency, blockSize, sizes->chunkSize / blockSize ); printPollMode(ci, sizes); sizes->chunkSize = sizes->effectiveArea / sizes->nChunk; /* We should be hitting memory size limits before we ever get to this */ if (sizes->chunkSize > maxWorkDim) { mw_printf("Warning: Area too large for one chunk (max size = "LLU")\n", maxWorkDim); while (sizes->chunkSize > maxWorkDim) { sizes->nChunk *= 2; sizes->chunkSize = sizes->effectiveArea / sizes->nChunk; } if (!mwDivisible(sizes->chunkSize, sizes->local[0])) { mw_printf("FIXME: I'm too lazy to handle very large workunits properly\n"); return CL_TRUE; } else if (!mwDivisible(sizes->chunkSize, blockSize)) { mw_printf("FIXME: Very large workunit potentially slower than it should be\n"); } } sizes->global[0] = sizes->chunkSize; printRunSizes(sizes, ia); if (sizes->effectiveArea < sizes->area) { mw_printf("Effective area less than actual area!\n"); return CL_TRUE; } return CL_FALSE; }
/* Set arguments using the same fit parameters as have been used */ static int luaDefaultSetBGStreamParametersFromArguments(lua_State* luaSt) { int i, ptable, bgTable, streamTable, n, nStream, idx, thisStream; ptable = mw_lua_checktable(luaSt, lua_gettop(luaSt)); n = luaL_getn(luaSt, ptable); /* Number parameters */ /* Set background parameters */ lua_newtable(luaSt); bgTable = lua_gettop(luaSt); lua_pushvalue(luaSt, bgTable); lua_setglobal(luaSt, BACKGROUND_NAME); bgTable = lua_gettop(luaSt); lua_rawgeti(luaSt, ptable, 1); lua_setfield(luaSt, bgTable, "q"); lua_rawgeti(luaSt, ptable, 2); lua_setfield(luaSt, bgTable, "r0"); /* Not included in fit */ lua_pushnumber(luaSt, 0.0); lua_setfield(luaSt, bgTable, "epsilon"); lua_pushnumber(luaSt, 1.0); lua_setfield(luaSt, bgTable, "alpha"); lua_pushnumber(luaSt, 1.0); lua_setfield(luaSt, bgTable, "delta"); lua_pop(luaSt, 1); /* Set stream parameters */ lua_newtable(luaSt); streamTable = lua_gettop(luaSt); lua_pushvalue(luaSt, streamTable); lua_setglobal(luaSt, STREAMS_NAME); streamTable = lua_gettop(luaSt); if (!mwDivisible(n - NUMBER_FIT_BG_PARAMETERS, NUMBER_FIT_STREAM_PARAMETERS)) { return luaL_error(luaSt, "Parameter count (%d) inconsistent with default argument mapping\n", n); } nStream = (n - NUMBER_FIT_BG_PARAMETERS) / NUMBER_FIT_STREAM_PARAMETERS; for (i = 0; i < nStream; ++i) { lua_newtable(luaSt); thisStream = lua_gettop(luaSt); idx = NUMBER_FIT_BG_PARAMETERS + i * NUMBER_FIT_STREAM_PARAMETERS; lua_rawgeti(luaSt, ptable, idx + 1); lua_setfield(luaSt, thisStream, "epsilon"); lua_rawgeti(luaSt, ptable, idx + 2); lua_setfield(luaSt, thisStream, "mu"); lua_rawgeti(luaSt, ptable, idx + 3); lua_setfield(luaSt, thisStream, "r"); lua_rawgeti(luaSt, ptable, idx + 4); lua_setfield(luaSt, thisStream, "theta"); lua_rawgeti(luaSt, ptable, idx + 5); lua_setfield(luaSt, thisStream, "phi"); lua_rawgeti(luaSt, ptable, idx + 6); lua_setfield(luaSt, thisStream, "sigma"); mw_lua_assert_top_type(luaSt, LUA_TTABLE); lua_rawseti(luaSt, streamTable, i + 1); } lua_pop(luaSt, 1); return 0; }