/* Timing in nanoseconds */ cl_ulong mwEventTimeNS(cl_event ev) { cl_int err; cl_ulong ts, te; if (!ev) return 0; err = clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &ts, NULL); if (err != CL_SUCCESS) { mwPerrorCL(err, "Failed to get event start time"); return 0; } err = clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &te, NULL); if (err != CL_SUCCESS) { mwPerrorCL(err, "Failed to get event end time"); return 0; } assert(te >= ts); return te - ts; }
static cl_int createLBTrigBuffer(CLInfo* ci, SeparationCLMem* cm, const AstronomyParameters* ap, const IntegralArea* ia, const SeparationSizes* sizes, const cl_mem_flags constBufFlags) { cl_int err = CL_SUCCESS; LTrigPair* lTrig = NULL; real* bSin = NULL; getSplitLBTrig(ap, ia, &lTrig, &bSin); cm->lTrig = clCreateBuffer(ci->clctx, constBufFlags, sizes->lTrig, lTrig, &err); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error creating lTrig buffer of size "ZU, sizes->lTrig); return err; } cm->bSin = clCreateBuffer(ci->clctx, constBufFlags, sizes->bSin, bSin, &err); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error creating bSin buffer of size "ZU, sizes->bSin); return err; } mwFreeA(lTrig); mwFreeA(bSin); return CL_SUCCESS; }
static void mwGetPlatformInfo(PlatformInfo* pInfo, cl_platform_id platform) { cl_int err; size_t readSize; err = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(pInfo->name), pInfo->name, &readSize); if (readSize > sizeof(pInfo->name)) mwPerrorCL(err, "Failed to read platform name"); err = clGetPlatformInfo(platform, CL_PLATFORM_VENDOR, sizeof(pInfo->vendor), pInfo->vendor, &readSize); if (readSize > sizeof(pInfo->vendor)) mwPerrorCL(err, "Failed to read platform vendor"); err = clGetPlatformInfo(platform, CL_PLATFORM_VERSION, sizeof(pInfo->version), pInfo->version, &readSize); if (readSize > sizeof(pInfo->version)) mwPerrorCL(err, "Failed to read platform version"); err = clGetPlatformInfo(platform, CL_PLATFORM_EXTENSIONS, sizeof(pInfo->extensions), pInfo->extensions, &readSize); if (readSize > sizeof(pInfo->extensions)) mwPerrorCL(err, "Failed to read platform extensions"); err = clGetPlatformInfo(platform, CL_PLATFORM_PROFILE, sizeof(pInfo->profile), pInfo->profile, &readSize); if (readSize > sizeof(pInfo->profile)) mwPerrorCL(err, "Failed to read platform profile"); }
static cl_int mwCreateCtxQueue(CLInfo* ci, cl_bool useBufQueue, cl_bool enableProfiling) { cl_int err = CL_SUCCESS; cl_command_queue_properties props = enableProfiling ? CL_QUEUE_PROFILING_ENABLE : 0; ci->clctx = clCreateContext(NULL, 1, &ci->dev, MW_CONTEXT_LOGGER, NULL, &err); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error creating context"); return err; } ci->queue = clCreateCommandQueue(ci->clctx, ci->dev, props, &err); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error creating command queue"); return err; } if (useBufQueue) { ci->bufQueue = clCreateCommandQueue(ci->clctx, ci->dev, 0, &err); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error creating buffer command queue"); return err; } } return CL_SUCCESS; }
static cl_int runIntegralKernel(CLInfo* ci, const RunSizes* runSizes, const size_t offset[1]) { cl_int err; cl_event ev; err = clEnqueueNDRangeKernel(ci->queue, _separationKernel, 1, offset, runSizes->global, runSizes->local, 0, NULL, &ev); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error enqueueing integral kernel execution"); return err; } /* Give the screen a chance to redraw */ err = mwCLWaitForEvent(ci, ev, runSizes->initialWait); clReleaseEvent(ev); if (err != CL_SUCCESS) { mwPerrorCL(err, "Failed to wait for integral event"); return err; } return CL_SUCCESS; }
cl_mem mwDuplicateBuffer(CLInfo* ci, cl_mem buf) { cl_mem bufCopy; size_t size; cl_mem_flags flags; cl_int err; cl_event ev; if (!buf) { return NULL; } err = clGetMemObjectInfo(buf, CL_MEM_FLAGS, sizeof(flags), &flags, NULL); if (err != CL_SUCCESS) { mwPerrorCL(err, "Failed to get memory flags for buffer duplication"); return NULL; } err = clGetMemObjectInfo(buf, CL_MEM_SIZE, sizeof(size), &size, NULL); if (err != CL_SUCCESS) { mwPerrorCL(err, "Failed to get memory size for buffer duplication"); return NULL; } /* We may have initialized that one from a host pointer, but not this one */ flags ^= CL_MEM_COPY_HOST_PTR; /* Create a copy of the same size */ bufCopy = clCreateBuffer(ci->clctx, flags, size, NULL, &err); if (err != CL_SUCCESS) { mwPerrorCL(err, "Failed to create copy buffer of size "ZU, size); return NULL; } err = clEnqueueCopyBuffer(ci->queue, buf, bufCopy, 0, 0, size, 0, NULL, &ev); if (err != CL_SUCCESS) { mwPerrorCL(err, "Failed to enqueue buffer copy of size"ZU, size); clReleaseMemObject(bufCopy); return NULL; } err = mwWaitReleaseEvent(&ev); if (err != CL_SUCCESS) { mwPerrorCL(err, "Failed to wait for buffer copy"); clReleaseMemObject(bufCopy); return NULL; } return bufCopy; }
/* Only sets the constant arguments, not the outputs which we double buffer */ cl_int separationSetKernelArgs(SeparationCLMem* cm, const RunSizes* runSizes) { cl_int err = CL_SUCCESS; /* Set output buffer arguments */ err |= clSetKernelArg(_separationKernel, 0, sizeof(cl_mem), &cm->outBg); err |= clSetKernelArg(_separationKernel, 1, sizeof(cl_mem), &cm->outStreams); /* The constant, global arguments */ err |= clSetKernelArg(_separationKernel, 2, sizeof(cl_mem), &cm->rc); err |= clSetKernelArg(_separationKernel, 3, sizeof(cl_mem), &cm->rPts); err |= clSetKernelArg(_separationKernel, 4, sizeof(cl_mem), &cm->lTrig); err |= clSetKernelArg(_separationKernel, 5, sizeof(cl_mem), &cm->bSin); /* The __constant arguments */ err |= clSetKernelArg(_separationKernel, 6, sizeof(cl_mem), &cm->ap); err |= clSetKernelArg(_separationKernel, 7, sizeof(cl_mem), &cm->sc); err |= clSetKernelArg(_separationKernel, 8, sizeof(cl_mem), &cm->sg_dx); err |= clSetKernelArg(_separationKernel, 9, sizeof(cl_uint), &runSizes->extra); err |= clSetKernelArg(_separationKernel, 10, sizeof(cl_uint), &runSizes->r); err |= clSetKernelArg(_separationKernel, 11, sizeof(cl_uint), &runSizes->mu); err |= clSetKernelArg(_separationKernel, 12, sizeof(cl_uint), &runSizes->nu); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error setting kernel arguments"); return err; } return CL_SUCCESS; }
cl_int mwGetWorkGroupInfo(cl_kernel kern, const CLInfo* ci, WGInfo* wgi) { cl_int err = CL_SUCCESS; err |= clGetKernelWorkGroupInfo(kern, ci->dev, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, sizeof(wgi->cwgs), wgi->cwgs, NULL); err |= clGetKernelWorkGroupInfo(kern, ci->dev, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgi->wgs, NULL); err |= clGetKernelWorkGroupInfo(kern, ci->dev, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(cl_ulong), &wgi->lms, NULL); if (err != CL_SUCCESS) mwPerrorCL(err, "Failed to get kernel work group info"); return err; }
cl_int integrateCL(const AstronomyParameters* ap, const IntegralArea* ia, const StreamConstants* sc, const StreamGauss sg, EvaluationState* es, const CLRequest* clr, CLInfo* ci) { cl_int err; RunSizes runSizes; SeparationSizes sizes; SeparationCLMem cm = EMPTY_SEPARATION_CL_MEM; /* Need to test sizes for each integral, since the area size can change */ calculateSizes(&sizes, ap, ia); if (findRunSizes(&runSizes, ci, &ci->di, ap, ia, clr)) { mw_printf("Failed to find good run sizes\n"); return MW_CL_ERROR; } err = createSeparationBuffers(ci, &cm, ap, ia, sc, sg, &sizes); if (err != CL_SUCCESS) { mwPerrorCL(err, "Failed to create CL buffers"); return err; } err = separationSetKernelArgs(&cm, &runSizes); if (err != CL_SUCCESS) { mwPerrorCL(err, "Failed to set integral kernel arguments"); return err; } err = runIntegral(ci, &cm, &runSizes, es, clr, ap, ia); releaseSeparationBuffers(&cm); separationIntegralGetSums(es); return err; }
static cl_int mwGetDeviceType(cl_device_id dev, cl_device_type* devType) { cl_int err = CL_SUCCESS; err = clGetDeviceInfo(dev, CL_DEVICE_TYPE, sizeof(cl_device_type), devType, NULL); if (err != CL_SUCCESS) mwPerrorCL(err, "Failed to get device type"); return err; }
cl_int mwFinishEvent(cl_event ev) { cl_int err; err = clSetUserEventStatus(ev, CL_COMPLETE); if (err != CL_SUCCESS) mwPerrorCL(err, "Failed to mark custom event as completed"); return err; }
static cl_int createRBuffers(CLInfo* ci, SeparationCLMem* cm, const AstronomyParameters* ap, const IntegralArea* ia, const StreamGauss sg, const SeparationSizes* sizes, cl_mem_flags constBufFlags) { cl_int err; RPoints* r_pts; RConsts* rc; r_pts = precalculateRPts(ap, ia, sg, &rc, FALSE); cm->rPts = clCreateBuffer(ci->clctx, constBufFlags, sizes->rPts, r_pts, &err); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error creating stream r points buffer of size "ZU, sizes->rPts); return err; } cm->rc = clCreateBuffer(ci->clctx, constBufFlags, sizes->rc, rc, &err); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error creating stream r consts buffer of size "ZU, sizes->rc); return err; } cm->sg_dx = clCreateBuffer(ci->clctx, constBufFlags, sizes->sg_dx, sg.dx, &err); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error creating stream sg_dx buffer of size "ZU, sizes->sg_dx); return err; } mwFreeA(r_pts); mwFreeA(rc); return CL_SUCCESS; }
static cl_int runIntegral(CLInfo* ci, SeparationCLMem* cm, RunSizes* runSizes, EvaluationState* es, const CLRequest* clr, const AstronomyParameters* ap, const IntegralArea* ia) { cl_int err = CL_SUCCESS; double t1, t2, dt; double tAcc = 0.0; for (; es->nu_step < ia->nu_steps; es->nu_step++) { if (clr->enableCheckpointing && timeToCheckpointGPU(es, ia)) { err = checkpointCL(ci, cm, ia, es); if (err != CL_SUCCESS) break; } t1 = mwGetTimeMilli(); err = runNuStep(ci, ia, runSizes, es->nu_step); if (err != CL_SUCCESS) { mwPerrorCL(err, "Failed to run nu step"); return err; } t2 = mwGetTimeMilli(); dt = t2 - t1; tAcc += dt; reportProgress(ap, ia, es, es->nu_step + 1, dt); } es->nu_step = 0; mw_printf("Integration time: %f s. Average time per iteration = %f ms\n", tAcc / 1000.0, tAcc / (double) ia->nu_steps); if (err == CL_SUCCESS) { err = readKernelResults(ci, cm, es, ia); if (err != CL_SUCCESS) mw_printf("Failed to read final kernel results\n"); /* Add final episode to running totals */ addTmpCheckpointSums(es); } return err; }
static cl_int setNuKernelArgs(const IntegralArea* ia, cl_uint nu_step) { cl_int err; NuId nuid; nuid = calcNuStep(ia, nu_step); err = clSetKernelArg(_separationKernel, 13, sizeof(real), &nuid.id); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error setting nu_id argument for step %u", nu_step); return err; } err = clSetKernelArg(_separationKernel, 14, sizeof(cl_uint), &nu_step); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error setting nu_id argument for step %u", nu_step); return err; } return CL_SUCCESS; }
/* Wait for an event then release it */ cl_int mwWaitReleaseEvent(cl_event* ev) { cl_int err; assert(ev); err = clWaitForEvents(1, ev); if (err != CL_SUCCESS) { mwPerrorCL(err, "Failed to wait for event"); return err; } err = clReleaseEvent(*ev); if (err != CL_SUCCESS) { mwPerrorCL(err, "Failed to release event"); return err; } return CL_SUCCESS; }
cl_platform_id* mwGetAllPlatformIDs(cl_uint* nPlatformsOut) { cl_uint nPlatform = 0; cl_uint nPlatformActual = 0; cl_platform_id* ids = NULL; cl_int err; err = clGetPlatformIDs(0, NULL, &nPlatform); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error getting number of platform"); return NULL; } if (nPlatform == 0) { mw_printf("No CL platforms found\n"); return NULL; } ids = mwMalloc(nPlatform * sizeof(cl_platform_id)); err = clGetPlatformIDs(nPlatform, ids, &nPlatformActual); if ((err != CL_SUCCESS) || (nPlatformActual != nPlatform)) { mwPerrorCL(err, "Error getting platform IDs or inconsistent platform count (expected %u, actual %u)\n", nPlatform, nPlatformActual ); free(ids); return NULL; } mw_printf("Found %u platform%s\n", nPlatform, nPlatform > 1 ? "s" : ""); *nPlatformsOut = nPlatform; return ids; }
cl_device_id* mwGetAllDevices(cl_platform_id platform, cl_uint* numDevOut) { cl_int err; cl_device_id* devs; cl_uint numDev = 0; cl_device_type type = BOINC_APPLICATION ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_ALL; /* We may want to use CPUs for debugging, but the index BOINC gives you seems to only use GPUs. */ err = clGetDeviceIDs(platform, type, 0, NULL, &numDev); if (err != CL_SUCCESS) { mwPerrorCL(err, "Failed to find number of devices"); return NULL; } if (numDev == 0) { mw_printf("Didn't find any CL devices\n"); return NULL; } mw_printf("Found %u CL device%s\n", numDev, numDev > 1 ? "s" : ""); devs = (cl_device_id*) mwMalloc(numDev * sizeof(cl_device_id)); err = clGetDeviceIDs(platform, type, numDev, devs, &numDev); if (err != CL_SUCCESS) { mwPerrorCL(err, "Failed to get device IDs"); free(devs); return NULL; } *numDevOut = numDev; return devs; }
cl_event mwCreateEvent(CLInfo* ci) { cl_int err; cl_event ev; ev = clCreateUserEvent(ci->clctx, &err); if (err != CL_SUCCESS) { mwPerrorCL(err, "Failed to create custom event"); return NULL; } return ev; }
cl_bool mwPlatformSupportsAMDOfflineDevices(const CLInfo* ci) { cl_int err; char exts[4096]; size_t readSize = 0; err = clGetPlatformInfo(ci->plat, CL_PLATFORM_EXTENSIONS, sizeof(exts), exts, &readSize); if ((err != CL_SUCCESS) || (readSize >= sizeof(exts))) { mwPerrorCL(err, "Error reading platform extensions (readSize = "ZU")\n", readSize); return CL_FALSE; } return (strstr(exts, "cl_amd_offline_devices") != NULL); }
static cl_mem mwCreateZeroReadWriteBufferComplete(CLInfo* ci, size_t size, cl_bool pinned) { void* p; cl_mem mem = NULL; cl_int err = CL_SUCCESS; cl_mem_flags flags = CL_MEM_READ_WRITE; if (pinned) { flags |= CL_MEM_ALLOC_HOST_PTR; /* flags |= CL_MEM_USE_PERSISTENT_MEM_AMD; */ } mem = clCreateBuffer(ci->clctx, flags, size, NULL, &err); if (err != CL_SUCCESS) { mwPerrorCL(err, "Failed to create zero buffer of size "ZU, size); goto fail; } p = clEnqueueMapBuffer(ci->queue, mem, CL_TRUE, CL_MAP_WRITE, 0, size, 0, NULL, NULL, &err); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error mapping zero buffer"); goto fail; } memset(p, 0, size); err = clEnqueueUnmapMemObject(ci->queue, mem, p, 0, NULL, NULL); if (err != CL_SUCCESS) mwPerrorCL(err, "Failed to unmap zero buffer"); fail: return mem; }
static cl_int createAPBuffer(CLInfo* ci, SeparationCLMem* cm, const AstronomyParameters* ap, const SeparationSizes* sizes, const cl_mem_flags constBufFlags) { cl_int err = CL_SUCCESS; double buf[16]; union { double d; cl_uint i[2]; } item; memset(buf, 0, sizeof(buf)); buf[0] = 0.0; buf[1] = 0.0; item.i[0] = ap->convolve; item.i[1] = ap->number_streams; buf[2] = item.d; buf[3] = 0.0; buf[4] = ap->m_sun_r0; buf[5] = ap->r0; buf[6] = ap->q_inv_sqr; buf[7] = 0.0; buf[8] = ap->bg_a; buf[9] = ap->bg_b; buf[10] = ap->bg_c; buf[11] = 0.0; cm->ap = clCreateBuffer(ci->clctx, constBufFlags, sizeof(buf), (void*) buf, &err); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error creating astronomy parameters buffer of size "ZU, sizes->ap); return err; } return CL_SUCCESS; }
cl_int mwDestroyCLInfo(CLInfo* ci) { cl_int err = CL_SUCCESS; /* Depending on where things fail, some of these will be NULL, and * will spew errors when trying to cleanup. */ if (ci->queue) err |= clReleaseCommandQueue(ci->queue); if (ci->bufQueue) err |= clReleaseCommandQueue(ci->bufQueue); if (ci->prog) err |= clReleaseProgram(ci->prog); if (ci->clctx) err |= clReleaseContext(ci->clctx); /* TODO: or'ing the err and showing = useless */ if (err) mwPerrorCL(err, "Error cleaning up CLInfo"); return err; }
static cl_int createSCBuffer(CLInfo* ci, SeparationCLMem* cm, const StreamConstants* sc, const SeparationSizes* sizes, const cl_mem_flags constBufFlags) { cl_int err; real* buf; cl_int i; buf = mwCallocA(sizes->nStream * 8, sizeof(real)); /* Pack into format used by kernel */ for (i = 0; i < sizes->nStream; ++i) { buf[8 * i + 0] = X(sc[i].a); buf[8 * i + 1] = X(sc[i].c); buf[8 * i + 2] = Y(sc[i].a); buf[8 * i + 3] = Y(sc[i].c); buf[8 * i + 4] = Z(sc[i].a); buf[8 * i + 5] = Z(sc[i].c); buf[8 * i + 6] = sc[i].sigma_sq2_inv; buf[8 * i + 7] = 0.0; } cm->sc = clCreateBuffer(ci->clctx, constBufFlags, sizes->sc, (void*) buf, &err); mwFreeA(buf); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error creating stream constants buffer of size "ZU, sizes->sc); return err; } return CL_SUCCESS; }
/* Return CL_UINT_MAX if it doesn't find one */ static cl_uint choosePlatform(const char* prefVendor, const cl_platform_id* platforms, cl_uint nPlatform) { cl_uint i; char platVendor[256]; char prefBuf[256]; cl_int err; if (!platforms || nPlatform == 0) return CL_UINT_MAX; /* No strnstr() on Windows, also be paranoid */ memset(prefBuf, 0, sizeof(prefBuf)); strncpy(prefBuf, prefVendor, sizeof(prefBuf)); prefBuf[sizeof(prefBuf) - 1] = '\0'; /* Out of the available platforms, see if one has a matching vendor */ for (i = 0; i < nPlatform; ++i) { err = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(platVendor), platVendor, NULL); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error getting platform vendor"); return CL_UINT_MAX; } if (strstr(platVendor, prefBuf)) { return i; } } return CL_UINT_MAX; }
cl_int setupSeparationCL(CLInfo* ci, const AstronomyParameters* ap, const IntegralArea* ias, const CLRequest* clr) { char* compileFlags; cl_bool useILKernel; cl_int err = MW_CL_ERROR; const char* kernSrc = (const char*) probabilities_kernel_cl; size_t kernSrcLen = probabilities_kernel_cl_len; const char* summarizationKernSrc = (const char*) summarization_kernel_cl; size_t summarizationKernSrcLen = summarization_kernel_cl_len; err = mwSetupCL(ci, clr); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error getting device and context"); return err; } if (!separationCheckDevCapabilities(&ci->di)) { return MW_CL_ERROR; } useILKernel = usingILKernelIsAcceptable(ci, ap, clr); compileFlags = getCompilerFlags(ci, ap, useILKernel); if (!compileFlags) { mw_printf("Failed to get CL compiler flags\n"); return MW_CL_ERROR; } if (clr->verbose) { mw_printf("\nCompiler flags:\n%s\n\n", compileFlags); } integrationProgram = mwCreateProgramFromSrc(ci, 1, &kernSrc, &kernSrcLen, compileFlags); if (!integrationProgram) { mw_printf("Error creating integral program from source\n"); err = MW_CL_ERROR; goto setup_exit; } summarizationProgram = mwCreateProgramFromSrc(ci, 1, &summarizationKernSrc, &summarizationKernSrcLen, compileFlags); if (!summarizationProgram) { mw_printf("Error creating summarization program from source\n"); err = MW_CL_ERROR; goto setup_exit; } if (useILKernel) { mw_printf("Using AMD IL kernel\n"); err = setProgramFromILKernel(ci, ap); if (err != CL_SUCCESS) { mw_printf("Failed to create IL kernel. Falling back to source kernel\n"); } } if (err == CL_SUCCESS) { _separationKernel = mwCreateKernel(integrationProgram, "probabilities"); _summarizationKernel = mwCreateKernel(summarizationProgram, "summarization"); if ( !_separationKernel || !_summarizationKernel || setSummarizationWorkgroupSize(ci) || !separationCheckDevMemory(&ci->di, ap, ias)) { err = MW_CL_ERROR; } } setup_exit: free(compileFlags); return err; }
static cl_int runSummarization(CLInfo* ci, SeparationCLMem* cm, const IntegralArea* ia, cl_uint which, Kahan* resultOut) { cl_int err = CL_SUCCESS; cl_mem buf; cl_uint offset; size_t global[1]; size_t local[1]; real result[2] = { -1.0, -1.0 }; cl_uint nElements = ia->r_steps * ia->mu_steps; cl_mem sumBufs[2] = { cm->summarizationBufs[0], cm->summarizationBufs[1] }; if (which == 0) { buf = cm->outBg; offset = 0; } else { buf = cm->outStreams; offset = (which - 1) * nElements; } /* First call reads from an offset into one of the output buffers */ err |= clSetKernelArg(_summarizationKernel, 0, sizeof(cl_mem), &sumBufs[0]); err |= clSetKernelArg(_summarizationKernel, 1, sizeof(cl_mem), &buf); err |= clSetKernelArg(_summarizationKernel, 2, sizeof(cl_uint), &nElements); err |= clSetKernelArg(_summarizationKernel, 3, sizeof(cl_uint), &offset); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error setting summarization kernel arguments"); return err; } local[0] = _summarizationWorkgroupSize; global[0] = mwNextMultiple(local[0], nElements); err = clEnqueueNDRangeKernel(ci->queue, _summarizationKernel, 1, NULL, global, local, 0, NULL, NULL); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error enqueuing summarization kernel"); return err; } /* Why is this necessary? It seems to frequently break on the 7970 and nowhere else without it */ err = clFinish(ci->queue); //err = clFlush(ci->queue); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error finishing summarization kernel"); return err; } /* Later calls swap between summarization buffers without an offset */ nElements = (cl_uint) mwDivRoundup(global[0], local[0]); offset = 0; err |= clSetKernelArg(_summarizationKernel, 3, sizeof(cl_uint), &offset); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error setting summarization kernel offset argument"); return err; } while (nElements > 1) { /* Swap old summarization buffer to the input and shrink the range */ swapBuffers(sumBufs); global[0] = mwNextMultiple(local[0], nElements); err |= clSetKernelArg(_summarizationKernel, 0, sizeof(cl_mem), &sumBufs[0]); err |= clSetKernelArg(_summarizationKernel, 1, sizeof(cl_mem), &sumBufs[1]); err |= clSetKernelArg(_summarizationKernel, 2, sizeof(cl_uint), &nElements); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error setting summarization kernel arguments"); return err; } /* err = clEnqueueBarrier(ci->queue); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error enqueuing summarization barrier"); return err; } */ err = clEnqueueNDRangeKernel(ci->queue, _summarizationKernel, 1, NULL, global, local, 0, NULL, NULL); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error enqueuing summarization kernel"); return err; } err = clFinish(ci->queue); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error finishing summarization kernel"); return err; } nElements = (cl_uint) mwDivRoundup(global[0], local[0]); } err = clEnqueueBarrier(ci->queue); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error enqueuing summarization barrier"); return err; } err = clEnqueueReadBuffer(ci->queue, sumBufs[0], CL_TRUE, 0, 2 * sizeof(real), result, 0, NULL, NULL); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error reading summarization result buffer"); return err; } resultOut->sum = result[0]; resultOut->correction = result[1]; return CL_SUCCESS; }
/* Returns CL_TRUE on error */ cl_bool findRunSizes(RunSizes* sizes, const CLInfo* ci, const DevInfo* di, const AstronomyParameters* ap, const IntegralArea* ia, const CLRequest* clr) { WGInfo wgi; cl_int err; size_t nWavefrontPerCU; size_t blockSize; /* Size chunks should be multiples of */ cl_bool forceOneChunk = clr->nonResponsive || di->nonOutput || di->hasGraphicsQOS; /* I assume this is how this works for 1D limit */ const cl_ulong maxWorkDim = (cl_ulong) di->maxWorkItemSizes[0] * di->maxWorkItemSizes[1] * di->maxWorkItemSizes[2]; const cl_ulong r = (cl_ulong) ia->r_steps; const cl_ulong mu = (cl_ulong) ia->mu_steps; sizes->r = ia->r_steps; sizes->mu = ia->mu_steps; sizes->nu = ia->nu_steps; sizes->area = r * mu; if (r > CL_ULONG_MAX / mu) { mw_printf("Integral area overflows cl_ulong\n"); return CL_TRUE; } if (di->devType == CL_DEVICE_TYPE_CPU) { sizes->nChunk = sizes->nChunkEstimate = 1; sizes->chunkSize = sizes->effectiveArea = sizes->area; sizes->extra = 0; sizes->local[0] = 1; sizes->global[0] = sizes->area; return CL_FALSE; } err = mwGetWorkGroupInfo(_separationKernel, ci, &wgi); if (err != CL_SUCCESS) { mwPerrorCL(err, "Failed to get work group info"); return CL_TRUE; } if (clr->verbose) { mwPrintWorkGroupInfo(&wgi); } if (!mwDivisible(wgi.wgs, (size_t) di->warpSize)) { mw_printf("Kernel reported work group size ("ZU") not a multiple of warp size (%u)\n", wgi.wgs, di->warpSize); return CL_TRUE; } /* This should give a good occupancy. If the global size isn't a * multiple of this bad performance things happen. */ nWavefrontPerCU = wgi.wgs / di->warpSize; /* Since we don't use any workgroup features, it makes sense to * use the wavefront size as the workgroup size */ sizes->local[0] = di->warpSize; /* For maximum efficiency, we want global work sizes to be multiples of * (warp size) * (number compute units) * (number of warps for good occupancy) * Then we throw in another factor since we can realistically do more work at once */ blockSize = nWavefrontPerCU * di->warpSize * di->maxCompUnits; { cl_uint nBlockPerChunk = 1; sizes->nChunkEstimate = findNChunk(ap, ia, di, clr, &sizes->initialWait); /* Make a guess appropriate for the hardware. */ /* m * b ~= area / n */ nBlockPerChunk = sizes->area / (sizes->nChunkEstimate * blockSize); if (nBlockPerChunk == 0) nBlockPerChunk = 1; sizes->chunkSize = nBlockPerChunk * blockSize; } //sizes->effectiveArea = sizes->chunkSize * mwDivRoundup(sizes->area, sizes->chunkSize); sizes->effectiveArea = di->warpSize * mwDivRoundup(sizes->area, di->warpSize); sizes->nChunk = forceOneChunk ? 1 : mwDivRoundup(sizes->effectiveArea, sizes->chunkSize); sizes->extra = (cl_uint) (sizes->effectiveArea - sizes->area); if (sizes->nChunk == 1) /* BlockPerChunk factor probably too high or very small workunit, or nonresponsive */ { /* Like using nBlockPerChunk == 1 */ sizes->effectiveArea = blockSize * mwDivRoundup(sizes->area, blockSize); sizes->chunkSize = sizes->effectiveArea; sizes->extra = sizes->effectiveArea - sizes->area; } mw_printf("Using a target frequency of %.1f\n" "Using a block size of "ZU" with "ZU" blocks/chunk\n", clr->targetFrequency, blockSize, sizes->chunkSize / blockSize ); printPollMode(ci, sizes); sizes->chunkSize = sizes->effectiveArea / sizes->nChunk; /* We should be hitting memory size limits before we ever get to this */ if (sizes->chunkSize > maxWorkDim) { mw_printf("Warning: Area too large for one chunk (max size = "LLU")\n", maxWorkDim); while (sizes->chunkSize > maxWorkDim) { sizes->nChunk *= 2; sizes->chunkSize = sizes->effectiveArea / sizes->nChunk; } if (!mwDivisible(sizes->chunkSize, sizes->local[0])) { mw_printf("FIXME: I'm too lazy to handle very large workunits properly\n"); return CL_TRUE; } else if (!mwDivisible(sizes->chunkSize, blockSize)) { mw_printf("FIXME: Very large workunit potentially slower than it should be\n"); } } sizes->global[0] = sizes->chunkSize; printRunSizes(sizes, ia); if (sizes->effectiveArea < sizes->area) { mw_printf("Effective area less than actual area!\n"); return CL_TRUE; } return CL_FALSE; }
static cl_int mwGetCLInfo(CLInfo* ci, const CLRequest* clr) { cl_int err = CL_SUCCESS; cl_uint nPlatform = 0; cl_uint nDev = 0; cl_platform_id* ids; cl_device_id* devs; cl_uint platformChoice; ids = mwGetAllPlatformIDs(&nPlatform); if (!ids) return MW_CL_ERROR; if (mwIsFirstRun()) { mwPrintPlatforms(ids, nPlatform); } /* We have this set by default to UINT_MAX, so if it's in a * legitimate range, it was specified */ if (clr->platform < nPlatform) { platformChoice = clr->platform; } else if (strcmp(clr->preferredPlatformVendor, "")) /* Didn't specify platform by index, try picking one by name */ { platformChoice = choosePlatform(clr->preferredPlatformVendor, ids, nPlatform); } else { platformChoice = 0; } if (platformChoice >= nPlatform) { mw_printf("Didn't find preferred platform\n"); platformChoice = 0; } mw_printf("Using device %u on platform %u\n", clr->devNum, platformChoice); ci->plat = ids[platformChoice]; devs = mwGetAllDevices(ci->plat, &nDev); if (!devs) { mw_printf("Error getting devices\n"); free(ids); return MW_CL_ERROR; } err = mwSelectDevice(ci, devs, clr, nDev); if (err != CL_SUCCESS) { mwPerrorCL(err, "Failed to select a device"); err = -1; } free(ids); free(devs); return err; }
cl_int mwGetDevInfo(DevInfo* di, cl_device_id dev) { const AMDGPUData* amdData; cl_int err = CL_SUCCESS; di->devID = dev; err |= clGetDeviceInfo(dev, CL_DEVICE_TYPE, sizeof(di->devType), &di->devType, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_NAME, sizeof(di->devName), di->devName, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_VENDOR, sizeof(di->vendor), di->vendor, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_VENDOR_ID, sizeof(cl_uint), &di->vendorID, NULL); err |= clGetDeviceInfo(dev, CL_DRIVER_VERSION, sizeof(di->driver), di->driver, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_VERSION, sizeof(di->version), di->version, NULL); //err |= clGetDeviceInfo(dev, CL_DEVICE_OPENCL_C_VERSION, sizeof(di->clCVer), di->clCVer, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_ENDIAN_LITTLE, sizeof(cl_bool), &di->littleEndian, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_ERROR_CORRECTION_SUPPORT, sizeof(cl_bool), &di->errCorrect, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool), &di->imgSupport, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_ADDRESS_BITS, sizeof(cl_uint), &di->addrBits, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &di->maxCompUnits, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(cl_uint), &di->clockFreq, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &di->memSize, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &di->maxMemAlloc, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, sizeof(cl_ulong), &di->gMemCache, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, sizeof(cl_uint), &di->cachelineSize, NULL); //err |= clGetDeviceInfo(dev, CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_ulong), &unifiedMem, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_LOCAL_MEM_TYPE, sizeof(cl_device_local_mem_type), &di->localMemType, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_DOUBLE_FP_CONFIG, sizeof(cl_device_fp_config), &di->doubleFPConfig, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_SINGLE_FP_CONFIG, sizeof(cl_device_fp_config), &di->floatFPConfig, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &di->localMemSize, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_MAX_CONSTANT_ARGS, sizeof(cl_uint), &di->maxConstArgs, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(cl_ulong), &di->maxConstBufSize, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_MAX_PARAMETER_SIZE, sizeof(size_t), &di->maxParamSize, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &di->maxWorkGroupSize, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), &di->maxWorkItemDim, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(di->maxWorkItemSizes), di->maxWorkItemSizes, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &di->memBaseAddrAlign, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, sizeof(cl_uint), &di->minAlignSize, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_PROFILING_TIMER_RESOLUTION, sizeof(size_t), &di->timerRes, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_EXTENSIONS, sizeof(di->exts), &di->exts, NULL); di->computeCapabilityMajor = di->computeCapabilityMinor = 0; di->warpSize = 0; if (err == CL_SUCCESS) { if (strstr(di->exts, "cl_nv_device_attribute_query") != NULL) { err |= clGetDeviceInfo(dev, CL_DEVICE_WARP_SIZE_NV, sizeof(di->warpSize), &di->warpSize, NULL); err |= clGetDeviceInfo(di->devID, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof(cl_uint), &di->computeCapabilityMajor, NULL); err |= clGetDeviceInfo(di->devID, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, sizeof(cl_uint), &di->computeCapabilityMinor, NULL); } else { if (di->devType == CL_DEVICE_TYPE_CPU) { di->warpSize = 1; } else if (di->devType == CL_DEVICE_TYPE_GPU) { /* FIXME: How do I get this on AMD? It's 64 for all of * the high end stuff, but 32 for lower. I think it's * 64 for all the GPUs that do have doubles */ di->warpSize = 64; } else { mw_printf("Unknown device type, using warp size = 1\n"); di->warpSize = 1; } } } di->nonOutput = mwDeviceIsNonOutput(di); di->hasGraphicsQOS = mwDeviceHasGraphicsQOS(di); if (mwIsNvidiaGPUDevice(di)) { di->aluPerCU = mwCUDACoresPerComputeUnit(di); di->doubleFrac = mwCUDAEstimateDoubleFrac(di); di->calTarget = MW_CAL_TARGET_INVALID; if (strstr(di->exts, "cl_nv_device_attribute_query") != NULL) { err |= clGetDeviceInfo(dev, CL_DEVICE_WARP_SIZE_NV, sizeof(di->warpSize), &di->warpSize, NULL); err |= clGetDeviceInfo(di->devID, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof(cl_uint), &di->computeCapabilityMajor, NULL); err |= clGetDeviceInfo(di->devID, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, sizeof(cl_uint), &di->computeCapabilityMinor, NULL); } } else if (mwIsAMDGPUDevice(di)) { amdData = mwLookupAMDGPUInfo(di); di->aluPerCU = amdData->aluPerCU; di->doubleFrac = amdData->doubleFrac; di->calTarget = amdData->target; di->warpSize = amdData->wavefrontSize; } if (di->warpSize == 0) { mw_printf("Unknown device type, using warp size = 1\n"); di->warpSize = 1; } if (err != CL_SUCCESS) { mwPerrorCL(err, "Error getting device information"); } else { di->doubleExts = mwGetDoubleExts(di->exts); } return err; }