/* Return CL_TRUE on error */ static cl_bool setSummarizationWorkgroupSize(const CLInfo* ci) { size_t maxGroupSize; size_t groupSize; size_t nextMultiple; cl_int err; err = clGetKernelWorkGroupInfo(_summarizationKernel, ci->dev, CL_KERNEL_WORK_GROUP_SIZE, sizeof(maxGroupSize), &maxGroupSize, NULL); if (err != CL_SUCCESS) { return CL_TRUE; } if (maxGroupSize == 1) { /* Seems to be a problem on OS X CPU implementation */ mw_printf("Workgroup size of 1 for summarization is not acceptable\n"); return CL_TRUE; } /* OpenCL 1.1 has CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE * which we probably should be using if available */ nextMultiple = mwNextMultiple(64, ci->di.warpSize); if (nextMultiple <= maxGroupSize) { groupSize = nextMultiple; } else { groupSize = maxGroupSize; } if (groupSize > 128) /* Just in case */ { groupSize = 128; } _summarizationWorkgroupSize = groupSize; return CL_FALSE; }
static cl_int runSummarization(CLInfo* ci, SeparationCLMem* cm, const IntegralArea* ia, cl_uint which, Kahan* resultOut) { cl_int err = CL_SUCCESS; cl_mem buf; cl_uint offset; size_t global[1]; size_t local[1]; real result[2] = { -1.0, -1.0 }; cl_uint nElements = ia->r_steps * ia->mu_steps; cl_mem sumBufs[2] = { cm->summarizationBufs[0], cm->summarizationBufs[1] }; if (which == 0) { buf = cm->outBg; offset = 0; } else { buf = cm->outStreams; offset = (which - 1) * nElements; } /* First call reads from an offset into one of the output buffers */ err |= clSetKernelArg(_summarizationKernel, 0, sizeof(cl_mem), &sumBufs[0]); err |= clSetKernelArg(_summarizationKernel, 1, sizeof(cl_mem), &buf); err |= clSetKernelArg(_summarizationKernel, 2, sizeof(cl_uint), &nElements); err |= clSetKernelArg(_summarizationKernel, 3, sizeof(cl_uint), &offset); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error setting summarization kernel arguments"); return err; } local[0] = _summarizationWorkgroupSize; global[0] = mwNextMultiple(local[0], nElements); err = clEnqueueNDRangeKernel(ci->queue, _summarizationKernel, 1, NULL, global, local, 0, NULL, NULL); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error enqueuing summarization kernel"); return err; } /* Why is this necessary? It seems to frequently break on the 7970 and nowhere else without it */ err = clFinish(ci->queue); //err = clFlush(ci->queue); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error finishing summarization kernel"); return err; } /* Later calls swap between summarization buffers without an offset */ nElements = (cl_uint) mwDivRoundup(global[0], local[0]); offset = 0; err |= clSetKernelArg(_summarizationKernel, 3, sizeof(cl_uint), &offset); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error setting summarization kernel offset argument"); return err; } while (nElements > 1) { /* Swap old summarization buffer to the input and shrink the range */ swapBuffers(sumBufs); global[0] = mwNextMultiple(local[0], nElements); err |= clSetKernelArg(_summarizationKernel, 0, sizeof(cl_mem), &sumBufs[0]); err |= clSetKernelArg(_summarizationKernel, 1, sizeof(cl_mem), &sumBufs[1]); err |= clSetKernelArg(_summarizationKernel, 2, sizeof(cl_uint), &nElements); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error setting summarization kernel arguments"); return err; } /* err = clEnqueueBarrier(ci->queue); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error enqueuing summarization barrier"); return err; } */ err = clEnqueueNDRangeKernel(ci->queue, _summarizationKernel, 1, NULL, global, local, 0, NULL, NULL); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error enqueuing summarization kernel"); return err; } err = clFinish(ci->queue); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error finishing summarization kernel"); return err; } nElements = (cl_uint) mwDivRoundup(global[0], local[0]); } err = clEnqueueBarrier(ci->queue); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error enqueuing summarization barrier"); return err; } err = clEnqueueReadBuffer(ci->queue, sumBufs[0], CL_TRUE, 0, 2 * sizeof(real), result, 0, NULL, NULL); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error reading summarization result buffer"); return err; } resultOut->sum = result[0]; resultOut->correction = result[1]; return CL_SUCCESS; }