CLWContext CLWContext::Create(cl_context context, cl_device_id* device, cl_command_queue* commandQueues, int numDevices) { std::vector<CLWDevice> devices(numDevices); std::vector<CLWCommandQueue> cmdQueues(numDevices); for (int i=0; i<numDevices; ++i) { devices[i] = CLWDevice::Create(device[i]); cmdQueues[i] = CLWCommandQueue::Create(commandQueues[i]); } return CLWContext(context, devices, cmdQueues); }
void OCLPerfDoubleDMA::run() { if (failed_) { return; } CPerfCounter timer; const int numQueues = (test_ % MaxQueues) + 1; const bool useKernel = ((test_ / MaxQueues) > 0); const int numBufs = numQueues; Profile profile(isProfilingEnabled_, numQueues); std::vector<cl_command_queue> cmdQueues(numQueues); int q; cl_command_queue_properties qProp = (isProfilingEnabled_) ? CL_QUEUE_PROFILING_ENABLE : 0; for (q = 0; q < numQueues; ++q) { cl_command_queue cmdQueue = clCreateCommandQueue( context_, devices_[deviceId_], qProp, &error_); CHECK_RESULT((error_), "clCreateCommandQueue() failed"); cmdQueues[q] = cmdQueue; } float *Data_s = (float*)clEnqueueMapBuffer(cmdQueues[0], buffers_[numBufs], CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, size_S, 0, NULL, NULL, &error_); CHECK_RESULT((error_), "clEnqueueMapBuffer failed"); memset(Data_s, 1, size_S); size_t gws[1] = { size_s / (4 * sizeof(float)) }; size_t lws[1] = { 256 }; // Warm-up for (q = 0; q < numQueues; ++q) { error_ |= clEnqueueWriteBuffer(cmdQueues[q], buffers_[q], CL_FALSE, 0, size_s, (char*)Data_s, 0, NULL, NULL); error_ |= clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void*) &buffers_[q]); error_ |= clEnqueueNDRangeKernel(cmdQueues[q], kernel_, 1, NULL, gws, lws, 0, NULL, NULL); error_ |= clEnqueueReadBuffer(cmdQueues[q], buffers_[q], CL_FALSE, 0, size_s, (char*)Data_s, 0, NULL, NULL); error_ |= clFinish(cmdQueues[q]); } size_t s_done = 0; cl_event r[MaxQueues] = {0}, w[MaxQueues] = {0}, x[MaxQueues] = {0}; /*---------- pass2: copy Data_s to and from GPU Buffers ----------*/ s_done = 0; timer.Reset(); timer.Start(); int idx = numBufs - 1; // Start from the last so read/write won't go to the same DMA when kernel is executed q = numQueues - 1; size_t iter = 0; while( 1 ) { if (0 == r[idx]) { error_ |= clEnqueueWriteBuffer(cmdQueues[q], buffers_[idx], CL_FALSE, 0, size_s, (char*)Data_s+s_done, 0, NULL, &w[idx]); } else { error_ |= clEnqueueWriteBuffer(cmdQueues[q], buffers_[idx], CL_FALSE, 0, size_s, (char*)Data_s+s_done, 1, &r[idx], &w[idx]); if (!isProfilingEnabled_) { error_ |= clReleaseEvent(r[idx]); } } profile.addEvent(q, ProfileQueue::Write, w[idx]); if (useKernel) { // Change the queue ++q %= numQueues; // Implicit flush of DMA engine on kernel start, because memory dependency error_ |= clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void*) &buffers_[idx]); error_ |= clEnqueueNDRangeKernel(cmdQueues[q], kernel_, 1, NULL, gws, lws, 1, &w[idx], &x[idx]); if (!isProfilingEnabled_) { error_ |= clReleaseEvent(w[idx]); } profile.addEvent(q, ProfileQueue::Execute, x[idx]); } // Change the queue ++q %= numQueues; error_ |= clEnqueueReadBuffer(cmdQueues[q], buffers_[idx], CL_FALSE, 0, size_s, (char*)Data_s+s_done, 1, (useKernel) ? &x[idx] : &w[idx], &r[idx]); if (!isProfilingEnabled_) { error_ |= clReleaseEvent((useKernel) ? x[idx] : w[idx]); } profile.addEvent(q, ProfileQueue::Read, r[idx]); if ((s_done += size_s) >= size_S) { if (!isProfilingEnabled_) { error_ |= clReleaseEvent(r[idx]); } break; } ++iter; ++idx %= numBufs; ++q %= numQueues; } for (q = 0; q < numQueues; ++q) { error_ |= clFinish(cmdQueues[q]); } timer.Stop(); error_ = clEnqueueUnmapMemObject(cmdQueues[0], buffers_[numBufs], Data_s, 0, NULL, NULL); error_ |= clFinish(cmdQueues[0]); CHECK_RESULT((error_), "Execution failed"); cl_long gpuTimeFrame = profile.findExecTime(); cl_long oneIter = gpuTimeFrame / iter; // Display 4 iterations in the middle cl_long startFrame = oneIter * (iter/2 - 2); cl_long finishFrame = oneIter * (iter/2 + 2); profile.display(startFrame, finishFrame); for (q = 0; q < numQueues; ++q) { error_ = clReleaseCommandQueue(cmdQueues[q]); CHECK_RESULT((error_), "clReleaseCommandQueue() failed"); } double GBytes = (double)(2*size_S)/(double)(1024*1024*1024); std::stringstream stream; if (useKernel) { stream << "Write/Kernel/Read operation "; } else { stream << "Write/Read operation "; } stream << numQueues << " queue; profiling " << ((isProfilingEnabled_) ? "enabled" : "disabled"); stream << ((useUHP_) ? " using UHP" : " using AHP") << ": "; stream.flags(std::ios::right | std::ios::showbase); std::cout << stream.str() << static_cast<float>(GBytes / timer.GetElapsedTime()) << " GB/s\n"; }