Ejemplo n.º 1
0
CLWContext CLWContext::Create(cl_context context, cl_device_id* device, cl_command_queue* commandQueues, int numDevices)
{
    std::vector<CLWDevice> devices(numDevices);
    std::vector<CLWCommandQueue> cmdQueues(numDevices);

    for (int i=0; i<numDevices; ++i)
    {
        devices[i] = CLWDevice::Create(device[i]);
        cmdQueues[i] = CLWCommandQueue::Create(commandQueues[i]);
    }

    return CLWContext(context, devices, cmdQueues);
}
Ejemplo n.º 2
0
void
OCLPerfDoubleDMA::run()
{
    if (failed_) {
        return;
    }
    CPerfCounter timer;
    const int   numQueues = (test_ % MaxQueues) + 1;
    const bool  useKernel = ((test_ / MaxQueues) > 0);
    const int   numBufs = numQueues;
    Profile     profile(isProfilingEnabled_, numQueues);

    std::vector<cl_command_queue> cmdQueues(numQueues);
    int q;
    cl_command_queue_properties qProp = (isProfilingEnabled_) ? CL_QUEUE_PROFILING_ENABLE : 0;
    for (q = 0; q < numQueues; ++q) {
        cl_command_queue cmdQueue = clCreateCommandQueue(
            context_, devices_[deviceId_], qProp, &error_);
        CHECK_RESULT((error_), "clCreateCommandQueue() failed");
        cmdQueues[q] = cmdQueue;
    }
    
    float *Data_s = (float*)clEnqueueMapBuffer(cmdQueues[0],
        buffers_[numBufs], CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, size_S, 0, NULL, NULL, &error_);
    CHECK_RESULT((error_), "clEnqueueMapBuffer failed");
	memset(Data_s, 1, size_S);
    size_t  gws[1] = { size_s / (4 * sizeof(float)) };
    size_t  lws[1] = { 256 };

    // Warm-up
    for (q = 0; q < numQueues; ++q) {
        error_ |= clEnqueueWriteBuffer(cmdQueues[q],
            buffers_[q], CL_FALSE, 0, size_s, (char*)Data_s, 0, NULL, NULL);
        error_ |= clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void*) &buffers_[q]);
        error_ |= clEnqueueNDRangeKernel(cmdQueues[q],
            kernel_, 1, NULL, gws, lws, 0, NULL, NULL);
        error_ |= clEnqueueReadBuffer(cmdQueues[q],
            buffers_[q], CL_FALSE, 0, size_s, (char*)Data_s, 0, NULL, NULL);
        error_ |= clFinish(cmdQueues[q]);
    }

    size_t s_done = 0;
    cl_event r[MaxQueues] = {0}, w[MaxQueues] = {0}, x[MaxQueues] = {0};

    /*----------  pass2:  copy Data_s to and from GPU Buffers ----------*/
    s_done = 0;
    timer.Reset();
    timer.Start();
    int idx = numBufs - 1;
    // Start from the last so read/write won't go to the same DMA when kernel is executed
    q = numQueues - 1;
    size_t iter = 0;
    while( 1 )  {
        if (0 == r[idx]) {
            error_ |= clEnqueueWriteBuffer(cmdQueues[q],
                buffers_[idx], CL_FALSE, 0, size_s, (char*)Data_s+s_done, 0, NULL, &w[idx]);
        }
        else {
            error_ |= clEnqueueWriteBuffer(cmdQueues[q],
                buffers_[idx], CL_FALSE, 0, size_s, (char*)Data_s+s_done, 1, &r[idx], &w[idx]);
            if (!isProfilingEnabled_) { 
                error_ |= clReleaseEvent(r[idx]);
            }
        }
        profile.addEvent(q, ProfileQueue::Write, w[idx]);

        if (useKernel) {
            // Change the queue
            ++q %= numQueues;
            // Implicit flush of DMA engine on kernel start, because memory dependency
            error_ |= clSetKernelArg(kernel_, 0, sizeof(cl_mem), (void*) &buffers_[idx]);
            error_ |= clEnqueueNDRangeKernel(cmdQueues[q],
                kernel_, 1, NULL, gws, lws, 1, &w[idx], &x[idx]);
            if (!isProfilingEnabled_) { 
                error_ |= clReleaseEvent(w[idx]);
            }
            profile.addEvent(q, ProfileQueue::Execute, x[idx]);
        }

        // Change the queue
        ++q %= numQueues;
        error_ |= clEnqueueReadBuffer(cmdQueues[q],
            buffers_[idx], CL_FALSE, 0, size_s, (char*)Data_s+s_done, 1,
            (useKernel) ? &x[idx] : &w[idx], &r[idx]);
        if (!isProfilingEnabled_) { 
            error_ |= clReleaseEvent((useKernel) ? x[idx] : w[idx]);
        }
        profile.addEvent(q, ProfileQueue::Read, r[idx]);

        if ((s_done += size_s) >= size_S) {
            if (!isProfilingEnabled_) { 
                error_ |= clReleaseEvent(r[idx]);
            }
            break;
        }
        ++iter;
        ++idx %= numBufs;
        ++q %= numQueues;
    }

    for (q = 0; q < numQueues; ++q) {
        error_ |= clFinish(cmdQueues[q]);
    }
    timer.Stop();

    error_ = clEnqueueUnmapMemObject(cmdQueues[0],
        buffers_[numBufs], Data_s, 0, NULL, NULL);

    error_ |= clFinish(cmdQueues[0]);
    CHECK_RESULT((error_), "Execution failed");

    cl_long gpuTimeFrame = profile.findExecTime();
    cl_long oneIter = gpuTimeFrame / iter;

    // Display 4 iterations in the middle
    cl_long startFrame = oneIter * (iter/2 - 2);
    cl_long finishFrame = oneIter * (iter/2 + 2);
    profile.display(startFrame, finishFrame);

    for (q = 0; q < numQueues; ++q) {
        error_ = clReleaseCommandQueue(cmdQueues[q]);
        CHECK_RESULT((error_), "clReleaseCommandQueue() failed");
    }

    double GBytes = (double)(2*size_S)/(double)(1024*1024*1024);

    std::stringstream stream;
    if (useKernel) {
        stream << "Write/Kernel/Read operation ";
    }
    else {
        stream << "Write/Read operation ";
    }
    stream << numQueues << " queue; profiling " <<
        ((isProfilingEnabled_) ? "enabled" : "disabled");

	stream << ((useUHP_) ? " using UHP" : " using AHP") << ": "; 
    
    stream.flags(std::ios::right | std::ios::showbase);
    std::cout << stream.str() << static_cast<float>(GBytes / timer.GetElapsedTime()) << " GB/s\n";
}