Beispiel #1
0
float clPeak::run_kernel(cl::CommandQueue &queue, cl::Kernel &kernel, cl::NDRange &globalSize, cl::NDRange &localSize, int iters)
{
    float timed = 0;

    // Dummy calls
    queue.enqueueNDRangeKernel(kernel, cl::NullRange, globalSize, localSize);
    queue.enqueueNDRangeKernel(kernel, cl::NullRange, globalSize, localSize);
    queue.finish();

    if(useEventTimer)
    {
        for(int i=0; i<iters; i++)
        {
            cl::Event timeEvent;

            queue.enqueueNDRangeKernel(kernel, cl::NullRange, globalSize, localSize, NULL, &timeEvent);
            queue.finish();
            timed += timeInUS(timeEvent);
        }
    } else      // std timer
    {
        Timer timer;

        timer.start();
        for(int i=0; i<iters; i++)
        {
            queue.enqueueNDRangeKernel(kernel, cl::NullRange, globalSize, localSize);
        }
        queue.finish();
        timed = timer.stopAndTime();
    }

    return (timed / iters);
}
Beispiel #2
0
void procOCL_I2I(int texIn, int texOut, int w, int h)
{
    if(!haveOpenCL) return;

    LOGD("procOCL_I2I(%d, %d, %d, %d)", texIn, texOut, w, h);
    cl::ImageGL imgIn (theContext, CL_MEM_READ_ONLY,  GL_TEXTURE_2D, 0, texIn);
    cl::ImageGL imgOut(theContext, CL_MEM_WRITE_ONLY, GL_TEXTURE_2D, 0, texOut);
    std::vector < cl::Memory > images;
    images.push_back(imgIn);
    images.push_back(imgOut);

    int64_t t = getTimeMs();
    theQueue.enqueueAcquireGLObjects(&images);
    theQueue.finish();
    LOGD("enqueueAcquireGLObjects() costs %d ms", getTimeInterval(t));

    t = getTimeMs();
    cl::Kernel Laplacian(theProgI2I, "Laplacian"); //TODO: may be done once
    Laplacian.setArg(0, imgIn);
    Laplacian.setArg(1, imgOut);
    theQueue.finish();
    LOGD("Kernel() costs %d ms", getTimeInterval(t));

    t = getTimeMs();
    theQueue.enqueueNDRangeKernel(Laplacian, cl::NullRange, cl::NDRange(w, h), cl::NullRange);
    theQueue.finish();
    LOGD("enqueueNDRangeKernel() costs %d ms", getTimeInterval(t));

    t = getTimeMs();
    theQueue.enqueueReleaseGLObjects(&images);
    theQueue.finish();
    LOGD("enqueueReleaseGLObjects() costs %d ms", getTimeInterval(t));
}
Beispiel #3
0
int clPeak::runKernelLatency(cl::CommandQueue &queue, cl::Program &prog, device_info_t &devInfo)
{
    if(!isKernelLatency)
        return 0;

    cl::Context ctx = queue.getInfo<CL_QUEUE_CONTEXT>();
    cl_uint numItems = (devInfo.maxWGSize) * (devInfo.numCUs) * FETCH_PER_WI;
    cl::NDRange globalSize = (numItems / FETCH_PER_WI);
    cl::NDRange localSize = devInfo.maxWGSize;
    int iters = devInfo.kernelLatencyIters;
    float latency;

    try
    {
        log->print(NEWLINE TAB TAB "Kernel launch latency : ");
        log->xmlOpenTag("kernel_launch_latency");
        log->xmlAppendAttribs("unit", "us");

        cl::Buffer inputBuf = cl::Buffer(ctx, CL_MEM_READ_ONLY, (numItems * sizeof(float)));
        cl::Buffer outputBuf = cl::Buffer(ctx, CL_MEM_WRITE_ONLY, (numItems * sizeof(float)));

        cl::Kernel kernel_v1(prog, "global_bandwidth_v1_local_offset");
        kernel_v1.setArg(0, inputBuf), kernel_v1.setArg(1, outputBuf);

        // Dummy calls
        queue.enqueueNDRangeKernel(kernel_v1, cl::NullRange, globalSize, localSize);
        queue.enqueueNDRangeKernel(kernel_v1, cl::NullRange, globalSize, localSize);
        queue.finish();

        latency = 0;
        for(int i=0; i<iters; i++)
        {
            cl::Event timeEvent;
            queue.enqueueNDRangeKernel(kernel_v1, cl::NullRange, globalSize, localSize, NULL, &timeEvent);
            queue.finish();
            cl_ulong start = timeEvent.getProfilingInfo<CL_PROFILING_COMMAND_QUEUED>() / 1000;
            cl_ulong end = timeEvent.getProfilingInfo<CL_PROFILING_COMMAND_START>() / 1000;
            latency += (float)((int)end - (int)start);
        }
        latency /= iters;

        log->print(latency);    log->print(" us" NEWLINE);
        log->xmlSetContent(latency);
        log->xmlCloseTag();
    }
    catch(cl::Error error)
    {
        log->print(error.err() + NEWLINE);
        log->print(TAB TAB "Tests skipped" NEWLINE);
        return -1;
    }

    return 0;
}
Beispiel #4
0
    void updateParticles(float timeDelta)
    {
        try
        {
            vector<cl::Memory> glBuffers;
            glBuffers.push_back(m_positions);
            glBuffers.push_back(m_colors);
            
            //this will update our system by calculating new velocity and updating the positions of our particles
            //Make sure OpenGL is done using our VBOs
            glFinish();
            
            // map OpenGL buffer object for writing from OpenCL
            // this passes in the vector of VBO buffer objects (position and color)
            m_queue.enqueueAcquireGLObjects(&glBuffers);
            
            m_particleKernel.setArg(5, timeDelta); //pass in the timestep
            
            //execute the kernel
            m_queue.enqueueNDRangeKernel(m_particleKernel, cl::NullRange, cl::NDRange(m_numParticles),
                                         cl::NullRange);
            //Release the VBOs so OpenGL can play with them
            m_queue.enqueueReleaseGLObjects(&glBuffers, NULL);

            m_queue.finish();
        }
        catch(cl::Error &error)
        {
            LOG_ERROR << error.what() << "(" << oclErrorString(error.err()) << ")";
        }
    }
Beispiel #5
0
void procOCL_OCV(int tex, int w, int h)
{
    int64_t t = getTimeMs();
    cl::ImageGL imgIn (theContext, CL_MEM_READ_ONLY,  GL_TEXTURE_2D, 0, tex);
    std::vector < cl::Memory > images(1, imgIn);
    theQueue.enqueueAcquireGLObjects(&images);
    theQueue.finish();
    cv::UMat uIn, uOut, uTmp;
    cv::ocl::convertFromImage(imgIn(), uIn);
    LOGD("loading texture data to OpenCV UMat costs %d ms", getTimeInterval(t));
    theQueue.enqueueReleaseGLObjects(&images);

    t = getTimeMs();
    //cv::blur(uIn, uOut, cv::Size(5, 5));
    cv::Laplacian(uIn, uTmp, CV_8U);
    cv:multiply(uTmp, 10, uOut);
    cv::ocl::finish();
    LOGD("OpenCV processing costs %d ms", getTimeInterval(t));

    t = getTimeMs();
    cl::ImageGL imgOut(theContext, CL_MEM_WRITE_ONLY, GL_TEXTURE_2D, 0, tex);
    images.clear();
    images.push_back(imgOut);
    theQueue.enqueueAcquireGLObjects(&images);
    cl_mem clBuffer = (cl_mem)uOut.handle(cv::ACCESS_READ);
    cl_command_queue q = (cl_command_queue)cv::ocl::Queue::getDefault().ptr();
    size_t offset = 0;
    size_t origin[3] = { 0, 0, 0 };
    size_t region[3] = { w, h, 1 };
    CV_Assert(clEnqueueCopyBufferToImage (q, clBuffer, imgOut(), offset, origin, region, 0, NULL, NULL) == CL_SUCCESS);
    theQueue.enqueueReleaseGLObjects(&images);
    cv::ocl::finish();
    LOGD("uploading results to texture costs %d ms", getTimeInterval(t));
}
void simulationStep() {
    try {
        // copy
        auto buffer = cl::Buffer(context, CL_MEM_READ_ONLY,
                                 sizeof(unsigned char) * 4 * fieldWidth * fieldHeight,
                                 nullptr, nullptr);
        queue.enqueueWriteBuffer(buffer, CL_TRUE, 0,
                                 sizeof(unsigned char) * 4 * fieldWidth * fieldHeight,
                                 visualizationBufferCPU, NULL, NULL);

        // enque
        stepKernel.setArg(2, buffer);
        cl::NDRange global((size_t) (fieldWidth * fieldHeight));
        queue.enqueueNDRangeKernel(stepKernel, cl::NullRange, global, cl::NullRange);

        // read back
        queue.enqueueReadBuffer(visualizationBufferGPU, CL_TRUE, 0,
                                sizeof(unsigned char) * 4 * fieldWidth * fieldHeight,
                                visualizationBufferCPU, NULL, NULL);

        // finish
        queue.finish();
    } catch (cl::Error err) {
        std::cout << "Error: " << err.what() << "(" << err.err() << ")" << std::endl;
        exit(3);
    }

    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, fieldWidth, fieldHeight, 0, GL_RGBA, GL_UNSIGNED_BYTE,
                 visualizationBufferCPU);
}
void PTWeekend::draw()
{
    /*
     * BEGIN - each frame part
     */
    
    /* Enqueue kernel for execution */
    
    glm::vec3 origin,lower_left, hor, ver;
    
    float theta = camera.getFov() * M_PI / 180.0f;
    float half_height = tan(theta / 2.0f);
    float half_width = camera.getAspectRatio() * half_height;
    
    origin = camera.getEyePoint();
    glm::vec3 u, v, w;
    
    w = -glm::normalize(camera.getViewDirection()); //odd...
    u = glm::normalize(glm::cross(glm::vec3(0,1,0), w));
    v = glm::cross(w, u);
    
    lower_left = origin - half_width * u - half_height * v - w;
    hor = 2.0f * half_width * u;
    ver = 2.0f * half_height * v;
    
    pt_assert(cl_set_pinhole_cam_arg(origin, lower_left, hor, ver, cam_buffer, cmd_queue), "Could not fill camera buffer");
    
    clStatus = cmd_queue.enqueueAcquireGLObjects(&img_buffer, NULL, NULL);
    pt_assert(clStatus, "Could not acquire gl objects");
    
    cl::Event profiling_evt;
    
    
    
    clStatus = cmd_queue.enqueueNDRangeKernel(kernel,
                                              cl::NDRange(0,0),
                                              cl::NDRange(img_width, img_height),
                                              cl::NDRange(local_width,local_height),
                                              NULL,
                                              &profiling_evt);
    profiling_evt.wait();
    
    pt_assert(clStatus, "Could not enqueue the kernel");
    clStatus = cmd_queue.enqueueReleaseGLObjects(&img_buffer, NULL, NULL);
    pt_assert(clStatus, "Could not release gl objects");
    cmd_queue.finish();
    
    cl_ulong time_start = profiling_evt.getProfilingInfo<CL_PROFILING_COMMAND_START>();
    cl_ulong time_end = profiling_evt.getProfilingInfo<CL_PROFILING_COMMAND_END>();
    cl_ulong total_time = time_end - time_start;
    std::cout << "Total time: " << total_time * 0.001 * 0.001 << " ms \n";
    
    /*
     * END - each frame part
     */
    
    gl::draw(imgTex, Rectf(0, 0, getWindowWidth(), getWindowHeight()));
}
cl::Event RuntimeMeasurementsManager::enqueueNewMarker(cl::CommandQueue queue) {
    cl::Event event;
#if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
    // Use deprecated API
    queue.enqueueMarker(&event);
#else
    queue.enqueueMarkerWithWaitList(NULL, &event)
#endif
    queue.finish();

    return event;
}
Beispiel #9
0
int clPeak::runTransferBandwidthTest(cl::CommandQueue &queue, cl::Program &prog, device_info_t &devInfo)
{
  if(!isTransferBW)
    return 0;

  float timed, gbps;
  cl::NDRange globalSize, localSize;
  cl::Context ctx = queue.getInfo<CL_QUEUE_CONTEXT>();
  int iters = devInfo.transferBWIters;
  Timer timer;
  float *arr = NULL;

  cl_uint maxItems = devInfo.maxAllocSize / sizeof(float) / 2;
  cl_uint numItems;

  // Set an upper-limit for cpu devies
  if(devInfo.deviceType & CL_DEVICE_TYPE_CPU) {
    numItems = roundToPowOf2(maxItems, 26);
  } else {
    numItems = roundToPowOf2(maxItems);
  }

  try
  {
    arr = new float[numItems];
    cl::Buffer clBuffer = cl::Buffer(ctx, (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR), (numItems * sizeof(float)));

    log->print(NEWLINE TAB TAB "Transfer bandwidth (GBPS)" NEWLINE);
    log->xmlOpenTag("transfer_bandwidth");
    log->xmlAppendAttribs("unit", "gbps");

    ///////////////////////////////////////////////////////////////////////////
    // enqueueWriteBuffer
    log->print(TAB TAB TAB "enqueueWriteBuffer         : ");

    // Dummy warm-up
    queue.enqueueWriteBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr);
    queue.finish();

    timed = 0;

    if(useEventTimer)
    {
      for(int i=0; i<iters; i++)
      {
        cl::Event timeEvent;
        queue.enqueueWriteBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr, NULL, &timeEvent);
        queue.finish();
        timed += timeInUS(timeEvent);
      }
    } else
    {
      Timer timer;

      timer.start();
      for(int i=0; i<iters; i++)
      {
        queue.enqueueWriteBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr);
      }
      queue.finish();
      timed = timer.stopAndTime();
    }
    timed /= iters;

    gbps = ((float)numItems * sizeof(float)) / timed / 1e3f;
    log->print(gbps);   log->print(NEWLINE);
    log->xmlRecord("enqueuewritebuffer", gbps);
    ///////////////////////////////////////////////////////////////////////////
    // enqueueReadBuffer
    log->print(TAB TAB TAB "enqueueReadBuffer          : ");

    // Dummy warm-up
    queue.enqueueReadBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr);
    queue.finish();

    timed = 0;
    if(useEventTimer)
    {
      for(int i=0; i<iters; i++)
      {
        cl::Event timeEvent;
        queue.enqueueReadBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr, NULL, &timeEvent);
        queue.finish();
        timed += timeInUS(timeEvent);
      }
    } else
    {
      Timer timer;

      timer.start();
      for(int i=0; i<iters; i++)
      {
        queue.enqueueReadBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr);
      }
      queue.finish();
      timed = timer.stopAndTime();
    }
    timed /= iters;

    gbps = ((float)numItems * sizeof(float)) / timed / 1e3f;
    log->print(gbps);   log->print(NEWLINE);
    log->xmlRecord("enqueuereadbuffer", gbps);
    ///////////////////////////////////////////////////////////////////////////
    // enqueueMapBuffer
    log->print(TAB TAB TAB "enqueueMapBuffer(for read) : ");

    queue.finish();

    timed = 0;
    if(useEventTimer)
    {
      for(int i=0; i<iters; i++)
      {
        cl::Event timeEvent;
        void *mapPtr;

        mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_READ, 0, (numItems * sizeof(float)), NULL, &timeEvent);
        queue.finish();
        queue.enqueueUnmapMemObject(clBuffer, mapPtr);
        queue.finish();
        timed += timeInUS(timeEvent);
      }
    } else
    {
      for(int i=0; i<iters; i++)
      {
        Timer timer;
        void *mapPtr;

        timer.start();
        mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_READ, 0, (numItems * sizeof(float)));
        queue.finish();
        timed += timer.stopAndTime();

        queue.enqueueUnmapMemObject(clBuffer, mapPtr);
        queue.finish();
      }
    }
    timed /= iters;

    gbps = ((float)numItems * sizeof(float)) / timed / 1e3f;
    log->print(gbps);   log->print(NEWLINE);
    log->xmlRecord("enqueuemapbuffer", gbps);
    ///////////////////////////////////////////////////////////////////////////

    // memcpy from mapped ptr
    log->print(TAB TAB TAB TAB "memcpy from mapped ptr   : ");
    queue.finish();

    timed = 0;
    for(int i=0; i<iters; i++)
    {
      cl::Event timeEvent;
      void *mapPtr;

      mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_READ, 0, (numItems * sizeof(float)));
      queue.finish();

      timer.start();
      memcpy(arr, mapPtr, (numItems * sizeof(float)));
      timed += timer.stopAndTime();

      queue.enqueueUnmapMemObject(clBuffer, mapPtr);
      queue.finish();
    }
    timed /= iters;

    gbps = ((float)numItems * sizeof(float)) / timed / 1e3f;
    log->print(gbps);   log->print(NEWLINE);
    log->xmlRecord("memcpy_from_mapped_ptr", gbps);

    ///////////////////////////////////////////////////////////////////////////

    // enqueueUnmap
    log->print(TAB TAB TAB "enqueueUnmap(after write)  : ");

    queue.finish();

    timed = 0;
    if(useEventTimer)
    {
      for(int i=0; i<iters; i++)
      {
        cl::Event timeEvent;
        void *mapPtr;

        mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_WRITE, 0, (numItems * sizeof(float)));
        queue.finish();
        queue.enqueueUnmapMemObject(clBuffer, mapPtr, NULL, &timeEvent);
        queue.finish();
        timed += timeInUS(timeEvent);
      }
    } else
    {
      for(int i=0; i<iters; i++)
      {
        Timer timer;
        void *mapPtr;

        mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_WRITE, 0, (numItems * sizeof(float)));
        queue.finish();

        timer.start();
        queue.enqueueUnmapMemObject(clBuffer, mapPtr);
        queue.finish();
        timed += timer.stopAndTime();
      }
    }
    timed /= iters;
    gbps = ((float)numItems * sizeof(float)) / timed / 1e3f;

    log->print(gbps);   log->print(NEWLINE);
    log->xmlRecord("enqueueunmap", gbps);
    ///////////////////////////////////////////////////////////////////////////

    // memcpy to mapped ptr
    log->print(TAB TAB TAB TAB "memcpy to mapped ptr     : ");
    queue.finish();

    timed = 0;
    for(int i=0; i<iters; i++)
    {
      cl::Event timeEvent;
      void *mapPtr;

      mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_WRITE, 0, (numItems * sizeof(float)));
      queue.finish();

      timer.start();
      memcpy(mapPtr, arr, (numItems * sizeof(float)));
      timed += timer.stopAndTime();

      queue.enqueueUnmapMemObject(clBuffer, mapPtr);
      queue.finish();
    }
    timed /= iters;

    gbps = ((float)numItems * sizeof(float)) / timed / 1e3f;
    log->print(gbps);   log->print(NEWLINE);
    log->xmlRecord("memcpy_to_mapped_ptr", gbps);

    ///////////////////////////////////////////////////////////////////////////
    log->xmlCloseTag();     // transfer_bandwidth

    if(arr)     delete [] arr;
  }
  catch(cl::Error error)
  {
    stringstream ss;
    ss << error.what() << " (" << error.err() << ")" NEWLINE
       << TAB TAB TAB "Tests skipped" NEWLINE;
    log->print(ss.str());

    if(arr)     delete [] arr;
    return -1;
  }

  return 0;
}
int MaxValueSimple::maxValueCL(int* values, size_t len) {
	try {
		cl_int status = CL_SUCCESS;

		/*** Ausgabe von Informationen ueber gewaehltes OpenCL-Device ***/
		/* TODO logging
		 Logger::logDebug(
		 METHOD,
		 Logger::sStream << "max compute units: " << devices[0].getInfo<
		 CL_DEVICE_MAX_COMPUTE_UNITS> ());
		 Logger::logDebug(
		 METHOD,
		 Logger::sStream << "max work item sizes: "
		 << devices[0].getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES> ()[0]);
		 Logger::logDebug(
		 METHOD,
		 Logger::sStream << "max work group sizes: "
		 << devices[0].getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE> ());
		 Logger::logDebug(
		 METHOD,
		 Logger::sStream << "max global mem size (KB): "
		 << devices[0].getInfo<CL_DEVICE_GLOBAL_MEM_SIZE> ()
		 / 1024);
		 Logger::logDebug(
		 METHOD,
		 Logger::sStream << "max local mem size (KB): "
		 << devices[0].getInfo<CL_DEVICE_LOCAL_MEM_SIZE> ()
		 / 1024);
		 */

		/*** Erstellen und Vorbereiten der Daten ***/
		cl::Buffer vBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
				sizeof(cl_int) * len, &values[0], &status);
		if (status != CL_SUCCESS) {
			throw cl::Error(status, "cl::Buffer values");
		}
		cmdQ.finish();

		/*** Arbeitsgroeszen berechnen ***/
		// Anzahl der Work-Items = globalSize
		// Work-Items pro Work-Group = localSize
		const size_t MAX_GROUP_SIZE = devices[0].getInfo<
				CL_DEVICE_MAX_WORK_GROUP_SIZE> ();
		size_t globalSize;
		size_t localSize;

		do {
			globalSize = len;
			localSize = MaxValueSimple::calcWorkGroupSize(globalSize,
					MAX_GROUP_SIZE);
			if (localSize == 1) {
				globalSize = ceil((double) len / WG_FAC) * WG_FAC;
				localSize = MaxValueSimple::calcWorkGroupSize(globalSize,
						MAX_GROUP_SIZE);
				/* TODO logging
				 Logger::logDebug(
				 METHOD,
				 Logger::sStream << "GlobalSize has been extended to "
				 << globalSize);
				 */
			}
			/* TODO logging
			 Logger::logDebug(METHOD,
			 Logger::sStream << "globalSize: " << globalSize);
			 Logger::logDebug(METHOD,
			 Logger::sStream << "localSize: " << localSize);
			 */

			/*** Kernel-Argumente setzen  ***/
			status = kernel.setArg(0, vBuffer);
			if (status != CL_SUCCESS) {
				throw cl::Error(status, "Kernel.SetArg");
			}

			status = kernel.setArg(1, sizeof(cl_int) * localSize, NULL);
			if (status != CL_SUCCESS) {
				throw cl::Error(status, "Kernel.SetArg");
			}

			/*** Kernel ausfuehren und auf Abarbeitung warten ***/
			cl::KernelFunctor func = kernel.bind(cmdQ, cl::NDRange(globalSize),
					cl::NDRange(localSize));

			event = func();

			event.wait();
			cmdQ.finish();

			/*
			 runtimeKernel
			 += event.getProfilingInfo<CL_PROFILING_COMMAND_END> ();
			 runtimeKernel
			 -= event.getProfilingInfo<CL_PROFILING_COMMAND_START> ();
			 */
			len = globalSize / localSize;
		} while (globalSize > localSize && localSize > 1);

		/*** Daten vom OpenCL-Device holen ***/
		// TODO nur 1. element auslesen
		status = cmdQ.enqueueReadBuffer(vBuffer, true, 0, sizeof(cl_int) * 1,
				&values[0]);
		if (status != CL_SUCCESS) {
			throw cl::Error(status, "CommandQueue.enqueueReadBuffer");
		}

		/* TODO logging
		 Logger::log(
		 METHOD,
		 TIME,
		 Logger::sStream << "timeKernel=" << 1.0e-9 * runtimeKernel
		 << ";");
		 */
		return values[0];
	} catch (cl::Error& err) {
		// TODO Logger::logError(METHOD, Logger::sStream << err.what());
		std::cerr << "[ERROR] MaxValueSimple::maxValueCL(int*, size_t): "
				<< err.what() << " (" << err.err() << ")" << std::endl;
		return MaxValueSimple::MAX_FAILURE;
	} catch (std::exception& err) {
		// TODO Logger::logError(METHOD, Logger::sStream << err.what());
		std::cerr << "[ERROR] MaxValueSimple::maxValueCL(int*, size_t): "
				<< err.what() << std::endl;
		return MaxValueSimple::MAX_FAILURE;
	}
}
Beispiel #11
0
int clPeak::runTransferBandwidthTest(cl::CommandQueue &queue, cl::Program &prog, device_info_t &devInfo)
{
    if(!isTransferBW)
        return 0;

    float timed, gbps;
    cl::NDRange globalSize, localSize;
    cl::Context ctx = queue.getInfo<CL_QUEUE_CONTEXT>();
    int iters = devInfo.transferBWIters;
    Timer timer;

    cl_uint maxItems = devInfo.maxAllocSize / sizeof(float) / 2;
    cl_uint numItems;

    // Set an upper-limit for cpu devies
    if(devInfo.deviceType & CL_DEVICE_TYPE_CPU) {
        numItems = roundToPowOf2(maxItems, 26);
    } else {
        numItems = roundToPowOf2(maxItems);
    }

    float *arr = new float[numItems];

    try
    {
        cl::Buffer clBuffer = cl::Buffer(ctx, (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR), (numItems * sizeof(float)));

        cout << NEWLINE TAB TAB "Transfer bandwidth (GBPS)" << endl;
        cout << setprecision(2) << fixed;

        ///////////////////////////////////////////////////////////////////////////
        // enqueueWriteBuffer
        cout << TAB TAB TAB "enqueueWriteBuffer         : ";    cout.flush();

        // Dummy warm-up
        queue.enqueueWriteBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr);
        queue.finish();

        timed = 0;

        if(useEventTimer)
        {
            for(int i=0; i<iters; i++)
            {
                cl::Event timeEvent;
                queue.enqueueWriteBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr, NULL, &timeEvent);
                queue.finish();
                timed += timeInUS(timeEvent);
            }
        } else
        {
            Timer timer;

            timer.start();
            for(int i=0; i<iters; i++)
            {
                queue.enqueueWriteBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr);
            }
            queue.finish();
            timed = timer.stopAndTime();
        }
        timed /= iters;

        gbps = ((float)numItems * sizeof(float)) / timed / 1e3f;
        cout << gbps << endl;
        ///////////////////////////////////////////////////////////////////////////
        // enqueueReadBuffer
        cout << TAB TAB TAB "enqueueReadBuffer          : ";    cout.flush();

        // Dummy warm-up
        queue.enqueueReadBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr);
        queue.finish();

        timed = 0;
        if(useEventTimer)
        {
            for(int i=0; i<iters; i++)
            {
                cl::Event timeEvent;
                queue.enqueueReadBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr, NULL, &timeEvent);
                queue.finish();
                timed += timeInUS(timeEvent);
            }
        } else
        {
            Timer timer;

            timer.start();
            for(int i=0; i<iters; i++)
            {
                queue.enqueueReadBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr);
            }
            queue.finish();
            timed = timer.stopAndTime();
        }
        timed /= iters;

        gbps = ((float)numItems * sizeof(float)) / timed / 1e3f;
        cout << gbps << endl;
        ///////////////////////////////////////////////////////////////////////////
        // enqueueMapBuffer
        cout << TAB TAB TAB "enqueueMapBuffer(for read) : ";    cout.flush();

        queue.finish();

        timed = 0;
        if(useEventTimer)
        {
            for(int i=0; i<iters; i++)
            {
                cl::Event timeEvent;
                void *mapPtr;

                mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_READ, 0, (numItems * sizeof(float)), NULL, &timeEvent);
                queue.finish();
                queue.enqueueUnmapMemObject(clBuffer, mapPtr);
                queue.finish();
                timed += timeInUS(timeEvent);
            }
        } else
        {
            for(int i=0; i<iters; i++)
            {
                Timer timer;
                void *mapPtr;

                timer.start();
                mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_READ, 0, (numItems * sizeof(float)));
                queue.finish();
                timed += timer.stopAndTime();

                queue.enqueueUnmapMemObject(clBuffer, mapPtr);
                queue.finish();
            }
        }
        timed /= iters;

        gbps = ((float)numItems * sizeof(float)) / timed / 1e3f;
        cout << gbps << endl;
        ///////////////////////////////////////////////////////////////////////////

        // memcpy from mapped ptr
        cout << TAB TAB TAB TAB "memcpy from mapped ptr   : ";  cout.flush();
        queue.finish();

        timed = 0;
        for(int i=0; i<iters; i++)
        {
            cl::Event timeEvent;
            void *mapPtr;

            mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_READ, 0, (numItems * sizeof(float)));
            queue.finish();

            timer.start();
            memcpy(arr, mapPtr, (numItems * sizeof(float)));
            timed += timer.stopAndTime();

            queue.enqueueUnmapMemObject(clBuffer, mapPtr);
            queue.finish();
        }
        timed /= iters;

        gbps = ((float)numItems * sizeof(float)) / timed / 1e3f;
        cout << gbps << endl;

        ///////////////////////////////////////////////////////////////////////////

        // enqueueUnmap
        cout << TAB TAB TAB "enqueueUnmap(after write)  : ";    cout.flush();

        queue.finish();

        timed = 0;
        if(useEventTimer)
        {
            for(int i=0; i<iters; i++)
            {
                cl::Event timeEvent;
                void *mapPtr;

                mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_WRITE, 0, (numItems * sizeof(float)));
                queue.finish();
                queue.enqueueUnmapMemObject(clBuffer, mapPtr, NULL, &timeEvent);
                queue.finish();
                timed += timeInUS(timeEvent);
            }
        } else
        {
            for(int i=0; i<iters; i++)
            {
                Timer timer;
                void *mapPtr;

                mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_WRITE, 0, (numItems * sizeof(float)));
                queue.finish();

                timer.start();
                queue.enqueueUnmapMemObject(clBuffer, mapPtr);
                queue.finish();
                timed += timer.stopAndTime();
            }
        }
        timed /= iters;
        gbps = ((float)numItems * sizeof(float)) / timed / 1e3f;

        cout << gbps << endl;
        ///////////////////////////////////////////////////////////////////////////

        // memcpy to mapped ptr
        cout << TAB TAB TAB TAB "memcpy to mapped ptr     : ";  cout.flush();
        queue.finish();

        timed = 0;
        for(int i=0; i<iters; i++)
        {
            cl::Event timeEvent;
            void *mapPtr;

            mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_WRITE, 0, (numItems * sizeof(float)));
            queue.finish();

            timer.start();
            memcpy(mapPtr, arr, (numItems * sizeof(float)));
            timed += timer.stopAndTime();

            queue.enqueueUnmapMemObject(clBuffer, mapPtr);
            queue.finish();
        }
        timed /= iters;

        gbps = ((float)numItems * sizeof(float)) / timed / 1e3f;
        cout << gbps << endl;

        ///////////////////////////////////////////////////////////////////////////


    }
    catch(cl::Error error)
    {
        cerr << error.what() << "(" << error.err() << ")" << endl;
        cerr << TAB TAB TAB "Tests skipped" << endl;

        if(arr)     delete [] arr;
        return -1;
    }

    if(arr)     delete [] arr;
    return 0;
}
Beispiel #12
0
PetscErrorCode ComputeResidual(TS ts,
                               PetscScalar t,
                               Vec Prim, Vec dPrim_dt,
                               Vec F, void *ptr)
{
    PetscErrorCode ierr;
    PetscScalar *prim, *dprim_dt, *f;

    // Get pointers to Petsc Vecs so that we can access the data.
    ierr = VecGetArray(Prim, &prim); CHKERRQ(ierr);
    ierr = VecGetArray(dPrim_dt, &dprim_dt); CHKERRQ(ierr);
    ierr = VecGetArray(F, &f); CHKERRQ(ierr);
    
    // OpenCL buffers.
    cl::Buffer primBuffer, dprimBuffer_dt, fbuffer;
    PetscInt size = DOF*N1*N2*sizeof(PetscScalar);

    // Create OpenCL buffers from the data pointers to Petsc Vecs.
    primBuffer = cl::Buffer(context,
                            CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY,
                            size, &(prim[0]), &clErr);
    dprimBuffer_dt = cl::Buffer(context,
                                CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY,
                                size, &(dprim_dt[0]), &clErr);
    fbuffer = cl::Buffer(context,
                         CL_MEM_USE_HOST_PTR | CL_MEM_WRITE_ONLY,
                         size, &(f[0]), &clErr);


    // Set kernel args.
    clErr = kernel.setArg(0, primBuffer);
    clErr = kernel.setArg(1, dprimBuffer_dt);
    clErr = kernel.setArg(2, fbuffer);

    // Kernel launch parameters and execution.
    cl::NDRange global(N1, N2);
    cl::NDRange local(TILE_SIZE_X1, TILE_SIZE_X2);
    clErr = queue.enqueueNDRangeKernel(kernel,
                                       cl::NullRange,
                                       global, local,
                                       NULL, NULL);

    // The following "buffer mapping" is not needed if running on CPU but is
    // needed if the OpenCL device executing the kernel is a GPU in order to
    // sync the data. For CPUs this routine is zero cost when used with buffers
    // created using CL_MEM_USE_HOST_PTR like we did above. For GPUs, the GPU
    // will access the data on the RAM as and when needed automatically without
    // user intervention.
    f = (PetscScalar*)queue.enqueueMapBuffer(fbuffer,
                                             CL_FALSE,
                                             CL_MAP_READ,
                                             0, size,
                                             NULL, NULL, &clErr);

    // Global sync point for all the threads to ensure execution is complete.
    clErr = queue.finish();

    // Restore the pointers.
    ierr = VecRestoreArray(Prim, &prim); CHKERRQ(ierr);
    ierr = VecRestoreArray(dPrim_dt, &dprim_dt); CHKERRQ(ierr);
    ierr = VecRestoreArray(F, &f); CHKERRQ(ierr);

    return(0);
}
int main()
{
    try {
        std::vector<cl::Device> devices;

        // select platform
        cl::Platform platform = selectPlatform();

        // select device
        platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
        cl::Device device = selectDevice(devices);

        // create context
        context = cl::Context(devices);

        // create command queue
        queue = cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE);

        // load opencl source
        std::ifstream cl_file("inclusive_scan.cl");

        std::string cl_string{std::istreambuf_iterator<char>(cl_file),
                    std::istreambuf_iterator<char>()};

        cl::Program::Sources source(1,
                                    std::make_pair(cl_string.c_str(),
                                                   cl_string.length() + 1));

        // create programm
        program = cl::Program(context, source);

        // compile opencl source
        try {
            program.build(devices);

            size_t input_size;
            std::ifstream input_file("input.txt");
            input_file >> input_size;

            std::vector<float> input(input_size);

//            for (size_t i = 0; i < input_size; ++i) {
//                input[i] = i % 10;
//            }

            for (int i = 0; i < input_size; i++) {
                input_file >> input[i];
            }

            std::vector<float> output(input_size, 0);

            cl::Buffer dev_input (context, CL_MEM_READ_ONLY, sizeof(float) * input_size);
            queue.enqueueWriteBuffer(dev_input, CL_TRUE, 0, sizeof(float) * input_size, &input[0]);

            cl::Buffer dev_output = inclusive_scan(dev_input, input_size);

            queue.enqueueReadBuffer(dev_output, CL_TRUE, 0, sizeof(float) * input_size, &output[0]);
            queue.finish();

            cpu_check(input, output);

            std::ofstream output_file("output.txt");
            for (int i = 0; i < input_size; i++) {
                output_file << output[i] << " ";
            }

        }
        catch (cl::Error const & e) {
            std::string log_str = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device);
            std::cout << std::endl << e.what() << " : " << e.err() << std::endl;
            std::cout << log_str;
            return 0;
        }


    }
    catch (cl::Error const & e) {
        std::cout << "Error: " << e.what() << " #" << e.err() << std::endl;
    }

    return 0;
}
Beispiel #14
0
void mainLoop( cl::CommandQueue& queue, cl::Context& context, cl::Kernel kernel, cl::Buffer clImgDesc, cl::Buffer clCamera ){
  cl::Event eAcquire, eRelease, eExecute;
  cl_int err;


  glFinish();
  checkGLErr( "glFinish()" );

  queue.enqueueWriteBuffer( clImgDesc, CL_TRUE, 0, 1 * sizeof(ImageDescriptor), (const void*)&imgDesc);

  err = queue.enqueueAcquireGLObjects( vSharedUnits, NULL, &eAcquire );
  checkErr(err, "CommandQueue::enqueueAcquireGLObjects()");

  eAcquire.wait();


  err = queue.enqueueNDRangeKernel( kernel, cl::NullRange, cl::NDRange(WIDTH, HEIGHT), cl::NullRange, NULL, &eExecute);

  checkErr(err, "CommandQueue::enqueueNDRangeKernel()");
  //std::cout<<"Kernel executing"<< std::endl ;
  clock_t ti = clock();
  eExecute.wait();
  clock_t tf = clock();

  queue.finish();
  err = queue.enqueueReleaseGLObjects( vSharedUnits, NULL, &eRelease );
  checkErr(err, "CommandQueue::enqueueReleaseGLObjects()");

  eRelease.wait();


  imgDesc.numSamples += SAMPLES;

  pAccumulator->glBind( GL_DRAW_FRAMEBUFFER );
  checkGLErr( "glBind GL_DRAW_FRAMEBUFFER, Accumulator " );
  pCLTarget->glBind( GL_READ_FRAMEBUFFER );
  checkGLErr( "glBind GL_READ_FRAMEBUFFER, Main Target " );
  glBlitFramebuffer( 0, 0, WIDTH, HEIGHT, 0, 0, WIDTH, HEIGHT, GL_COLOR_BUFFER_BIT, GL_NEAREST );
  checkGLErr( "glBlitFramebuffer" );

  glBindFramebuffer( GL_DRAW_FRAMEBUFFER, 0 );
  checkGLErr( "glBind GL_DRAW_FRAMEBUFFER, 0 " );
  pCLTarget->glBind( GL_READ_FRAMEBUFFER );
  checkGLErr( "glBind GL_READ_FRAMEBUFFER, something " );
  glBlitFramebuffer( 0, 0, WIDTH, HEIGHT, 0, 0, WIDTH, HEIGHT, GL_COLOR_BUFFER_BIT, GL_NEAREST );
  checkGLErr( "glBlitFramebuffer" );

  glfwPollEvents();

  pCamera->glfwHandleCursor( ((float)(tf - ti))/(CLOCKS_PER_SEC * 1.0f) );
  if( sceneChanged() ){
    //printf("scene changed..!");
    imgDesc.numSamples = 0;
    CLCamera* cam = pCamera->getCLCamera();
    queue.enqueueWriteBuffer( clCamera, CL_TRUE, 0, 1 * sizeof(CLCamera), (const void*)cam );
    delete cam;
  }

  glfwSwapBuffers( window );
  checkGLErr( "glSwapBuffers" );

  //Block for a while.
  //int i;
  //std::cin >> i;

  //float timeTaken = ( (float)(tf - ti) ) / (float)CLOCKS_PER_SEC;
  //std::cout<<"Time taken: "<< timeTaken * 1000 << "ms" << std::endl;
  //std::cout<<"Predicted FPS: "<< 1 / timeTaken << " FPS"<< std::endl;
  if( imgDesc.numSamples % 10 == 0 )
    std::cout<<"numSamples: "<<imgDesc.numSamples<<std::endl;
  //handleFrameCounter();

}