float clPeak::run_kernel(cl::CommandQueue &queue, cl::Kernel &kernel, cl::NDRange &globalSize, cl::NDRange &localSize, int iters) { float timed = 0; // Dummy calls queue.enqueueNDRangeKernel(kernel, cl::NullRange, globalSize, localSize); queue.enqueueNDRangeKernel(kernel, cl::NullRange, globalSize, localSize); queue.finish(); if(useEventTimer) { for(int i=0; i<iters; i++) { cl::Event timeEvent; queue.enqueueNDRangeKernel(kernel, cl::NullRange, globalSize, localSize, NULL, &timeEvent); queue.finish(); timed += timeInUS(timeEvent); } } else // std timer { Timer timer; timer.start(); for(int i=0; i<iters; i++) { queue.enqueueNDRangeKernel(kernel, cl::NullRange, globalSize, localSize); } queue.finish(); timed = timer.stopAndTime(); } return (timed / iters); }
void procOCL_I2I(int texIn, int texOut, int w, int h) { if(!haveOpenCL) return; LOGD("procOCL_I2I(%d, %d, %d, %d)", texIn, texOut, w, h); cl::ImageGL imgIn (theContext, CL_MEM_READ_ONLY, GL_TEXTURE_2D, 0, texIn); cl::ImageGL imgOut(theContext, CL_MEM_WRITE_ONLY, GL_TEXTURE_2D, 0, texOut); std::vector < cl::Memory > images; images.push_back(imgIn); images.push_back(imgOut); int64_t t = getTimeMs(); theQueue.enqueueAcquireGLObjects(&images); theQueue.finish(); LOGD("enqueueAcquireGLObjects() costs %d ms", getTimeInterval(t)); t = getTimeMs(); cl::Kernel Laplacian(theProgI2I, "Laplacian"); //TODO: may be done once Laplacian.setArg(0, imgIn); Laplacian.setArg(1, imgOut); theQueue.finish(); LOGD("Kernel() costs %d ms", getTimeInterval(t)); t = getTimeMs(); theQueue.enqueueNDRangeKernel(Laplacian, cl::NullRange, cl::NDRange(w, h), cl::NullRange); theQueue.finish(); LOGD("enqueueNDRangeKernel() costs %d ms", getTimeInterval(t)); t = getTimeMs(); theQueue.enqueueReleaseGLObjects(&images); theQueue.finish(); LOGD("enqueueReleaseGLObjects() costs %d ms", getTimeInterval(t)); }
int clPeak::runKernelLatency(cl::CommandQueue &queue, cl::Program &prog, device_info_t &devInfo) { if(!isKernelLatency) return 0; cl::Context ctx = queue.getInfo<CL_QUEUE_CONTEXT>(); cl_uint numItems = (devInfo.maxWGSize) * (devInfo.numCUs) * FETCH_PER_WI; cl::NDRange globalSize = (numItems / FETCH_PER_WI); cl::NDRange localSize = devInfo.maxWGSize; int iters = devInfo.kernelLatencyIters; float latency; try { log->print(NEWLINE TAB TAB "Kernel launch latency : "); log->xmlOpenTag("kernel_launch_latency"); log->xmlAppendAttribs("unit", "us"); cl::Buffer inputBuf = cl::Buffer(ctx, CL_MEM_READ_ONLY, (numItems * sizeof(float))); cl::Buffer outputBuf = cl::Buffer(ctx, CL_MEM_WRITE_ONLY, (numItems * sizeof(float))); cl::Kernel kernel_v1(prog, "global_bandwidth_v1_local_offset"); kernel_v1.setArg(0, inputBuf), kernel_v1.setArg(1, outputBuf); // Dummy calls queue.enqueueNDRangeKernel(kernel_v1, cl::NullRange, globalSize, localSize); queue.enqueueNDRangeKernel(kernel_v1, cl::NullRange, globalSize, localSize); queue.finish(); latency = 0; for(int i=0; i<iters; i++) { cl::Event timeEvent; queue.enqueueNDRangeKernel(kernel_v1, cl::NullRange, globalSize, localSize, NULL, &timeEvent); queue.finish(); cl_ulong start = timeEvent.getProfilingInfo<CL_PROFILING_COMMAND_QUEUED>() / 1000; cl_ulong end = timeEvent.getProfilingInfo<CL_PROFILING_COMMAND_START>() / 1000; latency += (float)((int)end - (int)start); } latency /= iters; log->print(latency); log->print(" us" NEWLINE); log->xmlSetContent(latency); log->xmlCloseTag(); } catch(cl::Error error) { log->print(error.err() + NEWLINE); log->print(TAB TAB "Tests skipped" NEWLINE); return -1; } return 0; }
void updateParticles(float timeDelta) { try { vector<cl::Memory> glBuffers; glBuffers.push_back(m_positions); glBuffers.push_back(m_colors); //this will update our system by calculating new velocity and updating the positions of our particles //Make sure OpenGL is done using our VBOs glFinish(); // map OpenGL buffer object for writing from OpenCL // this passes in the vector of VBO buffer objects (position and color) m_queue.enqueueAcquireGLObjects(&glBuffers); m_particleKernel.setArg(5, timeDelta); //pass in the timestep //execute the kernel m_queue.enqueueNDRangeKernel(m_particleKernel, cl::NullRange, cl::NDRange(m_numParticles), cl::NullRange); //Release the VBOs so OpenGL can play with them m_queue.enqueueReleaseGLObjects(&glBuffers, NULL); m_queue.finish(); } catch(cl::Error &error) { LOG_ERROR << error.what() << "(" << oclErrorString(error.err()) << ")"; } }
void procOCL_OCV(int tex, int w, int h) { int64_t t = getTimeMs(); cl::ImageGL imgIn (theContext, CL_MEM_READ_ONLY, GL_TEXTURE_2D, 0, tex); std::vector < cl::Memory > images(1, imgIn); theQueue.enqueueAcquireGLObjects(&images); theQueue.finish(); cv::UMat uIn, uOut, uTmp; cv::ocl::convertFromImage(imgIn(), uIn); LOGD("loading texture data to OpenCV UMat costs %d ms", getTimeInterval(t)); theQueue.enqueueReleaseGLObjects(&images); t = getTimeMs(); //cv::blur(uIn, uOut, cv::Size(5, 5)); cv::Laplacian(uIn, uTmp, CV_8U); cv:multiply(uTmp, 10, uOut); cv::ocl::finish(); LOGD("OpenCV processing costs %d ms", getTimeInterval(t)); t = getTimeMs(); cl::ImageGL imgOut(theContext, CL_MEM_WRITE_ONLY, GL_TEXTURE_2D, 0, tex); images.clear(); images.push_back(imgOut); theQueue.enqueueAcquireGLObjects(&images); cl_mem clBuffer = (cl_mem)uOut.handle(cv::ACCESS_READ); cl_command_queue q = (cl_command_queue)cv::ocl::Queue::getDefault().ptr(); size_t offset = 0; size_t origin[3] = { 0, 0, 0 }; size_t region[3] = { w, h, 1 }; CV_Assert(clEnqueueCopyBufferToImage (q, clBuffer, imgOut(), offset, origin, region, 0, NULL, NULL) == CL_SUCCESS); theQueue.enqueueReleaseGLObjects(&images); cv::ocl::finish(); LOGD("uploading results to texture costs %d ms", getTimeInterval(t)); }
void simulationStep() { try { // copy auto buffer = cl::Buffer(context, CL_MEM_READ_ONLY, sizeof(unsigned char) * 4 * fieldWidth * fieldHeight, nullptr, nullptr); queue.enqueueWriteBuffer(buffer, CL_TRUE, 0, sizeof(unsigned char) * 4 * fieldWidth * fieldHeight, visualizationBufferCPU, NULL, NULL); // enque stepKernel.setArg(2, buffer); cl::NDRange global((size_t) (fieldWidth * fieldHeight)); queue.enqueueNDRangeKernel(stepKernel, cl::NullRange, global, cl::NullRange); // read back queue.enqueueReadBuffer(visualizationBufferGPU, CL_TRUE, 0, sizeof(unsigned char) * 4 * fieldWidth * fieldHeight, visualizationBufferCPU, NULL, NULL); // finish queue.finish(); } catch (cl::Error err) { std::cout << "Error: " << err.what() << "(" << err.err() << ")" << std::endl; exit(3); } glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, fieldWidth, fieldHeight, 0, GL_RGBA, GL_UNSIGNED_BYTE, visualizationBufferCPU); }
void PTWeekend::draw() { /* * BEGIN - each frame part */ /* Enqueue kernel for execution */ glm::vec3 origin,lower_left, hor, ver; float theta = camera.getFov() * M_PI / 180.0f; float half_height = tan(theta / 2.0f); float half_width = camera.getAspectRatio() * half_height; origin = camera.getEyePoint(); glm::vec3 u, v, w; w = -glm::normalize(camera.getViewDirection()); //odd... u = glm::normalize(glm::cross(glm::vec3(0,1,0), w)); v = glm::cross(w, u); lower_left = origin - half_width * u - half_height * v - w; hor = 2.0f * half_width * u; ver = 2.0f * half_height * v; pt_assert(cl_set_pinhole_cam_arg(origin, lower_left, hor, ver, cam_buffer, cmd_queue), "Could not fill camera buffer"); clStatus = cmd_queue.enqueueAcquireGLObjects(&img_buffer, NULL, NULL); pt_assert(clStatus, "Could not acquire gl objects"); cl::Event profiling_evt; clStatus = cmd_queue.enqueueNDRangeKernel(kernel, cl::NDRange(0,0), cl::NDRange(img_width, img_height), cl::NDRange(local_width,local_height), NULL, &profiling_evt); profiling_evt.wait(); pt_assert(clStatus, "Could not enqueue the kernel"); clStatus = cmd_queue.enqueueReleaseGLObjects(&img_buffer, NULL, NULL); pt_assert(clStatus, "Could not release gl objects"); cmd_queue.finish(); cl_ulong time_start = profiling_evt.getProfilingInfo<CL_PROFILING_COMMAND_START>(); cl_ulong time_end = profiling_evt.getProfilingInfo<CL_PROFILING_COMMAND_END>(); cl_ulong total_time = time_end - time_start; std::cout << "Total time: " << total_time * 0.001 * 0.001 << " ms \n"; /* * END - each frame part */ gl::draw(imgTex, Rectf(0, 0, getWindowWidth(), getWindowHeight())); }
cl::Event RuntimeMeasurementsManager::enqueueNewMarker(cl::CommandQueue queue) { cl::Event event; #if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) // Use deprecated API queue.enqueueMarker(&event); #else queue.enqueueMarkerWithWaitList(NULL, &event) #endif queue.finish(); return event; }
int clPeak::runTransferBandwidthTest(cl::CommandQueue &queue, cl::Program &prog, device_info_t &devInfo) { if(!isTransferBW) return 0; float timed, gbps; cl::NDRange globalSize, localSize; cl::Context ctx = queue.getInfo<CL_QUEUE_CONTEXT>(); int iters = devInfo.transferBWIters; Timer timer; float *arr = NULL; cl_uint maxItems = devInfo.maxAllocSize / sizeof(float) / 2; cl_uint numItems; // Set an upper-limit for cpu devies if(devInfo.deviceType & CL_DEVICE_TYPE_CPU) { numItems = roundToPowOf2(maxItems, 26); } else { numItems = roundToPowOf2(maxItems); } try { arr = new float[numItems]; cl::Buffer clBuffer = cl::Buffer(ctx, (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR), (numItems * sizeof(float))); log->print(NEWLINE TAB TAB "Transfer bandwidth (GBPS)" NEWLINE); log->xmlOpenTag("transfer_bandwidth"); log->xmlAppendAttribs("unit", "gbps"); /////////////////////////////////////////////////////////////////////////// // enqueueWriteBuffer log->print(TAB TAB TAB "enqueueWriteBuffer : "); // Dummy warm-up queue.enqueueWriteBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr); queue.finish(); timed = 0; if(useEventTimer) { for(int i=0; i<iters; i++) { cl::Event timeEvent; queue.enqueueWriteBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr, NULL, &timeEvent); queue.finish(); timed += timeInUS(timeEvent); } } else { Timer timer; timer.start(); for(int i=0; i<iters; i++) { queue.enqueueWriteBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr); } queue.finish(); timed = timer.stopAndTime(); } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; log->print(gbps); log->print(NEWLINE); log->xmlRecord("enqueuewritebuffer", gbps); /////////////////////////////////////////////////////////////////////////// // enqueueReadBuffer log->print(TAB TAB TAB "enqueueReadBuffer : "); // Dummy warm-up queue.enqueueReadBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr); queue.finish(); timed = 0; if(useEventTimer) { for(int i=0; i<iters; i++) { cl::Event timeEvent; queue.enqueueReadBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr, NULL, &timeEvent); queue.finish(); timed += timeInUS(timeEvent); } } else { Timer timer; timer.start(); for(int i=0; i<iters; i++) { queue.enqueueReadBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr); } queue.finish(); timed = timer.stopAndTime(); } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; log->print(gbps); log->print(NEWLINE); log->xmlRecord("enqueuereadbuffer", gbps); /////////////////////////////////////////////////////////////////////////// // enqueueMapBuffer log->print(TAB TAB TAB "enqueueMapBuffer(for read) : "); queue.finish(); timed = 0; if(useEventTimer) { for(int i=0; i<iters; i++) { cl::Event timeEvent; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_READ, 0, (numItems * sizeof(float)), NULL, &timeEvent); queue.finish(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); timed += timeInUS(timeEvent); } } else { for(int i=0; i<iters; i++) { Timer timer; void *mapPtr; timer.start(); mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_READ, 0, (numItems * sizeof(float))); queue.finish(); timed += timer.stopAndTime(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); } } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; log->print(gbps); log->print(NEWLINE); log->xmlRecord("enqueuemapbuffer", gbps); /////////////////////////////////////////////////////////////////////////// // memcpy from mapped ptr log->print(TAB TAB TAB TAB "memcpy from mapped ptr : "); queue.finish(); timed = 0; for(int i=0; i<iters; i++) { cl::Event timeEvent; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_READ, 0, (numItems * sizeof(float))); queue.finish(); timer.start(); memcpy(arr, mapPtr, (numItems * sizeof(float))); timed += timer.stopAndTime(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; log->print(gbps); log->print(NEWLINE); log->xmlRecord("memcpy_from_mapped_ptr", gbps); /////////////////////////////////////////////////////////////////////////// // enqueueUnmap log->print(TAB TAB TAB "enqueueUnmap(after write) : "); queue.finish(); timed = 0; if(useEventTimer) { for(int i=0; i<iters; i++) { cl::Event timeEvent; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_WRITE, 0, (numItems * sizeof(float))); queue.finish(); queue.enqueueUnmapMemObject(clBuffer, mapPtr, NULL, &timeEvent); queue.finish(); timed += timeInUS(timeEvent); } } else { for(int i=0; i<iters; i++) { Timer timer; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_WRITE, 0, (numItems * sizeof(float))); queue.finish(); timer.start(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); timed += timer.stopAndTime(); } } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; log->print(gbps); log->print(NEWLINE); log->xmlRecord("enqueueunmap", gbps); /////////////////////////////////////////////////////////////////////////// // memcpy to mapped ptr log->print(TAB TAB TAB TAB "memcpy to mapped ptr : "); queue.finish(); timed = 0; for(int i=0; i<iters; i++) { cl::Event timeEvent; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_WRITE, 0, (numItems * sizeof(float))); queue.finish(); timer.start(); memcpy(mapPtr, arr, (numItems * sizeof(float))); timed += timer.stopAndTime(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; log->print(gbps); log->print(NEWLINE); log->xmlRecord("memcpy_to_mapped_ptr", gbps); /////////////////////////////////////////////////////////////////////////// log->xmlCloseTag(); // transfer_bandwidth if(arr) delete [] arr; } catch(cl::Error error) { stringstream ss; ss << error.what() << " (" << error.err() << ")" NEWLINE << TAB TAB TAB "Tests skipped" NEWLINE; log->print(ss.str()); if(arr) delete [] arr; return -1; } return 0; }
int MaxValueSimple::maxValueCL(int* values, size_t len) { try { cl_int status = CL_SUCCESS; /*** Ausgabe von Informationen ueber gewaehltes OpenCL-Device ***/ /* TODO logging Logger::logDebug( METHOD, Logger::sStream << "max compute units: " << devices[0].getInfo< CL_DEVICE_MAX_COMPUTE_UNITS> ()); Logger::logDebug( METHOD, Logger::sStream << "max work item sizes: " << devices[0].getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES> ()[0]); Logger::logDebug( METHOD, Logger::sStream << "max work group sizes: " << devices[0].getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE> ()); Logger::logDebug( METHOD, Logger::sStream << "max global mem size (KB): " << devices[0].getInfo<CL_DEVICE_GLOBAL_MEM_SIZE> () / 1024); Logger::logDebug( METHOD, Logger::sStream << "max local mem size (KB): " << devices[0].getInfo<CL_DEVICE_LOCAL_MEM_SIZE> () / 1024); */ /*** Erstellen und Vorbereiten der Daten ***/ cl::Buffer vBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(cl_int) * len, &values[0], &status); if (status != CL_SUCCESS) { throw cl::Error(status, "cl::Buffer values"); } cmdQ.finish(); /*** Arbeitsgroeszen berechnen ***/ // Anzahl der Work-Items = globalSize // Work-Items pro Work-Group = localSize const size_t MAX_GROUP_SIZE = devices[0].getInfo< CL_DEVICE_MAX_WORK_GROUP_SIZE> (); size_t globalSize; size_t localSize; do { globalSize = len; localSize = MaxValueSimple::calcWorkGroupSize(globalSize, MAX_GROUP_SIZE); if (localSize == 1) { globalSize = ceil((double) len / WG_FAC) * WG_FAC; localSize = MaxValueSimple::calcWorkGroupSize(globalSize, MAX_GROUP_SIZE); /* TODO logging Logger::logDebug( METHOD, Logger::sStream << "GlobalSize has been extended to " << globalSize); */ } /* TODO logging Logger::logDebug(METHOD, Logger::sStream << "globalSize: " << globalSize); Logger::logDebug(METHOD, Logger::sStream << "localSize: " << localSize); */ /*** Kernel-Argumente setzen ***/ status = kernel.setArg(0, vBuffer); if (status != CL_SUCCESS) { throw cl::Error(status, "Kernel.SetArg"); } status = kernel.setArg(1, sizeof(cl_int) * localSize, NULL); if (status != CL_SUCCESS) { throw cl::Error(status, "Kernel.SetArg"); } /*** Kernel ausfuehren und auf Abarbeitung warten ***/ cl::KernelFunctor func = kernel.bind(cmdQ, cl::NDRange(globalSize), cl::NDRange(localSize)); event = func(); event.wait(); cmdQ.finish(); /* runtimeKernel += event.getProfilingInfo<CL_PROFILING_COMMAND_END> (); runtimeKernel -= event.getProfilingInfo<CL_PROFILING_COMMAND_START> (); */ len = globalSize / localSize; } while (globalSize > localSize && localSize > 1); /*** Daten vom OpenCL-Device holen ***/ // TODO nur 1. element auslesen status = cmdQ.enqueueReadBuffer(vBuffer, true, 0, sizeof(cl_int) * 1, &values[0]); if (status != CL_SUCCESS) { throw cl::Error(status, "CommandQueue.enqueueReadBuffer"); } /* TODO logging Logger::log( METHOD, TIME, Logger::sStream << "timeKernel=" << 1.0e-9 * runtimeKernel << ";"); */ return values[0]; } catch (cl::Error& err) { // TODO Logger::logError(METHOD, Logger::sStream << err.what()); std::cerr << "[ERROR] MaxValueSimple::maxValueCL(int*, size_t): " << err.what() << " (" << err.err() << ")" << std::endl; return MaxValueSimple::MAX_FAILURE; } catch (std::exception& err) { // TODO Logger::logError(METHOD, Logger::sStream << err.what()); std::cerr << "[ERROR] MaxValueSimple::maxValueCL(int*, size_t): " << err.what() << std::endl; return MaxValueSimple::MAX_FAILURE; } }
int clPeak::runTransferBandwidthTest(cl::CommandQueue &queue, cl::Program &prog, device_info_t &devInfo) { if(!isTransferBW) return 0; float timed, gbps; cl::NDRange globalSize, localSize; cl::Context ctx = queue.getInfo<CL_QUEUE_CONTEXT>(); int iters = devInfo.transferBWIters; Timer timer; cl_uint maxItems = devInfo.maxAllocSize / sizeof(float) / 2; cl_uint numItems; // Set an upper-limit for cpu devies if(devInfo.deviceType & CL_DEVICE_TYPE_CPU) { numItems = roundToPowOf2(maxItems, 26); } else { numItems = roundToPowOf2(maxItems); } float *arr = new float[numItems]; try { cl::Buffer clBuffer = cl::Buffer(ctx, (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR), (numItems * sizeof(float))); cout << NEWLINE TAB TAB "Transfer bandwidth (GBPS)" << endl; cout << setprecision(2) << fixed; /////////////////////////////////////////////////////////////////////////// // enqueueWriteBuffer cout << TAB TAB TAB "enqueueWriteBuffer : "; cout.flush(); // Dummy warm-up queue.enqueueWriteBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr); queue.finish(); timed = 0; if(useEventTimer) { for(int i=0; i<iters; i++) { cl::Event timeEvent; queue.enqueueWriteBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr, NULL, &timeEvent); queue.finish(); timed += timeInUS(timeEvent); } } else { Timer timer; timer.start(); for(int i=0; i<iters; i++) { queue.enqueueWriteBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr); } queue.finish(); timed = timer.stopAndTime(); } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; cout << gbps << endl; /////////////////////////////////////////////////////////////////////////// // enqueueReadBuffer cout << TAB TAB TAB "enqueueReadBuffer : "; cout.flush(); // Dummy warm-up queue.enqueueReadBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr); queue.finish(); timed = 0; if(useEventTimer) { for(int i=0; i<iters; i++) { cl::Event timeEvent; queue.enqueueReadBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr, NULL, &timeEvent); queue.finish(); timed += timeInUS(timeEvent); } } else { Timer timer; timer.start(); for(int i=0; i<iters; i++) { queue.enqueueReadBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr); } queue.finish(); timed = timer.stopAndTime(); } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; cout << gbps << endl; /////////////////////////////////////////////////////////////////////////// // enqueueMapBuffer cout << TAB TAB TAB "enqueueMapBuffer(for read) : "; cout.flush(); queue.finish(); timed = 0; if(useEventTimer) { for(int i=0; i<iters; i++) { cl::Event timeEvent; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_READ, 0, (numItems * sizeof(float)), NULL, &timeEvent); queue.finish(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); timed += timeInUS(timeEvent); } } else { for(int i=0; i<iters; i++) { Timer timer; void *mapPtr; timer.start(); mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_READ, 0, (numItems * sizeof(float))); queue.finish(); timed += timer.stopAndTime(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); } } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; cout << gbps << endl; /////////////////////////////////////////////////////////////////////////// // memcpy from mapped ptr cout << TAB TAB TAB TAB "memcpy from mapped ptr : "; cout.flush(); queue.finish(); timed = 0; for(int i=0; i<iters; i++) { cl::Event timeEvent; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_READ, 0, (numItems * sizeof(float))); queue.finish(); timer.start(); memcpy(arr, mapPtr, (numItems * sizeof(float))); timed += timer.stopAndTime(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; cout << gbps << endl; /////////////////////////////////////////////////////////////////////////// // enqueueUnmap cout << TAB TAB TAB "enqueueUnmap(after write) : "; cout.flush(); queue.finish(); timed = 0; if(useEventTimer) { for(int i=0; i<iters; i++) { cl::Event timeEvent; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_WRITE, 0, (numItems * sizeof(float))); queue.finish(); queue.enqueueUnmapMemObject(clBuffer, mapPtr, NULL, &timeEvent); queue.finish(); timed += timeInUS(timeEvent); } } else { for(int i=0; i<iters; i++) { Timer timer; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_WRITE, 0, (numItems * sizeof(float))); queue.finish(); timer.start(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); timed += timer.stopAndTime(); } } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; cout << gbps << endl; /////////////////////////////////////////////////////////////////////////// // memcpy to mapped ptr cout << TAB TAB TAB TAB "memcpy to mapped ptr : "; cout.flush(); queue.finish(); timed = 0; for(int i=0; i<iters; i++) { cl::Event timeEvent; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_WRITE, 0, (numItems * sizeof(float))); queue.finish(); timer.start(); memcpy(mapPtr, arr, (numItems * sizeof(float))); timed += timer.stopAndTime(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; cout << gbps << endl; /////////////////////////////////////////////////////////////////////////// } catch(cl::Error error) { cerr << error.what() << "(" << error.err() << ")" << endl; cerr << TAB TAB TAB "Tests skipped" << endl; if(arr) delete [] arr; return -1; } if(arr) delete [] arr; return 0; }
PetscErrorCode ComputeResidual(TS ts, PetscScalar t, Vec Prim, Vec dPrim_dt, Vec F, void *ptr) { PetscErrorCode ierr; PetscScalar *prim, *dprim_dt, *f; // Get pointers to Petsc Vecs so that we can access the data. ierr = VecGetArray(Prim, &prim); CHKERRQ(ierr); ierr = VecGetArray(dPrim_dt, &dprim_dt); CHKERRQ(ierr); ierr = VecGetArray(F, &f); CHKERRQ(ierr); // OpenCL buffers. cl::Buffer primBuffer, dprimBuffer_dt, fbuffer; PetscInt size = DOF*N1*N2*sizeof(PetscScalar); // Create OpenCL buffers from the data pointers to Petsc Vecs. primBuffer = cl::Buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, size, &(prim[0]), &clErr); dprimBuffer_dt = cl::Buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, size, &(dprim_dt[0]), &clErr); fbuffer = cl::Buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_WRITE_ONLY, size, &(f[0]), &clErr); // Set kernel args. clErr = kernel.setArg(0, primBuffer); clErr = kernel.setArg(1, dprimBuffer_dt); clErr = kernel.setArg(2, fbuffer); // Kernel launch parameters and execution. cl::NDRange global(N1, N2); cl::NDRange local(TILE_SIZE_X1, TILE_SIZE_X2); clErr = queue.enqueueNDRangeKernel(kernel, cl::NullRange, global, local, NULL, NULL); // The following "buffer mapping" is not needed if running on CPU but is // needed if the OpenCL device executing the kernel is a GPU in order to // sync the data. For CPUs this routine is zero cost when used with buffers // created using CL_MEM_USE_HOST_PTR like we did above. For GPUs, the GPU // will access the data on the RAM as and when needed automatically without // user intervention. f = (PetscScalar*)queue.enqueueMapBuffer(fbuffer, CL_FALSE, CL_MAP_READ, 0, size, NULL, NULL, &clErr); // Global sync point for all the threads to ensure execution is complete. clErr = queue.finish(); // Restore the pointers. ierr = VecRestoreArray(Prim, &prim); CHKERRQ(ierr); ierr = VecRestoreArray(dPrim_dt, &dprim_dt); CHKERRQ(ierr); ierr = VecRestoreArray(F, &f); CHKERRQ(ierr); return(0); }
int main() { try { std::vector<cl::Device> devices; // select platform cl::Platform platform = selectPlatform(); // select device platform.getDevices(CL_DEVICE_TYPE_ALL, &devices); cl::Device device = selectDevice(devices); // create context context = cl::Context(devices); // create command queue queue = cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE); // load opencl source std::ifstream cl_file("inclusive_scan.cl"); std::string cl_string{std::istreambuf_iterator<char>(cl_file), std::istreambuf_iterator<char>()}; cl::Program::Sources source(1, std::make_pair(cl_string.c_str(), cl_string.length() + 1)); // create programm program = cl::Program(context, source); // compile opencl source try { program.build(devices); size_t input_size; std::ifstream input_file("input.txt"); input_file >> input_size; std::vector<float> input(input_size); // for (size_t i = 0; i < input_size; ++i) { // input[i] = i % 10; // } for (int i = 0; i < input_size; i++) { input_file >> input[i]; } std::vector<float> output(input_size, 0); cl::Buffer dev_input (context, CL_MEM_READ_ONLY, sizeof(float) * input_size); queue.enqueueWriteBuffer(dev_input, CL_TRUE, 0, sizeof(float) * input_size, &input[0]); cl::Buffer dev_output = inclusive_scan(dev_input, input_size); queue.enqueueReadBuffer(dev_output, CL_TRUE, 0, sizeof(float) * input_size, &output[0]); queue.finish(); cpu_check(input, output); std::ofstream output_file("output.txt"); for (int i = 0; i < input_size; i++) { output_file << output[i] << " "; } } catch (cl::Error const & e) { std::string log_str = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device); std::cout << std::endl << e.what() << " : " << e.err() << std::endl; std::cout << log_str; return 0; } } catch (cl::Error const & e) { std::cout << "Error: " << e.what() << " #" << e.err() << std::endl; } return 0; }
void mainLoop( cl::CommandQueue& queue, cl::Context& context, cl::Kernel kernel, cl::Buffer clImgDesc, cl::Buffer clCamera ){ cl::Event eAcquire, eRelease, eExecute; cl_int err; glFinish(); checkGLErr( "glFinish()" ); queue.enqueueWriteBuffer( clImgDesc, CL_TRUE, 0, 1 * sizeof(ImageDescriptor), (const void*)&imgDesc); err = queue.enqueueAcquireGLObjects( vSharedUnits, NULL, &eAcquire ); checkErr(err, "CommandQueue::enqueueAcquireGLObjects()"); eAcquire.wait(); err = queue.enqueueNDRangeKernel( kernel, cl::NullRange, cl::NDRange(WIDTH, HEIGHT), cl::NullRange, NULL, &eExecute); checkErr(err, "CommandQueue::enqueueNDRangeKernel()"); //std::cout<<"Kernel executing"<< std::endl ; clock_t ti = clock(); eExecute.wait(); clock_t tf = clock(); queue.finish(); err = queue.enqueueReleaseGLObjects( vSharedUnits, NULL, &eRelease ); checkErr(err, "CommandQueue::enqueueReleaseGLObjects()"); eRelease.wait(); imgDesc.numSamples += SAMPLES; pAccumulator->glBind( GL_DRAW_FRAMEBUFFER ); checkGLErr( "glBind GL_DRAW_FRAMEBUFFER, Accumulator " ); pCLTarget->glBind( GL_READ_FRAMEBUFFER ); checkGLErr( "glBind GL_READ_FRAMEBUFFER, Main Target " ); glBlitFramebuffer( 0, 0, WIDTH, HEIGHT, 0, 0, WIDTH, HEIGHT, GL_COLOR_BUFFER_BIT, GL_NEAREST ); checkGLErr( "glBlitFramebuffer" ); glBindFramebuffer( GL_DRAW_FRAMEBUFFER, 0 ); checkGLErr( "glBind GL_DRAW_FRAMEBUFFER, 0 " ); pCLTarget->glBind( GL_READ_FRAMEBUFFER ); checkGLErr( "glBind GL_READ_FRAMEBUFFER, something " ); glBlitFramebuffer( 0, 0, WIDTH, HEIGHT, 0, 0, WIDTH, HEIGHT, GL_COLOR_BUFFER_BIT, GL_NEAREST ); checkGLErr( "glBlitFramebuffer" ); glfwPollEvents(); pCamera->glfwHandleCursor( ((float)(tf - ti))/(CLOCKS_PER_SEC * 1.0f) ); if( sceneChanged() ){ //printf("scene changed..!"); imgDesc.numSamples = 0; CLCamera* cam = pCamera->getCLCamera(); queue.enqueueWriteBuffer( clCamera, CL_TRUE, 0, 1 * sizeof(CLCamera), (const void*)cam ); delete cam; } glfwSwapBuffers( window ); checkGLErr( "glSwapBuffers" ); //Block for a while. //int i; //std::cin >> i; //float timeTaken = ( (float)(tf - ti) ) / (float)CLOCKS_PER_SEC; //std::cout<<"Time taken: "<< timeTaken * 1000 << "ms" << std::endl; //std::cout<<"Predicted FPS: "<< 1 / timeTaken << " FPS"<< std::endl; if( imgDesc.numSamples % 10 == 0 ) std::cout<<"numSamples: "<<imgDesc.numSamples<<std::endl; //handleFrameCounter(); }