void initParticles(uint32_t num_particles) { m_geom = gl::Geometry::create(); m_geom->setPrimitiveType(GL_POINTS); m_mesh = gl::Mesh::create(m_geom, m_pointMaterial); m_numParticles = num_particles; GLsizei numBytes = m_numParticles * sizeof(vec4); m_geom->vertices().resize(m_numParticles, vec3(0)); m_geom->colors().resize(m_numParticles, vec4(1)); m_geom->point_sizes().resize(m_numParticles, 9.f); m_geom->createGLBuffers(); m_mesh->material()->setPointSize(2.f); scene().addObject(m_mesh); try { // shared position buffer for OpenGL / OpenCL m_positions = cl::BufferGL(m_context, CL_MEM_READ_WRITE, m_geom->vertexBuffer().id()); m_colors = cl::BufferGL(m_context, CL_MEM_READ_WRITE, m_geom->colorBuffer().id()); //create the OpenCL only arrays m_velocities = cl::Buffer( m_context, CL_MEM_WRITE_ONLY, numBytes ); m_positionGen = cl::Buffer( m_context, CL_MEM_WRITE_ONLY, numBytes ); m_velocityGen = cl::Buffer( m_context, CL_MEM_WRITE_ONLY, numBytes ); vector<vec4> posGen, velGen; for (int i = 0; i < m_numParticles; i++) { posGen.push_back( vec4(glm::ballRand(20.0f), 1.f) ); vec2 tmp = glm::linearRand(vec2(-100), vec2(100)); float life = kinski::random(2.f, 5.f); float yVel = kinski::random<float>(5, 15); velGen.push_back(vec4(tmp.x, yVel, tmp.y, life)); m_geom->point_sizes()[i] = kinski::random(5.f, 15.f); } m_geom->createGLBuffers(); m_queue.enqueueWriteBuffer(m_velocities, CL_TRUE, 0, numBytes, &velGen[0]); m_queue.enqueueWriteBuffer(m_positionGen, CL_TRUE, 0, numBytes, &posGen[0]); m_queue.enqueueWriteBuffer(m_velocityGen, CL_TRUE, 0, numBytes, &velGen[0]); m_particleKernel.setArg(0, m_positions); m_particleKernel.setArg(1, m_colors); m_particleKernel.setArg(2, m_velocities); m_particleKernel.setArg(3, m_positionGen); m_particleKernel.setArg(4, m_velocityGen); } catch(cl::Error &error) { LOG_ERROR << error.what() << "(" << oclErrorString(error.err()) << ")"; } }
void copyToDevice(cl::CommandQueue &queue) { if(m_pElts==NULL) throw cl::Error(CL_INVALID_MEM_OBJECT, "copyToDevice - Buffer is not initialised."); queue.enqueueWriteBuffer(m_buffer, CL_TRUE, 0, m_cb, m_pElts); }
void simulationStep() { try { // copy auto buffer = cl::Buffer(context, CL_MEM_READ_ONLY, sizeof(unsigned char) * 4 * fieldWidth * fieldHeight, nullptr, nullptr); queue.enqueueWriteBuffer(buffer, CL_TRUE, 0, sizeof(unsigned char) * 4 * fieldWidth * fieldHeight, visualizationBufferCPU, NULL, NULL); // enque stepKernel.setArg(2, buffer); cl::NDRange global((size_t) (fieldWidth * fieldHeight)); queue.enqueueNDRangeKernel(stepKernel, cl::NullRange, global, cl::NullRange); // read back queue.enqueueReadBuffer(visualizationBufferGPU, CL_TRUE, 0, sizeof(unsigned char) * 4 * fieldWidth * fieldHeight, visualizationBufferCPU, NULL, NULL); // finish queue.finish(); } catch (cl::Error err) { std::cout << "Error: " << err.what() << "(" << err.err() << ")" << std::endl; exit(3); } glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, fieldWidth, fieldHeight, 0, GL_RGBA, GL_UNSIGNED_BYTE, visualizationBufferCPU); }
inline void OpenCL::addkernelarg(std::size_t i, T const (& arg)[N], cl::Kernel & kernel,cl::CommandQueue &quene) const { cl::Buffer buffer(this->context,CL_MEM_READ_WRITE,N*sizeof(T)); // std::cout << "enqeue\n"; quene.enqueueWriteBuffer(buffer,CL_FALSE,0,sizeof(T)*N,&arg); kernel.setArg(i,buffer); }
inline void OpenCL::addkernelarg(std::size_t i, std::vector<T> const & arg, cl::Kernel & kernel,cl::CommandQueue &quene) const { cl::Buffer buffer(this->context,CL_MEM_READ_WRITE,arg.size()*sizeof(T)); // std::cout << "enqeue\n"; quene.enqueueWriteBuffer(buffer,CL_FALSE,0,sizeof(T)*arg.size(),&(arg[0])); kernel.setArg(i,buffer); }
cl::Event copyToDeviceAsync(cl::CommandQueue &queue) { if(m_pElts==NULL) throw cl::Error(CL_INVALID_MEM_OBJECT, "copyToDevice - Buffer is not initialised."); cl::Event complete; queue.enqueueWriteBuffer(m_buffer, CL_FALSE, 0, m_cb, m_pElts, NULL, &complete); return complete; }
void write(const cl::CommandQueue &q, size_t offset, size_t size, const T *host, bool blocking = false) const { if (size) q.enqueueWriteBuffer( buffer, blocking ? CL_TRUE : CL_FALSE, sizeof(T) * offset, sizeof(T) * size, host ); }
void Skeleton::updateBuffer(cl::CommandQueue _queue) const { bindPose->calculateOffsetTo(currentPose, bindToCurrentTransforms); for (auto& transform : bindToCurrentTransforms) { transform = glm::transpose(world * transform); } _queue.enqueueWriteBuffer(transformBuffer, false, 0, sizeof(glm::mat4) * bindToCurrentTransforms.size(), bindToCurrentTransforms.data()); }
cl::Event copyToDeviceAsync(cl::CommandQueue &queue, const cl::Event &prior) { if(m_pElts==NULL) throw cl::Error(CL_INVALID_MEM_OBJECT, "copyToDevice - Buffer is not initialised."); cl::Event complete; std::vector<cl::Event> srcs; srcs.push_back(prior); queue.enqueueWriteBuffer(m_buffer, CL_FALSE, 0, m_cb, m_pElts, srcs, &complete); return complete; }
void randomizeField() { for (int i = 0; i < fieldHeight * fieldWidth; i++) { visualizationBufferCPU[i * 4 + 0] = 0; visualizationBufferCPU[i * 4 + 1] = 0; visualizationBufferCPU[i * 4 + 2] = 0; visualizationBufferCPU[i * 4 + 3] = (rand() % 5) * 255; } queue.enqueueWriteBuffer(visualizationBufferGPU, CL_TRUE, 0, sizeof(unsigned char) * 4 * fieldWidth * fieldHeight, visualizationBufferCPU, NULL, NULL); }
void OpenCL::addkernelarg(std::size_t i, std::vector<T> const & arg, cl::Kernel & kernel,std::vector<cl::Buffer> &outputbuffer,cl::CommandQueue &quene)const { cl::Buffer buffer(this->context,CL_MEM_READ_WRITE,arg.size()*sizeof(T)); outputbuffer.push_back(buffer); // std::cout << "enqeue\n"; cl_int err = quene.enqueueWriteBuffer(buffer,CL_FALSE,0,sizeof(T)*arg.size(),&(arg[0])); if (err){ std::cerr << "Error while pushing Vector. Errorcode: " << err << std::endl; } kernel.setArg(i,buffer); }
void CLArgument::copyToDevice( cl::CommandQueue &queue ) { assert( myBufferInitialized ); if ( !myCopyTo ) return; queue.enqueueWriteBuffer( myBuffer, CL_TRUE, 0, mySize, myPtr ); }
/** * initialize tinymt status in device global memory * using 1 parameter for all generators. *@param tinymt_status device global memories *@param total total number of work items *@param local_item number of local work items *@param seed_array seeds for initialization *@param seed_size size of seed_array */ static void initialize_by_array(Buffer& tinymt_status, int total, int local_item, uint64_t seed_array[], int seed_size) { #if defined(DEBUG) cout << "initialize_by_array start" << endl; #endif Buffer seed_array_buffer(context, CL_MEM_READ_WRITE, seed_size * sizeof(uint64_t)); queue.enqueueWriteBuffer(seed_array_buffer, CL_TRUE, 0, seed_size * sizeof(uint64_t), seed_array); Kernel init_kernel(program, "tinymt_init_array_kernel"); init_kernel.setArg(0, tinymt_status); init_kernel.setArg(1, seed_array_buffer); init_kernel.setArg(2, seed_size); NDRange global(total); NDRange local(local_item); Event event; queue.enqueueNDRangeKernel(init_kernel, NullRange, global, local, NULL, &event); double time = get_time(event); tinymt64j_t status[total]; queue.enqueueReadBuffer(tinymt_status, CL_TRUE, 0, sizeof(tinymt64j_t) * total, status); cout << "initializing time = " << time * 1000 << "ms" << endl; check_status(status, total); #if defined(DEBUG) cout << "initialize_by_array end" << endl; #endif }
void MetaBallsApp::updateMarching() { static const cl_int3 size{ VOLUME_WIDTH, VOLUME_HEIGHT, VOLUME_DEPTH }; mClCommandQueue.enqueueNDRangeKernel( mKernWriteClear, cl::NullRange, cl::NDRange( VOLUME_SIZE ) ); /* Update volumes */ mClCommandQueue.enqueueNDRangeKernel( mKernWriteMetaballs, cl::NullRange, cl::NDRange( VOLUME_SIZE ) ); /* End */ int zero = 0; auto kernelRange = (size.s[0]-1) * (size.s[1]-1) * (size.s[2]-1); mClCommandQueue.enqueueWriteBuffer( mClVertIndex, true, 0, sizeof(int), &zero ); mClCommandQueue.enqueueNDRangeKernel( mKernConstructSurface, cl::NullRange, cl::NDRange( kernelRange ) ); mClCommandQueue.enqueueReadBuffer( mClVertIndex, true, 0, sizeof(cl_int), &mMarchingVertsWritten ); /* Generate Normals */ if (mMarchingVertsWritten > 0) { bool smooth = true; if( ! smooth ) mClCommandQueue.enqueueNDRangeKernel( mKernGenNormals, cl::NullRange, cl::NDRange( mMarchingVertsWritten ) ); else mClCommandQueue.enqueueNDRangeKernel( mKernGenNormalsSmooth, cl::NullRange, cl::NDRange( mMarchingVertsWritten ) ); } //if( mDebugDraw ) mClCommandQueue.enqueueNDRangeKernel( mKernWritePointColorBack, cl::NullRange, cl::NDRange( VOLUME_SIZE ) ); }
int clPeak::runTransferBandwidthTest(cl::CommandQueue &queue, cl::Program &prog, device_info_t &devInfo) { if(!isTransferBW) return 0; float timed, gbps; cl::NDRange globalSize, localSize; cl::Context ctx = queue.getInfo<CL_QUEUE_CONTEXT>(); int iters = devInfo.transferBWIters; Timer timer; float *arr = NULL; cl_uint maxItems = devInfo.maxAllocSize / sizeof(float) / 2; cl_uint numItems; // Set an upper-limit for cpu devies if(devInfo.deviceType & CL_DEVICE_TYPE_CPU) { numItems = roundToPowOf2(maxItems, 26); } else { numItems = roundToPowOf2(maxItems); } try { arr = new float[numItems]; cl::Buffer clBuffer = cl::Buffer(ctx, (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR), (numItems * sizeof(float))); log->print(NEWLINE TAB TAB "Transfer bandwidth (GBPS)" NEWLINE); log->xmlOpenTag("transfer_bandwidth"); log->xmlAppendAttribs("unit", "gbps"); /////////////////////////////////////////////////////////////////////////// // enqueueWriteBuffer log->print(TAB TAB TAB "enqueueWriteBuffer : "); // Dummy warm-up queue.enqueueWriteBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr); queue.finish(); timed = 0; if(useEventTimer) { for(int i=0; i<iters; i++) { cl::Event timeEvent; queue.enqueueWriteBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr, NULL, &timeEvent); queue.finish(); timed += timeInUS(timeEvent); } } else { Timer timer; timer.start(); for(int i=0; i<iters; i++) { queue.enqueueWriteBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr); } queue.finish(); timed = timer.stopAndTime(); } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; log->print(gbps); log->print(NEWLINE); log->xmlRecord("enqueuewritebuffer", gbps); /////////////////////////////////////////////////////////////////////////// // enqueueReadBuffer log->print(TAB TAB TAB "enqueueReadBuffer : "); // Dummy warm-up queue.enqueueReadBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr); queue.finish(); timed = 0; if(useEventTimer) { for(int i=0; i<iters; i++) { cl::Event timeEvent; queue.enqueueReadBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr, NULL, &timeEvent); queue.finish(); timed += timeInUS(timeEvent); } } else { Timer timer; timer.start(); for(int i=0; i<iters; i++) { queue.enqueueReadBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr); } queue.finish(); timed = timer.stopAndTime(); } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; log->print(gbps); log->print(NEWLINE); log->xmlRecord("enqueuereadbuffer", gbps); /////////////////////////////////////////////////////////////////////////// // enqueueMapBuffer log->print(TAB TAB TAB "enqueueMapBuffer(for read) : "); queue.finish(); timed = 0; if(useEventTimer) { for(int i=0; i<iters; i++) { cl::Event timeEvent; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_READ, 0, (numItems * sizeof(float)), NULL, &timeEvent); queue.finish(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); timed += timeInUS(timeEvent); } } else { for(int i=0; i<iters; i++) { Timer timer; void *mapPtr; timer.start(); mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_READ, 0, (numItems * sizeof(float))); queue.finish(); timed += timer.stopAndTime(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); } } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; log->print(gbps); log->print(NEWLINE); log->xmlRecord("enqueuemapbuffer", gbps); /////////////////////////////////////////////////////////////////////////// // memcpy from mapped ptr log->print(TAB TAB TAB TAB "memcpy from mapped ptr : "); queue.finish(); timed = 0; for(int i=0; i<iters; i++) { cl::Event timeEvent; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_READ, 0, (numItems * sizeof(float))); queue.finish(); timer.start(); memcpy(arr, mapPtr, (numItems * sizeof(float))); timed += timer.stopAndTime(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; log->print(gbps); log->print(NEWLINE); log->xmlRecord("memcpy_from_mapped_ptr", gbps); /////////////////////////////////////////////////////////////////////////// // enqueueUnmap log->print(TAB TAB TAB "enqueueUnmap(after write) : "); queue.finish(); timed = 0; if(useEventTimer) { for(int i=0; i<iters; i++) { cl::Event timeEvent; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_WRITE, 0, (numItems * sizeof(float))); queue.finish(); queue.enqueueUnmapMemObject(clBuffer, mapPtr, NULL, &timeEvent); queue.finish(); timed += timeInUS(timeEvent); } } else { for(int i=0; i<iters; i++) { Timer timer; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_WRITE, 0, (numItems * sizeof(float))); queue.finish(); timer.start(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); timed += timer.stopAndTime(); } } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; log->print(gbps); log->print(NEWLINE); log->xmlRecord("enqueueunmap", gbps); /////////////////////////////////////////////////////////////////////////// // memcpy to mapped ptr log->print(TAB TAB TAB TAB "memcpy to mapped ptr : "); queue.finish(); timed = 0; for(int i=0; i<iters; i++) { cl::Event timeEvent; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_WRITE, 0, (numItems * sizeof(float))); queue.finish(); timer.start(); memcpy(mapPtr, arr, (numItems * sizeof(float))); timed += timer.stopAndTime(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; log->print(gbps); log->print(NEWLINE); log->xmlRecord("memcpy_to_mapped_ptr", gbps); /////////////////////////////////////////////////////////////////////////// log->xmlCloseTag(); // transfer_bandwidth if(arr) delete [] arr; } catch(cl::Error error) { stringstream ss; ss << error.what() << " (" << error.err() << ")" NEWLINE << TAB TAB TAB "Tests skipped" NEWLINE; log->print(ss.str()); if(arr) delete [] arr; return -1; } return 0; }
void PTWeekend::setup() { /* Scene data */ camera.lookAt(glm::vec3(-2,1,1), vec3(0, 0, -1.0f), vec3(0,1,0)); camera.setPerspective( 45.0f, getWindowAspectRatio(), 0.01f, 100.0f ); cameraUI = CameraUi(&camera, getWindow()); glm::vec3 bottom_sky_color(1.0, 1.0, 1.0); glm::vec3 top_sky_color(0.5, 0.7, 1.0); CGLContextObj glContext = CGLGetCurrentContext(); CGLShareGroupObj shareGroup = CGLGetShareGroup(glContext); GLuint imgTexName; const char* program_file_str = "../../../assets/path_tracing.cl"; /* Obtain a platform */ std::vector<cl::Platform> platforms; clStatus = cl::Platform::get(&platforms); pt_assert(clStatus, "Could not find an OpenCL platform."); /* Obtain a device and determinte max local size */ std::vector<cl::Device> devices; clStatus = platforms[0].getDevices(CL_DEVICE_TYPE_GPU, &devices); pt_assert(clStatus, "Could not find a GPU device."); device = devices[0]; /* Create an OpenCL context for the device */ cl_context_properties properties[] = { CL_CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE, (cl_context_properties)shareGroup, 0 }; context = cl::Context({device}, properties, NULL, NULL, &clStatus); pt_assert(clStatus, "Could not create a context for device."); /* Load and build a program */ std::ifstream program_file(program_file_str); std::string program_str(std::istreambuf_iterator<char>(program_file), (std::istreambuf_iterator<char>())); cl::Program::Sources sources(1, std::make_pair(program_str.c_str(), program_str.length() + 1)); program = cl::Program(context, sources); clStatus = program.build({device}, "-I ../../../assets/ -cl-denorms-are-zero"); if (clStatus != CL_SUCCESS) { std::string log = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device); std::cerr << log << "\n"; exit(EXIT_FAILURE); } /* Create command queue */ cmd_queue = cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &clStatus); pt_assert(clStatus, "Could not create command queue"); /* create kernel and set the kernel arguments */ kernel = cl::Kernel(program, "path_tracing", &clStatus); pt_assert(clStatus, "Could not create kernel"); img_width = getWindowWidth(); img_height = getWindowHeight(); true_img_width = getWindowWidth(); true_img_height = getWindowHeight(); local_size = device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>(); //TODO: throws local_width = (size_t)pow(2, ceilf(log2f((floorf(sqrtf(local_size)))))); local_height = local_size / local_width; img_width = ceilf((float)img_width / (float)local_width) * local_width; img_height = ceilf((float)img_height / (float)local_height) * local_height; unsigned int samples = 16; /* Create GL texture and CL wrapper */ glGenTextures(1, &imgTexName); glBindTexture(GL_TEXTURE_2D, imgTexName); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, img_width, img_height, 0, GL_RGBA, GL_UNSIGNED_BYTE, 0); glBindTexture(GL_TEXTURE_2D, 0); imgTex = gl::Texture2d::create(GL_TEXTURE_2D, imgTexName, img_width, img_height, true); glFinish(); img_buffer.push_back(cl::Image2DGL(context, CL_MEM_WRITE_ONLY, GL_TEXTURE_2D, 0, imgTexName, &clStatus)); pt_assert(clStatus, "Could not create buffer"); /* Create all buffers */ cam_buffer = cl::Buffer(context, CL_MEM_READ_ONLY, sizeof(cl_pinhole_cam), NULL, &clStatus); pt_assert(clStatus, "Could not create camera buffer"); primitive_buffer = cl::Buffer (context, CL_MEM_READ_ONLY, MAX_PRIMITIVES * sizeof(cl_sphere), NULL, &clStatus); pt_assert(clStatus, "Could not create primitive buffer"); material_buffer = cl::Buffer (context, CL_MEM_READ_ONLY, MAX_PRIMITIVES * sizeof(cl_material), NULL, &clStatus); pt_assert(clStatus, "Could not create primitive buffer"); sky_buffer = cl::Buffer(context, CL_MEM_READ_ONLY, sizeof(cl_sky_material), NULL, &clStatus); pt_assert(clStatus, "Could not create sky buffer"); /* Upload scene (static) */ size_t sceneObjectCount = 5; cl_sphere* primitive_array = (cl_sphere*)malloc(sceneObjectCount * sizeof(cl_sphere)); cl_material* material_array = (cl_material*)malloc(sceneObjectCount * sizeof(cl_material)); primitive_array[0] = cl_make_sphere(glm::vec3(1, 0, -1), 0.5f); material_array[0] = cl_make_material(pt::ColorHex_to_RGBfloat<float>("0x730202"), 0, MAT_LAMBERTIAN); primitive_array[1] = cl_make_sphere(glm::vec3(-1, 0, -1), 0.5f); material_array[1] = cl_make_material(pt::ColorHex_to_RGBfloat<float>("0xF89000"), 0, MAT_LAMBERTIAN); primitive_array[2] = cl_make_sphere(glm::vec3(0, 0, 0), 0.5f); material_array[2] = cl_make_material(pt::ColorHex_to_RGBfloat<float>("0x97A663"), 0.1f, MAT_METALLIC); primitive_array[3] = cl_make_sphere(glm::vec3(0, 0, -2), 0.5f); material_array[3] = cl_make_material(glm::vec3(0.8f, 0.6f, 0.2f), 0.3f, MAT_METALLIC); primitive_array[4] = cl_make_sphere(glm::vec3(0,-100.5f, 1.0f), 100.0f); material_array[4] = cl_make_material(glm::vec3(0.5f), 0, MAT_LAMBERTIAN); clStatus = cmd_queue.enqueueWriteBuffer(primitive_buffer, CL_TRUE, 0, sceneObjectCount * sizeof(cl_sphere), primitive_array, NULL, NULL); pt_assert(clStatus, "Could not fill primitive buffer"); clStatus = cmd_queue.enqueueWriteBuffer(material_buffer, CL_TRUE, 0, sceneObjectCount * sizeof(cl_material), material_array, NULL, NULL); pt_assert(clStatus, "Could not fill material buffer"); pt_assert(cl_set_skycolors(bottom_sky_color, top_sky_color, sky_buffer, cmd_queue), "Could not fill sky buffer"); clStatus = kernel.setArg(1, primitive_buffer); pt_assert(clStatus, "Could not set primitive buffer argument"); clStatus = kernel.setArg(2, material_buffer); pt_assert(clStatus, "Could not set material buffer argument"); clStatus = kernel.setArg(3, sky_buffer); pt_assert(clStatus, "Could not set sky buffer argument"); clStatus = kernel.setArg(4, sceneObjectCount); pt_assert(clStatus, "Could not set primitive count count argument"); clStatus = kernel.setArg(5, img_buffer[0]); pt_assert(clStatus, "Could not set img buffer argument"); clStatus = kernel.setArg(6, samples); pt_assert(clStatus, "Could not set samples argument"); clStatus = kernel.setArg(0, cam_buffer); pt_assert(clStatus, "Could not set camera buffer argument"); }
void watershed(int width, int height, cl::Buffer& src, cl::Buffer& labeled, ProgramCache& cache, cl::CommandQueue& queue) { #ifdef OPENCL_PROFILE watershed_descent_kernel_time = 0; watershed_increment_kernel_time = 0; watershed_minima_kernel_time = 0; watershed_plateau_kernel_time = 0; watershed_flood_kernel_time = 0; #endif cl::Context context = queue.getInfo<CL_QUEUE_CONTEXT>(); std::stringstream params_stream; params_stream << "-DBLOCK_SIZE="; params_stream << BLOCK_SIZE; std::string program_params = params_stream.str(); cl::Program& program = cache.getProgram("Watershed", program_params); cl::Kernel descent_kernel(program, "descent_kernel"); cl::Kernel increment_kernel(program, "increment_kernel"); cl::Kernel minima_kernel(program, "minima_kernel"); cl::Kernel plateau_kernel(program, "plateau_kernel"); cl::Kernel flood_kernel(program, "flood_kernel"); //setting constant memory with neigbourhood cl::Buffer cl_neighbourhood_x = cl::Buffer(context,CL_MEM_READ_ONLY, sizeof(neighbourhood_x)); cl::Buffer cl_neighbourhood_y = cl::Buffer(context, CL_MEM_READ_ONLY, sizeof(neighbourhood_y)); #ifdef OPENCL_PROFILE cl::Event first_event; queue.enqueueWriteBuffer(cl_neighbourhood_x, CL_TRUE, 0, sizeof(neighbourhood_x), neighbourhood_x, __null, &first_event); #else queue.enqueueWriteBuffer(cl_neighbourhood_x, CL_TRUE, 0, sizeof(neighbourhood_x), neighbourhood_x); #endif queue.enqueueWriteBuffer(cl_neighbourhood_y, CL_TRUE, 0, sizeof(neighbourhood_y), neighbourhood_y); //const size_t block_size = 6; //cl::LocalSpaceArg local_mem = cl::__local(block_size * block_size * sizeof(float)); cl::LocalSpaceArg local_mem = cl::__local(BLOCK_SIZE * BLOCK_SIZE * sizeof(float)); //setting args for descent_kernel descent_kernel.setArg(0, src); descent_kernel.setArg(1, labeled); descent_kernel.setArg(2, cl_neighbourhood_x); descent_kernel.setArg(3, cl_neighbourhood_y); descent_kernel.setArg(4, local_mem); descent_kernel.setArg(5, width); descent_kernel.setArg(6, height); size_t global_width = (width / (BLOCK_SIZE - 2) + 1) * BLOCK_SIZE; size_t global_height = (height / (BLOCK_SIZE - 2) + 1) * BLOCK_SIZE; #ifdef DEBUG_PRINT std::cout << "global width=" << global_width << " global height=" << global_height << std::endl; #endif cl::NDRange global(global_width, global_height); cl::NDRange local(BLOCK_SIZE, BLOCK_SIZE); cl_int status; #ifdef OPENCL_PROFILE { VECTOR_CLASS<cl::Event> events_vector(1); status = queue.enqueueNDRangeKernel(descent_kernel, cl::NullRange, global, local, __null, &events_vector[0]); cl::WaitForEvents(events_vector); cl::Event& event = events_vector[0]; cl_ulong start = event.getProfilingInfo<CL_PROFILING_COMMAND_START>(); cl_ulong end = event.getProfilingInfo<CL_PROFILING_COMMAND_END>(); cl_ulong total_time = end - start; watershed_descent_kernel_time = total_time; } #else status = queue.enqueueNDRangeKernel(descent_kernel, cl::NullRange, global, local); #endif #ifdef DEBUG_PRINT std::cout << "kernel execution " << status << std::endl; #endif // queue.flush(); // queue.enqueueBarrier(); /* PREPARING INCREMENT KERNEL */ increment_kernel.setArg(0, labeled); increment_kernel.setArg(1, width); increment_kernel.setArg(2, height); #ifdef OPENCL_PROFILE { VECTOR_CLASS<cl::Event> events_vector(1); status = queue.enqueueNDRangeKernel(increment_kernel, cl::NullRange, global, local, __null, &events_vector[0]); cl::WaitForEvents(events_vector); cl::Event& event = events_vector[0]; cl_ulong start = event.getProfilingInfo<CL_PROFILING_COMMAND_START>(); cl_ulong end = event.getProfilingInfo<CL_PROFILING_COMMAND_END>(); cl_ulong total_time = end - start; watershed_increment_kernel_time = total_time; } #else status = queue.enqueueNDRangeKernel(increment_kernel, cl::NullRange, global, local); #endif // queue.enqueueBarrier(); /* PREPARING MINIMA KERNEL */ int counter_tmp = 0; cl::Buffer counter(context, CL_MEM_READ_WRITE, sizeof(int)); queue.enqueueWriteBuffer(counter, CL_TRUE, 0, sizeof(int), &counter_tmp); queue.enqueueBarrier(); minima_kernel.setArg(0, counter); minima_kernel.setArg(1, labeled); minima_kernel.setArg(2, cl_neighbourhood_x); minima_kernel.setArg(3, cl_neighbourhood_y); minima_kernel.setArg(4, local_mem); minima_kernel.setArg(5, width); minima_kernel.setArg(6, height); int old_val = -1; int new_val = -2; int c = 0; while(old_val != new_val) { old_val = new_val; #ifdef OPENCL_PROFILE { VECTOR_CLASS<cl::Event> events_vector(1); status = queue.enqueueNDRangeKernel(minima_kernel, cl::NullRange, global, local, __null, &events_vector[0]); cl::WaitForEvents(events_vector); cl::Event& event = events_vector[0]; cl_ulong start = event.getProfilingInfo<CL_PROFILING_COMMAND_START>(); cl_ulong end = event.getProfilingInfo<CL_PROFILING_COMMAND_END>(); cl_ulong total_time = end - start; watershed_minima_kernel_time += total_time; } #else status = queue.enqueueNDRangeKernel(minima_kernel, cl::NullRange, global, local); #endif queue.enqueueReadBuffer(counter, CL_TRUE, 0, sizeof(int), &new_val); c++; } #ifdef DEBUG_PRINT std::cout << "step 2: " << c << " iterations" << std::endl; #endif /* PREPARING PLATEAU KERNEL */ queue.enqueueWriteBuffer(counter, CL_TRUE, 0, sizeof(int), &counter_tmp); queue.enqueueBarrier(); plateau_kernel.setArg(0, counter); plateau_kernel.setArg(1, src); plateau_kernel.setArg(2, labeled); plateau_kernel.setArg(3, cl_neighbourhood_x); plateau_kernel.setArg(4, cl_neighbourhood_y); plateau_kernel.setArg(5, local_mem); plateau_kernel.setArg(6, width); plateau_kernel.setArg(7, height); old_val = -1; new_val = -2; c = 0; #ifdef OPENCL_PROFILE watershed_plateau_kernel_time = 0; #endif while(old_val != new_val) { old_val = new_val; #ifdef OPENCL_PROFILE { VECTOR_CLASS<cl::Event> events_vector(1); status = queue.enqueueNDRangeKernel(plateau_kernel, cl::NullRange, global, local, __null, &events_vector[0]); cl::WaitForEvents(events_vector); cl::Event& event = events_vector[0]; cl_ulong start = event.getProfilingInfo<CL_PROFILING_COMMAND_START>(); cl_ulong end = event.getProfilingInfo<CL_PROFILING_COMMAND_END>(); cl_ulong total_time = end - start; watershed_plateau_kernel_time += total_time; } #else status = queue.enqueueNDRangeKernel(plateau_kernel, cl::NullRange, global, local); #endif queue.enqueueReadBuffer(counter, CL_TRUE, 0, sizeof(int), &new_val); c++; } #ifdef DEBUG_PRINT std::cout << "step 3: " << c << " iterations" << std::endl; #endif //preparing flood kernel queue.enqueueWriteBuffer(counter, CL_TRUE, 0, sizeof(int), &counter_tmp); queue.enqueueBarrier(); flood_kernel.setArg(0, counter); flood_kernel.setArg(1, labeled); flood_kernel.setArg(2, width); flood_kernel.setArg(3, height); old_val = -1; new_val = -2; c = 0; int new_block_size = 16; local = cl::NDRange(new_block_size, new_block_size); int n_width = ((width - 1) / new_block_size + 2) * new_block_size; int n_height = ((height - 1) / new_block_size + 2) * new_block_size; global = cl::NDRange(n_width, n_height); #ifdef DEBUG_PRINT std::cout << "flood kernel invocation params:" << std::endl; std::cout << "local: " << local[0] << ", " << local[1] << std::endl; std::cout << "global: " << global[0] << ", " << global[1] << std::endl; #endif #ifdef OPENCL_PROFILE cl::Event last_event; #endif #ifdef OPENCL_PROFILE watershed_flood_kernel_time = 0; #endif while(old_val != new_val) { old_val = new_val; #ifdef OPENCL_PROFILE { VECTOR_CLASS<cl::Event> events_vector(1); status = queue.enqueueNDRangeKernel(flood_kernel, cl::NullRange, global, local, __null, &events_vector[0]); cl::WaitForEvents(events_vector); cl::Event& event = events_vector[0]; cl_ulong start = event.getProfilingInfo<CL_PROFILING_COMMAND_START>(); cl_ulong end = event.getProfilingInfo<CL_PROFILING_COMMAND_END>(); cl_ulong total_time = end - start; watershed_flood_kernel_time += total_time; } #else status = queue.enqueueNDRangeKernel(flood_kernel, cl::NullRange, global, local); #endif #ifdef OPENCL_PROFILE queue.enqueueReadBuffer(counter, CL_TRUE, 0, sizeof(int), &new_val, __null, &last_event); #else queue.enqueueReadBuffer(counter, CL_TRUE, 0, sizeof(int), &new_val); #endif c++; } #ifdef OPENCL_PROFILE watershed_descent_kernel_time /= TIME_DIVISOR; watershed_increment_kernel_time /= TIME_DIVISOR; watershed_minima_kernel_time /= TIME_DIVISOR; watershed_plateau_kernel_time /= TIME_DIVISOR; watershed_flood_kernel_time /= TIME_DIVISOR; #endif #ifdef DEBUG_PRINT std::cout << "step 4: " << c << " iterations" << std::endl; #endif #ifdef OPENCL_PROFILE last_event.wait(); cl_ulong start = first_event.getProfilingInfo<CL_PROFILING_COMMAND_START>(); cl_ulong end = last_event.getProfilingInfo<CL_PROFILING_COMMAND_END>(); cl_ulong total_time = end - start; setLastExecutionTime(total_time/TIME_DIVISOR); #endif }
void mainLoop( cl::CommandQueue& queue, cl::Context& context, cl::Kernel kernel, cl::Buffer clImgDesc, cl::Buffer clCamera ){ cl::Event eAcquire, eRelease, eExecute; cl_int err; glFinish(); checkGLErr( "glFinish()" ); queue.enqueueWriteBuffer( clImgDesc, CL_TRUE, 0, 1 * sizeof(ImageDescriptor), (const void*)&imgDesc); err = queue.enqueueAcquireGLObjects( vSharedUnits, NULL, &eAcquire ); checkErr(err, "CommandQueue::enqueueAcquireGLObjects()"); eAcquire.wait(); err = queue.enqueueNDRangeKernel( kernel, cl::NullRange, cl::NDRange(WIDTH, HEIGHT), cl::NullRange, NULL, &eExecute); checkErr(err, "CommandQueue::enqueueNDRangeKernel()"); //std::cout<<"Kernel executing"<< std::endl ; clock_t ti = clock(); eExecute.wait(); clock_t tf = clock(); queue.finish(); err = queue.enqueueReleaseGLObjects( vSharedUnits, NULL, &eRelease ); checkErr(err, "CommandQueue::enqueueReleaseGLObjects()"); eRelease.wait(); imgDesc.numSamples += SAMPLES; pAccumulator->glBind( GL_DRAW_FRAMEBUFFER ); checkGLErr( "glBind GL_DRAW_FRAMEBUFFER, Accumulator " ); pCLTarget->glBind( GL_READ_FRAMEBUFFER ); checkGLErr( "glBind GL_READ_FRAMEBUFFER, Main Target " ); glBlitFramebuffer( 0, 0, WIDTH, HEIGHT, 0, 0, WIDTH, HEIGHT, GL_COLOR_BUFFER_BIT, GL_NEAREST ); checkGLErr( "glBlitFramebuffer" ); glBindFramebuffer( GL_DRAW_FRAMEBUFFER, 0 ); checkGLErr( "glBind GL_DRAW_FRAMEBUFFER, 0 " ); pCLTarget->glBind( GL_READ_FRAMEBUFFER ); checkGLErr( "glBind GL_READ_FRAMEBUFFER, something " ); glBlitFramebuffer( 0, 0, WIDTH, HEIGHT, 0, 0, WIDTH, HEIGHT, GL_COLOR_BUFFER_BIT, GL_NEAREST ); checkGLErr( "glBlitFramebuffer" ); glfwPollEvents(); pCamera->glfwHandleCursor( ((float)(tf - ti))/(CLOCKS_PER_SEC * 1.0f) ); if( sceneChanged() ){ //printf("scene changed..!"); imgDesc.numSamples = 0; CLCamera* cam = pCamera->getCLCamera(); queue.enqueueWriteBuffer( clCamera, CL_TRUE, 0, 1 * sizeof(CLCamera), (const void*)cam ); delete cam; } glfwSwapBuffers( window ); checkGLErr( "glSwapBuffers" ); //Block for a while. //int i; //std::cin >> i; //float timeTaken = ( (float)(tf - ti) ) / (float)CLOCKS_PER_SEC; //std::cout<<"Time taken: "<< timeTaken * 1000 << "ms" << std::endl; //std::cout<<"Predicted FPS: "<< 1 / timeTaken << " FPS"<< std::endl; if( imgDesc.numSamples % 10 == 0 ) std::cout<<"numSamples: "<<imgDesc.numSamples<<std::endl; //handleFrameCounter(); }
int main() { try { std::vector<cl::Device> devices; // select platform cl::Platform platform = selectPlatform(); // select device platform.getDevices(CL_DEVICE_TYPE_ALL, &devices); cl::Device device = selectDevice(devices); // create context context = cl::Context(devices); // create command queue queue = cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE); // load opencl source std::ifstream cl_file("inclusive_scan.cl"); std::string cl_string{std::istreambuf_iterator<char>(cl_file), std::istreambuf_iterator<char>()}; cl::Program::Sources source(1, std::make_pair(cl_string.c_str(), cl_string.length() + 1)); // create programm program = cl::Program(context, source); // compile opencl source try { program.build(devices); size_t input_size; std::ifstream input_file("input.txt"); input_file >> input_size; std::vector<float> input(input_size); // for (size_t i = 0; i < input_size; ++i) { // input[i] = i % 10; // } for (int i = 0; i < input_size; i++) { input_file >> input[i]; } std::vector<float> output(input_size, 0); cl::Buffer dev_input (context, CL_MEM_READ_ONLY, sizeof(float) * input_size); queue.enqueueWriteBuffer(dev_input, CL_TRUE, 0, sizeof(float) * input_size, &input[0]); cl::Buffer dev_output = inclusive_scan(dev_input, input_size); queue.enqueueReadBuffer(dev_output, CL_TRUE, 0, sizeof(float) * input_size, &output[0]); queue.finish(); cpu_check(input, output); std::ofstream output_file("output.txt"); for (int i = 0; i < input_size; i++) { output_file << output[i] << " "; } } catch (cl::Error const & e) { std::string log_str = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device); std::cout << std::endl << e.what() << " : " << e.err() << std::endl; std::cout << log_str; return 0; } } catch (cl::Error const & e) { std::cout << "Error: " << e.what() << " #" << e.err() << std::endl; } return 0; }
bool runTestType(cl::Context context, cl::CommandQueue queue) { cl_uint size = (1 << 16) + 16384; std::vector<T> input(size); std::cout << "##Testing AMD radix sort for " << input.size() << " elements and type " << magnet::CL::detail::traits<T>::kernel_type(); for(size_t i = 0; i < input.size(); ++i) input[i] = input.size() - i - 1; // create input buffer using pinned memory cl::Buffer bufferIn(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, sizeof(T) * input.size(), &input[0]); magnet::CL::radixSortAMD<T> radixSortFunctor; radixSortFunctor.build(queue, context); radixSortFunctor(bufferIn, bufferIn); std::vector<T> output(size); queue.enqueueReadBuffer(bufferIn, CL_TRUE, 0, input.size() * sizeof(T), &output[0]); bool failed = !testOutput(input, output); std::cout << " key(only) " << (failed ? "FAILED" : "PASSED") << ", "; //Now test with some data! //Refresh the input array queue.enqueueWriteBuffer(bufferIn, CL_TRUE, 0, input.size() * sizeof(T), &input[0]); //Write a data array std::vector<cl_uint> data(size); for(size_t i = 0; i < input.size(); ++i) data[i] = i; cl::Buffer dataIn(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_uint) * data.size(), &data[0]) ; radixSortFunctor(bufferIn, dataIn, bufferIn, dataIn); queue.enqueueReadBuffer(dataIn, CL_TRUE, 0, data.size() * sizeof(cl_uint), &data[0]); bool keyfail = !testOutput(input, output); std::cout << " key " << (keyfail ? "FAILED" : "PASSED"); bool datafail = false; for(size_t i = 0; i < input.size(); ++i) if (data[i] != input.size() - 1 - i) datafail = true; std::cout << " data " << (datafail ? "FAILED" : "PASSED") << std::endl; return failed || keyfail || datafail; }
bool runTestType(cl::Context context, cl::CommandQueue queue) { cl_uint size = 64 * 256; std::vector<T> input(size); for(size_t i = 0; i < input.size(); ++i) input[i] = input.size() - i - 1; // create input buffer using pinned memory cl::Buffer bufferIn(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, sizeof(T) * input.size(), &input[0]) ; magnet::CL::sort<T> sortFunctor; sortFunctor.build(queue, context); sortFunctor(bufferIn); std::cout << "##Testing generic sort ("; switch(sortFunctor.getMode()) { case magnet::CL::sort<T>::CPU: std::cout << "HeapSort"; break; case magnet::CL::sort<T>::NVIDIA: std::cout << "radixNVIDIA"; break; case magnet::CL::sort<T>::AMD: std::cout << "radixAMD"; break; default: M_throw() << "Could not determine which sorting algorithm is being used"; } std::cout << ") for " << input.size() << " elements and type " << magnet::CL::detail::traits<T>::kernel_type(); std::vector<T> output(size); queue.enqueueReadBuffer(bufferIn, CL_TRUE, 0, input.size() * sizeof(T), &output[0]); bool failed = !testOutput(input, output); std::cout << " key(only) " << (failed ? "FAILED" : "PASSED") << ", "; //Now test with some data! //Refresh the input array queue.enqueueWriteBuffer(bufferIn, CL_TRUE, 0, input.size() * sizeof(T), &input[0]); //Write a data array std::vector<cl_uint> data(size); for(size_t i = 0; i < input.size(); ++i) data[i] = i; cl::Buffer dataIn(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_uint) * data.size(), &data[0]) ; sortFunctor(bufferIn, dataIn); queue.enqueueReadBuffer(dataIn, CL_TRUE, 0, data.size() * sizeof(cl_uint), &data[0]); bool keyfail = false;//!testOutput(input, output); std::cout << " key " << (keyfail ? "FAILED" : "PASSED"); bool datafail = false; for(size_t i = 0; i < input.size(); ++i) if (data[i] != input.size() - 1 - i) datafail = true; std::cout << " data " << (datafail ? "FAILED" : "PASSED") << std::endl; return failed || keyfail || datafail; }
int clPeak::runTransferBandwidthTest(cl::CommandQueue &queue, cl::Program &prog, device_info_t &devInfo) { if(!isTransferBW) return 0; float timed, gbps; cl::NDRange globalSize, localSize; cl::Context ctx = queue.getInfo<CL_QUEUE_CONTEXT>(); int iters = devInfo.transferBWIters; Timer timer; cl_uint maxItems = devInfo.maxAllocSize / sizeof(float) / 2; cl_uint numItems; // Set an upper-limit for cpu devies if(devInfo.deviceType & CL_DEVICE_TYPE_CPU) { numItems = roundToPowOf2(maxItems, 26); } else { numItems = roundToPowOf2(maxItems); } float *arr = new float[numItems]; try { cl::Buffer clBuffer = cl::Buffer(ctx, (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR), (numItems * sizeof(float))); cout << NEWLINE TAB TAB "Transfer bandwidth (GBPS)" << endl; cout << setprecision(2) << fixed; /////////////////////////////////////////////////////////////////////////// // enqueueWriteBuffer cout << TAB TAB TAB "enqueueWriteBuffer : "; cout.flush(); // Dummy warm-up queue.enqueueWriteBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr); queue.finish(); timed = 0; if(useEventTimer) { for(int i=0; i<iters; i++) { cl::Event timeEvent; queue.enqueueWriteBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr, NULL, &timeEvent); queue.finish(); timed += timeInUS(timeEvent); } } else { Timer timer; timer.start(); for(int i=0; i<iters; i++) { queue.enqueueWriteBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr); } queue.finish(); timed = timer.stopAndTime(); } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; cout << gbps << endl; /////////////////////////////////////////////////////////////////////////// // enqueueReadBuffer cout << TAB TAB TAB "enqueueReadBuffer : "; cout.flush(); // Dummy warm-up queue.enqueueReadBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr); queue.finish(); timed = 0; if(useEventTimer) { for(int i=0; i<iters; i++) { cl::Event timeEvent; queue.enqueueReadBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr, NULL, &timeEvent); queue.finish(); timed += timeInUS(timeEvent); } } else { Timer timer; timer.start(); for(int i=0; i<iters; i++) { queue.enqueueReadBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr); } queue.finish(); timed = timer.stopAndTime(); } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; cout << gbps << endl; /////////////////////////////////////////////////////////////////////////// // enqueueMapBuffer cout << TAB TAB TAB "enqueueMapBuffer(for read) : "; cout.flush(); queue.finish(); timed = 0; if(useEventTimer) { for(int i=0; i<iters; i++) { cl::Event timeEvent; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_READ, 0, (numItems * sizeof(float)), NULL, &timeEvent); queue.finish(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); timed += timeInUS(timeEvent); } } else { for(int i=0; i<iters; i++) { Timer timer; void *mapPtr; timer.start(); mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_READ, 0, (numItems * sizeof(float))); queue.finish(); timed += timer.stopAndTime(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); } } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; cout << gbps << endl; /////////////////////////////////////////////////////////////////////////// // memcpy from mapped ptr cout << TAB TAB TAB TAB "memcpy from mapped ptr : "; cout.flush(); queue.finish(); timed = 0; for(int i=0; i<iters; i++) { cl::Event timeEvent; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_READ, 0, (numItems * sizeof(float))); queue.finish(); timer.start(); memcpy(arr, mapPtr, (numItems * sizeof(float))); timed += timer.stopAndTime(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; cout << gbps << endl; /////////////////////////////////////////////////////////////////////////// // enqueueUnmap cout << TAB TAB TAB "enqueueUnmap(after write) : "; cout.flush(); queue.finish(); timed = 0; if(useEventTimer) { for(int i=0; i<iters; i++) { cl::Event timeEvent; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_WRITE, 0, (numItems * sizeof(float))); queue.finish(); queue.enqueueUnmapMemObject(clBuffer, mapPtr, NULL, &timeEvent); queue.finish(); timed += timeInUS(timeEvent); } } else { for(int i=0; i<iters; i++) { Timer timer; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_WRITE, 0, (numItems * sizeof(float))); queue.finish(); timer.start(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); timed += timer.stopAndTime(); } } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; cout << gbps << endl; /////////////////////////////////////////////////////////////////////////// // memcpy to mapped ptr cout << TAB TAB TAB TAB "memcpy to mapped ptr : "; cout.flush(); queue.finish(); timed = 0; for(int i=0; i<iters; i++) { cl::Event timeEvent; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_WRITE, 0, (numItems * sizeof(float))); queue.finish(); timer.start(); memcpy(mapPtr, arr, (numItems * sizeof(float))); timed += timer.stopAndTime(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; cout << gbps << endl; /////////////////////////////////////////////////////////////////////////// } catch(cl::Error error) { cerr << error.what() << "(" << error.err() << ")" << endl; cerr << TAB TAB TAB "Tests skipped" << endl; if(arr) delete [] arr; return -1; } if(arr) delete [] arr; return 0; }