mapped_array map(const cl::CommandQueue &q) { return mapped_array( static_cast<T*>( q.enqueueMapBuffer( buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size() * sizeof(T) ) ), buffer_unmapper(q, buffer) ); }
reference_base(Container &rhs, difference_type index, difference_type range, cl::CommandQueue queue): container( rhs ), index( index ), range ( range ), queue(queue) { cl_int status = CL_SUCCESS; //should we throw or map until container.size()? assert( (index + range) < container.size() ); host_buffer = reinterpret_cast< naked_pointer >( queue.enqueueMapBuffer( container.data(), true, CL_MAP_READ | CL_MAP_WRITE, index * sizeof( value_type ), range * sizeof( value_type ), NULL, NULL, &status) ); CLSPARSE_V( status, "Mapping device buffer on host failed" ); }
int ShowU8ImageGPUBuffer(cl::CommandQueue command_queue, cl::Buffer buffer, int width, int height) { unsigned char *cpu = (unsigned char *)command_queue.enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_WRITE, 0, width * height); Mat img_to_show(height, width, CV_8U, cpu); ResizeImage(img_to_show, width, height); imshow("u8", img_to_show); waitKey(); command_queue.enqueueUnmapMemObject(buffer, cpu); return (0); }
NumList mathViaOpenCL(cl::Context& context, cl::CommandQueue& queue, cl::Kernel& kernel, cl::KernelFunctor& vadd, const std::vector<Type>& vec1, const std::vector<Type>& vec2) { const int ROFlags = CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR; const int RWFlags = CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR; const int byteSize = vec1.size() * sizeof(double); // Create buffer for vec1 and vec2 and copy host contents cl::Buffer vec1Buffer = cl::Buffer(context, ROFlags, byteSize, const_cast<float*>(&vec1[0])); cl::Buffer vec2Buffer = cl::Buffer(context, ROFlags, byteSize, const_cast<float*>(&vec2[0])); // Create buffer for result vector NumList result(vec1.size()); cl::Buffer resBuffer = cl::Buffer(context, RWFlags, byteSize, &result[0]); // Run the OpenCL kernel via the functor vadd(vec1Buffer, vec2Buffer, resBuffer); // Sync the OpenCL buffer with the host buffers queue.enqueueMapBuffer(resBuffer, CL_TRUE, CL_MAP_READ, 0, byteSize); return result; }
int clPeak::runTransferBandwidthTest(cl::CommandQueue &queue, cl::Program &prog, device_info_t &devInfo) { if(!isTransferBW) return 0; float timed, gbps; cl::NDRange globalSize, localSize; cl::Context ctx = queue.getInfo<CL_QUEUE_CONTEXT>(); int iters = devInfo.transferBWIters; Timer timer; float *arr = NULL; cl_uint maxItems = devInfo.maxAllocSize / sizeof(float) / 2; cl_uint numItems; // Set an upper-limit for cpu devies if(devInfo.deviceType & CL_DEVICE_TYPE_CPU) { numItems = roundToPowOf2(maxItems, 26); } else { numItems = roundToPowOf2(maxItems); } try { arr = new float[numItems]; cl::Buffer clBuffer = cl::Buffer(ctx, (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR), (numItems * sizeof(float))); log->print(NEWLINE TAB TAB "Transfer bandwidth (GBPS)" NEWLINE); log->xmlOpenTag("transfer_bandwidth"); log->xmlAppendAttribs("unit", "gbps"); /////////////////////////////////////////////////////////////////////////// // enqueueWriteBuffer log->print(TAB TAB TAB "enqueueWriteBuffer : "); // Dummy warm-up queue.enqueueWriteBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr); queue.finish(); timed = 0; if(useEventTimer) { for(int i=0; i<iters; i++) { cl::Event timeEvent; queue.enqueueWriteBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr, NULL, &timeEvent); queue.finish(); timed += timeInUS(timeEvent); } } else { Timer timer; timer.start(); for(int i=0; i<iters; i++) { queue.enqueueWriteBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr); } queue.finish(); timed = timer.stopAndTime(); } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; log->print(gbps); log->print(NEWLINE); log->xmlRecord("enqueuewritebuffer", gbps); /////////////////////////////////////////////////////////////////////////// // enqueueReadBuffer log->print(TAB TAB TAB "enqueueReadBuffer : "); // Dummy warm-up queue.enqueueReadBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr); queue.finish(); timed = 0; if(useEventTimer) { for(int i=0; i<iters; i++) { cl::Event timeEvent; queue.enqueueReadBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr, NULL, &timeEvent); queue.finish(); timed += timeInUS(timeEvent); } } else { Timer timer; timer.start(); for(int i=0; i<iters; i++) { queue.enqueueReadBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr); } queue.finish(); timed = timer.stopAndTime(); } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; log->print(gbps); log->print(NEWLINE); log->xmlRecord("enqueuereadbuffer", gbps); /////////////////////////////////////////////////////////////////////////// // enqueueMapBuffer log->print(TAB TAB TAB "enqueueMapBuffer(for read) : "); queue.finish(); timed = 0; if(useEventTimer) { for(int i=0; i<iters; i++) { cl::Event timeEvent; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_READ, 0, (numItems * sizeof(float)), NULL, &timeEvent); queue.finish(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); timed += timeInUS(timeEvent); } } else { for(int i=0; i<iters; i++) { Timer timer; void *mapPtr; timer.start(); mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_READ, 0, (numItems * sizeof(float))); queue.finish(); timed += timer.stopAndTime(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); } } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; log->print(gbps); log->print(NEWLINE); log->xmlRecord("enqueuemapbuffer", gbps); /////////////////////////////////////////////////////////////////////////// // memcpy from mapped ptr log->print(TAB TAB TAB TAB "memcpy from mapped ptr : "); queue.finish(); timed = 0; for(int i=0; i<iters; i++) { cl::Event timeEvent; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_READ, 0, (numItems * sizeof(float))); queue.finish(); timer.start(); memcpy(arr, mapPtr, (numItems * sizeof(float))); timed += timer.stopAndTime(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; log->print(gbps); log->print(NEWLINE); log->xmlRecord("memcpy_from_mapped_ptr", gbps); /////////////////////////////////////////////////////////////////////////// // enqueueUnmap log->print(TAB TAB TAB "enqueueUnmap(after write) : "); queue.finish(); timed = 0; if(useEventTimer) { for(int i=0; i<iters; i++) { cl::Event timeEvent; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_WRITE, 0, (numItems * sizeof(float))); queue.finish(); queue.enqueueUnmapMemObject(clBuffer, mapPtr, NULL, &timeEvent); queue.finish(); timed += timeInUS(timeEvent); } } else { for(int i=0; i<iters; i++) { Timer timer; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_WRITE, 0, (numItems * sizeof(float))); queue.finish(); timer.start(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); timed += timer.stopAndTime(); } } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; log->print(gbps); log->print(NEWLINE); log->xmlRecord("enqueueunmap", gbps); /////////////////////////////////////////////////////////////////////////// // memcpy to mapped ptr log->print(TAB TAB TAB TAB "memcpy to mapped ptr : "); queue.finish(); timed = 0; for(int i=0; i<iters; i++) { cl::Event timeEvent; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_WRITE, 0, (numItems * sizeof(float))); queue.finish(); timer.start(); memcpy(mapPtr, arr, (numItems * sizeof(float))); timed += timer.stopAndTime(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; log->print(gbps); log->print(NEWLINE); log->xmlRecord("memcpy_to_mapped_ptr", gbps); /////////////////////////////////////////////////////////////////////////// log->xmlCloseTag(); // transfer_bandwidth if(arr) delete [] arr; } catch(cl::Error error) { stringstream ss; ss << error.what() << " (" << error.err() << ")" NEWLINE << TAB TAB TAB "Tests skipped" NEWLINE; log->print(ss.str()); if(arr) delete [] arr; return -1; } return 0; }
int clPeak::runTransferBandwidthTest(cl::CommandQueue &queue, cl::Program &prog, device_info_t &devInfo) { if(!isTransferBW) return 0; float timed, gbps; cl::NDRange globalSize, localSize; cl::Context ctx = queue.getInfo<CL_QUEUE_CONTEXT>(); int iters = devInfo.transferBWIters; Timer timer; cl_uint maxItems = devInfo.maxAllocSize / sizeof(float) / 2; cl_uint numItems; // Set an upper-limit for cpu devies if(devInfo.deviceType & CL_DEVICE_TYPE_CPU) { numItems = roundToPowOf2(maxItems, 26); } else { numItems = roundToPowOf2(maxItems); } float *arr = new float[numItems]; try { cl::Buffer clBuffer = cl::Buffer(ctx, (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR), (numItems * sizeof(float))); cout << NEWLINE TAB TAB "Transfer bandwidth (GBPS)" << endl; cout << setprecision(2) << fixed; /////////////////////////////////////////////////////////////////////////// // enqueueWriteBuffer cout << TAB TAB TAB "enqueueWriteBuffer : "; cout.flush(); // Dummy warm-up queue.enqueueWriteBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr); queue.finish(); timed = 0; if(useEventTimer) { for(int i=0; i<iters; i++) { cl::Event timeEvent; queue.enqueueWriteBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr, NULL, &timeEvent); queue.finish(); timed += timeInUS(timeEvent); } } else { Timer timer; timer.start(); for(int i=0; i<iters; i++) { queue.enqueueWriteBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr); } queue.finish(); timed = timer.stopAndTime(); } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; cout << gbps << endl; /////////////////////////////////////////////////////////////////////////// // enqueueReadBuffer cout << TAB TAB TAB "enqueueReadBuffer : "; cout.flush(); // Dummy warm-up queue.enqueueReadBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr); queue.finish(); timed = 0; if(useEventTimer) { for(int i=0; i<iters; i++) { cl::Event timeEvent; queue.enqueueReadBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr, NULL, &timeEvent); queue.finish(); timed += timeInUS(timeEvent); } } else { Timer timer; timer.start(); for(int i=0; i<iters; i++) { queue.enqueueReadBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr); } queue.finish(); timed = timer.stopAndTime(); } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; cout << gbps << endl; /////////////////////////////////////////////////////////////////////////// // enqueueMapBuffer cout << TAB TAB TAB "enqueueMapBuffer(for read) : "; cout.flush(); queue.finish(); timed = 0; if(useEventTimer) { for(int i=0; i<iters; i++) { cl::Event timeEvent; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_READ, 0, (numItems * sizeof(float)), NULL, &timeEvent); queue.finish(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); timed += timeInUS(timeEvent); } } else { for(int i=0; i<iters; i++) { Timer timer; void *mapPtr; timer.start(); mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_READ, 0, (numItems * sizeof(float))); queue.finish(); timed += timer.stopAndTime(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); } } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; cout << gbps << endl; /////////////////////////////////////////////////////////////////////////// // memcpy from mapped ptr cout << TAB TAB TAB TAB "memcpy from mapped ptr : "; cout.flush(); queue.finish(); timed = 0; for(int i=0; i<iters; i++) { cl::Event timeEvent; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_READ, 0, (numItems * sizeof(float))); queue.finish(); timer.start(); memcpy(arr, mapPtr, (numItems * sizeof(float))); timed += timer.stopAndTime(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; cout << gbps << endl; /////////////////////////////////////////////////////////////////////////// // enqueueUnmap cout << TAB TAB TAB "enqueueUnmap(after write) : "; cout.flush(); queue.finish(); timed = 0; if(useEventTimer) { for(int i=0; i<iters; i++) { cl::Event timeEvent; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_WRITE, 0, (numItems * sizeof(float))); queue.finish(); queue.enqueueUnmapMemObject(clBuffer, mapPtr, NULL, &timeEvent); queue.finish(); timed += timeInUS(timeEvent); } } else { for(int i=0; i<iters; i++) { Timer timer; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_WRITE, 0, (numItems * sizeof(float))); queue.finish(); timer.start(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); timed += timer.stopAndTime(); } } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; cout << gbps << endl; /////////////////////////////////////////////////////////////////////////// // memcpy to mapped ptr cout << TAB TAB TAB TAB "memcpy to mapped ptr : "; cout.flush(); queue.finish(); timed = 0; for(int i=0; i<iters; i++) { cl::Event timeEvent; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_WRITE, 0, (numItems * sizeof(float))); queue.finish(); timer.start(); memcpy(mapPtr, arr, (numItems * sizeof(float))); timed += timer.stopAndTime(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; cout << gbps << endl; /////////////////////////////////////////////////////////////////////////// } catch(cl::Error error) { cerr << error.what() << "(" << error.err() << ")" << endl; cerr << TAB TAB TAB "Tests skipped" << endl; if(arr) delete [] arr; return -1; } if(arr) delete [] arr; return 0; }
PetscErrorCode ComputeResidual(TS ts, PetscScalar t, Vec Prim, Vec dPrim_dt, Vec F, void *ptr) { PetscErrorCode ierr; PetscScalar *prim, *dprim_dt, *f; // Get pointers to Petsc Vecs so that we can access the data. ierr = VecGetArray(Prim, &prim); CHKERRQ(ierr); ierr = VecGetArray(dPrim_dt, &dprim_dt); CHKERRQ(ierr); ierr = VecGetArray(F, &f); CHKERRQ(ierr); // OpenCL buffers. cl::Buffer primBuffer, dprimBuffer_dt, fbuffer; PetscInt size = DOF*N1*N2*sizeof(PetscScalar); // Create OpenCL buffers from the data pointers to Petsc Vecs. primBuffer = cl::Buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, size, &(prim[0]), &clErr); dprimBuffer_dt = cl::Buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, size, &(dprim_dt[0]), &clErr); fbuffer = cl::Buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_WRITE_ONLY, size, &(f[0]), &clErr); // Set kernel args. clErr = kernel.setArg(0, primBuffer); clErr = kernel.setArg(1, dprimBuffer_dt); clErr = kernel.setArg(2, fbuffer); // Kernel launch parameters and execution. cl::NDRange global(N1, N2); cl::NDRange local(TILE_SIZE_X1, TILE_SIZE_X2); clErr = queue.enqueueNDRangeKernel(kernel, cl::NullRange, global, local, NULL, NULL); // The following "buffer mapping" is not needed if running on CPU but is // needed if the OpenCL device executing the kernel is a GPU in order to // sync the data. For CPUs this routine is zero cost when used with buffers // created using CL_MEM_USE_HOST_PTR like we did above. For GPUs, the GPU // will access the data on the RAM as and when needed automatically without // user intervention. f = (PetscScalar*)queue.enqueueMapBuffer(fbuffer, CL_FALSE, CL_MAP_READ, 0, size, NULL, NULL, &clErr); // Global sync point for all the threads to ensure execution is complete. clErr = queue.finish(); // Restore the pointers. ierr = VecRestoreArray(Prim, &prim); CHKERRQ(ierr); ierr = VecRestoreArray(dPrim_dt, &dprim_dt); CHKERRQ(ierr); ierr = VecRestoreArray(F, &f); CHKERRQ(ierr); return(0); }