bool runTestType(cl::Context context, cl::CommandQueue queue) { cl_uint size = 1024 * 2 + 15; std::vector<T> input(size); std::cout << "##Testing scan for " << input.size() << " elements and type " << magnet::CL::detail::traits<T>::kernel_type(); for(size_t i = 0; i < input.size(); ++i) input[i] = i+1; // create input buffer using pinned memory cl::Buffer bufferIn(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, sizeof(T) * input.size(), &input[0]) ; magnet::CL::scan<T> scanFunctor; scanFunctor.build(queue, context); scanFunctor(bufferIn, bufferIn); std::vector<T> output(size); queue.enqueueReadBuffer(bufferIn, CL_TRUE, 0, input.size() * sizeof(T), &output[0]); bool failed = !testOutput(input, output); std::cout << (failed ? " FAILED" : " PASSED") << std::endl; return failed; }
void simulationStep() { try { // copy auto buffer = cl::Buffer(context, CL_MEM_READ_ONLY, sizeof(unsigned char) * 4 * fieldWidth * fieldHeight, nullptr, nullptr); queue.enqueueWriteBuffer(buffer, CL_TRUE, 0, sizeof(unsigned char) * 4 * fieldWidth * fieldHeight, visualizationBufferCPU, NULL, NULL); // enque stepKernel.setArg(2, buffer); cl::NDRange global((size_t) (fieldWidth * fieldHeight)); queue.enqueueNDRangeKernel(stepKernel, cl::NullRange, global, cl::NullRange); // read back queue.enqueueReadBuffer(visualizationBufferGPU, CL_TRUE, 0, sizeof(unsigned char) * 4 * fieldWidth * fieldHeight, visualizationBufferCPU, NULL, NULL); // finish queue.finish(); } catch (cl::Error err) { std::cout << "Error: " << err.what() << "(" << err.err() << ")" << std::endl; exit(3); } glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, fieldWidth, fieldHeight, 0, GL_RGBA, GL_UNSIGNED_BYTE, visualizationBufferCPU); }
void copyFromDevice(cl::CommandQueue &queue) { if(m_pElts==NULL) throw cl::Error(CL_INVALID_MEM_OBJECT, "copyFromDevice - Buffer is not initialised."); queue.enqueueReadBuffer(m_buffer, CL_TRUE, 0, m_cb, m_pElts); }
void runTestType(cl::Context context, cl::CommandQueue queue) { cl_uint size = 2 << 10; std::vector<T> input(size); std::cout << "##Testing bitonic sort for " << input.size() << " elements and type " << magnet::CL::detail::traits<T>::kernel_type() << std::endl; for(size_t i = 0; i < input.size(); ++i) input[i] = input.size() - i - 1; // create input buffer using pinned memory cl::Buffer bufferIn(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, sizeof(T) * input.size(), &input[0]) ; magnet::CL::bitonicSort<T> bitonicSortFunctor; bitonicSortFunctor.build(queue, context); bitonicSortFunctor(bufferIn); std::vector<T> output(size); queue.enqueueReadBuffer(bufferIn, CL_TRUE, 0, input.size() * sizeof(T), &output[0]); if (!testOutput(input, output)) M_throw() << "Incorrect output for size " << input.size() << " and type " << magnet::CL::detail::traits<T>::kernel_type(); }
void read(const cl::CommandQueue &q, size_t offset, size_t size, T *host, bool blocking = false) const { if (size) q.enqueueReadBuffer( buffer, blocking ? CL_TRUE : CL_FALSE, sizeof(T) * offset, sizeof(T) * size, host ); }
cl::Event copyFromDeviceAsync(cl::CommandQueue &queue) { if(m_pElts==NULL) throw cl::Error(CL_INVALID_MEM_OBJECT, "copyFromDevice - Buffer is not initialised."); cl::Event complete; queue.enqueueReadBuffer(m_buffer, CL_FALSE, 0, m_cb, m_pElts, NULL, &complete); return complete; }
cl::Event copyFromDeviceAsync(cl::CommandQueue &queue, cl::Event &prior) { if(m_pElts==NULL) throw cl::Error(CL_INVALID_MEM_OBJECT, "copyFromDevice - Buffer is not initialised."); cl::Event complete; std::vector<cl::Event> srcs; srcs.push_back(prior); queue.enqueueReadBuffer(m_buffer, CL_FALSE, 0, m_cb, m_pElts, &srcs, &complete); return complete; }
void helper(uint32_t* out, int osize, uint8_t* in, int isize, int w, int h, int choice) { int set_size=8; try { cl::Buffer bufferIn = cl::Buffer(gContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, isize*sizeof(cl_uchar), in, NULL); cl::Buffer bufferOut = cl::Buffer(gContext, CL_MEM_READ_WRITE, osize*sizeof(cl_uchar4)); cl::Buffer bufferOut2= cl::Buffer(gContext, CL_MEM_READ_WRITE, osize*sizeof(cl_uchar4)); gNV21Kernel.setArg(2,w); gNV21Kernel.setArg(3,h); gNV21Kernel.setArg(1,bufferIn); gNV21Kernel.setArg(0,bufferOut); gQueue.enqueueNDRangeKernel(gNV21Kernel, cl::NullRange, cl::NDRange( (int)ceil((float)w/16.0f)*16,(int)ceil((float)h/16.0f)*16), cl::NDRange(set_size,set_size), NULL, NULL); if (choice==1) { gLaplacianK.setArg(2,w); gLaplacianK.setArg(3,h); gLaplacianK.setArg(1,bufferOut); gLaplacianK.setArg(0,bufferOut2); gQueue.enqueueNDRangeKernel(gLaplacianK, cl::NullRange, cl::NDRange( (int)ceil((float)w/16.0f)*16,(int)ceil((float)h/16.0f)*16), cl::NDRange(set_size,set_size), NULL, NULL); } else if (choice>1) { gNegative.setArg(2,w); gNegative.setArg(3,h); gNegative.setArg(1,bufferOut); gNegative.setArg(0,bufferOut2); gQueue.enqueueNDRangeKernel(gNegative, cl::NullRange, cl::NDRange( (int)ceil((float)w/16.0f)*16,(int)ceil((float)h/16.0f)*16), cl::NDRange(set_size,set_size), NULL, NULL); } gQueue.enqueueReadBuffer(bufferOut2, CL_TRUE, 0, osize*sizeof(cl_uchar4), out); } catch (cl::Error e) { LOGI("@oclDecoder: %s %d \n",e.what(),e.err()); } }
/** * generate 64 bit unsigned random numbers in device global memory *@param tinymt_status device global memories *@param total_num total number of work items *@param local_num number of local work items *@param data_size number of data to generate */ static void generate_uint64(Buffer& tinymt_status, int total_num, int local_num, int data_size) { #if defined(DEBUG) cout << "generate_uint64 start" << endl; #endif int min_size = total_num; if (data_size % min_size != 0) { data_size = (data_size / min_size + 1) * min_size; } Kernel uint_kernel(program, "tinymt_uint64_kernel"); Buffer output_buffer(context, CL_MEM_READ_WRITE, data_size * sizeof(uint64_t)); uint_kernel.setArg(0, tinymt_status); uint_kernel.setArg(1, output_buffer); uint_kernel.setArg(2, data_size / total_num); NDRange global(total_num); NDRange local(local_num); Event generate_event; #if defined(DEBUG) cout << "generate_uint64 enque kernel start" << endl; #endif queue.enqueueNDRangeKernel(uint_kernel, NullRange, global, local, NULL, &generate_event); uint64_t * output = new uint64_t[data_size]; generate_event.wait(); queue.enqueueReadBuffer(output_buffer, CL_TRUE, 0, data_size * sizeof(uint64_t), output); check_data(output, data_size, total_num); #if defined(DEBUG) print_uint64(output, data_size, total_num); #endif double time = get_time(generate_event); cout << "generate time:" << time * 1000 << "ms" << endl; delete[] output; #if defined(DEBUG) cout << "generate_uint64 end" << endl; #endif }
void CLArgument::copyFromDevice( cl::CommandQueue &queue ) { assert( myBufferInitialized ); // If we own the memory, nobody else can read it anyways. if ( myCopy ) return; if ( !myCopyBack ) return; queue.enqueueReadBuffer( myBuffer, CL_TRUE, 0, mySize, myPtr ); }
real L2Norm(const Buffer3D & in,cl::CommandQueue & q) { cl::Buffer ans (CLContextLoader::getContext(),CL_MEM_READ_WRITE,sizeof(real)*in.width()*in.height()*in.depth()); CLContextLoader::getRedL2NormKer().setArg(0,in()); CLContextLoader::getRedL2NormKer().setArg(1,ans()); q.enqueueNDRangeKernel(CLContextLoader::getRedL2NormKer(), cl::NDRange(0), cl::NDRange(in.width()*in.height()*in.depth()), getBestWorkspaceDim(cl::NDRange(in.width()*in.height()*in.depth()))); ans = performReduction(ans,CLContextLoader::getRedSumAllKer(),q,in.width()*in.height()*in.depth()); real res; q.enqueueReadBuffer(ans,true,0,sizeof(real),&res); return sqrt(res); }
/** * initialize tinymt status in device global memory * using 1 parameter for 1 generator. *@param tinymt_status internal state of kernel side tinymt *@param total total number of work items *@param local_item number of local work items *@param seed seed for initialization */ static void initialize_by_seed(Buffer& tinymt_status, int total, int local_item, uint32_t seed) { #if defined(DEBUG) cout << "initialize_by_seed start" << endl; #endif Kernel init_kernel(program, "tinymt_init_seed_kernel"); init_kernel.setArg(0, tinymt_status); init_kernel.setArg(1, seed); NDRange global(total); NDRange local(local_item); Event event; #if defined(DEBUG) cout << "global:" << dec << total << endl; cout << "group:" << dec << (total / local_item) << endl; cout << "local:" << dec << local_item << endl; #endif queue.enqueueNDRangeKernel(init_kernel, NullRange, global, local, NULL, &event); double time = get_time(event); tinymt32j_t status[total]; queue.enqueueReadBuffer(tinymt_status, CL_TRUE, 0, sizeof(tinymt32j_t) * total, status); cout << "initializing time = " << time * 1000 << "ms" << endl; #if defined(DEBUG) cout << "status[0].s0:" << hex << status[0].s0 << endl; cout << "status[0].s1:" << hex << status[0].s1 << endl; cout << "status[0].s2:" << hex << status[0].s2 << endl; cout << "status[0].s3:" << hex << status[0].s3 << endl; #endif check_status(status, total); #if defined(DEBUG) cout << "initialize_by_seed end" << endl; #endif }
/** * initialize tinymt status in device global memory * using 1 parameter for all generators. *@param tinymt_status device global memories *@param total total number of work items *@param local_item number of local work items *@param seed_array seeds for initialization *@param seed_size size of seed_array */ static void initialize_by_array(Buffer& tinymt_status, int total, int local_item, uint64_t seed_array[], int seed_size) { #if defined(DEBUG) cout << "initialize_by_array start" << endl; #endif Buffer seed_array_buffer(context, CL_MEM_READ_WRITE, seed_size * sizeof(uint64_t)); queue.enqueueWriteBuffer(seed_array_buffer, CL_TRUE, 0, seed_size * sizeof(uint64_t), seed_array); Kernel init_kernel(program, "tinymt_init_array_kernel"); init_kernel.setArg(0, tinymt_status); init_kernel.setArg(1, seed_array_buffer); init_kernel.setArg(2, seed_size); NDRange global(total); NDRange local(local_item); Event event; queue.enqueueNDRangeKernel(init_kernel, NullRange, global, local, NULL, &event); double time = get_time(event); tinymt64j_t status[total]; queue.enqueueReadBuffer(tinymt_status, CL_TRUE, 0, sizeof(tinymt64j_t) * total, status); cout << "initializing time = " << time * 1000 << "ms" << endl; check_status(status, total); #if defined(DEBUG) cout << "initialize_by_array end" << endl; #endif }
void run(T* buf) { cl_int err; cl::Buffer outbuf( m_context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, N*N*sizeof(T), buf, &err); checkErr(err, "Buffer::Buffer()"); err = m_kernel.setArg(0, outbuf); checkErr(err, "Kernel::setArg(0)"); err = m_kernel.setArg(1, N); checkErr(err, "Kernel::setArg(1)"); err = m_kernel.setArg(2, depth); checkErr(err, "Kernel::setArg(2)"); err = m_kernel.setArg(3, escape2); checkErr(err, "Kernel::setArg(3)"); cl::Event event; err = m_cmdq.enqueueNDRangeKernel( m_kernel, cl::NullRange, cl::NDRange(N*N), cl::NDRange(N, 1), NULL, &event); checkErr(err, "ComamndQueue::enqueueNDRangeKernel()"); event.wait(); err = m_cmdq.enqueueReadBuffer( outbuf, CL_TRUE, 0, N*N*sizeof(T), buf); checkErr(err, "ComamndQueue::enqueueReadBuffer()"); }
/** * generate double precision floating point numbers in the range [0, 1) * in device global memory *@param tinymt_status device global memories *@param total_num total number of work items *@param local_num number of local work items *@param data_size number of data to generate */ static void generate_double01(Buffer& tinymt_status, int total_num, int local_num, int data_size) { int min_size = total_num; if (data_size % min_size != 0) { data_size = (data_size / min_size + 1) * min_size; } Kernel double_kernel(program, "tinymt_double01_kernel"); Buffer output_buffer(context, CL_MEM_READ_WRITE, data_size * sizeof(double)); double_kernel.setArg(0, tinymt_status); double_kernel.setArg(1, output_buffer); double_kernel.setArg(2, data_size / total_num); NDRange global(total_num); NDRange local(local_num); Event generate_event; queue.enqueueNDRangeKernel(double_kernel, NullRange, global, local, NULL, &generate_event); double * output = new double[data_size]; generate_event.wait(); queue.enqueueReadBuffer(output_buffer, CL_TRUE, 0, data_size * sizeof(double), &output[0]); check_data01(output, data_size, total_num); #if defined(DEBUG) print_double(&output[0], data_size, local_num); #endif double time = get_time(generate_event); delete[] output; cout << "generate time:" << time * 1000 << "ms" << endl; }
void MetaBallsApp::updateMarching() { static const cl_int3 size{ VOLUME_WIDTH, VOLUME_HEIGHT, VOLUME_DEPTH }; mClCommandQueue.enqueueNDRangeKernel( mKernWriteClear, cl::NullRange, cl::NDRange( VOLUME_SIZE ) ); /* Update volumes */ mClCommandQueue.enqueueNDRangeKernel( mKernWriteMetaballs, cl::NullRange, cl::NDRange( VOLUME_SIZE ) ); /* End */ int zero = 0; auto kernelRange = (size.s[0]-1) * (size.s[1]-1) * (size.s[2]-1); mClCommandQueue.enqueueWriteBuffer( mClVertIndex, true, 0, sizeof(int), &zero ); mClCommandQueue.enqueueNDRangeKernel( mKernConstructSurface, cl::NullRange, cl::NDRange( kernelRange ) ); mClCommandQueue.enqueueReadBuffer( mClVertIndex, true, 0, sizeof(cl_int), &mMarchingVertsWritten ); /* Generate Normals */ if (mMarchingVertsWritten > 0) { bool smooth = true; if( ! smooth ) mClCommandQueue.enqueueNDRangeKernel( mKernGenNormals, cl::NullRange, cl::NDRange( mMarchingVertsWritten ) ); else mClCommandQueue.enqueueNDRangeKernel( mKernGenNormalsSmooth, cl::NullRange, cl::NDRange( mMarchingVertsWritten ) ); } //if( mDebugDraw ) mClCommandQueue.enqueueNDRangeKernel( mKernWritePointColorBack, cl::NullRange, cl::NDRange( VOLUME_SIZE ) ); }
void Reduce::enqueue( const cl::CommandQueue &commandQueue, bool blocking, const cl::Buffer &inBuffer, void *out, ::size_t first, ::size_t elements, const VECTOR_CLASS<cl::Event> *events, cl::Event *event, cl::Event *reduceEvent) { VECTOR_CLASS<cl::Event> reduceEvents(1); cl::Event readEvent; if (out == NULL) throw cl::Error(CL_INVALID_VALUE, "clogs::Reduce::enqueue: out is NULL"); //{inviwo::ScopedClockCPU clock("Reduce", "Reduce", -0.1f); enqueue(commandQueue, inBuffer, sums, first, elements, reduceBlocks, events, &reduceEvents[0]); //auto buf = commandQueue.enqueueMapBuffer(sums, true, CL_MAP_READ, reduceBlocks * elementSize, elementSize, &reduceEvents, // &readEvent); //memcpy(buf, out, elementSize); //commandQueue.enqueueUnmapMemObject(sums, buf); commandQueue.enqueueReadBuffer( sums, blocking, reduceBlocks * elementSize, elementSize, out, &reduceEvents, &readEvent); //} doEventCallback(readEvent); if (event != NULL) *event = readEvent; if (reduceEvent != NULL) *reduceEvent = reduceEvents[0]; }
bool runTestType(cl::Context context, cl::CommandQueue queue) { cl_uint size = (1 << 16) + 16384; std::vector<T> input(size); std::cout << "##Testing AMD radix sort for " << input.size() << " elements and type " << magnet::CL::detail::traits<T>::kernel_type(); for(size_t i = 0; i < input.size(); ++i) input[i] = input.size() - i - 1; // create input buffer using pinned memory cl::Buffer bufferIn(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, sizeof(T) * input.size(), &input[0]); magnet::CL::radixSortAMD<T> radixSortFunctor; radixSortFunctor.build(queue, context); radixSortFunctor(bufferIn, bufferIn); std::vector<T> output(size); queue.enqueueReadBuffer(bufferIn, CL_TRUE, 0, input.size() * sizeof(T), &output[0]); bool failed = !testOutput(input, output); std::cout << " key(only) " << (failed ? "FAILED" : "PASSED") << ", "; //Now test with some data! //Refresh the input array queue.enqueueWriteBuffer(bufferIn, CL_TRUE, 0, input.size() * sizeof(T), &input[0]); //Write a data array std::vector<cl_uint> data(size); for(size_t i = 0; i < input.size(); ++i) data[i] = i; cl::Buffer dataIn(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_uint) * data.size(), &data[0]) ; radixSortFunctor(bufferIn, dataIn, bufferIn, dataIn); queue.enqueueReadBuffer(dataIn, CL_TRUE, 0, data.size() * sizeof(cl_uint), &data[0]); bool keyfail = !testOutput(input, output); std::cout << " key " << (keyfail ? "FAILED" : "PASSED"); bool datafail = false; for(size_t i = 0; i < input.size(); ++i) if (data[i] != input.size() - 1 - i) datafail = true; std::cout << " data " << (datafail ? "FAILED" : "PASSED") << std::endl; return failed || keyfail || datafail; }
int clPeak::runTransferBandwidthTest(cl::CommandQueue &queue, cl::Program &prog, device_info_t &devInfo) { if(!isTransferBW) return 0; float timed, gbps; cl::NDRange globalSize, localSize; cl::Context ctx = queue.getInfo<CL_QUEUE_CONTEXT>(); int iters = devInfo.transferBWIters; Timer timer; cl_uint maxItems = devInfo.maxAllocSize / sizeof(float) / 2; cl_uint numItems; // Set an upper-limit for cpu devies if(devInfo.deviceType & CL_DEVICE_TYPE_CPU) { numItems = roundToPowOf2(maxItems, 26); } else { numItems = roundToPowOf2(maxItems); } float *arr = new float[numItems]; try { cl::Buffer clBuffer = cl::Buffer(ctx, (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR), (numItems * sizeof(float))); cout << NEWLINE TAB TAB "Transfer bandwidth (GBPS)" << endl; cout << setprecision(2) << fixed; /////////////////////////////////////////////////////////////////////////// // enqueueWriteBuffer cout << TAB TAB TAB "enqueueWriteBuffer : "; cout.flush(); // Dummy warm-up queue.enqueueWriteBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr); queue.finish(); timed = 0; if(useEventTimer) { for(int i=0; i<iters; i++) { cl::Event timeEvent; queue.enqueueWriteBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr, NULL, &timeEvent); queue.finish(); timed += timeInUS(timeEvent); } } else { Timer timer; timer.start(); for(int i=0; i<iters; i++) { queue.enqueueWriteBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr); } queue.finish(); timed = timer.stopAndTime(); } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; cout << gbps << endl; /////////////////////////////////////////////////////////////////////////// // enqueueReadBuffer cout << TAB TAB TAB "enqueueReadBuffer : "; cout.flush(); // Dummy warm-up queue.enqueueReadBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr); queue.finish(); timed = 0; if(useEventTimer) { for(int i=0; i<iters; i++) { cl::Event timeEvent; queue.enqueueReadBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr, NULL, &timeEvent); queue.finish(); timed += timeInUS(timeEvent); } } else { Timer timer; timer.start(); for(int i=0; i<iters; i++) { queue.enqueueReadBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr); } queue.finish(); timed = timer.stopAndTime(); } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; cout << gbps << endl; /////////////////////////////////////////////////////////////////////////// // enqueueMapBuffer cout << TAB TAB TAB "enqueueMapBuffer(for read) : "; cout.flush(); queue.finish(); timed = 0; if(useEventTimer) { for(int i=0; i<iters; i++) { cl::Event timeEvent; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_READ, 0, (numItems * sizeof(float)), NULL, &timeEvent); queue.finish(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); timed += timeInUS(timeEvent); } } else { for(int i=0; i<iters; i++) { Timer timer; void *mapPtr; timer.start(); mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_READ, 0, (numItems * sizeof(float))); queue.finish(); timed += timer.stopAndTime(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); } } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; cout << gbps << endl; /////////////////////////////////////////////////////////////////////////// // memcpy from mapped ptr cout << TAB TAB TAB TAB "memcpy from mapped ptr : "; cout.flush(); queue.finish(); timed = 0; for(int i=0; i<iters; i++) { cl::Event timeEvent; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_READ, 0, (numItems * sizeof(float))); queue.finish(); timer.start(); memcpy(arr, mapPtr, (numItems * sizeof(float))); timed += timer.stopAndTime(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; cout << gbps << endl; /////////////////////////////////////////////////////////////////////////// // enqueueUnmap cout << TAB TAB TAB "enqueueUnmap(after write) : "; cout.flush(); queue.finish(); timed = 0; if(useEventTimer) { for(int i=0; i<iters; i++) { cl::Event timeEvent; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_WRITE, 0, (numItems * sizeof(float))); queue.finish(); queue.enqueueUnmapMemObject(clBuffer, mapPtr, NULL, &timeEvent); queue.finish(); timed += timeInUS(timeEvent); } } else { for(int i=0; i<iters; i++) { Timer timer; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_WRITE, 0, (numItems * sizeof(float))); queue.finish(); timer.start(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); timed += timer.stopAndTime(); } } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; cout << gbps << endl; /////////////////////////////////////////////////////////////////////////// // memcpy to mapped ptr cout << TAB TAB TAB TAB "memcpy to mapped ptr : "; cout.flush(); queue.finish(); timed = 0; for(int i=0; i<iters; i++) { cl::Event timeEvent; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_WRITE, 0, (numItems * sizeof(float))); queue.finish(); timer.start(); memcpy(mapPtr, arr, (numItems * sizeof(float))); timed += timer.stopAndTime(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; cout << gbps << endl; /////////////////////////////////////////////////////////////////////////// } catch(cl::Error error) { cerr << error.what() << "(" << error.err() << ")" << endl; cerr << TAB TAB TAB "Tests skipped" << endl; if(arr) delete [] arr; return -1; } if(arr) delete [] arr; return 0; }
void helper(uint8_t* in, int isize, int choice[]) { int set_NDRange_size=16; char* filePathptr; int result = 0; //opening file and closing. Appended to in later locations FILE *log = fopen("logfile.txt", "w"); if (!log) { printf("\nCannot open logfile.txt for writing.\n"); return; // bail out if we can't log } try { cl::Buffer bufferIn = cl::Buffer(gContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, isize*sizeof(cl_uchar), in, NULL); cl::Buffer bufferOut = cl::Buffer(gContext, CL_MEM_READ_WRITE, isize*sizeof(cl_uchar4)); //int result = mainRunBFS(char * argv[]); // need a c string to pass it /* gLaplacianK.setArg(2,w); gLaplacianK.setArg(3,h); gLaplacianK.setArg(1,bufferOut); gLaplacianK.setArg(0,bufferOut2); gQueue.enqueueNDRangeKernel(gLaplacianK, cl::NullRange, cl::NDRange( (int)ceil((float)w/16.0f)*16,(int)ceil((float)h/16.0f)*16), cl::NDRange(set_NDRange_size,set_NDRange_size), NULL, NULL); */ if (choice[0]==1) { //leukocyte } if (choice[1]==1){ //heartwall } if (choice[2]==1){ //CFD } if (choice[3]==1){ //LUdecomp } if (choice[4]==1){ //hotspot } if (choice[5]==1){ //backprop } if (choice[6]==1){ //needleman } if (choice[7]==1){ //kmeans } if (choice[8]==1){ //BFS fclose(log); char filePath[] = "../assets/graph4096.txt"; filePathptr = &filePath[0]; result = mainRunBFS(&filePathptr); if (result < 0) { FILE *log = fopen("logfile.txt", "a"); fprintf(log, "\n\n----------\nError in running mainRunBFS\n----------\n\n"); fclose(log); } } if (choice[9]==1){ //srad } if (choice[10]==1){ //streamcluster } if (choice[11]==1){ //particle filter } if (choice[12]==1){ //pathfinder } if (choice[13]==1){ //gaussian } if (choice[14]==1){ //k nearest } if (choice[15]==1){ //lava } if (choice[16]==1){ //myocyte } if (choice[17]==1){ //btree } if (choice[18]==1){ //gpudwt } if (choice[19]==1){ //hybrid sort } //last arg in this should be out??? gQueue.enqueueReadBuffer(bufferOut, CL_TRUE, 0, isize*sizeof(cl_uchar4), NULL); } catch (cl::Error e) { LOGI("@oclDecoder: %s %d \n",e.what(),e.err()); } if(log) { fclose(log); } }
void watershed(int width, int height, cl::Buffer& src, cl::Buffer& labeled, ProgramCache& cache, cl::CommandQueue& queue) { #ifdef OPENCL_PROFILE watershed_descent_kernel_time = 0; watershed_increment_kernel_time = 0; watershed_minima_kernel_time = 0; watershed_plateau_kernel_time = 0; watershed_flood_kernel_time = 0; #endif cl::Context context = queue.getInfo<CL_QUEUE_CONTEXT>(); std::stringstream params_stream; params_stream << "-DBLOCK_SIZE="; params_stream << BLOCK_SIZE; std::string program_params = params_stream.str(); cl::Program& program = cache.getProgram("Watershed", program_params); cl::Kernel descent_kernel(program, "descent_kernel"); cl::Kernel increment_kernel(program, "increment_kernel"); cl::Kernel minima_kernel(program, "minima_kernel"); cl::Kernel plateau_kernel(program, "plateau_kernel"); cl::Kernel flood_kernel(program, "flood_kernel"); //setting constant memory with neigbourhood cl::Buffer cl_neighbourhood_x = cl::Buffer(context,CL_MEM_READ_ONLY, sizeof(neighbourhood_x)); cl::Buffer cl_neighbourhood_y = cl::Buffer(context, CL_MEM_READ_ONLY, sizeof(neighbourhood_y)); #ifdef OPENCL_PROFILE cl::Event first_event; queue.enqueueWriteBuffer(cl_neighbourhood_x, CL_TRUE, 0, sizeof(neighbourhood_x), neighbourhood_x, __null, &first_event); #else queue.enqueueWriteBuffer(cl_neighbourhood_x, CL_TRUE, 0, sizeof(neighbourhood_x), neighbourhood_x); #endif queue.enqueueWriteBuffer(cl_neighbourhood_y, CL_TRUE, 0, sizeof(neighbourhood_y), neighbourhood_y); //const size_t block_size = 6; //cl::LocalSpaceArg local_mem = cl::__local(block_size * block_size * sizeof(float)); cl::LocalSpaceArg local_mem = cl::__local(BLOCK_SIZE * BLOCK_SIZE * sizeof(float)); //setting args for descent_kernel descent_kernel.setArg(0, src); descent_kernel.setArg(1, labeled); descent_kernel.setArg(2, cl_neighbourhood_x); descent_kernel.setArg(3, cl_neighbourhood_y); descent_kernel.setArg(4, local_mem); descent_kernel.setArg(5, width); descent_kernel.setArg(6, height); size_t global_width = (width / (BLOCK_SIZE - 2) + 1) * BLOCK_SIZE; size_t global_height = (height / (BLOCK_SIZE - 2) + 1) * BLOCK_SIZE; #ifdef DEBUG_PRINT std::cout << "global width=" << global_width << " global height=" << global_height << std::endl; #endif cl::NDRange global(global_width, global_height); cl::NDRange local(BLOCK_SIZE, BLOCK_SIZE); cl_int status; #ifdef OPENCL_PROFILE { VECTOR_CLASS<cl::Event> events_vector(1); status = queue.enqueueNDRangeKernel(descent_kernel, cl::NullRange, global, local, __null, &events_vector[0]); cl::WaitForEvents(events_vector); cl::Event& event = events_vector[0]; cl_ulong start = event.getProfilingInfo<CL_PROFILING_COMMAND_START>(); cl_ulong end = event.getProfilingInfo<CL_PROFILING_COMMAND_END>(); cl_ulong total_time = end - start; watershed_descent_kernel_time = total_time; } #else status = queue.enqueueNDRangeKernel(descent_kernel, cl::NullRange, global, local); #endif #ifdef DEBUG_PRINT std::cout << "kernel execution " << status << std::endl; #endif // queue.flush(); // queue.enqueueBarrier(); /* PREPARING INCREMENT KERNEL */ increment_kernel.setArg(0, labeled); increment_kernel.setArg(1, width); increment_kernel.setArg(2, height); #ifdef OPENCL_PROFILE { VECTOR_CLASS<cl::Event> events_vector(1); status = queue.enqueueNDRangeKernel(increment_kernel, cl::NullRange, global, local, __null, &events_vector[0]); cl::WaitForEvents(events_vector); cl::Event& event = events_vector[0]; cl_ulong start = event.getProfilingInfo<CL_PROFILING_COMMAND_START>(); cl_ulong end = event.getProfilingInfo<CL_PROFILING_COMMAND_END>(); cl_ulong total_time = end - start; watershed_increment_kernel_time = total_time; } #else status = queue.enqueueNDRangeKernel(increment_kernel, cl::NullRange, global, local); #endif // queue.enqueueBarrier(); /* PREPARING MINIMA KERNEL */ int counter_tmp = 0; cl::Buffer counter(context, CL_MEM_READ_WRITE, sizeof(int)); queue.enqueueWriteBuffer(counter, CL_TRUE, 0, sizeof(int), &counter_tmp); queue.enqueueBarrier(); minima_kernel.setArg(0, counter); minima_kernel.setArg(1, labeled); minima_kernel.setArg(2, cl_neighbourhood_x); minima_kernel.setArg(3, cl_neighbourhood_y); minima_kernel.setArg(4, local_mem); minima_kernel.setArg(5, width); minima_kernel.setArg(6, height); int old_val = -1; int new_val = -2; int c = 0; while(old_val != new_val) { old_val = new_val; #ifdef OPENCL_PROFILE { VECTOR_CLASS<cl::Event> events_vector(1); status = queue.enqueueNDRangeKernel(minima_kernel, cl::NullRange, global, local, __null, &events_vector[0]); cl::WaitForEvents(events_vector); cl::Event& event = events_vector[0]; cl_ulong start = event.getProfilingInfo<CL_PROFILING_COMMAND_START>(); cl_ulong end = event.getProfilingInfo<CL_PROFILING_COMMAND_END>(); cl_ulong total_time = end - start; watershed_minima_kernel_time += total_time; } #else status = queue.enqueueNDRangeKernel(minima_kernel, cl::NullRange, global, local); #endif queue.enqueueReadBuffer(counter, CL_TRUE, 0, sizeof(int), &new_val); c++; } #ifdef DEBUG_PRINT std::cout << "step 2: " << c << " iterations" << std::endl; #endif /* PREPARING PLATEAU KERNEL */ queue.enqueueWriteBuffer(counter, CL_TRUE, 0, sizeof(int), &counter_tmp); queue.enqueueBarrier(); plateau_kernel.setArg(0, counter); plateau_kernel.setArg(1, src); plateau_kernel.setArg(2, labeled); plateau_kernel.setArg(3, cl_neighbourhood_x); plateau_kernel.setArg(4, cl_neighbourhood_y); plateau_kernel.setArg(5, local_mem); plateau_kernel.setArg(6, width); plateau_kernel.setArg(7, height); old_val = -1; new_val = -2; c = 0; #ifdef OPENCL_PROFILE watershed_plateau_kernel_time = 0; #endif while(old_val != new_val) { old_val = new_val; #ifdef OPENCL_PROFILE { VECTOR_CLASS<cl::Event> events_vector(1); status = queue.enqueueNDRangeKernel(plateau_kernel, cl::NullRange, global, local, __null, &events_vector[0]); cl::WaitForEvents(events_vector); cl::Event& event = events_vector[0]; cl_ulong start = event.getProfilingInfo<CL_PROFILING_COMMAND_START>(); cl_ulong end = event.getProfilingInfo<CL_PROFILING_COMMAND_END>(); cl_ulong total_time = end - start; watershed_plateau_kernel_time += total_time; } #else status = queue.enqueueNDRangeKernel(plateau_kernel, cl::NullRange, global, local); #endif queue.enqueueReadBuffer(counter, CL_TRUE, 0, sizeof(int), &new_val); c++; } #ifdef DEBUG_PRINT std::cout << "step 3: " << c << " iterations" << std::endl; #endif //preparing flood kernel queue.enqueueWriteBuffer(counter, CL_TRUE, 0, sizeof(int), &counter_tmp); queue.enqueueBarrier(); flood_kernel.setArg(0, counter); flood_kernel.setArg(1, labeled); flood_kernel.setArg(2, width); flood_kernel.setArg(3, height); old_val = -1; new_val = -2; c = 0; int new_block_size = 16; local = cl::NDRange(new_block_size, new_block_size); int n_width = ((width - 1) / new_block_size + 2) * new_block_size; int n_height = ((height - 1) / new_block_size + 2) * new_block_size; global = cl::NDRange(n_width, n_height); #ifdef DEBUG_PRINT std::cout << "flood kernel invocation params:" << std::endl; std::cout << "local: " << local[0] << ", " << local[1] << std::endl; std::cout << "global: " << global[0] << ", " << global[1] << std::endl; #endif #ifdef OPENCL_PROFILE cl::Event last_event; #endif #ifdef OPENCL_PROFILE watershed_flood_kernel_time = 0; #endif while(old_val != new_val) { old_val = new_val; #ifdef OPENCL_PROFILE { VECTOR_CLASS<cl::Event> events_vector(1); status = queue.enqueueNDRangeKernel(flood_kernel, cl::NullRange, global, local, __null, &events_vector[0]); cl::WaitForEvents(events_vector); cl::Event& event = events_vector[0]; cl_ulong start = event.getProfilingInfo<CL_PROFILING_COMMAND_START>(); cl_ulong end = event.getProfilingInfo<CL_PROFILING_COMMAND_END>(); cl_ulong total_time = end - start; watershed_flood_kernel_time += total_time; } #else status = queue.enqueueNDRangeKernel(flood_kernel, cl::NullRange, global, local); #endif #ifdef OPENCL_PROFILE queue.enqueueReadBuffer(counter, CL_TRUE, 0, sizeof(int), &new_val, __null, &last_event); #else queue.enqueueReadBuffer(counter, CL_TRUE, 0, sizeof(int), &new_val); #endif c++; } #ifdef OPENCL_PROFILE watershed_descent_kernel_time /= TIME_DIVISOR; watershed_increment_kernel_time /= TIME_DIVISOR; watershed_minima_kernel_time /= TIME_DIVISOR; watershed_plateau_kernel_time /= TIME_DIVISOR; watershed_flood_kernel_time /= TIME_DIVISOR; #endif #ifdef DEBUG_PRINT std::cout << "step 4: " << c << " iterations" << std::endl; #endif #ifdef OPENCL_PROFILE last_event.wait(); cl_ulong start = first_event.getProfilingInfo<CL_PROFILING_COMMAND_START>(); cl_ulong end = last_event.getProfilingInfo<CL_PROFILING_COMMAND_END>(); cl_ulong total_time = end - start; setLastExecutionTime(total_time/TIME_DIVISOR); #endif }
inline void OpenCL::readarg(std::vector<T> & arg, cl::Buffer &buf,cl::CommandQueue &quene) const{ quene.enqueueReadBuffer(buf,CL_FALSE,0,arg.size()*sizeof(T),&(arg[0])); }
inline void OpenCL::readarg(T (&arg)[N], cl::Buffer &buf,cl::CommandQueue &quene) const{ quene.enqueueReadBuffer(buf,CL_FALSE,0,N*sizeof(T),arg); }
int MaxValueSimple::maxValueCL(int* values, size_t len) { try { cl_int status = CL_SUCCESS; /*** Ausgabe von Informationen ueber gewaehltes OpenCL-Device ***/ /* TODO logging Logger::logDebug( METHOD, Logger::sStream << "max compute units: " << devices[0].getInfo< CL_DEVICE_MAX_COMPUTE_UNITS> ()); Logger::logDebug( METHOD, Logger::sStream << "max work item sizes: " << devices[0].getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES> ()[0]); Logger::logDebug( METHOD, Logger::sStream << "max work group sizes: " << devices[0].getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE> ()); Logger::logDebug( METHOD, Logger::sStream << "max global mem size (KB): " << devices[0].getInfo<CL_DEVICE_GLOBAL_MEM_SIZE> () / 1024); Logger::logDebug( METHOD, Logger::sStream << "max local mem size (KB): " << devices[0].getInfo<CL_DEVICE_LOCAL_MEM_SIZE> () / 1024); */ /*** Erstellen und Vorbereiten der Daten ***/ cl::Buffer vBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(cl_int) * len, &values[0], &status); if (status != CL_SUCCESS) { throw cl::Error(status, "cl::Buffer values"); } cmdQ.finish(); /*** Arbeitsgroeszen berechnen ***/ // Anzahl der Work-Items = globalSize // Work-Items pro Work-Group = localSize const size_t MAX_GROUP_SIZE = devices[0].getInfo< CL_DEVICE_MAX_WORK_GROUP_SIZE> (); size_t globalSize; size_t localSize; do { globalSize = len; localSize = MaxValueSimple::calcWorkGroupSize(globalSize, MAX_GROUP_SIZE); if (localSize == 1) { globalSize = ceil((double) len / WG_FAC) * WG_FAC; localSize = MaxValueSimple::calcWorkGroupSize(globalSize, MAX_GROUP_SIZE); /* TODO logging Logger::logDebug( METHOD, Logger::sStream << "GlobalSize has been extended to " << globalSize); */ } /* TODO logging Logger::logDebug(METHOD, Logger::sStream << "globalSize: " << globalSize); Logger::logDebug(METHOD, Logger::sStream << "localSize: " << localSize); */ /*** Kernel-Argumente setzen ***/ status = kernel.setArg(0, vBuffer); if (status != CL_SUCCESS) { throw cl::Error(status, "Kernel.SetArg"); } status = kernel.setArg(1, sizeof(cl_int) * localSize, NULL); if (status != CL_SUCCESS) { throw cl::Error(status, "Kernel.SetArg"); } /*** Kernel ausfuehren und auf Abarbeitung warten ***/ cl::KernelFunctor func = kernel.bind(cmdQ, cl::NDRange(globalSize), cl::NDRange(localSize)); event = func(); event.wait(); cmdQ.finish(); /* runtimeKernel += event.getProfilingInfo<CL_PROFILING_COMMAND_END> (); runtimeKernel -= event.getProfilingInfo<CL_PROFILING_COMMAND_START> (); */ len = globalSize / localSize; } while (globalSize > localSize && localSize > 1); /*** Daten vom OpenCL-Device holen ***/ // TODO nur 1. element auslesen status = cmdQ.enqueueReadBuffer(vBuffer, true, 0, sizeof(cl_int) * 1, &values[0]); if (status != CL_SUCCESS) { throw cl::Error(status, "CommandQueue.enqueueReadBuffer"); } /* TODO logging Logger::log( METHOD, TIME, Logger::sStream << "timeKernel=" << 1.0e-9 * runtimeKernel << ";"); */ return values[0]; } catch (cl::Error& err) { // TODO Logger::logError(METHOD, Logger::sStream << err.what()); std::cerr << "[ERROR] MaxValueSimple::maxValueCL(int*, size_t): " << err.what() << " (" << err.err() << ")" << std::endl; return MaxValueSimple::MAX_FAILURE; } catch (std::exception& err) { // TODO Logger::logError(METHOD, Logger::sStream << err.what()); std::cerr << "[ERROR] MaxValueSimple::maxValueCL(int*, size_t): " << err.what() << std::endl; return MaxValueSimple::MAX_FAILURE; } }
int main() { try { std::vector<cl::Device> devices; // select platform cl::Platform platform = selectPlatform(); // select device platform.getDevices(CL_DEVICE_TYPE_ALL, &devices); cl::Device device = selectDevice(devices); // create context context = cl::Context(devices); // create command queue queue = cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE); // load opencl source std::ifstream cl_file("inclusive_scan.cl"); std::string cl_string{std::istreambuf_iterator<char>(cl_file), std::istreambuf_iterator<char>()}; cl::Program::Sources source(1, std::make_pair(cl_string.c_str(), cl_string.length() + 1)); // create programm program = cl::Program(context, source); // compile opencl source try { program.build(devices); size_t input_size; std::ifstream input_file("input.txt"); input_file >> input_size; std::vector<float> input(input_size); // for (size_t i = 0; i < input_size; ++i) { // input[i] = i % 10; // } for (int i = 0; i < input_size; i++) { input_file >> input[i]; } std::vector<float> output(input_size, 0); cl::Buffer dev_input (context, CL_MEM_READ_ONLY, sizeof(float) * input_size); queue.enqueueWriteBuffer(dev_input, CL_TRUE, 0, sizeof(float) * input_size, &input[0]); cl::Buffer dev_output = inclusive_scan(dev_input, input_size); queue.enqueueReadBuffer(dev_output, CL_TRUE, 0, sizeof(float) * input_size, &output[0]); queue.finish(); cpu_check(input, output); std::ofstream output_file("output.txt"); for (int i = 0; i < input_size; i++) { output_file << output[i] << " "; } } catch (cl::Error const & e) { std::string log_str = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device); std::cout << std::endl << e.what() << " : " << e.err() << std::endl; std::cout << log_str; return 0; } } catch (cl::Error const & e) { std::cout << "Error: " << e.what() << " #" << e.err() << std::endl; } return 0; }
int clPeak::runTransferBandwidthTest(cl::CommandQueue &queue, cl::Program &prog, device_info_t &devInfo) { if(!isTransferBW) return 0; float timed, gbps; cl::NDRange globalSize, localSize; cl::Context ctx = queue.getInfo<CL_QUEUE_CONTEXT>(); int iters = devInfo.transferBWIters; Timer timer; float *arr = NULL; cl_uint maxItems = devInfo.maxAllocSize / sizeof(float) / 2; cl_uint numItems; // Set an upper-limit for cpu devies if(devInfo.deviceType & CL_DEVICE_TYPE_CPU) { numItems = roundToPowOf2(maxItems, 26); } else { numItems = roundToPowOf2(maxItems); } try { arr = new float[numItems]; cl::Buffer clBuffer = cl::Buffer(ctx, (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR), (numItems * sizeof(float))); log->print(NEWLINE TAB TAB "Transfer bandwidth (GBPS)" NEWLINE); log->xmlOpenTag("transfer_bandwidth"); log->xmlAppendAttribs("unit", "gbps"); /////////////////////////////////////////////////////////////////////////// // enqueueWriteBuffer log->print(TAB TAB TAB "enqueueWriteBuffer : "); // Dummy warm-up queue.enqueueWriteBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr); queue.finish(); timed = 0; if(useEventTimer) { for(int i=0; i<iters; i++) { cl::Event timeEvent; queue.enqueueWriteBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr, NULL, &timeEvent); queue.finish(); timed += timeInUS(timeEvent); } } else { Timer timer; timer.start(); for(int i=0; i<iters; i++) { queue.enqueueWriteBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr); } queue.finish(); timed = timer.stopAndTime(); } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; log->print(gbps); log->print(NEWLINE); log->xmlRecord("enqueuewritebuffer", gbps); /////////////////////////////////////////////////////////////////////////// // enqueueReadBuffer log->print(TAB TAB TAB "enqueueReadBuffer : "); // Dummy warm-up queue.enqueueReadBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr); queue.finish(); timed = 0; if(useEventTimer) { for(int i=0; i<iters; i++) { cl::Event timeEvent; queue.enqueueReadBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr, NULL, &timeEvent); queue.finish(); timed += timeInUS(timeEvent); } } else { Timer timer; timer.start(); for(int i=0; i<iters; i++) { queue.enqueueReadBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr); } queue.finish(); timed = timer.stopAndTime(); } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; log->print(gbps); log->print(NEWLINE); log->xmlRecord("enqueuereadbuffer", gbps); /////////////////////////////////////////////////////////////////////////// // enqueueMapBuffer log->print(TAB TAB TAB "enqueueMapBuffer(for read) : "); queue.finish(); timed = 0; if(useEventTimer) { for(int i=0; i<iters; i++) { cl::Event timeEvent; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_READ, 0, (numItems * sizeof(float)), NULL, &timeEvent); queue.finish(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); timed += timeInUS(timeEvent); } } else { for(int i=0; i<iters; i++) { Timer timer; void *mapPtr; timer.start(); mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_READ, 0, (numItems * sizeof(float))); queue.finish(); timed += timer.stopAndTime(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); } } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; log->print(gbps); log->print(NEWLINE); log->xmlRecord("enqueuemapbuffer", gbps); /////////////////////////////////////////////////////////////////////////// // memcpy from mapped ptr log->print(TAB TAB TAB TAB "memcpy from mapped ptr : "); queue.finish(); timed = 0; for(int i=0; i<iters; i++) { cl::Event timeEvent; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_READ, 0, (numItems * sizeof(float))); queue.finish(); timer.start(); memcpy(arr, mapPtr, (numItems * sizeof(float))); timed += timer.stopAndTime(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; log->print(gbps); log->print(NEWLINE); log->xmlRecord("memcpy_from_mapped_ptr", gbps); /////////////////////////////////////////////////////////////////////////// // enqueueUnmap log->print(TAB TAB TAB "enqueueUnmap(after write) : "); queue.finish(); timed = 0; if(useEventTimer) { for(int i=0; i<iters; i++) { cl::Event timeEvent; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_WRITE, 0, (numItems * sizeof(float))); queue.finish(); queue.enqueueUnmapMemObject(clBuffer, mapPtr, NULL, &timeEvent); queue.finish(); timed += timeInUS(timeEvent); } } else { for(int i=0; i<iters; i++) { Timer timer; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_WRITE, 0, (numItems * sizeof(float))); queue.finish(); timer.start(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); timed += timer.stopAndTime(); } } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; log->print(gbps); log->print(NEWLINE); log->xmlRecord("enqueueunmap", gbps); /////////////////////////////////////////////////////////////////////////// // memcpy to mapped ptr log->print(TAB TAB TAB TAB "memcpy to mapped ptr : "); queue.finish(); timed = 0; for(int i=0; i<iters; i++) { cl::Event timeEvent; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_WRITE, 0, (numItems * sizeof(float))); queue.finish(); timer.start(); memcpy(mapPtr, arr, (numItems * sizeof(float))); timed += timer.stopAndTime(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; log->print(gbps); log->print(NEWLINE); log->xmlRecord("memcpy_to_mapped_ptr", gbps); /////////////////////////////////////////////////////////////////////////// log->xmlCloseTag(); // transfer_bandwidth if(arr) delete [] arr; } catch(cl::Error error) { stringstream ss; ss << error.what() << " (" << error.err() << ")" NEWLINE << TAB TAB TAB "Tests skipped" NEWLINE; log->print(ss.str()); if(arr) delete [] arr; return -1; } return 0; }
bool runTestType(cl::Context context, cl::CommandQueue queue) { cl_uint size = 64 * 256; std::vector<T> input(size); for(size_t i = 0; i < input.size(); ++i) input[i] = input.size() - i - 1; // create input buffer using pinned memory cl::Buffer bufferIn(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, sizeof(T) * input.size(), &input[0]) ; magnet::CL::sort<T> sortFunctor; sortFunctor.build(queue, context); sortFunctor(bufferIn); std::cout << "##Testing generic sort ("; switch(sortFunctor.getMode()) { case magnet::CL::sort<T>::CPU: std::cout << "HeapSort"; break; case magnet::CL::sort<T>::NVIDIA: std::cout << "radixNVIDIA"; break; case magnet::CL::sort<T>::AMD: std::cout << "radixAMD"; break; default: M_throw() << "Could not determine which sorting algorithm is being used"; } std::cout << ") for " << input.size() << " elements and type " << magnet::CL::detail::traits<T>::kernel_type(); std::vector<T> output(size); queue.enqueueReadBuffer(bufferIn, CL_TRUE, 0, input.size() * sizeof(T), &output[0]); bool failed = !testOutput(input, output); std::cout << " key(only) " << (failed ? "FAILED" : "PASSED") << ", "; //Now test with some data! //Refresh the input array queue.enqueueWriteBuffer(bufferIn, CL_TRUE, 0, input.size() * sizeof(T), &input[0]); //Write a data array std::vector<cl_uint> data(size); for(size_t i = 0; i < input.size(); ++i) data[i] = i; cl::Buffer dataIn(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_uint) * data.size(), &data[0]) ; sortFunctor(bufferIn, dataIn); queue.enqueueReadBuffer(dataIn, CL_TRUE, 0, data.size() * sizeof(cl_uint), &data[0]); bool keyfail = false;//!testOutput(input, output); std::cout << " key " << (keyfail ? "FAILED" : "PASSED"); bool datafail = false; for(size_t i = 0; i < input.size(); ++i) if (data[i] != input.size() - 1 - i) datafail = true; std::cout << " data " << (datafail ? "FAILED" : "PASSED") << std::endl; return failed || keyfail || datafail; }