int clPeak::runComputeDP(cl::CommandQueue &queue, cl::Program &prog, device_info_t &devInfo) { float timed, gflops; cl_uint workPerWI; cl::NDRange globalSize, localSize; cl_double A = 1.3f; int iters = devInfo.computeIters; if(!isComputeDP) return 0; if(!devInfo.doubleSupported) { cout << NEWLINE TAB TAB "No double precision support! Skipped" << endl; return 0; } try { cl::Context ctx = queue.getInfo<CL_QUEUE_CONTEXT>(); uint globalWIs = (devInfo.numCUs) * (devInfo.computeWgsPerCU) * (devInfo.maxWGSize); uint t = MIN((globalWIs * sizeof(cl_double)), devInfo.maxAllocSize); t = roundToPowOf2(t); globalWIs = t / sizeof(cl_double); cl::Buffer outputBuf = cl::Buffer(ctx, CL_MEM_WRITE_ONLY, (globalWIs * sizeof(cl_double))); globalSize = globalWIs; localSize = devInfo.maxWGSize; cl::Kernel kernel_v1(prog, "compute_dp_v1"); kernel_v1.setArg(0, outputBuf), kernel_v1.setArg(1, A); cl::Kernel kernel_v2(prog, "compute_dp_v2"); kernel_v2.setArg(0, outputBuf), kernel_v2.setArg(1, A); cl::Kernel kernel_v4(prog, "compute_dp_v4"); kernel_v4.setArg(0, outputBuf), kernel_v4.setArg(1, A); cl::Kernel kernel_v8(prog, "compute_dp_v8"); kernel_v8.setArg(0, outputBuf), kernel_v8.setArg(1, A); cl::Kernel kernel_v16(prog, "compute_dp_v16"); kernel_v16.setArg(0, outputBuf), kernel_v16.setArg(1, A); cout << NEWLINE TAB TAB "Double-precision compute (GFLOPS)" << endl; cout << setprecision(2) << fixed; /////////////////////////////////////////////////////////////////////////// // Vector width 1 cout << TAB TAB TAB "double : "; cout.flush(); workPerWI = 4096; // Indicates flops executed per work-item timed = run_kernel(queue, kernel_v1, globalSize, localSize, iters); gflops = ((float)globalWIs * workPerWI) / timed / 1e3f; cout << gflops << endl; /////////////////////////////////////////////////////////////////////////// // Vector width 2 cout << TAB TAB TAB "double2 : "; cout.flush(); workPerWI = 4096; timed = run_kernel(queue, kernel_v2, globalSize, localSize, iters); gflops = ((float)globalWIs * workPerWI) / timed / 1e3f; cout << gflops << endl; /////////////////////////////////////////////////////////////////////////// // Vector width 4 cout << TAB TAB TAB "double4 : "; cout.flush(); workPerWI = 4096; timed = run_kernel(queue, kernel_v4, globalSize, localSize, iters); gflops = ((float)globalWIs * workPerWI) / timed / 1e3f; cout << gflops << endl; /////////////////////////////////////////////////////////////////////////// // Vector width 8 cout << TAB TAB TAB "double8 : "; cout.flush(); workPerWI = 4096; timed = run_kernel(queue, kernel_v8, globalSize, localSize, iters); gflops = ((float)globalWIs * workPerWI) / timed / 1e3f; cout << gflops << endl; /////////////////////////////////////////////////////////////////////////// // Vector width 16 cout << TAB TAB TAB "double16 : "; cout.flush(); workPerWI = 4096; timed = run_kernel(queue, kernel_v16, globalSize, localSize, iters); gflops = ((float)globalWIs * workPerWI) / timed / 1e3f; cout << gflops << endl; /////////////////////////////////////////////////////////////////////////// } catch(cl::Error error) { cerr << error.what() << "(" << error.err() << ")" << endl; cerr << TAB TAB TAB "Tests skipped" << endl; return -1; } return 0; }
int clPeak::runTransferBandwidthTest(cl::CommandQueue &queue, cl::Program &prog, device_info_t &devInfo) { if(!isTransferBW) return 0; float timed, gbps; cl::NDRange globalSize, localSize; cl::Context ctx = queue.getInfo<CL_QUEUE_CONTEXT>(); int iters = devInfo.transferBWIters; Timer timer; float *arr = NULL; cl_uint maxItems = devInfo.maxAllocSize / sizeof(float) / 2; cl_uint numItems; // Set an upper-limit for cpu devies if(devInfo.deviceType & CL_DEVICE_TYPE_CPU) { numItems = roundToPowOf2(maxItems, 26); } else { numItems = roundToPowOf2(maxItems); } try { arr = new float[numItems]; cl::Buffer clBuffer = cl::Buffer(ctx, (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR), (numItems * sizeof(float))); log->print(NEWLINE TAB TAB "Transfer bandwidth (GBPS)" NEWLINE); log->xmlOpenTag("transfer_bandwidth"); log->xmlAppendAttribs("unit", "gbps"); /////////////////////////////////////////////////////////////////////////// // enqueueWriteBuffer log->print(TAB TAB TAB "enqueueWriteBuffer : "); // Dummy warm-up queue.enqueueWriteBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr); queue.finish(); timed = 0; if(useEventTimer) { for(int i=0; i<iters; i++) { cl::Event timeEvent; queue.enqueueWriteBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr, NULL, &timeEvent); queue.finish(); timed += timeInUS(timeEvent); } } else { Timer timer; timer.start(); for(int i=0; i<iters; i++) { queue.enqueueWriteBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr); } queue.finish(); timed = timer.stopAndTime(); } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; log->print(gbps); log->print(NEWLINE); log->xmlRecord("enqueuewritebuffer", gbps); /////////////////////////////////////////////////////////////////////////// // enqueueReadBuffer log->print(TAB TAB TAB "enqueueReadBuffer : "); // Dummy warm-up queue.enqueueReadBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr); queue.finish(); timed = 0; if(useEventTimer) { for(int i=0; i<iters; i++) { cl::Event timeEvent; queue.enqueueReadBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr, NULL, &timeEvent); queue.finish(); timed += timeInUS(timeEvent); } } else { Timer timer; timer.start(); for(int i=0; i<iters; i++) { queue.enqueueReadBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr); } queue.finish(); timed = timer.stopAndTime(); } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; log->print(gbps); log->print(NEWLINE); log->xmlRecord("enqueuereadbuffer", gbps); /////////////////////////////////////////////////////////////////////////// // enqueueMapBuffer log->print(TAB TAB TAB "enqueueMapBuffer(for read) : "); queue.finish(); timed = 0; if(useEventTimer) { for(int i=0; i<iters; i++) { cl::Event timeEvent; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_READ, 0, (numItems * sizeof(float)), NULL, &timeEvent); queue.finish(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); timed += timeInUS(timeEvent); } } else { for(int i=0; i<iters; i++) { Timer timer; void *mapPtr; timer.start(); mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_READ, 0, (numItems * sizeof(float))); queue.finish(); timed += timer.stopAndTime(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); } } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; log->print(gbps); log->print(NEWLINE); log->xmlRecord("enqueuemapbuffer", gbps); /////////////////////////////////////////////////////////////////////////// // memcpy from mapped ptr log->print(TAB TAB TAB TAB "memcpy from mapped ptr : "); queue.finish(); timed = 0; for(int i=0; i<iters; i++) { cl::Event timeEvent; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_READ, 0, (numItems * sizeof(float))); queue.finish(); timer.start(); memcpy(arr, mapPtr, (numItems * sizeof(float))); timed += timer.stopAndTime(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; log->print(gbps); log->print(NEWLINE); log->xmlRecord("memcpy_from_mapped_ptr", gbps); /////////////////////////////////////////////////////////////////////////// // enqueueUnmap log->print(TAB TAB TAB "enqueueUnmap(after write) : "); queue.finish(); timed = 0; if(useEventTimer) { for(int i=0; i<iters; i++) { cl::Event timeEvent; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_WRITE, 0, (numItems * sizeof(float))); queue.finish(); queue.enqueueUnmapMemObject(clBuffer, mapPtr, NULL, &timeEvent); queue.finish(); timed += timeInUS(timeEvent); } } else { for(int i=0; i<iters; i++) { Timer timer; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_WRITE, 0, (numItems * sizeof(float))); queue.finish(); timer.start(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); timed += timer.stopAndTime(); } } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; log->print(gbps); log->print(NEWLINE); log->xmlRecord("enqueueunmap", gbps); /////////////////////////////////////////////////////////////////////////// // memcpy to mapped ptr log->print(TAB TAB TAB TAB "memcpy to mapped ptr : "); queue.finish(); timed = 0; for(int i=0; i<iters; i++) { cl::Event timeEvent; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_WRITE, 0, (numItems * sizeof(float))); queue.finish(); timer.start(); memcpy(mapPtr, arr, (numItems * sizeof(float))); timed += timer.stopAndTime(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; log->print(gbps); log->print(NEWLINE); log->xmlRecord("memcpy_to_mapped_ptr", gbps); /////////////////////////////////////////////////////////////////////////// log->xmlCloseTag(); // transfer_bandwidth if(arr) delete [] arr; } catch(cl::Error error) { stringstream ss; ss << error.what() << " (" << error.err() << ")" NEWLINE << TAB TAB TAB "Tests skipped" NEWLINE; log->print(ss.str()); if(arr) delete [] arr; return -1; } return 0; }
int clPeak::runTransferBandwidthTest(cl::CommandQueue &queue, cl::Program &prog, device_info_t &devInfo) { if(!isTransferBW) return 0; float timed, gbps; cl::NDRange globalSize, localSize; cl::Context ctx = queue.getInfo<CL_QUEUE_CONTEXT>(); int iters = devInfo.transferBWIters; Timer timer; cl_uint maxItems = devInfo.maxAllocSize / sizeof(float) / 2; cl_uint numItems; // Set an upper-limit for cpu devies if(devInfo.deviceType & CL_DEVICE_TYPE_CPU) { numItems = roundToPowOf2(maxItems, 26); } else { numItems = roundToPowOf2(maxItems); } float *arr = new float[numItems]; try { cl::Buffer clBuffer = cl::Buffer(ctx, (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR), (numItems * sizeof(float))); cout << NEWLINE TAB TAB "Transfer bandwidth (GBPS)" << endl; cout << setprecision(2) << fixed; /////////////////////////////////////////////////////////////////////////// // enqueueWriteBuffer cout << TAB TAB TAB "enqueueWriteBuffer : "; cout.flush(); // Dummy warm-up queue.enqueueWriteBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr); queue.finish(); timed = 0; if(useEventTimer) { for(int i=0; i<iters; i++) { cl::Event timeEvent; queue.enqueueWriteBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr, NULL, &timeEvent); queue.finish(); timed += timeInUS(timeEvent); } } else { Timer timer; timer.start(); for(int i=0; i<iters; i++) { queue.enqueueWriteBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr); } queue.finish(); timed = timer.stopAndTime(); } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; cout << gbps << endl; /////////////////////////////////////////////////////////////////////////// // enqueueReadBuffer cout << TAB TAB TAB "enqueueReadBuffer : "; cout.flush(); // Dummy warm-up queue.enqueueReadBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr); queue.finish(); timed = 0; if(useEventTimer) { for(int i=0; i<iters; i++) { cl::Event timeEvent; queue.enqueueReadBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr, NULL, &timeEvent); queue.finish(); timed += timeInUS(timeEvent); } } else { Timer timer; timer.start(); for(int i=0; i<iters; i++) { queue.enqueueReadBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr); } queue.finish(); timed = timer.stopAndTime(); } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; cout << gbps << endl; /////////////////////////////////////////////////////////////////////////// // enqueueMapBuffer cout << TAB TAB TAB "enqueueMapBuffer(for read) : "; cout.flush(); queue.finish(); timed = 0; if(useEventTimer) { for(int i=0; i<iters; i++) { cl::Event timeEvent; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_READ, 0, (numItems * sizeof(float)), NULL, &timeEvent); queue.finish(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); timed += timeInUS(timeEvent); } } else { for(int i=0; i<iters; i++) { Timer timer; void *mapPtr; timer.start(); mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_READ, 0, (numItems * sizeof(float))); queue.finish(); timed += timer.stopAndTime(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); } } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; cout << gbps << endl; /////////////////////////////////////////////////////////////////////////// // memcpy from mapped ptr cout << TAB TAB TAB TAB "memcpy from mapped ptr : "; cout.flush(); queue.finish(); timed = 0; for(int i=0; i<iters; i++) { cl::Event timeEvent; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_READ, 0, (numItems * sizeof(float))); queue.finish(); timer.start(); memcpy(arr, mapPtr, (numItems * sizeof(float))); timed += timer.stopAndTime(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; cout << gbps << endl; /////////////////////////////////////////////////////////////////////////// // enqueueUnmap cout << TAB TAB TAB "enqueueUnmap(after write) : "; cout.flush(); queue.finish(); timed = 0; if(useEventTimer) { for(int i=0; i<iters; i++) { cl::Event timeEvent; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_WRITE, 0, (numItems * sizeof(float))); queue.finish(); queue.enqueueUnmapMemObject(clBuffer, mapPtr, NULL, &timeEvent); queue.finish(); timed += timeInUS(timeEvent); } } else { for(int i=0; i<iters; i++) { Timer timer; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_WRITE, 0, (numItems * sizeof(float))); queue.finish(); timer.start(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); timed += timer.stopAndTime(); } } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; cout << gbps << endl; /////////////////////////////////////////////////////////////////////////// // memcpy to mapped ptr cout << TAB TAB TAB TAB "memcpy to mapped ptr : "; cout.flush(); queue.finish(); timed = 0; for(int i=0; i<iters; i++) { cl::Event timeEvent; void *mapPtr; mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_WRITE, 0, (numItems * sizeof(float))); queue.finish(); timer.start(); memcpy(mapPtr, arr, (numItems * sizeof(float))); timed += timer.stopAndTime(); queue.enqueueUnmapMemObject(clBuffer, mapPtr); queue.finish(); } timed /= iters; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; cout << gbps << endl; /////////////////////////////////////////////////////////////////////////// } catch(cl::Error error) { cerr << error.what() << "(" << error.err() << ")" << endl; cerr << TAB TAB TAB "Tests skipped" << endl; if(arr) delete [] arr; return -1; } if(arr) delete [] arr; return 0; }