cl_int ComputeLocalSplits_p(cl::Buffer &internalBRTNodes, cl::Buffer &localSplits, cl_int size) { startBenchmark("ComputeLocalSplits_p"); cl_int globalSize = nextPow2(size); cl::Kernel &kernel = CLFW::Kernels["ComputeLocalSplitsKernel"]; cl::CommandQueue &queue = CLFW::DefaultQueue; bool isOld; cl::Buffer zeroBuffer; cl_int error = CLFW::get(localSplits, "localSplits", sizeof(cl_int) * globalSize); error |= CLFW::get(zeroBuffer, "zeroBuffer", sizeof(cl_int) * globalSize, isOld); //Fill any new zero buffers with zero. Then initialize localSplits with zero. if (!isOld) { cl_int zero = 0; error |= queue.enqueueFillBuffer<cl_int>(zeroBuffer, { zero }, 0, sizeof(cl_int) * globalSize); } error |= queue.enqueueCopyBuffer(zeroBuffer, localSplits, 0, 0, sizeof(cl_int) * globalSize); error |= kernel.setArg(0, localSplits); error |= kernel.setArg(1, internalBRTNodes); error |= kernel.setArg(2, size); error = queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(globalSize), cl::NullRange); stopBenchmark(); return error; }
cl_int RadixSortBigUnsigned(cl::Buffer &input, cl_int size, cl_int mbits) { cl_int error = 0; const size_t globalSize = nextPow2(size); cl::Buffer predicate, address, bigUnsignedTemp, temp; error |= CLFW::get(address, "address", sizeof(cl_int)*(globalSize)); error |= CLFW::get(bigUnsignedTemp, "bigUnsignedTemp", sizeof(BigUnsigned)*globalSize); if (error != CL_SUCCESS) return error; //For each bit startBenchmark("RadixSortBigUnsigned"); for (unsigned int index = 0; index < mbits; index++) { //Predicate the 0's and 1's error |= BitPredicate(input, predicate, index, 0, globalSize); //Scan the predication buffers. error |= StreamScan_p(predicate, address, globalSize); //Compacting error |= DoubleCompact(input, bigUnsignedTemp, predicate, address, globalSize); //Swap result with input. temp = input; input = bigUnsignedTemp; bigUnsignedTemp = temp; } stopBenchmark(); return error; }
/*----------------------------------------------------------------------------- * Function: calcOffset * Parameters: Benchmark* - Benchmark to calculate the offset for * * Description: * Calculates the time it takes to call the start and stop so it can be * subtracted later. *-----------------------------------------------------------------------------*/ void calcOffset(Benchmark *pB) { startBenchmark(pB); stopBenchmark(pB); pB->t_offset.tv_sec = pB->t_stop.tv_sec - pB->t_start.tv_sec; pB->t_offset.tv_nsec = pB->t_stop.tv_nsec - pB->t_start.tv_nsec; }
void software_3x3_filter(const char *input) { filter_params filter; Image iImage = IMAGE_INITIALIZER; Image oImage = IMAGE_INITIALIZER; Benchmark b; int val = 0; initBenchmark(&b, "Software 3x3 Filter", ""); filter_Init(&filter, 4, 2, 1, 4); ImageRead(input, &iImage); startBenchmark(&b); val = filter_Execute(&filter, &iImage, &oImage); stopBenchmark(&b); if(val != 0) { fprintf(stderr, "software_3x3_filter: ERROR: Filter failed.\n"); } printBenchmark(&b); ImageWrite("software_3x3.tif",&oImage); ImageCleanup(&oImage); ImageCleanup(&iImage); }
void software_hardware_exhaustive(const char *input) { #ifdef ZYNQ const int nRuns = 500; int i = 0; hardware_config hard_config; filter_params filter; Image iImage = IMAGE_INITIALIZER; Image oImage = IMAGE_INITIALIZER; int val = 0; volatile int j = 0; Benchmark b_software; Benchmark b_hardware; initBenchmark(&b_software, "Software 3x3 filter", ""); initBenchmark(&b_hardware, "Hardware 3x3 filter", ""); ImageRead(input, &iImage); filter_Init(&filter, 4, 2, 1, 4); val = hardware_filter_init(&iImage, &hard_config); fprintf(stdout, "Running hardware %d times\n", nRuns); startBenchmark(&b_hardware); for(i = 0; i < nRuns; i++) { val = hardware_filter_execute(&hard_config); } stopBenchmark(&b_hardware); val = hardware_filter_cleanup(&iImage, &oImage, &hard_config); fprintf(stdout, "Hardware runs complete\n"); fprintf(stdout, "Runnning software %d times\n", nRuns); for(i = 0; i < nRuns; i++) { val = filter_Execute(&filter, &iImage, &oImage); } stopBenchmark(&b_software); fprintf(stdout, "Software runs complete\n"); printBenchmarkAvg(&b_hardware,nRuns); printBenchmarkAvg(&b_software,nRuns); #else fprintf(stderr, "Hardware exhaustive run not supported on x86 platform\n"); #endif }
cl_int BuildBinaryRadixTree_s(BigUnsigned* zpoints, BrtNode* internalBRTNodes, cl_int size, cl_int mbits) { startBenchmark("BuildBinaryRadixTree_s"); for (int i = 0; i < size-1; ++i) { BuildBinaryRadixTree(internalBRTNodes, zpoints, mbits, size, i); } stopBenchmark(); return CL_SUCCESS; }
cl_int UploadPoints(const vector<intn> &points, cl::Buffer &pointsBuffer) { startBenchmark("Uploading points"); cl_int error = 0; cl_int roundSize = nextPow2(points.size()); error |= CLFW::get(pointsBuffer, "pointsBuffer", sizeof(intn)*roundSize); error |= CLFW::DefaultQueue.enqueueWriteBuffer(pointsBuffer, CL_TRUE, 0, sizeof(cl_int2) * points.size(), points.data()); stopBenchmark(); return error; }
cl_int ComputeLocalSplits_s(vector<BrtNode> &I, vector<cl_uint> &local_splits, const cl_int size) { startBenchmark("ComputeLocalSplits_s"); if (size > 0) { local_splits[0] = 1 + I[0].lcp_length / DIM; } for (int i = 0; i < size - 1; ++i) { ComputeLocalSplits(local_splits.data(), I.data(), i); } stopBenchmark(); return CL_SUCCESS; }
cl_int PointsToMorton_s(cl_int size, cl_int bits, cl_int2* points, BigUnsigned* result) { startBenchmark("PointsToMorton_s"); int nextPowerOfTwo = nextPow2(size); for (int gid = 0; gid < nextPowerOfTwo; ++gid) { if (gid < size) { xyz2z(&result[gid], points[gid], bits); } else { initBlkBU(&result[gid], 0); } } stopBenchmark(); return 0; }
cl_int PointsToMorton_p(cl::Buffer &points, cl::Buffer &zpoints, cl_int size, cl_int bits) { cl_int error = 0; size_t globalSize = nextPow2(size); error |= CLFW::get(zpoints, "zpoints", globalSize * sizeof(BigUnsigned)); cl::Kernel kernel = CLFW::Kernels["PointsToMortonKernel"]; error |= kernel.setArg(0, zpoints); error |= kernel.setArg(1, points); error |= kernel.setArg(2, size); error |= kernel.setArg(3, bits); startBenchmark("PointsToMorton_p"); error |= CLFW::DefaultQueue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(nextPow2(size)), cl::NullRange); stopBenchmark(); return error; };
cl_int BuildBinaryRadixTree_p(cl::Buffer &zpoints, cl::Buffer &internalBRTNodes, cl_int size, cl_int mbits) { startBenchmark("BuildBinaryRadixTree_p"); cl::Kernel &kernel = CLFW::Kernels["BuildBinaryRadixTreeKernel"]; cl::CommandQueue &queue = CLFW::DefaultQueue; cl_int globalSize = nextPow2(size); cl_int error = CLFW::get(internalBRTNodes, "internalBRTNodes", sizeof(BrtNode)* (globalSize)); error |= kernel.setArg(0, internalBRTNodes); error |= kernel.setArg(1, zpoints); error |= kernel.setArg(2, mbits); error |= kernel.setArg(3, size); error |= queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(globalSize), cl::NullRange); stopBenchmark(); return error; }
cl_int InitOctree(cl::Buffer &internalBRTNodes, cl::Buffer &octree, cl::Buffer &localSplits, cl::Buffer &scannedSplits, cl_int size, cl_int octreeSize) { startBenchmark("InitOctree"); cl_int globalSize = nextPow2(octreeSize); cl::Kernel &kernel = CLFW::Kernels["BRT2OctreeKernel_init"]; cl::CommandQueue &queue = CLFW::DefaultQueue; cl_int error = 0; error |= kernel.setArg(0, internalBRTNodes); error |= kernel.setArg(1, octree); error |= kernel.setArg(2, localSplits); error |= kernel.setArg(3, scannedSplits); error |= kernel.setArg(4, size); error |= queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(globalSize), cl::NullRange); stopBenchmark(); return error; }
cl_int BinaryRadixToOctree_s(vector<BrtNode> &internalBRTNodes, vector<OctNode> &octree, cl_int size) { startBenchmark("BinaryRadixToOctree_s"); vector<unsigned int> localSplits(size); ComputeLocalSplits_s(internalBRTNodes, localSplits, size); vector<unsigned int> prefixSums(size); StreamScan_s(localSplits.data(), prefixSums.data(), size); const int octreeSize = prefixSums[size - 1]; octree.resize(octreeSize); for (int i = 0; i < octreeSize; ++i) brt2octree_init(i, octree.data()); for (int brt_i = 1; brt_i < size - 1; ++brt_i) brt2octree(brt_i, internalBRTNodes.data(), octree.data(), localSplits.data(), prefixSums.data(), size, octreeSize); stopBenchmark(); return CL_SUCCESS; }
MainWindow::MainWindow(QWidget *parent) : QMainWindow(parent) { ui.setupUi(this); _environment = new Environment(this); _currentBenchmark = NULL; _platforms = _environment->getPlatformsMap(); setPlatformBox(); connect(ui.platformBox, SIGNAL(currentIndexChanged(const QString &)), this, SLOT(platformBoxChanged(const QString &))); connect(ui.deviceBox, SIGNAL(currentIndexChanged(const QString &)), this, SLOT(deviceBoxChanged(const QString &))); addBenchmark(FlopsBenchmark::getName(), new FlopsBenchmark(_environment, this)); addBenchmark(ReadWrite::getName(), new ReadWrite(_environment, this)); addBenchmark(Galaxy::getName(), new Galaxy(_environment, this)); addBenchmark(Mandelbrot::getName(), new Mandelbrot(_environment, this)); addBenchmark(Raytracing::getName(), new Raytracing(_environment, this)); addBenchmark(IoThroughput::getName(), new IoThroughput(_environment, this)); ui.benchmarkList->addItems(_benchmarks.keys()); ui.centralwidget->setLayout(new QVBoxLayout()); ui.centralwidget->layout()->setAlignment(Qt::AlignHCenter); connect(ui.benchmarkList, SIGNAL(currentTextChanged(const QString &)), this, SLOT(setBenchmarkWidgets(const QString &))); connect(ui.startButton, SIGNAL(clicked()), this, SLOT(launchBenchmark())); connect(ui.stopButton, SIGNAL(clicked()), this, SLOT(stopBenchmark())); connect(ui.actionAbout, SIGNAL(triggered()), this, SLOT(showAbout())); connect(ui.actionQuit, SIGNAL(triggered()), this, SLOT(close())); }
cl_int UniqueSorted(cl::Buffer &input, cl_int &size) { startBenchmark("UniqueSorted"); int globalSize = nextPow2(size); cl_int error = 0; cl::Buffer predicate, address, intermediate, result; error = CLFW::get(predicate, "predicate", sizeof(cl_int)*(globalSize)); error |= CLFW::get(address, "address", sizeof(cl_int)*(globalSize)); error |= CLFW::get(result, "result", sizeof(BigUnsigned) * globalSize); error |= UniquePredicate(input, predicate, globalSize); error |= StreamScan_p(predicate, address, globalSize); error |= SingleCompact(input, result, predicate, address, globalSize); input = result; error |= CLFW::DefaultQueue.enqueueReadBuffer(address, CL_TRUE, (sizeof(cl_int)*globalSize - (sizeof(cl_int))), sizeof(cl_int), &size); stopBenchmark(); return error; }
void hardware_3x3_filter(const char *input) { #ifdef ZYNQ Image iImage = IMAGE_INITIALIZER; Image oImage = IMAGE_INITIALIZER; hardware_config hard_config; Benchmark b; int val = 0; initBenchmark(&b, "Hardware 3x3 Filter", ""); ImageRead(input, &iImage); if(hardware_filter_init(&iImage, &hard_config) != 0) { fprintf(stderr, "hardware_3x3_filter: ERROR: Failed to initialize hardware driver\n"); return; } startBenchmark(&b); val = hardware_filter_execute(&hard_config); stopBenchmark(&b); if(val != 0) { fprintf(stderr, "hardware_3x3_filter: ERROR: Filter failed.\n"); } val = hardware_filter_cleanup(&iImage, &oImage, &hard_config); if(val != 0) { fprintf(stderr, "hardware_3x3_filter: ERROR: Hardware filter failed to clean up.\n"); } printBenchmark(&b); ImageWrite("hardware_3x3.tif",&oImage); ImageCleanup(&oImage); ImageCleanup(&iImage); #else fprintf(stderr, "Hardware 3x3 filter not supported on x86 platform\n"); #endif }
cl_int BinaryRadixToOctree_p(cl::Buffer &internalBRTNodes, vector<OctNode> &octree_vec, cl_int size) { startBenchmark("BinaryRadixToOctree_p"); int globalSize = nextPow2(size); cl::Kernel &kernel = CLFW::Kernels["BRT2OctreeKernel"]; cl::CommandQueue &queue = CLFW::DefaultQueue; cl::Buffer localSplits, scannedSplits, octree; cl_int error = CLFW::get(scannedSplits, "scannedSplits", sizeof(cl_int) * globalSize); error |= ComputeLocalSplits_p(internalBRTNodes, localSplits, size); error |= StreamScan_p(localSplits, scannedSplits, globalSize); //Read in the required octree size cl_int octreeSize; error |= CLFW::DefaultQueue.enqueueReadBuffer(scannedSplits, CL_TRUE, sizeof(int)*(size - 2), sizeof(int), &octreeSize); cl_int roundOctreeSize = nextPow2(octreeSize); //Create an octree buffer. error |= CLFW::get(octree, "octree", sizeof(OctNode) * roundOctreeSize); //use the scanned splits & brt to create octree. InitOctree(internalBRTNodes, octree, localSplits, scannedSplits, size, octreeSize); error |= kernel.setArg(0, internalBRTNodes); error |= kernel.setArg(1, octree); error |= kernel.setArg(2, localSplits); error |= kernel.setArg(3, scannedSplits); error |= kernel.setArg(4, size); error |= queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(globalSize), cl::NullRange); octree_vec.resize(octreeSize); error |= queue.enqueueReadBuffer(octree, CL_TRUE, 0, sizeof(OctNode)*octreeSize, octree_vec.data()); stopBenchmark(); return error; }