Ejemplo n.º 1
0
cl_int ComputeLocalSplits_p(cl::Buffer &internalBRTNodes, cl::Buffer &localSplits, cl_int size) {
    startBenchmark("ComputeLocalSplits_p");
    cl_int globalSize = nextPow2(size);
    cl::Kernel &kernel = CLFW::Kernels["ComputeLocalSplitsKernel"];
    cl::CommandQueue &queue = CLFW::DefaultQueue;

    bool isOld;
    cl::Buffer zeroBuffer;

    cl_int error  = CLFW::get(localSplits, "localSplits", sizeof(cl_int) * globalSize);
    error |= CLFW::get(zeroBuffer, "zeroBuffer", sizeof(cl_int) * globalSize, isOld);

    //Fill any new zero buffers with zero. Then initialize localSplits with zero.
    if (!isOld) {
        cl_int zero = 0;
        error |= queue.enqueueFillBuffer<cl_int>(zeroBuffer, { zero }, 0, sizeof(cl_int) * globalSize);
    }
    error |= queue.enqueueCopyBuffer(zeroBuffer, localSplits, 0, 0, sizeof(cl_int) * globalSize);

    error |= kernel.setArg(0, localSplits);
    error |= kernel.setArg(1, internalBRTNodes);
    error |= kernel.setArg(2, size);

    error = queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(globalSize), cl::NullRange);
    stopBenchmark();
    return error;
}
Ejemplo n.º 2
0
cl_int RadixSortBigUnsigned(cl::Buffer &input, cl_int size, cl_int mbits) {
    cl_int error = 0;
    const size_t globalSize = nextPow2(size);

    cl::Buffer predicate, address, bigUnsignedTemp, temp;
    error |= CLFW::get(address, "address", sizeof(cl_int)*(globalSize));
    error |= CLFW::get(bigUnsignedTemp, "bigUnsignedTemp", sizeof(BigUnsigned)*globalSize);

    if (error != CL_SUCCESS) return error;
    //For each bit
    startBenchmark("RadixSortBigUnsigned");
    for (unsigned int index = 0; index < mbits; index++) {
        //Predicate the 0's and 1's
        error |= BitPredicate(input, predicate, index, 0, globalSize);

        //Scan the predication buffers.
        error |= StreamScan_p(predicate, address, globalSize);

        //Compacting
        error |= DoubleCompact(input, bigUnsignedTemp, predicate, address, globalSize);

        //Swap result with input.
        temp = input;
        input = bigUnsignedTemp;
        bigUnsignedTemp = temp;
    }
    stopBenchmark();
    return error;
}
Ejemplo n.º 3
0
/*-----------------------------------------------------------------------------
 *  Function: calcOffset
 *  Parameters: Benchmark*  - Benchmark to calculate the offset for
 *
 *  Description:
 *    Calculates the time it takes to call the start and stop so it can be
 *    subtracted later.
 *-----------------------------------------------------------------------------*/
void calcOffset(Benchmark *pB)
{
  startBenchmark(pB);
  stopBenchmark(pB);
  pB->t_offset.tv_sec = pB->t_stop.tv_sec - pB->t_start.tv_sec;
  pB->t_offset.tv_nsec = pB->t_stop.tv_nsec - pB->t_start.tv_nsec;
}
Ejemplo n.º 4
0
void software_3x3_filter(const char *input)
{
    filter_params filter;
    Image iImage = IMAGE_INITIALIZER;
    Image oImage = IMAGE_INITIALIZER;
    Benchmark b;
    int val = 0;

    initBenchmark(&b, "Software 3x3 Filter", "");

    filter_Init(&filter, 4, 2, 1, 4);
    ImageRead(input, &iImage);

    startBenchmark(&b);
    val = filter_Execute(&filter, &iImage, &oImage);
    stopBenchmark(&b);

    if(val != 0) {
        fprintf(stderr, "software_3x3_filter: ERROR: Filter failed.\n");
    }

    printBenchmark(&b);
    ImageWrite("software_3x3.tif",&oImage);
    ImageCleanup(&oImage);
    ImageCleanup(&iImage);
}
Ejemplo n.º 5
0
void software_hardware_exhaustive(const char *input)
{
#ifdef ZYNQ
    const int nRuns = 500;
    int i = 0;
    hardware_config hard_config;
    filter_params filter;
    Image iImage = IMAGE_INITIALIZER;
    Image oImage = IMAGE_INITIALIZER;
    int val = 0;
    volatile int j = 0;
    Benchmark b_software;
    Benchmark b_hardware;

    initBenchmark(&b_software, "Software 3x3 filter", "");
    initBenchmark(&b_hardware, "Hardware 3x3 filter", "");

    ImageRead(input, &iImage);

    filter_Init(&filter, 4, 2, 1, 4);

    val = hardware_filter_init(&iImage, &hard_config);
    fprintf(stdout, "Running hardware %d times\n", nRuns);
    startBenchmark(&b_hardware);
    for(i = 0; i < nRuns; i++) {
        val = hardware_filter_execute(&hard_config);
    }
    stopBenchmark(&b_hardware);
    val = hardware_filter_cleanup(&iImage, &oImage, &hard_config);
    fprintf(stdout, "Hardware runs complete\n");
    fprintf(stdout, "Runnning software %d times\n", nRuns);
    for(i = 0; i < nRuns; i++) {
        val = filter_Execute(&filter, &iImage, &oImage);
    }
    stopBenchmark(&b_software);
    fprintf(stdout, "Software runs complete\n");

    printBenchmarkAvg(&b_hardware,nRuns);
    printBenchmarkAvg(&b_software,nRuns);

#else
    fprintf(stderr, "Hardware exhaustive run not supported on x86 platform\n");
#endif


}
Ejemplo n.º 6
0
cl_int BuildBinaryRadixTree_s(BigUnsigned* zpoints, BrtNode* internalBRTNodes, cl_int size, cl_int mbits) {
    startBenchmark("BuildBinaryRadixTree_s");
    for (int i = 0; i < size-1; ++i) {
        BuildBinaryRadixTree(internalBRTNodes, zpoints, mbits, size, i);
    }
    stopBenchmark();
    return CL_SUCCESS;
}
Ejemplo n.º 7
0
cl_int UploadPoints(const vector<intn> &points, cl::Buffer &pointsBuffer) {
    startBenchmark("Uploading points");
    cl_int error = 0;
    cl_int roundSize = nextPow2(points.size());
    error |= CLFW::get(pointsBuffer, "pointsBuffer", sizeof(intn)*roundSize);
    error |= CLFW::DefaultQueue.enqueueWriteBuffer(pointsBuffer, CL_TRUE, 0, sizeof(cl_int2) * points.size(), points.data());
    stopBenchmark();
    return error;
}
Ejemplo n.º 8
0
cl_int ComputeLocalSplits_s(vector<BrtNode> &I, vector<cl_uint> &local_splits, const cl_int size) {
    startBenchmark("ComputeLocalSplits_s");
    if (size > 0) {
        local_splits[0] = 1 + I[0].lcp_length / DIM;
    }
    for (int i = 0; i < size - 1; ++i) {
        ComputeLocalSplits(local_splits.data(), I.data(), i);
    }
    stopBenchmark();
    return CL_SUCCESS;
}
Ejemplo n.º 9
0
cl_int PointsToMorton_s(cl_int size, cl_int bits, cl_int2* points, BigUnsigned* result) {
    startBenchmark("PointsToMorton_s");
    int nextPowerOfTwo = nextPow2(size);
    for (int gid = 0; gid < nextPowerOfTwo; ++gid) {
        if (gid < size) {
            xyz2z(&result[gid], points[gid], bits);
        }
        else {
            initBlkBU(&result[gid], 0);
        }
    }
    stopBenchmark();
    return 0;
}
Ejemplo n.º 10
0
cl_int PointsToMorton_p(cl::Buffer &points, cl::Buffer &zpoints, cl_int size, cl_int bits) {
    cl_int error = 0;
    size_t globalSize = nextPow2(size);
    error |= CLFW::get(zpoints, "zpoints", globalSize * sizeof(BigUnsigned));
    cl::Kernel kernel = CLFW::Kernels["PointsToMortonKernel"];
    error |= kernel.setArg(0, zpoints);
    error |= kernel.setArg(1, points);
    error |= kernel.setArg(2, size);
    error |= kernel.setArg(3, bits);
    startBenchmark("PointsToMorton_p");
    error |= CLFW::DefaultQueue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(nextPow2(size)), cl::NullRange);
    stopBenchmark();
    return error;
};
Ejemplo n.º 11
0
cl_int BuildBinaryRadixTree_p(cl::Buffer &zpoints, cl::Buffer &internalBRTNodes, cl_int size, cl_int mbits) {
    startBenchmark("BuildBinaryRadixTree_p");
    cl::Kernel &kernel = CLFW::Kernels["BuildBinaryRadixTreeKernel"];
    cl::CommandQueue &queue = CLFW::DefaultQueue;
    cl_int globalSize = nextPow2(size);

    cl_int error = CLFW::get(internalBRTNodes, "internalBRTNodes", sizeof(BrtNode)* (globalSize));

    error |= kernel.setArg(0, internalBRTNodes);
    error |= kernel.setArg(1, zpoints);
    error |= kernel.setArg(2, mbits);
    error |= kernel.setArg(3, size);
    error |= queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(globalSize), cl::NullRange);
    stopBenchmark();
    return error;
}
Ejemplo n.º 12
0
cl_int InitOctree(cl::Buffer &internalBRTNodes, cl::Buffer &octree, cl::Buffer &localSplits, cl::Buffer &scannedSplits, cl_int size, cl_int octreeSize) {
    startBenchmark("InitOctree");
    cl_int globalSize = nextPow2(octreeSize);
    cl::Kernel &kernel = CLFW::Kernels["BRT2OctreeKernel_init"];
    cl::CommandQueue &queue = CLFW::DefaultQueue;
    cl_int error = 0;

    error |= kernel.setArg(0, internalBRTNodes);
    error |= kernel.setArg(1, octree);
    error |= kernel.setArg(2, localSplits);
    error |= kernel.setArg(3, scannedSplits);
    error |= kernel.setArg(4, size);

    error |= queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(globalSize), cl::NullRange);
    stopBenchmark();
    return error;
}
Ejemplo n.º 13
0
cl_int BinaryRadixToOctree_s(vector<BrtNode> &internalBRTNodes, vector<OctNode> &octree, cl_int size) {
    startBenchmark("BinaryRadixToOctree_s");
    vector<unsigned int> localSplits(size);
    ComputeLocalSplits_s(internalBRTNodes, localSplits, size);

    vector<unsigned int> prefixSums(size);
    StreamScan_s(localSplits.data(), prefixSums.data(), size);

    const int octreeSize = prefixSums[size - 1];
    octree.resize(octreeSize);
    for (int i = 0; i < octreeSize; ++i)
        brt2octree_init(i, octree.data());
    for (int brt_i = 1; brt_i < size - 1; ++brt_i)
        brt2octree(brt_i, internalBRTNodes.data(), octree.data(), localSplits.data(), prefixSums.data(), size, octreeSize);
    stopBenchmark();
    return CL_SUCCESS;
}
Ejemplo n.º 14
0
MainWindow::MainWindow(QWidget *parent)
    : QMainWindow(parent)
{
    ui.setupUi(this);
    _environment = new Environment(this);
    _currentBenchmark = NULL;

    _platforms = _environment->getPlatformsMap();
    setPlatformBox();
    connect(ui.platformBox, SIGNAL(currentIndexChanged(const QString &)),
            this, SLOT(platformBoxChanged(const QString &)));

    connect(ui.deviceBox, SIGNAL(currentIndexChanged(const QString &)),
            this, SLOT(deviceBoxChanged(const QString &)));

    addBenchmark(FlopsBenchmark::getName(),
            new FlopsBenchmark(_environment, this));
    addBenchmark(ReadWrite::getName(),
            new ReadWrite(_environment, this));
    addBenchmark(Galaxy::getName(),
            new Galaxy(_environment, this));
    addBenchmark(Mandelbrot::getName(),
            new Mandelbrot(_environment, this));
    addBenchmark(Raytracing::getName(),
            new Raytracing(_environment, this));
    addBenchmark(IoThroughput::getName(),
            new IoThroughput(_environment, this));

    ui.benchmarkList->addItems(_benchmarks.keys());
    ui.centralwidget->setLayout(new QVBoxLayout());
    ui.centralwidget->layout()->setAlignment(Qt::AlignHCenter);

    connect(ui.benchmarkList, SIGNAL(currentTextChanged(const QString &)),
            this, SLOT(setBenchmarkWidgets(const QString &)));
    connect(ui.startButton, SIGNAL(clicked()),
            this, SLOT(launchBenchmark()));
    connect(ui.stopButton, SIGNAL(clicked()),
            this, SLOT(stopBenchmark()));
    connect(ui.actionAbout, SIGNAL(triggered()),
            this, SLOT(showAbout()));
    connect(ui.actionQuit, SIGNAL(triggered()),
            this, SLOT(close()));
}
Ejemplo n.º 15
0
cl_int UniqueSorted(cl::Buffer &input, cl_int &size) {
    startBenchmark("UniqueSorted");
    int globalSize = nextPow2(size);
    cl_int error = 0;

    cl::Buffer predicate, address, intermediate, result;
    error  = CLFW::get(predicate, "predicate", sizeof(cl_int)*(globalSize));
    error |= CLFW::get(address, "address", sizeof(cl_int)*(globalSize));
    error |= CLFW::get(result, "result", sizeof(BigUnsigned) * globalSize);

    error |= UniquePredicate(input, predicate, globalSize);
    error |= StreamScan_p(predicate, address, globalSize);
    error |= SingleCompact(input, result, predicate, address, globalSize);

    input = result;

    error |= CLFW::DefaultQueue.enqueueReadBuffer(address, CL_TRUE, (sizeof(cl_int)*globalSize - (sizeof(cl_int))), sizeof(cl_int), &size);
    stopBenchmark();
    return error;
}
Ejemplo n.º 16
0
void hardware_3x3_filter(const char *input)
{
#ifdef ZYNQ
    Image iImage = IMAGE_INITIALIZER;
    Image oImage = IMAGE_INITIALIZER;
    hardware_config hard_config;
    Benchmark b;
    int val = 0;

    initBenchmark(&b, "Hardware 3x3 Filter", "");
    ImageRead(input, &iImage);
    if(hardware_filter_init(&iImage, &hard_config) != 0) {
        fprintf(stderr, "hardware_3x3_filter: ERROR: Failed to initialize hardware driver\n");
        return;
    }

    startBenchmark(&b);
    val = hardware_filter_execute(&hard_config);
    stopBenchmark(&b);
    if(val != 0) {
        fprintf(stderr, "hardware_3x3_filter: ERROR: Filter failed.\n");
    }

    val = hardware_filter_cleanup(&iImage, &oImage, &hard_config);
    if(val != 0) {
        fprintf(stderr, "hardware_3x3_filter: ERROR: Hardware filter failed to clean up.\n");
    }

    printBenchmark(&b);
    ImageWrite("hardware_3x3.tif",&oImage);
    ImageCleanup(&oImage);
    ImageCleanup(&iImage);
#else
    fprintf(stderr, "Hardware 3x3 filter not supported on x86 platform\n");
#endif

}
Ejemplo n.º 17
0
cl_int BinaryRadixToOctree_p(cl::Buffer &internalBRTNodes, vector<OctNode> &octree_vec, cl_int size) {
    startBenchmark("BinaryRadixToOctree_p");
    int globalSize = nextPow2(size);
    cl::Kernel &kernel = CLFW::Kernels["BRT2OctreeKernel"];
    cl::CommandQueue &queue = CLFW::DefaultQueue;

    cl::Buffer localSplits, scannedSplits, octree;
    cl_int error = CLFW::get(scannedSplits, "scannedSplits", sizeof(cl_int) * globalSize);

    error |= ComputeLocalSplits_p(internalBRTNodes, localSplits, size);
    error |= StreamScan_p(localSplits, scannedSplits, globalSize);

    //Read in the required octree size
    cl_int octreeSize;
    error |= CLFW::DefaultQueue.enqueueReadBuffer(scannedSplits, CL_TRUE, sizeof(int)*(size - 2), sizeof(int), &octreeSize);
    cl_int roundOctreeSize = nextPow2(octreeSize);

    //Create an octree buffer.
    error |= CLFW::get(octree, "octree", sizeof(OctNode) * roundOctreeSize);

    //use the scanned splits & brt to create octree.
    InitOctree(internalBRTNodes, octree, localSplits, scannedSplits, size, octreeSize);

    error |= kernel.setArg(0, internalBRTNodes);
    error |= kernel.setArg(1, octree);
    error |= kernel.setArg(2, localSplits);
    error |= kernel.setArg(3, scannedSplits);
    error |= kernel.setArg(4, size);

    error |= queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(globalSize), cl::NullRange);

    octree_vec.resize(octreeSize);
    error |= queue.enqueueReadBuffer(octree, CL_TRUE, 0, sizeof(OctNode)*octreeSize, octree_vec.data());
    stopBenchmark();
    return error;
}