C++ (Cpp) OptionParser::getOptionBool Examples

Programming Language: C++ (Cpp)

Class/Type: OptionParser

Method/Function: getOptionBool

Examples at hotexamples.com: 15

The `getOptionBool` function in C++ OptionParser is used to retrieve a boolean value for a specific option from the command line arguments. It checks if the specified option is present and, if so, returns the boolean value associated with it. This function is useful in scenarios where the program needs to handle command line arguments that represent boolean flags or options.

C++ (Cpp) OptionParser::getOptionBool - 15 examples found. These are the top rated real world C++ (Cpp) examples of OptionParser::getOptionBool extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

getOptionInt(30)

setRequired(22)

fail(22)

getString(22)

Parse(21)

countOptions(21)

AddOption(20)

addOption(19)

setAllowed(17)

parse(17)

getOptionBool(15)

setDefaultArguments(14)

HelpDisplayed(14)

getBool(13)

usage(12)

getOptionVecInt(11)

getInt(10)

autoExtendOptions(9)

getDouble(8)

readArgv(8)

readFile(8)

parse_args(7)

add_option(7)

error(6)

getOptionFloat(5)

executable(5)

value(5)

bad(5)

getErrors(3)

AppendOptions(3)

DisplayHelp(3)

setShortOptionEquivalent(3)

args(3)

checkOptions(3)

ParseOptions(3)

PrintOptionHelp(3)

help_requested(2)

option_missing(2)

option_missing_message(2)

print_help(2)

about_message(2)

about_requested(2)

HelpRequested(2)

GetErrMsg(2)

getStringVector(2)

help_message(2)

add_option_group(2)

getOptionString(2)

add_opt(2)

add(1)

Example #1

Show file

File: MaxFlops.cpp Project: optimus-prime/shoc-mic

// ****************************************************************************
// Function: RunBenchmark
//
// Purpose:
//   Measures the floating point capability of the device for a variety of 
//   combinations of arithmetic operations.
//
// Arguments:
//   op: the options parser / parameter database
//
// Returns:  nothing
//
// Programmer: Zhi Ying([email protected])
//             Jun Jin([email protected])
//
// Creation: May 23, 2011
//
// Modifications:
// 12/12/12 - Kyle Spafford - Code style and minor integration updates
//
// ****************************************************************************
void RunBenchmark(OptionParser &op, ResultDatabase &resultDB)
{
    const bool verbose = op.getOptionBool("verbose");
    // Quiet == no progress bar.
    const bool quiet   = op.getOptionBool("quiet");
    const unsigned int passes = op.getOptionInt("passes");
    const int micdev = op.getOptionInt("target");

    double repeatF = 3;
    cout << "Adjust repeat factor = " << repeatF << "\n";

    // Initialize progress bar
    int totalRuns = 16*passes*2;
    ProgressBar pb(totalRuns);
    if (!verbose && !quiet) 
    {
        pb.Show(stdout);
    }

    RunTest<float>(resultDB, passes, verbose, quiet,
                   repeatF, pb, "-SP", micdev);
    RunTest<double>(resultDB, passes, verbose, quiet,
                    repeatF, pb, "-DP", micdev);

    if (!verbose) cout << endl;
}

Example #2

Show file

File: FFT.cpp Project: Poojachitral/shoc

void
RunBenchmark(cl::Device& devcpp,
                  cl::Context& ctxcpp,
                  cl::CommandQueue& queuecpp,
                  ResultDatabase &resultDB,
                  OptionParser &op)
{
    // convert from C++ bindings to C bindings
    // TODO propagate use of C++ bindings
    cl_device_id dev = devcpp();
    cl_context ctx = ctxcpp();
    cl_command_queue queue = queuecpp();

    if (getMaxWorkGroupSize(dev) < 64) {
        cout << "FFT requires MaxWorkGroupSize of at least 64" << endl;
        fillResultDB("SP-FFT", "MaxWorkGroupSize<64", op, resultDB);
        fillResultDB("DP-FFT", "MaxWorkGroupSize<64", op, resultDB);
        return;
    }
    
    bool has_dp = checkExtension(dev, "cl_khr_fp64") ||
        checkExtension(dev, "cl_amd_fp64");

    if (op.getOptionBool("dump-sp")) {
        dump<cplxflt>(op);
    }
    else if (op.getOptionBool("dump-dp")) {
        if (!has_dp) {
            cout << "dump-dp: no double precision support!\n";
            return;
        }
        dump<cplxdbl>(op);
    }
    else {
        // Always run single precision test
        runTest<cplxflt>("SP-FFT", dev, ctx, queue, resultDB, op);

        // If double precision is supported, run the DP test
        if (has_dp) {
            cout << "DP Supported\n";
            runTest<cplxdbl>("DP-FFT", dev, ctx, queue, resultDB, op);
        }
        else {
            cout << "DP Not Supported\n";
            fillResultDB("DP-FFT", "DP_Not_Supported", op, resultDB);
        }
    }
}

Example #3

Show file

File: OCLDriver.cpp Project: Poojachitral/shoc

// ****************************************************************************
// Function: GPUSetup
//
// Purpose:
//  do the necessary OpenCL setup for GPU part of the test
//
// Arguments:
//   op: the options parser / parameter database
//   mympirank: for printing errors in case of failure
//   mynoderank: this is typically the device ID (the mapping done in main)
//
// Returns: success/failure
//
// Creation: 2009
//
// Modifications:
//
// ****************************************************************************
//
int GPUSetup(OptionParser &op, int mympirank, int mynoderank)
{
    addBenchmarkSpecOptions(op);
    
    if (op.getOptionBool("infoDevices"))
    {
        OpenCLNodePlatformContainer ndc1;
        ndc1.Print (cout);
        return (0);
    }
    
    // The device option supports specifying more than one device
    int platform = op.getOptionInt("platform");
    int deviceIdx = mynoderank;
    if( deviceIdx >= op.getOptionVecInt( "device" ).size() )
    {
        std::ostringstream estr;
        estr << "Warning: not enough devices specified with --device flag for task "
            << mympirank
            << " ( node rank " << mynoderank
            << ") to claim its own device; forcing to use first device ";
        std::cerr << estr.str() << std::endl;
        deviceIdx = 0;
    }
    int device = op.getOptionVecInt("device")[deviceIdx];

    // Initialization
    _mpicontention_ocldev = new cl::Device( ListDevicesAndGetDevice(platform, device) );
    std::vector<cl::Device> ctxDevices;
    ctxDevices.push_back( *_mpicontention_ocldev );
    _mpicontention_ocldriver_ctx   = new cl::Context( ctxDevices );
    _mpicontention_ocldriver_queue = new cl::CommandQueue( *_mpicontention_ocldriver_ctx, *_mpicontention_ocldev, CL_QUEUE_PROFILING_ENABLE );
    _mpicontention_gpuop = op;
    return 0;
}

Example #4

Show file

File: FFT.cpp Project: ashwinma/mpiacc-contention-tests

void
RunBenchmark(ResultDatabase &resultDB, OptionParser &op)
{
    // Test to see if this device supports double precision
    cudaGetDevice(&fftDevice);
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, fftDevice);
    bool has_dp = (deviceProp.major == 1 && deviceProp.minor >= 3) ||
        (deviceProp.major >= 2);

    if (op.getOptionBool("dump-sp")) {
        dump<float2>(op);
    }
    else if (op.getOptionBool("dump-dp")) {
        if (!has_dp) {
            cout << "dump-dp: no double precision support!\n";
            return;
        }
        dump<double2>(op);
    }
    else {
        cout << "Running single precision test" << endl;
        runTest<float2>("SP-FFT", resultDB, op);
        if (has_dp) {
            cout << "Running double precision test" << endl;
            runTest<double2>("DP-FFT", resultDB, op);
        } 
        else {
            cout << "Skipping double precision test" << endl;
            char atts[32] = "DP_Not_Supported";
            // resultDB requires neg entry for every possible result
            int passes = op.getOptionInt("passes");
            for (int k=0; k<passes; k++) {
                resultDB.AddResult("DP-FFT" , atts, "GB/s", FLT_MAX);
                resultDB.AddResult("DP-FFT_PCIe" , atts, "GB/s", FLT_MAX);
                resultDB.AddResult("DP-FFT_Parity" , atts, "GB/s", FLT_MAX);
                resultDB.AddResult("DP-FFT-INV" , atts, "GB/s", FLT_MAX);
                resultDB.AddResult("DP-FFT-INV_PCIe" , atts, "GB/s", FLT_MAX);
                resultDB.AddResult("DP-FFT-INV_Parity" , atts, "GB/s", FLT_MAX);
            }
        }
    }
}

Example #5

Show file

File: fft.cpp Project: Sable/Ostrich

	void
RunBenchmark(OptionParser &op)
{
	// convert from C++ bindings to C bindings
	// TODO propagate use of C++ bindings



	if(op.getOptionBool("2D"))
		dump2D<cplxflt>(op);
	else
		dump1D<cplxflt>(op);

}

Example #6

Show file

File: FFT.cpp Project: ellen-hl/shoc-mic

void
RunBenchmark(OptionParser &op, ResultDatabase &resultDB)
{
    const bool verbose = op.getOptionBool("verbose");

    if (verbose) // print MKL version info
    {
        static char mklver[200];
        char *p;
        MKL_Get_Version_String(mklver,sizeof(mklver));
        mklver[sizeof(mklver)-1] = 0;
        p = strrchr(mklver,' ');
        if (p) while (p[0]==' ' && p[1]==0) *p-- = 0;
        printf("SHOC FFT benchmark using MKL verison %s\n",mklver);
    }
    RunTest<cplxflt>("SP-FFT", resultDB, op);
    RunTest<cplxdbl>("DP-FFT", resultDB, op);
}

Example #7

Show file

File: Stencil2Dmain.cpp Project: adityaatluri/shoc-1

void
DoTest( const char* timerDesc, ResultDatabase& resultDB, OptionParser& opts )
{
    StencilFactory<T>* stdStencilFactory = NULL;
    Stencil<T>* stdStencil = NULL;
    StencilFactory<T>* testStencilFactory = NULL;
    Stencil<T>* testStencil = NULL;

    try
    {
#if defined(PARALLEL)
        stdStencilFactory = new MPIHostStencilFactory<T>;
        testStencilFactory = new MPICUDAStencilFactory<T>;
#else
        stdStencilFactory = new HostStencilFactory<T>;
        testStencilFactory = new CUDAStencilFactory<T>;
#endif // defined(PARALLEL)
        assert( (stdStencilFactory != NULL) && (testStencilFactory != NULL) );

        // do a sanity check on option values
        CheckOptions( opts );
        stdStencilFactory->CheckOptions( opts );
        testStencilFactory->CheckOptions( opts );

        // extract and validate options
        std::vector<long long> arrayDims = opts.getOptionVecInt( "customSize" );
        if( arrayDims.size() != 2 )
        {
            cerr << "Dim size: " << arrayDims.size() << "\n";
            throw InvalidArgValue( "all overall dimensions must be positive" );
        }
        if (arrayDims[0] == 0) // User has not specified a custom size
        {
            int sizeClass = opts.getOptionInt("size");
            arrayDims = StencilFactory<T>::GetStandardProblemSize( sizeClass );
        }
        long int seed = (long)opts.getOptionInt( "seed" );
        bool beVerbose = opts.getOptionBool( "verbose" );
        unsigned int nIters = (unsigned int)opts.getOptionInt( "num-iters" );
        double valErrThreshold = (double)opts.getOptionFloat( "val-threshold" );
        unsigned int nValErrsToPrint = (unsigned int)opts.getOptionInt( "val-print-limit" );

#if defined(PARALLEL)
        unsigned int haloWidth = (unsigned int)opts.getOptionInt( "iters-per-exchange" );
#else
        unsigned int haloWidth = 1;
#endif // defined(PARALLEL)

        float haloVal = (float)opts.getOptionFloat( "haloVal" );

        // build a description of this experiment
        std::vector<long long> lDims = opts.getOptionVecInt( "lsize" );
        assert( lDims.size() == 2 );
        std::ostringstream experimentDescriptionStr;
        experimentDescriptionStr
            << nIters << ':'
            << arrayDims[0] << 'x' << arrayDims[1] << ':'
            << lDims[0] << 'x' << lDims[1];

        unsigned int nPasses = (unsigned int)opts.getOptionInt( "passes" );
        unsigned int nWarmupPasses = (unsigned int)opts.getOptionInt( "warmupPasses" );


        // compute the expected result on the host
        // or read it from a pre-existing file
        std::string matrixFilenameBase = (std::string)opts.getOptionString( "expMatrixFile" );
#if defined(PARALLEL)
        int cwrank;
        MPI_Comm_rank( MPI_COMM_WORLD, &cwrank );
        if( cwrank == 0 )
        {
#endif // defined(PARALLEL)
        if( !matrixFilenameBase.empty() )
        {
            std::cout << "\nReading expected stencil operation result from file for later comparison with CUDA output\n"
                << std::endl;
        }
        else
        {
            std::cout << "\nPerforming stencil operation on host for later comparison with CUDA output\n"
                << "Depending on host capabilities, this may take a while."
                << std::endl;
        }
#if defined(PARALLEL)
        }
#endif // defined(PARALLEL)
        Matrix2D<T> expected( arrayDims[0] + 2*haloWidth,
                            arrayDims[1] + 2*haloWidth );
        Initialize<T> init( seed, haloWidth, haloVal );

        bool haveExpectedData = false;
        if( ! matrixFilenameBase.empty() )
        {
            bool readOK = ReadMatrixFromFile( expected, GetMatrixFileName<T>( matrixFilenameBase ) );
            if( readOK )
            {

                if( (expected.GetNumRows() != arrayDims[0] + 2*haloWidth) ||
                    (expected.GetNumColumns() != arrayDims[1] + 2*haloWidth) )
                {
                    std::cerr << "The matrix read from file \'"
                        << GetMatrixFileName<T>( matrixFilenameBase )
                        << "\' does not match the matrix size specified on the command line.\n";
                    expected.Reset( arrayDims[0] + 2*haloWidth, arrayDims[1] + 2*haloWidth );
                }
                else
                {
                    haveExpectedData = true;
                }
            }

            if( !haveExpectedData )
            {
                std::cout << "\nSince we could not read the expected matrix values,\nperforming stencil operation on host for later comparison with CUDA output.\n"
                    << "Depending on host capabilities, this may take a while."
                    << std::endl;
            }
        }
        if( !haveExpectedData )
        {
            init( expected );
            haveExpectedData = true;
            if( beVerbose )
            {
                std::cout << "initial state:\n" << expected << std::endl;
            }
            stdStencil = stdStencilFactory->BuildStencil( opts );
            (*stdStencil)( expected, nIters );
        }
        if( beVerbose )
        {
            std::cout << "expected result:\n" << expected << std::endl;
        }

        // determine whether we are to save the expected matrix values to a file
        // to speed up future runs
        matrixFilenameBase = (std::string)opts.getOptionString( "saveExpMatrixFile" );
        if( !matrixFilenameBase.empty() )
        {
            SaveMatrixToFile( expected, GetMatrixFileName<T>( matrixFilenameBase ) );
        }
        assert( haveExpectedData );


        // compute the result on the CUDA device
        Matrix2D<T> data( arrayDims[0] + 2*haloWidth,
                            arrayDims[1] + 2*haloWidth );
        Stencil<T>* testStencil = testStencilFactory->BuildStencil( opts );

        // Compute the number of floating point operations we will perform.
        //
        // Note: in the truly-parallel case, we count flops for redundant
        // work due to the need for a halo.
        // But we do not add to the count for the local 1-wide halo since
        // we aren't computing new values for those items.
        unsigned long npts = (arrayDims[0] + 2*haloWidth - 2) *
                            (arrayDims[1] + 2*haloWidth - 2);
#if defined(PARALLEL)
        MPICUDAStencil<T>* mpiTestStencil = static_cast<MPICUDAStencil<T>*>( testStencil );
        assert( mpiTestStencil != NULL );
        int participating = mpiTestStencil->ParticipatingInProgram() ? 1 : 0;
        int numParticipating = 0;
        MPI_Allreduce( &participating,      // src
                        &numParticipating,  // dest
                        1,                  // count
                        MPI_INT,            // type
                        MPI_SUM,            // op
                        MPI_COMM_WORLD );   // communicator
        npts *= numParticipating;
#endif // defined(PARALLEL)

        // In our 9-point stencil, there are 11 floating point operations
        // per point (3 multiplies and 11 adds):
        //
        // newval = weight_center * centerval +
        //      weight_cardinal * (northval + southval + eastval + westval) +
        //      weight_diagnoal * (neval + nwval + seval + swval)
        //
        // we do this stencil operation 'nIters' times
        unsigned long nflops = npts * 11 * nIters;

#if defined(PARALLEL)
        if( cwrank == 0 )
        {
#endif // defined(PARALLEL)
        std::cout << "Performing " << nWarmupPasses << " warmup passes...";
#if defined(PARALLEL)
        }
#endif // defined(PARALLEL)

        for( unsigned int pass = 0; pass < nWarmupPasses; pass++ )
        {
            init(data);
            (*testStencil)( data, nIters );
        }
#if defined(PARALLEL)
        if( cwrank == 0 )
        {
#endif // defined(PARALLEL)
        std::cout << "done." << std::endl;
#if defined(PARALLEL)
        }
#endif // defined(PARALLEL)

#if defined(PARALLEL)
        MPI_Comm_rank( MPI_COMM_WORLD, &cwrank );
        if( cwrank == 0 )
        {
#endif // defined(PARALLEL)
        std::cout << "\nPerforming stencil operation on chosen device, "
            << nPasses << " passes.\n"
            << "Depending on chosen device, this may take a while."
            << std::endl;
#if defined(PARALLEL)
        }
#endif // defined(PARALLEL)

#if !defined(PARALLEL)
        std::cout << "At the end of each pass the number of validation\nerrors observed will be printed to the standard output."
            << std::endl;
#endif // !defined(PARALLEL)


        for( unsigned int pass = 0; pass < nPasses; pass++ )
        {
#if !defined(PARALLEL)
            std::cout << "pass " << pass << ": ";
#endif // !defined(PARALLEL)
            init( data );

            int timerHandle = Timer::Start();
            (*testStencil)( data, nIters );
            double elapsedTime = Timer::Stop( timerHandle, "CUDA stencil" );


            // find and report the computation rate
            double gflops = (nflops / elapsedTime) / 1e9;

            resultDB.AddResult( timerDesc,
                                    experimentDescriptionStr.str(),
                                    "GFLOPS",
                                    gflops );
            if( beVerbose )
            {
                std::cout << "observed result, pass " << pass << ":\n"
                    << data
                    << std::endl;
            }

            // validate the result
#if defined(PARALLEL)
            StencilValidater<T>* validater = new MPIStencilValidater<T>;
#else
            StencilValidater<T>* validater = new SerialStencilValidater<T>;
#endif // defined(PARALLEL)
            validater->ValidateResult( expected,
                            data,
                            valErrThreshold,
                            nValErrsToPrint );
        }
    }
    catch( ... )
    {
        // clean up - abnormal termination
        // wish we didn't have to do this, but C++ exceptions do not
        // support a try-catch-finally approach
        delete stdStencil;
        delete stdStencilFactory;
        delete testStencil;
        delete testStencilFactory;
        throw;
    }

    // clean up - normal termination
    delete stdStencil;
    delete stdStencilFactory;
    delete testStencil;
    delete testStencilFactory;
}

Example #8

Show file

File: main.cpp Project: ManavA/shoc

// ****************************************************************************
// Method:  main()
//
// Purpose:
//   serial and parallel main for OpenCL level0 benchmarks
//
// Arguments:
//   argc, argv
//
// Programmer:  SHOC Team
// Creation:    The Epoch
//
// Modifications:
//   Jeremy Meredith, Tue Jan 12 15:09:33 EST 2010
//   Changed the way device selection works.  It now defaults to the device
//   index corresponding to the process's rank within a node if no devices
//   are specified on the command command line, and otherwise, round-robins
//   the list of devices among the tasks.
//
//   Gabriel Marin, Tue Jun 01 15:38 EST 2010
//   Check that we have valid (not NULL) context and queue objects before
//   running the benchmarks. Errors inside CreateContextFromSingleDevice or
//   CreateCommandQueueForContextAndDevice were not propagated out to the main
//   program.
//
//   Jeremy Meredith, Wed Nov 10 14:20:47 EST 2010
//   Split timing reports into detailed and summary.  For serial code, we
//   report all trial values, and for parallel, skip the per-process vals.
//   Also detect and print outliers from parallel runs.
//
// ****************************************************************************
int main(int argc, char *argv[])
{
    int ret = 0;

    try
    {
#ifdef PARALLEL
        int rank, size;
        MPI_Init(&argc,&argv);
        MPI_Comm_size(MPI_COMM_WORLD, &size);
        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
        cout << "MPI Task "<< rank << "/" << size - 1 << " starting....\n";
#endif

        OptionParser op;

        //Add shared options to the parser
        op.addOption("platform", OPT_INT, "0", "specify OpenCL platform to use",
                'p');
        op.addOption("device", OPT_VECINT, "", "specify device(s) to run on", 'd');
        op.addOption("passes", OPT_INT, "10", "specify number of passes", 'n');
        op.addOption("size", OPT_VECINT, "1", "specify problem size", 's');
        op.addOption("infoDevices", OPT_BOOL, "",
                "show info for available platforms and devices", 'i');
        op.addOption("verbose", OPT_BOOL, "", "enable verbose output", 'v');
        op.addOption("quiet", OPT_BOOL, "", "write minimum necessary to standard output", 'q');

        addBenchmarkSpecOptions(op);

        if (!op.parse(argc, argv))
        {
#ifdef PARALLEL
            if (rank == 0)
                op.usage();
            MPI_Finalize();
#else
            op.usage();
#endif
            return (op.HelpRequested() ? 0 : 1 );
        }

        if (op.getOptionBool("infoDevices"))
        {
#define DEBUG_DEVICE_CONTAINER 0
#ifdef PARALLEL
            // execute following code only if I am the process of lowest
            // rank on this node
            NodeInfo NI;
            int mynoderank = NI.nodeRank();
            if (mynoderank==0)
            {
                int nlrrank, nlrsize;
                MPI_Comm nlrcomm = NI.getNLRComm();
                MPI_Comm_size(nlrcomm, &nlrsize);
                MPI_Comm_rank(nlrcomm, &nlrrank);

                OpenCLNodePlatformContainer ndc1;
                OpenCLMultiNodeContainer localMnc(ndc1);
                localMnc.doMerge (nlrrank, nlrsize, nlrcomm);
                if (rank==0)  // I am the global rank 0, print all configurations
                    localMnc.Print (cout);
            }
#else
            OpenCLNodePlatformContainer ndc1;
            ndc1.Print (cout);
#if DEBUG_DEVICE_CONTAINER
            OpenCLMultiNodeContainer mnc1(ndc1), mnc2;
            mnc1.Print (cout);
            ostringstream oss;
            mnc1.writeObject (oss);
            std::string temp(oss.str());
            cout << "Serialized MultiNodeContainer:\n" << temp;
            istringstream iss(temp);
            mnc2.readObject (iss);
            cout << "Unserialized object2:\n";
            mnc2.Print (cout);
            mnc1.merge (mnc2);
            cout << "==============\nObject1 after merging 1:\n";
            mnc1.Print (cout);
            mnc1.merge (mnc2);
            cout << "==============\nObject1 after merging 2:\n";
            mnc1.Print (cout);
#endif  // DEBUG
#endif  // PARALLEL
            return (0);
        }

        bool verbose = op.getOptionBool("verbose");

        // The device option supports specifying more than one device
        // for now, just choose the first one.
        int platform = op.getOptionInt("platform");

#ifdef PARALLEL
        NodeInfo ni;
        int myNodeRank = ni.nodeRank();
        if (verbose)
        cout << "Global rank "<<rank<<" is local rank "<<myNodeRank << endl;
#else
        int myNodeRank = 0;
#endif

        // If they haven't specified any devices, assume they
        // want the process with in-node rank N to use device N
        int deviceIdx = myNodeRank;

        // If they have, then round-robin the list of devices
        // among the processes on a node.
        vector<long long> deviceVec = op.getOptionVecInt("device");
        if (deviceVec.size() > 0)
        {
        int len = deviceVec.size();
            deviceIdx = deviceVec[myNodeRank % len];
        }

        // Check for an erroneous device
        if (deviceIdx >= GetNumOclDevices(platform)) {
            cerr << "Warning: device index: " << deviceIdx
                 << " out of range, defaulting to device 0.\n";
            deviceIdx = 0;
        }

        // Initialization
        if (verbose) cout << ">> initializing\n";
        cl_device_id devID = ListDevicesAndGetDevice(platform, deviceIdx);
        cl_int clErr;
        cl_context ctx = clCreateContext( NULL,     // properties
                                            1,      // number of devices
                                            &devID, // device
                                            NULL,   // notification function
                                            NULL,
                                            &clErr );
        CL_CHECK_ERROR(clErr);
        cl_command_queue queue = clCreateCommandQueue( ctx,
                                                        devID,
                                                        CL_QUEUE_PROFILING_ENABLE,
                                                        &clErr );
        CL_CHECK_ERROR(clErr);
        ResultDatabase resultDB;

        // Run the benchmark
        RunBenchmark(devID, ctx, queue, resultDB, op);

        clReleaseCommandQueue( queue );
        clReleaseContext( ctx );


#ifndef PARALLEL
        resultDB.DumpDetailed(cout);
#else
        ParallelResultDatabase pardb;
        pardb.MergeSerialDatabases(resultDB,MPI_COMM_WORLD);
        if (rank==0)
        {
            pardb.DumpSummary(cout);
            pardb.DumpOutliers(cout);
        }
#endif
    }
    catch( std::exception& e )
    {
        std::cerr << e.what() << std::endl;
        ret = 1;
    }
    catch( ... )
    {
        std::cerr << "unrecognized exception caught" << std::endl;
        ret = 1;
    }

#ifdef PARALLEL
    MPI_Finalize();
#endif

    return ret;
}

Example #9

Show file

File: BusSpeedReadback.cpp Project: ellen-hl/shoc-mic

void RunBenchmark(OptionParser &op, ResultDatabase &resultDB)
{
    const bool verbose = op.getOptionBool("verbose");

    // Sizes are in kb
    const int nSizes  = 17;
    int sizes[nSizes] = {1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,
        32768, 65536};
    
    long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;

    // Create host memory
    hostMem = (float*)_mm_malloc(numMaxFloats*sizeof(float),ALIGN);

    if(hostMem==NULL)
    {
        cerr << "Couldn't allocate CPU memory! \n";
        cerr << "Test failed." << endl;
        return;
    }
    // Initialize memory with some pattern.
    for (int i = 0; i < numMaxFloats; i++)
    {
        hostMem[i] = i % 77;
    }

    const unsigned int passes = op.getOptionInt("passes");
    int micdev = op.getOptionInt("target");

    // Allocate memory on the card
    #pragma offload target(mic:micdev) \
    nocopy(hostMem:length(numMaxFloats) alloc_if(1) free_if(0) align(ALIGN) ) 
    {
    }

    // Three passes, forward and backward both
    for (int pass = 0; pass < passes; pass++)
    {
        // Step through sizes forward on even passes and backward on odd
        for (int i = 0; i < nSizes; i++)
        {
            int sizeIndex;
            if ((pass % 2) == 0)
            {
                sizeIndex = i;
            }
            else
            {
                sizeIndex = (nSizes - 1) - i;
            }

            int nbytes = sizes[sizeIndex] * 1024;

            //  D->H test
            double start = curr_second();

            #pragma offload target(mic:micdev) \
            out(hostMem:length((1024*sizes[sizeIndex]/4)) \
            free_if(0) alloc_if(0)  )
            {
            }
            double t = curr_second()-start;

            if (verbose)
            {
                cerr << "Size " << sizes[sizeIndex] << "k took " << t <<
                    " sec\n";
            }

            double speed = (double(sizes[sizeIndex]) * 1024 / 
                    (1000. * 1000. * 1000.)) / t;
            char sizeStr[256];
            sprintf(sizeStr, "% 6dkB", sizes[sizeIndex]);
            resultDB.AddResult("ReadbackSpeed", sizeStr, "GB/sec", speed);
            resultDB.AddResult("ReadbackTime", sizeStr, "ms", t*1000);
        }
    }
    // Free memory allocated on the mic
    #pragma offload target(mic:micdev) \
    in(hostMem:length(numMaxFloats) alloc_if(0)  )
    {
    }

    // Cleanup
    _mm_free(hostMem);
}

Example #10

Show file

File: Stencil2Dmain.cpp Project: optimus-prime/shoc-mic

void
DoTest( const char* timerDesc, ResultDatabase& resultDB, OptionParser& opts )
{
    StencilFactory<T>* stdStencilFactory = NULL;
    Stencil<T>* stdStencil = NULL;
    StencilFactory<T>* testStencilFactory = NULL;
    Stencil<T>* testStencil = NULL;

    //try
    {
        stdStencilFactory = new HostStencilFactory<T>;
        testStencilFactory = new MICStencilFactory<T>;
        assert( (stdStencilFactory != NULL) && (testStencilFactory != NULL) );

        // do a sanity check on option values
        CheckOptions( opts );
        stdStencilFactory->CheckOptions( opts );
        testStencilFactory->CheckOptions( opts );

        // extract and validate options
        std::vector<long long> arrayDims = opts.getOptionVecInt( "customSize" );
        if( arrayDims.size() != 2 )
        {
            cerr << "Dim size: " << arrayDims.size() << "\n";
            //throw InvalidArgValue( "all overall dimensions must be positive" );
        }
        if (arrayDims[0] == 0) // User has not specified a custom size
        {
            const int probSizes[4] = { 768, 1408, 2048, 4096 };
            int sizeClass = opts.getOptionInt("size");
            if (!(sizeClass >= 0 && sizeClass < 5))
            {
                //throw InvalidArgValue( "Size class must be between 1-4" );
            }
            arrayDims[0] = arrayDims[1] =probSizes[sizeClass - 1];
        }

        long int seed = (long)opts.getOptionInt( "seed" );
        bool beVerbose = opts.getOptionBool( "verbose" );
        unsigned int nIters = (unsigned int)opts.getOptionInt( "num-iters" );
        double valErrThreshold = (double)opts.getOptionFloat( "val-threshold" );
        unsigned int nValErrsToPrint = (unsigned int)opts.getOptionInt( "val-print-limit" );

#if defined(PARALLEL)
        unsigned int haloWidth = (unsigned int)opts.getOptionInt( "iters-per-exchange" );
#else
        unsigned int haloWidth = 1;
#endif // defined(PARALLEL)

        float haloVal = (float)opts.getOptionFloat( "haloVal" );

        // build a description of this experiment
        std::ostringstream experimentDescriptionStr;
        experimentDescriptionStr 
            << nIters << ':'
            << arrayDims[0] << 'x' << arrayDims[1] << ':'
            << LROWS << 'x' << LCOLS;

        unsigned int nPasses =(unsigned int)opts.getOptionInt( "passes" );
         unsigned long npts = (arrayDims[0] + 2*haloWidth - 2) * 
                                     (arrayDims[1] + 2*haloWidth - 2); 

unsigned long nflops = npts * 11 * nIters;
cout<<"flops are = "<<nflops<<endl;

        // compute the expected result on the host
#if defined(PARALLEL)
        int cwrank;
        MPI_Comm_rank( MPI_COMM_WORLD, &cwrank );
        if( cwrank == 0 )
        {
#endif // defined(PARALLEL)
        std::cout << "\nPerforming stencil operation on host for later comparison with MIC output\n"
            << "Depending on host capabilities, this may take a while."
            << std::endl;
#if defined(PARALLEL)
        }
#endif // defined(PARALLEL)
        Matrix2D<T> exp( arrayDims[0] + 2*haloWidth, 
                            arrayDims[1] + 2*haloWidth );
        Initialize<T> init( seed,
                        haloWidth,
                        haloVal );	

        init( exp );
        if( beVerbose )
        {
            std::cout << "initial state:\n" << exp << std::endl;
        }
        Stencil<T>* stdStencil = stdStencilFactory->BuildStencil( opts );
        (*stdStencil)( exp, nIters );
        if( beVerbose )
        {
            std::cout << "expected result:\n" << exp << std::endl;
        }
	

        // compute the result on the MIC device
        Matrix2D<T> data( arrayDims[0] + 2*haloWidth, 
                            arrayDims[1] + 2*haloWidth );
        Stencil<T>* testStencil = testStencilFactory->BuildStencil( opts );

#if defined(PARALLEL)
        MPI_Comm_rank( MPI_COMM_WORLD, &cwrank );
        if( cwrank == 0 )
        {
#endif // defined(PARALLEL)
        std::cout << "\nPerforming stencil operation on chosen device, " 
            << nPasses << " passes.\n"
            << "Depending on chosen device, this may take a while."
            << std::endl;
#if defined(PARALLEL)
        }
#endif // defined(PARALLEL)

#if !defined(PARALLEL)
        std::cout << "At the end of each pass the number of validation\nerrors observed will be printed to the standard output." 
            << std::endl;
#endif // !defined(PARALLEL)
	std::cout<<"Passes:"<<nPasses<<endl;
        for( unsigned int pass = 0; pass < nPasses; pass++ )
        {
            init( data );

            double start = curr_second();
            (*testStencil)( data, nIters );
            double elapsedTime = curr_second()-start;
            double gflops = (nflops / elapsedTime) / 1e9;
            resultDB.AddResult( timerDesc,
                                    experimentDescriptionStr.str(),
                                    "GFLOPS",
                                    gflops );
            if( beVerbose )
            {
                std::cout << "observed result, pass " << pass << ":\n" 
                    << data 
                    << std::endl;
            }

            // validate the result
#if defined(PARALLEL)
            //StencilValidater<T>* validater = new MPIStencilValidater<T>;
#else
            //StencilValidater<T>* validater = new SerialStencilValidater<T>;            
#endif // defined(PARALLEL)
	   MICValidate(exp,data,valErrThreshold,nValErrsToPrint);
            /*validater->ValidateResult( exp,
                            data,
                            valErrThreshold,
                            nValErrsToPrint );*/
        }
    }
    /*
    catch( ... )
    {
        // clean up - abnormal termination
        // wish we didn't have to do this, but C++ exceptions do not 
        // support a try-catch-finally approach
        delete stdStencil;
        delete stdStencilFactory;
        delete testStencil;
        delete testStencilFactory;
        throw;
    }*/

    // clean up - normal termination
    delete stdStencil;
    delete stdStencilFactory;
    delete testStencil;
    delete testStencilFactory;
}

Example #11

Show file

File: MaxFlops.cpp Project: ashwinma/mpiacc-contention-tests

// ****************************************************************************
// Function: RunBenchmark
//
// Purpose:
//   Executes a series of arithmetic benchmarks for OpenCL devices. 
//   OpenCL kernels are auto-generated based on the values in the
//   _benchmark_type structures.
//   The benchmark tests throughput for add, multiply, multiply-add and 
//   multiply+multiply-add series of operations, for 1, 2, 4 and 8
//   independent streams..
//
// Arguments:
//   ctx: the opencl context to use for the benchmark
//   queue: the opencl command queue to issue commands to
//   resultDB: results from the benchmark are stored in this db
//   op: the options parser (contains input parameters)
//
// Returns:  nothing
//
// Programmer: Gabriel Marin
// Creation: June 26, 2009
//
// Modifications:
//
// ****************************************************************************
void RunBenchmark(cl::Device& devcpp,
                  cl::Context& ctxcpp,
                  cl::CommandQueue& queuecpp,
                  ResultDatabase &resultDB,
                  OptionParser &op)
{
    // convert from C++ bindings to C bindings
    // TODO propagate use of C++ bindings
    cl_device_id id = devcpp();
    cl_context ctx = ctxcpp();
    cl_command_queue queue = queuecpp();

    int npasses  = op.getOptionInt("passes");
    bool verbose = op.getOptionBool("verbose");
    bool quiet = op.getOptionBool("quiet");


    int err;
    cl_mem mem1;
    float *hostMem, *hostMem2;
    size_t maxGroupSize = 1;
    size_t localWorkSize = 1;

    // Seed the random number generator
    srand48(8650341L);
    
    // To prevent this benchmark from taking too long to run, we 
    // calibrate how many repetitions of each test to execute. To do this we
    // run one pass through a multiply-add benchmark and then adjust
    // the repeat factor based on runtime. Use MulMAdd4 for this.
    int aIdx = 0;
    float repeatF = 1.0f;
    // Find the index of the MAdd4 benchmark
    while ((aTests!=0) && (aTests[aIdx].name!=0) && 
        strcmp(aTests[aIdx].name,"MAdd4"))
    {   
       aIdx += 1;
    }
    if (aTests && aTests[aIdx].name)  // we found a benchmark with that name
    {
       struct _benchmark_type temp = aTests[aIdx];
       // Limit to one repetition
       temp.numRepeats = 10;

       // Kernel will be generated into this stream
       ostringstream oss;
       generateKernel (oss, temp, "float", "");
       std::string kernelCode(oss.str());
       
       // Allocate host memory
       int halfNumFloatsMax = temp.halfBufSizeMax*1024/4;
       int numFloatsMax = 2*halfNumFloatsMax;
       hostMem = new float[numFloatsMax];
       hostMem2 = new float[numFloatsMax];
       
       // Allocate device memory
       mem1 = clCreateBuffer(ctx, CL_MEM_READ_WRITE, 
                                 sizeof(float)*numFloatsMax, NULL, &err);
       CL_CHECK_ERROR(err);

       err = clEnqueueWriteBuffer(queue, mem1, true, 0,
                               numFloatsMax*sizeof(float), hostMem,
                               0, NULL, NULL);
       CL_CHECK_ERROR(err);
       
       // Create the kernel program
       const char* progSource[] = {kernelCode.c_str()};
       cl_program prog = clCreateProgramWithSource(ctx, 1, progSource, 
                                                   NULL, &err);
       CL_CHECK_ERROR(err);
       
       // Compile the kernel
       err = clBuildProgram(prog, 0, NULL, opts, NULL, NULL);
       // Compile the kernel
       CL_CHECK_ERROR(err);
       
       // Extract out madd kernel 
       cl_kernel kernel_madd = clCreateKernel(prog, temp.name, &err);
       CL_CHECK_ERROR(err);
       
       // Set kernel arguments
       err = clSetKernelArg (kernel_madd, 0, sizeof(cl_mem), (void*)&mem1);
       CL_CHECK_ERROR (err);
       err = clSetKernelArg (kernel_madd, 1, sizeof(cl_int), 
                             (void*)&temp.numRepeats);
       CL_CHECK_ERROR (err);
       
       // Determine the maximum work group size for this kernel
       maxGroupSize = getMaxWorkGroupSize(id);
       // use min(maxWorkGroupSize, 256)
       localWorkSize = maxGroupSize<128?maxGroupSize:128;
       
       // Initialize host data, with the first half the same as the second
       for (int j=0; j<halfNumFloatsMax; ++j) 
       {
           hostMem[j] = hostMem[numFloatsMax-j-1] = (float)(drand48()*5.0);
       }
       // Set global work size
       size_t globalWorkSize = numFloatsMax;
       
       Event evCopyMem("CopyMem");
       err = clEnqueueWriteBuffer (queue, mem1, true, 0,
                                   numFloatsMax*sizeof(float), hostMem,
                                   0, NULL, &evCopyMem.CLEvent());
       CL_CHECK_ERROR (err);
       // Wait for transfer to finish      
       err = clWaitForEvents (1, &evCopyMem.CLEvent());
       CL_CHECK_ERROR (err);

       Event evKernel(temp.name);
       err = clEnqueueNDRangeKernel (queue, kernel_madd, 1, NULL,
                                 &globalWorkSize, &localWorkSize,
                                 0, NULL, &evKernel.CLEvent());
       CL_CHECK_ERROR (err);
       // Wait for kernel to finish      
       err = clWaitForEvents (1, &evKernel.CLEvent());
       CL_CHECK_ERROR (err);
             
       evKernel.FillTimingInfo();
       // Calculate repeat factor based on kernel runtime
       double tt = double(evKernel.SubmitEndRuntime());
       repeatF = 1.1e07 / tt;
       cout << "Adjust repeat factor = " << repeatF << endl;
       
       // Clean up
       err = clReleaseKernel (kernel_madd);
       CL_CHECK_ERROR(err);
       err = clReleaseProgram (prog);
       CL_CHECK_ERROR(err);
       err = clReleaseMemObject(mem1);
       CL_CHECK_ERROR(err);

       delete[] hostMem;
       delete[] hostMem2;
    }
    
    // Compute total number of kernel runs
    int totalRuns = 0;
    aIdx = 0;
    while ((aTests!=0) && (aTests[aIdx].name!=0))
    {
        for (int halfNumFloats=aTests[aIdx].halfBufSizeMin*1024 ; 
             halfNumFloats<=aTests[aIdx].halfBufSizeMax*1024 ;
             halfNumFloats*=aTests[aIdx].halfBufSizeStride)
        {
            totalRuns += npasses;
        }
        aIdx += 1;
    }
    // Account for custom kernels
    totalRuns += 2 * npasses; 
    
    // check for double precision support
    int hasDoubleFp = 0;
    string doublePragma = "";
    if (checkExtension(id, "cl_khr_fp64")) {
        hasDoubleFp = 1;
        doublePragma = "#pragma OPENCL EXTENSION cl_khr_fp64: enable";
    } else
    if (checkExtension(id, "cl_amd_fp64")) {
        hasDoubleFp = 1;
        doublePragma = "#pragma OPENCL EXTENSION cl_amd_fp64: enable";
    }
    
    // Double the number of passes if double precision support found
    if (hasDoubleFp) {
        cout << "DP Supported" << endl;
        totalRuns <<= 1;
    } else
        cout << "DP Not Supported" << endl;

    ProgressBar pb(totalRuns);
    if (!verbose && !quiet)
        pb.Show(stdout);

    RunTest<float> (id, ctx, queue, resultDB, npasses, verbose, quiet,
             repeatF, localWorkSize, pb, "float", "-SP", "");

    if (hasDoubleFp)
        RunTest<double> (id, ctx, queue, resultDB, npasses, verbose, quiet,
             repeatF, localWorkSize, pb, "double", "-DP", doublePragma.c_str());
    else
    {
        aIdx = 0;
        const char atts[] = "DP_Not_Supported";
        while ((aTests!=0) && (aTests[aIdx].name!=0))
        {   
            for (int pas=0 ; pas<npasses ; ++pas) 
            {
                resultDB.AddResult(string(aTests[aIdx].name)+"-DP" , atts, "GFLOPS", FLT_MAX);
            }
            aIdx += 1;
        }
        for (int pas=0 ; pas<npasses ; ++pas) 
        {
            resultDB.AddResult("MulMAddU-DP", atts, "GFLOPS", FLT_MAX);
            resultDB.AddResult("MAddU-DP", atts, "GFLOPS", FLT_MAX);
        }
    }
    if (!verbose)
        fprintf (stdout, "\n\n");
}

Example #12

Show file

File: FFT.cpp Project: ellen-hl/shoc-mic

void RunTest(const string& name, ResultDatabase &resultDB, OptionParser &op)
{
    static __declspec(target(mic)) T2 *source;
    int chk;
    unsigned long bytes = 0;
    const int micdev = op.getOptionInt("target");
    const bool verbose = op.getOptionBool("verbose");
    
    // Get problem size
    if (op.getOptionInt("MB") == 0) {
        int probSizes[4] = { 1, 8, 96, 256 };
        int sizeIndex = op.getOptionInt("size")-1;
        if (sizeIndex < 0 || sizeIndex >= 4) {
            cerr << "Invalid size index specified\n";
            exit(-1);
        }
        bytes = probSizes[sizeIndex];
    } else {
        bytes = op.getOptionInt("MB");
    }
    // Convert to MiB
    bytes *= 1024 * 1024;

    int passes = op.getOptionInt("passes");

    // The size of the transform computed is fixed at 512 complex elements
    int fftsz = 512;        
    int N = (bytes)/sizeof(T2);
    int n_ffts = N/fftsz;

    // Allocate space (aligned)
    source = (T2*) MKL_malloc(bytes,  4096);

    //allocate buffers and create FFT plans
    #pragma offload target(mic:micdev) in(fftsz, n_ffts)       \
                                       nocopy(source:length(N) \
                                       align(4096) alloc_if(1) free_if(0))
    {
        forward((T2*)NULL, fftsz, n_ffts);
        inverse((T2*)NULL, fftsz, n_ffts);
    }

    const char *sizeStr;
    stringstream ss;
    ss << "N=" << (long)N;
    sizeStr = strdup(ss.str().c_str());

    for(int k = 0; k < passes; k++)
    {
        init<T2>( source, fftsz, n_ffts );
        // Warmup
        if (k==0)
        {
            #pragma offload target(mic:micdev) in(fftsz, n_ffts)   \
                                               in(source:length(N) \
                                               alloc_if(0)  free_if(0))
            {
                forward(source, fftsz, n_ffts);
            }
        }

        // Time forward fft with data transfer over PCIe
        double time_fwd_pcie = -curr_second();
        
        // Using in rather than inout to be consistent with CUDA version.
        #pragma offload target(mic:micdev) in(fftsz, n_ffts)               \
                                           in(source:length(N) alloc_if(0) \
                                           free_if(0))
        {
            forward(source, fftsz, n_ffts);
        }
        time_fwd_pcie += curr_second();
        #pragma offload target(mic:micdev) out(source:length(N) alloc_if(0) \
                free_if(0))
        {
        }

        // Time inverse fft with data transfer over PCIe
        double time_inv_pcie = -curr_second();
        #pragma offload target(mic:micdev) in(fftsz, n_ffts)   \
                                           in(source:length(N) \
                                           alloc_if(0)  free_if(0))
        {
            inverse(source, fftsz, n_ffts);
        }
        time_inv_pcie += curr_second();

        #pragma offload target(mic:micdev) out(source:length(N) alloc_if(0) \
            free_if(0))
        {}

        // Check result
        #pragma offload target(mic:micdev) in(fftsz,n_ffts) nocopy(source) \
            out(chk)
        {
            chk = checkDiff(source, fftsz, n_ffts);
        }
        if (verbose || chk)
        {
            cout << "Test " << k << ((chk) ? ": Failed\n" : ": Passed\n");
        }

        // Time forward fft without data transfer
        double time_fwd_native = -curr_second();
        #pragma offload target(mic:micdev) in(fftsz, n_ffts) nocopy(source)
        {
            forward(source, fftsz, n_ffts);
        }
        time_fwd_native += curr_second();

        // Time inverse fft without data transfer
        double time_inv_native = -curr_second();
        #pragma offload target(mic:micdev) in(fftsz, n_ffts) nocopy(source)
        {
            inverse(source, fftsz, n_ffts);
        }
        time_inv_native += curr_second();

        // Calculate gflops
        double flop_count    = n_ffts*(5*fftsz*log2(fftsz));
        double GF_fwd_pcie   = flop_count / (time_fwd_pcie   * 1e9);
        double GF_fwd_native = flop_count / (time_fwd_native * 1e9);
        double GF_inv_pcie   = flop_count / (time_inv_pcie   * 1e9);
        double GF_inv_native = flop_count / (time_inv_native * 1e9);

        resultDB.AddResult(name, sizeStr, "GFLOPS", GF_fwd_native);
        resultDB.AddResult(name+"_PCIe", sizeStr, "GFLOPS", GF_fwd_pcie);
        resultDB.AddResult(name+"_Parity", sizeStr, "N", 
                (time_fwd_pcie - time_fwd_native) / time_fwd_native);

        resultDB.AddResult(name+"-INV", sizeStr, "GFLOPS", GF_inv_native);
        resultDB.AddResult(name+"-INV_PCIe", sizeStr, "GFLOPS", GF_inv_pcie);
        resultDB.AddResult(name+"-INV_Parity", sizeStr, "N", 
                (time_inv_pcie - time_inv_native) / time_inv_native);
    }

    // Cleanup FFT plans and buffers
    #pragma offload target(mic:micdev) nocopy(source:length(N) \
            alloc_if(0) free_if(1))
    {
        forward((T2*)NULL, 0, 0);
        inverse((T2*)NULL, 0, 0);
    }
    MKL_free(source);
}

Example #13

Show file

File: main.cpp Project: Poojachitral/shoc

// ****************************************************************************
// Function: main
//
// Purpose:
//   The main function takes care of initialization (device and MPI),  then
//   performs the benchmark and prints results.
//
// Arguments:
//
//
// Programmer: Jeremy Meredith
// Creation:
//
// Modifications:
//   Jeremy Meredith, Wed Nov 10 14:20:47 EST 2010
//   Split timing reports into detailed and summary.  For serial code, we
//   report all trial values, and for parallel, skip the per-process vals.
//   Also detect and print outliers from parallel runs.
//
// ****************************************************************************
int main(int argc, char *argv[])
{
    int ret = 0;
    bool noprompt = false;

    try
    {
#ifdef PARALLEL
        int rank, size;
        MPI_Init(&argc,&argv);
        MPI_Comm_size(MPI_COMM_WORLD, &size);
        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
        cerr << "MPI Task " << rank << "/" << size - 1 << " starting....\n";
#endif

        // Get args
        OptionParser op;
       
        //Add shared options to the parser
        op.addOption("device", OPT_VECINT, "0",
                "specify device(s) to run on", 'd');
        op.addOption("verbose", OPT_BOOL, "", "enable verbose output", 'v');
        op.addOption("passes", OPT_INT, "10", "specify number of passes", 'n');
        op.addOption("size", OPT_INT, "1", "specify problem size", 's');
        op.addOption("infoDevices", OPT_BOOL, "",
                "show info for available platforms and devices", 'i');
        op.addOption("quiet", OPT_BOOL, "", "write minimum necessary to standard output", 'q');
#ifdef _WIN32
        op.addOption("noprompt", OPT_BOOL, "", "don't wait for prompt at program exit");
#endif

        addBenchmarkSpecOptions(op);

        if (!op.parse(argc, argv))
        {
#ifdef PARALLEL
            if (rank == 0)
                op.usage();
            MPI_Finalize();
#else
            op.usage();
#endif
            return (op.HelpRequested() ? 0 : 1);
        }
        
        bool verbose = op.getOptionBool("verbose");
        bool infoDev = op.getOptionBool("infoDevices");
#ifdef _WIN32
        noprompt = op.getOptionBool("noprompt");
#endif

        int device;
#ifdef PARALLEL
        NodeInfo ni;
        int myNodeRank = ni.nodeRank();
        vector<long long> deviceVec = op.getOptionVecInt("device");
        if (myNodeRank >= deviceVec.size()) {
            // Default is for task i to test device i
            device = myNodeRank;
        } else {
            device = deviceVec[myNodeRank];
        }
#else
        device = op.getOptionVecInt("device")[0];
#endif
        int deviceCount;
        cudaGetDeviceCount(&deviceCount);
        if (device >= deviceCount) {
            cerr << "Warning: device index: " << device <<
            " out of range, defaulting to device 0.\n";
            device = 0;
        }

        // Initialization
        EnumerateDevicesAndChoose(device, infoDev);
        if( infoDev )
        {
            return 0;
        }
        ResultDatabase resultDB;

        // Run the benchmark
        RunBenchmark(resultDB, op);

#ifndef PARALLEL
        resultDB.DumpDetailed(cout);
#else
        ParallelResultDatabase pardb;
        pardb.MergeSerialDatabases(resultDB,MPI_COMM_WORLD);
        if (rank==0)
        {
            pardb.DumpSummary(cout);
            pardb.DumpOutliers(cout);
        }
#endif

    }
    catch( InvalidArgValue& e )
    {
        std::cerr << e.what() << ": " << e.GetMessage() << std::endl;
        ret = 1;
    }
    catch( std::exception& e )
    {
        std::cerr << e.what() << std::endl;
        ret = 1;
    }
    catch( ... )
    {
        ret = 1;
    }


#ifdef PARALLEL
    MPI_Finalize();
#endif

#ifdef _WIN32
    if (!noprompt)
    {
        cout << "Press return to exit\n";
        cin.get();
    }
#endif

    return ret;
}

Example #14

Show file

File: fftlib.cpp Project: TakayukiSakai/shoc

void 
init(OptionParser& op, bool _do_dp)
{
    cl_int err;

    do_dp = _do_dp;
    
    if (!fftCtx) {
        // first get the device
        int device, platform = op.getOptionInt("platform");
        if (op.getOptionVecInt("device").size() > 0) {
            device = op.getOptionVecInt("device")[0];
        }
        else {
            device = 0;
        }
        fftDev = ListDevicesAndGetDevice(platform, device);

        // now get the context
        fftCtx = clCreateContext(NULL, 1, &fftDev, NULL, NULL, &err);
        CL_CHECK_ERROR(err);
    }

    if (!fftQueue) {
        // get a queue
        fftQueue = clCreateCommandQueue(fftCtx, fftDev, CL_QUEUE_PROFILING_ENABLE,
                                        &err);
        CL_CHECK_ERROR(err);
    }
    
    // create the program...
    fftProg = clCreateProgramWithSource(fftCtx, 1, &cl_source_fft, NULL, &err);
    CL_CHECK_ERROR(err);
    
    // ...and build it
    string args = " -cl-mad-enable ";
    if (op.getOptionBool("use-native")) {
        args += " -cl-fast-relaxed-math ";
    }
    if (!do_dp) {
        args += " -DSINGLE_PRECISION ";
    }
    else if (checkExtension(fftDev, "cl_khr_fp64")) {
        args += " -DK_DOUBLE_PRECISION ";
    }
    else if (checkExtension(fftDev, "cl_amd_fp64")) {
        args += " -DAMD_DOUBLE_PRECISION ";
    }
    err = clBuildProgram(fftProg, 0, NULL, args.c_str(), NULL, NULL);
    {
        char* log = NULL;
        size_t bytesRequired = 0;
        err = clGetProgramBuildInfo(fftProg, 
                                    fftDev, 
                                    CL_PROGRAM_BUILD_LOG,
                                    0,
                                    NULL,
                                    &bytesRequired );
        log = (char*)malloc( bytesRequired + 1 );
        err = clGetProgramBuildInfo(fftProg, 
                                    fftDev, 
                                    CL_PROGRAM_BUILD_LOG,
                                    bytesRequired,
                                    log,
                                    NULL );
        std::cout << log << std::endl;
        free( log );
    }
    if (err != CL_SUCCESS) {
        char log[50000];
        size_t retsize = 0;
        err = clGetProgramBuildInfo(fftProg, fftDev, CL_PROGRAM_BUILD_LOG,
                                    50000*sizeof(char),  log, &retsize);
        CL_CHECK_ERROR(err);
        cout << "Retsize: " << retsize << endl;
        cout << "Log: " << log << endl;
        dumpPTXCode(fftCtx, fftProg, "oclFFT");
        exit(-1);
    }
    else {
        // dumpPTXCode(fftCtx, fftProg, "oclFFT");
    }

    // Create kernel for forward FFT
    fftKrnl = clCreateKernel(fftProg, "fft1D_512", &err);
    CL_CHECK_ERROR(err);
    // Create kernel for inverse FFT
    ifftKrnl = clCreateKernel(fftProg, "ifft1D_512", &err);
    CL_CHECK_ERROR(err);
    // Create kernel for check
    chkKrnl = clCreateKernel(fftProg, "chk1D_512", &err);
    CL_CHECK_ERROR(err);
}

Example #15

Show file

File: BusSpeedDownload.cpp Project: dylanzika/shoc

//  Modifications:
//    Jeremy Meredith, Wed Dec  1 17:05:27 EST 2010
//    Added calculation of latency estimate.
//
void RunBenchmark(cl::Device& devcpp,
                  cl::Context& ctxcpp,
                  cl::CommandQueue& queuecpp,
                  ResultDatabase &resultDB,
                  OptionParser &op)
{
    // convert from C++ bindings to C bindings
    // TODO propagate use of C++ bindings
    cl_device_id id = devcpp();
    cl_context ctx = ctxcpp();
    cl_command_queue queue = queuecpp();

    bool verbose = op.getOptionBool("verbose");
    bool pinned = !op.getOptionBool("nopinned");
    int  npasses = op.getOptionInt("passes");
    const bool waitForEvents = true;

    // Sizes are in kb
    int nSizes  = 20;
    int sizes[20] = {1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,
		     32768,65536,131072,262144,524288};

    // Max sure we don't surpass the OpenCL limit.
    cl_long maxAllocSizeBytes = 0;
    clGetDeviceInfo(id, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
                    sizeof(cl_long), &maxAllocSizeBytes, NULL);
    while (sizes[nSizes-1]*1024 > 0.90 * maxAllocSizeBytes)
    {
        --nSizes;
        if (verbose) cout << " - dropping allocation size to keep under reported limit.\n";
        if (nSizes < 1)
        {
            cerr << "Error: OpenCL reported a max allocation size less than 1kB.\b";
            return;
        }
    }

    // Create some host memory pattern
    if (verbose) cout << ">> creating host mem pattern\n";
    int err;
    float *hostMem;
    cl_mem hostMemObj;
    long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
    if (pinned)
    {
	hostMemObj = clCreateBuffer(ctx,
				    CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
				    sizeof(float)*numMaxFloats, NULL, &err);
        if (err == CL_SUCCESS)
        {
            hostMem = (float*)clEnqueueMapBuffer(queue, hostMemObj, true,
                                                 CL_MAP_READ|CL_MAP_WRITE,
                                                 0,sizeof(float)*numMaxFloats,0,
                                                 NULL,NULL,&err);
        }
	while (err != CL_SUCCESS)
	{
	    // drop the size and try again
	    if (verbose) cout << " - dropping size allocating pinned mem\n";
	    --nSizes;
	    if (nSizes < 1)
	    {
		cerr << "Error: Couldn't allocated any pinned buffer\n";
		return;
	    }
	    numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
	    hostMemObj = clCreateBuffer(ctx,
					CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
					sizeof(float)*numMaxFloats, NULL, &err);
            if (err == CL_SUCCESS)
            {
                hostMem = (float*)clEnqueueMapBuffer(queue, hostMemObj, true,
                                                     CL_MAP_READ|CL_MAP_WRITE,
                                                     0,sizeof(float)*numMaxFloats,0,
                                                     NULL,NULL,&err);
            }
	}
    }
    else
    {
        hostMem = new float[numMaxFloats];
    }
    for (int i=0; i<numMaxFloats; i++)
        hostMem[i] = i % 77;

    // Allocate some device memory
    if (verbose) cout << ">> allocating device mem\n";
    cl_mem mem1 = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
                                 sizeof(float)*numMaxFloats, NULL, &err);
    while (err != CL_SUCCESS)
    {
	// drop the size and try again
	if (verbose) cout << " - dropping size allocating device mem\n";
	--nSizes;
	if (nSizes < 1)
	{
	    cerr << "Error: Couldn't allocated any device buffer\n";
	    return;
	}
	numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
	mem1 = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
			      sizeof(float)*numMaxFloats, NULL, &err);
    }
    if (verbose) cout << ">> filling device mem to force allocation\n";
    Event evDownloadPrime("DownloadPrime");
    err = clEnqueueWriteBuffer(queue, mem1, false, 0,
                               numMaxFloats*sizeof(float), hostMem,
                               0, NULL, &evDownloadPrime.CLEvent());
    CL_CHECK_ERROR(err);
    if (verbose) cout << ">> waiting for download to finish\n";
    err = clWaitForEvents(1, &evDownloadPrime.CLEvent());
    CL_CHECK_ERROR(err);

    // Three passes, forward and backward both
    for (int pass = 0; pass < npasses; pass++)
    {
        // store the times temporarily to estimate latency
        //float times[nSizes];
        // Step through sizes forward on even passes and backward on odd
        for (int i = 0; i < nSizes; i++)
        {
            int sizeIndex;
            if ((pass%2) == 0)
                sizeIndex = i;
            else
                sizeIndex = (nSizes-1) - i;

            // Copy input memory to the device
            if (verbose) cout << ">> copying to device "<<sizes[sizeIndex]<<"kB\n";
            Event evDownload("Download");
            err = clEnqueueWriteBuffer(queue, mem1, false, 0,
                                       sizes[sizeIndex]*1024, hostMem,
                                       0, NULL, &evDownload.CLEvent());
            CL_CHECK_ERROR(err);

            // Wait for event to finish
            if (verbose) cout << ">> waiting for download to finish\n";
            err = clWaitForEvents(1, &evDownload.CLEvent());
            CL_CHECK_ERROR(err);

            if (verbose) cout << ">> finish!";
            if (verbose) cout << endl;

            // Get timings
            err = clFlush(queue);
            CL_CHECK_ERROR(err);
            evDownload.FillTimingInfo();
            if (verbose) evDownload.Print(cerr);

            double t = evDownload.SubmitEndRuntime() / 1.e6; // in ms
            //times[sizeIndex] = t;

            // Add timings to database
            double speed = (double(sizes[sizeIndex] * 1024.) /  (1000.*1000.)) / t;
            char sizeStr[256];
            sprintf(sizeStr, "% 7dkB", sizes[sizeIndex]);
            resultDB.AddResult("DownloadSpeed", sizeStr, "GB/sec", speed);

            // Add timings to database
            double delay = evDownload.SubmitStartDelay() / 1.e6;
            resultDB.AddResult("DownloadDelay", sizeStr, "ms", delay);
            resultDB.AddResult("DownloadTime", sizeStr, "ms", t);
        }
	//resultDB.AddResult("DownloadLatencyEstimate", "1-2kb", "ms", times[0]-(times[1]-times[0])/1.);
	//resultDB.AddResult("DownloadLatencyEstimate", "1-4kb", "ms", times[0]-(times[2]-times[0])/3.);
	//resultDB.AddResult("DownloadLatencyEstimate", "2-4kb", "ms", times[1]-(times[2]-times[1])/1.);
    }

    // Cleanup
    err = clReleaseMemObject(mem1);
    CL_CHECK_ERROR(err);
    if (pinned)
    {
        err = clReleaseMemObject(hostMemObj);
        CL_CHECK_ERROR(err);
    }
    else
    {
        delete[] hostMem;
    }
}