Ejemplos de OptionParser::getOptionVecInt en C++ (Cpp)

Lenguaje de programación: C++ (Cpp)

Clase / Tipo: OptionParser

Método / Función: getOptionVecInt

Ejemplos en hotexamples.com: 11

C++ (Cpp) OptionParser::getOptionVecInt - 11 ejemplos encontrados. Estos son los ejemplos en C++ (Cpp) del mundo real mejor valorados de OptionParser::getOptionVecInt extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Métodos usados con frecuencia

Mostrar Ocultar

getOptionInt(30)

setRequired(22)

fail(22)

getString(22)

Parse(21)

countOptions(21)

AddOption(20)

addOption(19)

setAllowed(17)

parse(17)

getOptionBool(15)

setDefaultArguments(14)

HelpDisplayed(14)

getBool(13)

usage(12)

getOptionVecInt(11)

getInt(10)

autoExtendOptions(9)

getDouble(8)

readArgv(8)

readFile(8)

parse_args(7)

add_option(7)

error(6)

getOptionFloat(5)

executable(5)

value(5)

bad(5)

getErrors(3)

AppendOptions(3)

DisplayHelp(3)

setShortOptionEquivalent(3)

args(3)

checkOptions(3)

ParseOptions(3)

PrintOptionHelp(3)

help_requested(2)

option_missing(2)

option_missing_message(2)

print_help(2)

about_message(2)

about_requested(2)

HelpRequested(2)

GetErrMsg(2)

getStringVector(2)

help_message(2)

add_option_group(2)

getOptionString(2)

add_opt(2)

add(1)

Ejemplo n.º 1

Mostrar archivo

Archivo: OCLDriver.cpp Proyecto: Poojachitral/shoc

// ****************************************************************************
// Function: GPUSetup
//
// Purpose:
//  do the necessary OpenCL setup for GPU part of the test
//
// Arguments:
//   op: the options parser / parameter database
//   mympirank: for printing errors in case of failure
//   mynoderank: this is typically the device ID (the mapping done in main)
//
// Returns: success/failure
//
// Creation: 2009
//
// Modifications:
//
// ****************************************************************************
//
int GPUSetup(OptionParser &op, int mympirank, int mynoderank)
{
    addBenchmarkSpecOptions(op);
    
    if (op.getOptionBool("infoDevices"))
    {
        OpenCLNodePlatformContainer ndc1;
        ndc1.Print (cout);
        return (0);
    }
    
    // The device option supports specifying more than one device
    int platform = op.getOptionInt("platform");
    int deviceIdx = mynoderank;
    if( deviceIdx >= op.getOptionVecInt( "device" ).size() )
    {
        std::ostringstream estr;
        estr << "Warning: not enough devices specified with --device flag for task "
            << mympirank
            << " ( node rank " << mynoderank
            << ") to claim its own device; forcing to use first device ";
        std::cerr << estr.str() << std::endl;
        deviceIdx = 0;
    }
    int device = op.getOptionVecInt("device")[deviceIdx];

    // Initialization
    _mpicontention_ocldev = new cl::Device( ListDevicesAndGetDevice(platform, device) );
    std::vector<cl::Device> ctxDevices;
    ctxDevices.push_back( *_mpicontention_ocldev );
    _mpicontention_ocldriver_ctx   = new cl::Context( ctxDevices );
    _mpicontention_ocldriver_queue = new cl::CommandQueue( *_mpicontention_ocldriver_ctx, *_mpicontention_ocldev, CL_QUEUE_PROFILING_ENABLE );
    _mpicontention_gpuop = op;
    return 0;
}

Ejemplo n.º 2

Mostrar archivo

Archivo: Stencil2Dmain.cpp Proyecto: adityaatluri/shoc-1

// validate stencil-independent values
void
CheckOptions( const OptionParser& opts )
{
    // check matrix dimensions - must be 2d, must be positive
    std::vector<long long> arrayDims = opts.getOptionVecInt( "customSize" );
    if( arrayDims.size() != 2 )
    {
        throw InvalidArgValue( "overall size must have two dimensions" );
    }
    if( (arrayDims[0] < 0) || (arrayDims[1] < 0) )
    {
        throw InvalidArgValue( "each size dimension must be positive" );
    }

    // validation error threshold must be positive
    float valThreshold = opts.getOptionFloat( "val-threshold" );
    if( valThreshold <= 0.0f )
    {
        throw InvalidArgValue( "validation threshold must be positive" );
    }

    // number of validation errors to print must be non-negative
    int nErrsToPrint = opts.getOptionInt( "val-print-limit" );
    if( nErrsToPrint < 0 )
    {
        throw InvalidArgValue( "number of validation errors to print must be non-negative" );
    }

    int nWarmupPasses = opts.getOptionInt( "warmupPasses" );
    if( nWarmupPasses < 0 )
    {
        throw InvalidArgValue( "number of warmup passes must be non-negative" );
    }
}

Ejemplo n.º 3

Mostrar archivo

Archivo: CommonMICStencilFactory.cpp Proyecto: optimus-prime/shoc-mic

void 
CommonMICStencilFactory<T>::CheckOptions( const OptionParser& opts ) const
{
    // let base class check its options first
    StencilFactory<T>::CheckOptions( opts );

    // check our options
    std::vector<long long> arrayDims = opts.getOptionVecInt( "customSize" );
    assert( arrayDims.size() == 2 );

    // If both of these are zero, we're using a non-custom size, skip this test
    if (arrayDims[0] == 0 && arrayDims[0] == 0)
    {
        return;
    }

    size_t gRows = (size_t)arrayDims[0];
    size_t gCols = (size_t)arrayDims[1];
    size_t lRows = LROWS;
    size_t lCols = LCOLS;

    // verify that local dimensions evenly divide global dimensions
    if( ((gRows % lRows) != 0) || (lRows > gRows) )
    {
        throw InvalidArgValue( "number of rows must be even multiple of lsize rows" );
    }
    if( ((gCols % lCols) != 0) || (lCols > gCols) )
    {
        throw InvalidArgValue( "number of columns must be even multiple of lsize columns" );
    }

    // TODO ensure local dims are smaller than CUDA implementation limits
}

Ejemplo n.º 4

Mostrar archivo

Archivo: MPICUDAStencilFactory.cpp Proyecto: BenjaminCoquelle/shoc

void
MPICUDAStencilFactory<T>::CheckOptions( const OptionParser& opts ) const
{
    // let base class check its options first
    CommonCUDAStencilFactory<T>::CheckOptions( opts );

    // check our options
    std::vector<long long> shDims = opts.getOptionVecInt( "lsize" );
    std::vector<long long> arrayDims = opts.getOptionVecInt( "customSize" );
    if( arrayDims[0] == 0 )
    {
        // custom size was not specified - we are using a standard size
        int sizeClass = opts.getOptionInt("size");
        arrayDims = StencilFactory<T>::GetStandardProblemSize( sizeClass );
    }
    assert( shDims.size() == 2 );
    assert( arrayDims.size() == 2 );

    size_t gRows = (size_t)arrayDims[0];
    size_t gCols = (size_t)arrayDims[1];
    size_t lRows = shDims[0];
    size_t lCols = shDims[1];

    unsigned int haloWidth = (unsigned int)opts.getOptionInt( "iters-per-exchange" );

    // verify that MPI halo width will result in a matrix being passed
    // to the kernel that also has its global size as a multiple of
    // the local work size
    //
    // Because the MPI halo width is arbitrary, and the kernel halo width
    // is always 1, we have to ensure that:
    //   ((size + 2*halo) - 2) % lsize == 0
    if( (((gRows + 2*haloWidth) - 2) % lRows) != 0 )
    {
        throw InvalidArgValue( "rows including halo must be even multiple of lsize (e.g., lsize rows evenly divides ((rows + 2*halo) - 2) )" );
    }
    if( (((gCols + 2*haloWidth) - 2) % lCols) != 0 )
    {
        throw InvalidArgValue( "columns including halo must be even multiple of lsize (e.g., lsize cols evenly divides ((cols + 2*halo) - 2) )" );
    }
}

Ejemplo n.º 5

Mostrar archivo

Archivo: CommonMICStencilFactory.cpp Proyecto: optimus-prime/shoc-mic

void
CommonMICStencilFactory<T>::ExtractOptions( const OptionParser& options,
                                            T& wCenter,
                                            T& wCardinal,
                                            T& wDiagonal,
                                            std::vector<long long>& devices )
{
    // let base class extract its options
    StencilFactory<T>::ExtractOptions( options, wCenter, wCardinal, wDiagonal );

    // extract our options
    // with hardcoded lsize, we no longer have any to extract

    // determine which device to use
    // We would really prefer this to be done in main() but 
    // since BuildStencil is a virtual function, we cannot change its
    // signature, and OptionParser provides no way to override an
    // option's value after it is set during parsing.
    devices = options.getOptionVecInt("device");
}

Ejemplo n.º 6

Mostrar archivo

Archivo: Stencil2Dmain.cpp Proyecto: adityaatluri/shoc-1

void
DoTest( const char* timerDesc, ResultDatabase& resultDB, OptionParser& opts )
{
    StencilFactory<T>* stdStencilFactory = NULL;
    Stencil<T>* stdStencil = NULL;
    StencilFactory<T>* testStencilFactory = NULL;
    Stencil<T>* testStencil = NULL;

    try
    {
#if defined(PARALLEL)
        stdStencilFactory = new MPIHostStencilFactory<T>;
        testStencilFactory = new MPICUDAStencilFactory<T>;
#else
        stdStencilFactory = new HostStencilFactory<T>;
        testStencilFactory = new CUDAStencilFactory<T>;
#endif // defined(PARALLEL)
        assert( (stdStencilFactory != NULL) && (testStencilFactory != NULL) );

        // do a sanity check on option values
        CheckOptions( opts );
        stdStencilFactory->CheckOptions( opts );
        testStencilFactory->CheckOptions( opts );

        // extract and validate options
        std::vector<long long> arrayDims = opts.getOptionVecInt( "customSize" );
        if( arrayDims.size() != 2 )
        {
            cerr << "Dim size: " << arrayDims.size() << "\n";
            throw InvalidArgValue( "all overall dimensions must be positive" );
        }
        if (arrayDims[0] == 0) // User has not specified a custom size
        {
            int sizeClass = opts.getOptionInt("size");
            arrayDims = StencilFactory<T>::GetStandardProblemSize( sizeClass );
        }
        long int seed = (long)opts.getOptionInt( "seed" );
        bool beVerbose = opts.getOptionBool( "verbose" );
        unsigned int nIters = (unsigned int)opts.getOptionInt( "num-iters" );
        double valErrThreshold = (double)opts.getOptionFloat( "val-threshold" );
        unsigned int nValErrsToPrint = (unsigned int)opts.getOptionInt( "val-print-limit" );

#if defined(PARALLEL)
        unsigned int haloWidth = (unsigned int)opts.getOptionInt( "iters-per-exchange" );
#else
        unsigned int haloWidth = 1;
#endif // defined(PARALLEL)

        float haloVal = (float)opts.getOptionFloat( "haloVal" );

        // build a description of this experiment
        std::vector<long long> lDims = opts.getOptionVecInt( "lsize" );
        assert( lDims.size() == 2 );
        std::ostringstream experimentDescriptionStr;
        experimentDescriptionStr
            << nIters << ':'
            << arrayDims[0] << 'x' << arrayDims[1] << ':'
            << lDims[0] << 'x' << lDims[1];

        unsigned int nPasses = (unsigned int)opts.getOptionInt( "passes" );
        unsigned int nWarmupPasses = (unsigned int)opts.getOptionInt( "warmupPasses" );


        // compute the expected result on the host
        // or read it from a pre-existing file
        std::string matrixFilenameBase = (std::string)opts.getOptionString( "expMatrixFile" );
#if defined(PARALLEL)
        int cwrank;
        MPI_Comm_rank( MPI_COMM_WORLD, &cwrank );
        if( cwrank == 0 )
        {
#endif // defined(PARALLEL)
        if( !matrixFilenameBase.empty() )
        {
            std::cout << "\nReading expected stencil operation result from file for later comparison with CUDA output\n"
                << std::endl;
        }
        else
        {
            std::cout << "\nPerforming stencil operation on host for later comparison with CUDA output\n"
                << "Depending on host capabilities, this may take a while."
                << std::endl;
        }
#if defined(PARALLEL)
        }
#endif // defined(PARALLEL)
        Matrix2D<T> expected( arrayDims[0] + 2*haloWidth,
                            arrayDims[1] + 2*haloWidth );
        Initialize<T> init( seed, haloWidth, haloVal );

        bool haveExpectedData = false;
        if( ! matrixFilenameBase.empty() )
        {
            bool readOK = ReadMatrixFromFile( expected, GetMatrixFileName<T>( matrixFilenameBase ) );
            if( readOK )
            {

                if( (expected.GetNumRows() != arrayDims[0] + 2*haloWidth) ||
                    (expected.GetNumColumns() != arrayDims[1] + 2*haloWidth) )
                {
                    std::cerr << "The matrix read from file \'"
                        << GetMatrixFileName<T>( matrixFilenameBase )
                        << "\' does not match the matrix size specified on the command line.\n";
                    expected.Reset( arrayDims[0] + 2*haloWidth, arrayDims[1] + 2*haloWidth );
                }
                else
                {
                    haveExpectedData = true;
                }
            }

            if( !haveExpectedData )
            {
                std::cout << "\nSince we could not read the expected matrix values,\nperforming stencil operation on host for later comparison with CUDA output.\n"
                    << "Depending on host capabilities, this may take a while."
                    << std::endl;
            }
        }
        if( !haveExpectedData )
        {
            init( expected );
            haveExpectedData = true;
            if( beVerbose )
            {
                std::cout << "initial state:\n" << expected << std::endl;
            }
            stdStencil = stdStencilFactory->BuildStencil( opts );
            (*stdStencil)( expected, nIters );
        }
        if( beVerbose )
        {
            std::cout << "expected result:\n" << expected << std::endl;
        }

        // determine whether we are to save the expected matrix values to a file
        // to speed up future runs
        matrixFilenameBase = (std::string)opts.getOptionString( "saveExpMatrixFile" );
        if( !matrixFilenameBase.empty() )
        {
            SaveMatrixToFile( expected, GetMatrixFileName<T>( matrixFilenameBase ) );
        }
        assert( haveExpectedData );


        // compute the result on the CUDA device
        Matrix2D<T> data( arrayDims[0] + 2*haloWidth,
                            arrayDims[1] + 2*haloWidth );
        Stencil<T>* testStencil = testStencilFactory->BuildStencil( opts );

        // Compute the number of floating point operations we will perform.
        //
        // Note: in the truly-parallel case, we count flops for redundant
        // work due to the need for a halo.
        // But we do not add to the count for the local 1-wide halo since
        // we aren't computing new values for those items.
        unsigned long npts = (arrayDims[0] + 2*haloWidth - 2) *
                            (arrayDims[1] + 2*haloWidth - 2);
#if defined(PARALLEL)
        MPICUDAStencil<T>* mpiTestStencil = static_cast<MPICUDAStencil<T>*>( testStencil );
        assert( mpiTestStencil != NULL );
        int participating = mpiTestStencil->ParticipatingInProgram() ? 1 : 0;
        int numParticipating = 0;
        MPI_Allreduce( &participating,      // src
                        &numParticipating,  // dest
                        1,                  // count
                        MPI_INT,            // type
                        MPI_SUM,            // op
                        MPI_COMM_WORLD );   // communicator
        npts *= numParticipating;
#endif // defined(PARALLEL)

        // In our 9-point stencil, there are 11 floating point operations
        // per point (3 multiplies and 11 adds):
        //
        // newval = weight_center * centerval +
        //      weight_cardinal * (northval + southval + eastval + westval) +
        //      weight_diagnoal * (neval + nwval + seval + swval)
        //
        // we do this stencil operation 'nIters' times
        unsigned long nflops = npts * 11 * nIters;

#if defined(PARALLEL)
        if( cwrank == 0 )
        {
#endif // defined(PARALLEL)
        std::cout << "Performing " << nWarmupPasses << " warmup passes...";
#if defined(PARALLEL)
        }
#endif // defined(PARALLEL)

        for( unsigned int pass = 0; pass < nWarmupPasses; pass++ )
        {
            init(data);
            (*testStencil)( data, nIters );
        }
#if defined(PARALLEL)
        if( cwrank == 0 )
        {
#endif // defined(PARALLEL)
        std::cout << "done." << std::endl;
#if defined(PARALLEL)
        }
#endif // defined(PARALLEL)

#if defined(PARALLEL)
        MPI_Comm_rank( MPI_COMM_WORLD, &cwrank );
        if( cwrank == 0 )
        {
#endif // defined(PARALLEL)
        std::cout << "\nPerforming stencil operation on chosen device, "
            << nPasses << " passes.\n"
            << "Depending on chosen device, this may take a while."
            << std::endl;
#if defined(PARALLEL)
        }
#endif // defined(PARALLEL)

#if !defined(PARALLEL)
        std::cout << "At the end of each pass the number of validation\nerrors observed will be printed to the standard output."
            << std::endl;
#endif // !defined(PARALLEL)


        for( unsigned int pass = 0; pass < nPasses; pass++ )
        {
#if !defined(PARALLEL)
            std::cout << "pass " << pass << ": ";
#endif // !defined(PARALLEL)
            init( data );

            int timerHandle = Timer::Start();
            (*testStencil)( data, nIters );
            double elapsedTime = Timer::Stop( timerHandle, "CUDA stencil" );


            // find and report the computation rate
            double gflops = (nflops / elapsedTime) / 1e9;

            resultDB.AddResult( timerDesc,
                                    experimentDescriptionStr.str(),
                                    "GFLOPS",
                                    gflops );
            if( beVerbose )
            {
                std::cout << "observed result, pass " << pass << ":\n"
                    << data
                    << std::endl;
            }

            // validate the result
#if defined(PARALLEL)
            StencilValidater<T>* validater = new MPIStencilValidater<T>;
#else
            StencilValidater<T>* validater = new SerialStencilValidater<T>;
#endif // defined(PARALLEL)
            validater->ValidateResult( expected,
                            data,
                            valErrThreshold,
                            nValErrsToPrint );
        }
    }
    catch( ... )
    {
        // clean up - abnormal termination
        // wish we didn't have to do this, but C++ exceptions do not
        // support a try-catch-finally approach
        delete stdStencil;
        delete stdStencilFactory;
        delete testStencil;
        delete testStencilFactory;
        throw;
    }

    // clean up - normal termination
    delete stdStencil;
    delete stdStencilFactory;
    delete testStencil;
    delete testStencilFactory;
}

Ejemplo n.º 7

Mostrar archivo

Archivo: main.cpp Proyecto: ManavA/shoc

// ****************************************************************************
// Method:  main()
//
// Purpose:
//   serial and parallel main for OpenCL level0 benchmarks
//
// Arguments:
//   argc, argv
//
// Programmer:  SHOC Team
// Creation:    The Epoch
//
// Modifications:
//   Jeremy Meredith, Tue Jan 12 15:09:33 EST 2010
//   Changed the way device selection works.  It now defaults to the device
//   index corresponding to the process's rank within a node if no devices
//   are specified on the command command line, and otherwise, round-robins
//   the list of devices among the tasks.
//
//   Gabriel Marin, Tue Jun 01 15:38 EST 2010
//   Check that we have valid (not NULL) context and queue objects before
//   running the benchmarks. Errors inside CreateContextFromSingleDevice or
//   CreateCommandQueueForContextAndDevice were not propagated out to the main
//   program.
//
//   Jeremy Meredith, Wed Nov 10 14:20:47 EST 2010
//   Split timing reports into detailed and summary.  For serial code, we
//   report all trial values, and for parallel, skip the per-process vals.
//   Also detect and print outliers from parallel runs.
//
// ****************************************************************************
int main(int argc, char *argv[])
{
    int ret = 0;

    try
    {
#ifdef PARALLEL
        int rank, size;
        MPI_Init(&argc,&argv);
        MPI_Comm_size(MPI_COMM_WORLD, &size);
        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
        cout << "MPI Task "<< rank << "/" << size - 1 << " starting....\n";
#endif

        OptionParser op;

        //Add shared options to the parser
        op.addOption("platform", OPT_INT, "0", "specify OpenCL platform to use",
                'p');
        op.addOption("device", OPT_VECINT, "", "specify device(s) to run on", 'd');
        op.addOption("passes", OPT_INT, "10", "specify number of passes", 'n');
        op.addOption("size", OPT_VECINT, "1", "specify problem size", 's');
        op.addOption("infoDevices", OPT_BOOL, "",
                "show info for available platforms and devices", 'i');
        op.addOption("verbose", OPT_BOOL, "", "enable verbose output", 'v');
        op.addOption("quiet", OPT_BOOL, "", "write minimum necessary to standard output", 'q');

        addBenchmarkSpecOptions(op);

        if (!op.parse(argc, argv))
        {
#ifdef PARALLEL
            if (rank == 0)
                op.usage();
            MPI_Finalize();
#else
            op.usage();
#endif
            return (op.HelpRequested() ? 0 : 1 );
        }

        if (op.getOptionBool("infoDevices"))
        {
#define DEBUG_DEVICE_CONTAINER 0
#ifdef PARALLEL
            // execute following code only if I am the process of lowest
            // rank on this node
            NodeInfo NI;
            int mynoderank = NI.nodeRank();
            if (mynoderank==0)
            {
                int nlrrank, nlrsize;
                MPI_Comm nlrcomm = NI.getNLRComm();
                MPI_Comm_size(nlrcomm, &nlrsize);
                MPI_Comm_rank(nlrcomm, &nlrrank);

                OpenCLNodePlatformContainer ndc1;
                OpenCLMultiNodeContainer localMnc(ndc1);
                localMnc.doMerge (nlrrank, nlrsize, nlrcomm);
                if (rank==0)  // I am the global rank 0, print all configurations
                    localMnc.Print (cout);
            }
#else
            OpenCLNodePlatformContainer ndc1;
            ndc1.Print (cout);
#if DEBUG_DEVICE_CONTAINER
            OpenCLMultiNodeContainer mnc1(ndc1), mnc2;
            mnc1.Print (cout);
            ostringstream oss;
            mnc1.writeObject (oss);
            std::string temp(oss.str());
            cout << "Serialized MultiNodeContainer:\n" << temp;
            istringstream iss(temp);
            mnc2.readObject (iss);
            cout << "Unserialized object2:\n";
            mnc2.Print (cout);
            mnc1.merge (mnc2);
            cout << "==============\nObject1 after merging 1:\n";
            mnc1.Print (cout);
            mnc1.merge (mnc2);
            cout << "==============\nObject1 after merging 2:\n";
            mnc1.Print (cout);
#endif  // DEBUG
#endif  // PARALLEL
            return (0);
        }

        bool verbose = op.getOptionBool("verbose");

        // The device option supports specifying more than one device
        // for now, just choose the first one.
        int platform = op.getOptionInt("platform");

#ifdef PARALLEL
        NodeInfo ni;
        int myNodeRank = ni.nodeRank();
        if (verbose)
        cout << "Global rank "<<rank<<" is local rank "<<myNodeRank << endl;
#else
        int myNodeRank = 0;
#endif

        // If they haven't specified any devices, assume they
        // want the process with in-node rank N to use device N
        int deviceIdx = myNodeRank;

        // If they have, then round-robin the list of devices
        // among the processes on a node.
        vector<long long> deviceVec = op.getOptionVecInt("device");
        if (deviceVec.size() > 0)
        {
        int len = deviceVec.size();
            deviceIdx = deviceVec[myNodeRank % len];
        }

        // Check for an erroneous device
        if (deviceIdx >= GetNumOclDevices(platform)) {
            cerr << "Warning: device index: " << deviceIdx
                 << " out of range, defaulting to device 0.\n";
            deviceIdx = 0;
        }

        // Initialization
        if (verbose) cout << ">> initializing\n";
        cl_device_id devID = ListDevicesAndGetDevice(platform, deviceIdx);
        cl_int clErr;
        cl_context ctx = clCreateContext( NULL,     // properties
                                            1,      // number of devices
                                            &devID, // device
                                            NULL,   // notification function
                                            NULL,
                                            &clErr );
        CL_CHECK_ERROR(clErr);
        cl_command_queue queue = clCreateCommandQueue( ctx,
                                                        devID,
                                                        CL_QUEUE_PROFILING_ENABLE,
                                                        &clErr );
        CL_CHECK_ERROR(clErr);
        ResultDatabase resultDB;

        // Run the benchmark
        RunBenchmark(devID, ctx, queue, resultDB, op);

        clReleaseCommandQueue( queue );
        clReleaseContext( ctx );


#ifndef PARALLEL
        resultDB.DumpDetailed(cout);
#else
        ParallelResultDatabase pardb;
        pardb.MergeSerialDatabases(resultDB,MPI_COMM_WORLD);
        if (rank==0)
        {
            pardb.DumpSummary(cout);
            pardb.DumpOutliers(cout);
        }
#endif
    }
    catch( std::exception& e )
    {
        std::cerr << e.what() << std::endl;
        ret = 1;
    }
    catch( ... )
    {
        std::cerr << "unrecognized exception caught" << std::endl;
        ret = 1;
    }

#ifdef PARALLEL
    MPI_Finalize();
#endif

    return ret;
}

Ejemplo n.º 8

Mostrar archivo

Archivo: bcmain.cpp Proyecto: BenjaminCoquelle/shoc

int main(int argc, char *argv[])
{
    int numdev=0, totalnumdev=0, numtasks, mympirank, dest, source, rc,
        mypair=0, count, tag=2, mynoderank,myclusterrank,nodenprocs;
    int *grp1, *grp2;
    int mygrprank,grpnumtasks;
    MPI_Group orig_group,bmgrp;
    MPI_Comm bmcomm,nlrcomm;
    ResultDatabase resultDB,resultDBWU,resultDB1;
    OptionParser op;
    ParallelResultDatabase pardb, pardb1;
    bool amGPUTask = false;
    volatile unsigned long long *mpidone;
    int i,shmid;

    /* Allocate System V shared memory */

    MPI_Init(&argc,&argv);
    MPI_Comm_size(MPI_COMM_WORLD, &numtasks);
    MPI_Comm_rank(MPI_COMM_WORLD, &mympirank);
    MPI_Comm_group(MPI_COMM_WORLD, &orig_group);


    //Add shared options to the parser
    op.addOption("device", OPT_VECINT, "0", "specify device(s) to run on",
		    'd');
    op.addOption("verbose", OPT_BOOL, "", "enable verbose output", 'v');
    op.addOption("quiet", OPT_BOOL, "",
		    "write minimum necessary to standard output", 'q');
    op.addOption("passes", OPT_INT, "10", "specify number of passes", 'z');
    op.addOption("size", OPT_VECINT, "1", "specify problem size", 's');
    op.addOption("time", OPT_INT, "5", "specify running time in miuntes", 't');
    op.addOption("outputFile", OPT_STRING, "output.txt", "specify output file",
       'o');
    op.addOption("infoDevices", OPT_BOOL, "", "show summary info for available devices",
       'i');
    op.addOption("fullInfoDevices", OPT_BOOL, "", "show full info for available devices");
    op.addOption("MPIminmsg", OPT_INT, "0", "specify minimum MPI message size");
    op.addOption("MPImaxmsg", OPT_INT, "16384",
                    "specify maximum MPI message size");
    op.addOption("MPIiter", OPT_INT, "1000",
                    "specify number of MPI benchmark iterations for each size");
    op.addOption("platform", OPT_INT, "0", "specify platform for device selection", 'y');

    if (!op.parse(argc, argv))
    {
        if (mympirank == 0)
            op.usage();
        MPI_Finalize();
        return 0;
    }

    int npasses = op.getOptionInt("passes");

    //our simple mapping
    NodeInfo NI;
    mynoderank = NI.nodeRank();         // rank of my process within the node
    myclusterrank = NI.clusterRank();   // cluster (essentially, node) id
    MPI_Comm smpcomm = NI.getSMPComm();

    if(mynoderank==0){
        shmid = shmget(IPC_PRIVATE,
                 sizeof(unsigned long long),
                 (IPC_CREAT | 0600));
    }

    MPI_Bcast(&shmid, 1, MPI_INT, 0, NI.getSMPComm());

    mpidone = ((volatile unsigned long long*) shmat(shmid, 0, 0));
    if (mynoderank == 0)
        shmctl(shmid, IPC_RMID, 0);
    *mpidone = 0;

    nlrcomm = NI.getNLRComm(); // communcator of all the lowest rank processes
                               // on all the nodes
    int numnodes = NI.numNodes();
    if ( numnodes%2!=0 )
    {
        if(mympirank==0)
            printf("\nThis test needs an even number of nodes\n");
        MPI_Finalize();
	exit(0);
    }
    int nodealr = NI.nodeALR();

    nodenprocs=NI.nodeNprocs();

    // determine how many GPU devices we are to use
    int devsPerNode = op.getOptionVecInt( "device" ).size();
    //cout<<mympirank<<":numgpus="<<devsPerNode<<endl;

    // if there are as many or more devices as the nprocs, only use half of
    // the nproc
    if ( devsPerNode >= nodenprocs ) devsPerNode = nodenprocs/2;

    numdev = (mynoderank == 0) ? devsPerNode : 0;
    MPI_Allreduce(&numdev, &totalnumdev, 1, MPI_INT, MPI_SUM,
                    MPI_COMM_WORLD);
    numdev = devsPerNode;

    // determine whether I am to be a GPU or a comm task
    if( mynoderank < numdev )
    {
        amGPUTask = true;
    }

    //Divide tasks into two distinct groups based upon noderank
    grp1=(int *)calloc(totalnumdev, sizeof(int));
    grp2=(int *)calloc((numtasks-totalnumdev),sizeof(int));
    if (grp1==NULL || grp2==NULL)
    {
        printf("\n%d:calloc failed in %s",mympirank,__FUNCTION__);
        exit(1);
    }


    /*compute the groups*/
    int beginoffset[2]={0,0};
    if(mynoderank == 0)
    {
        int tmp[2];
	tmp[0]=numdev;
	tmp[1]=nodenprocs-numdev;
        if (mympirank ==0)
            MPI_Send(tmp, 2*sizeof(int), MPI_CHAR, 1, 112, nlrcomm);
        else
        {
            MPI_Status reqstat;
	    MPI_Recv(beginoffset, 2*sizeof(int), MPI_CHAR, myclusterrank-1,
			    112, nlrcomm ,&reqstat);
            if (myclusterrank < numnodes-1)
            {
                beginoffset[0]+=numdev;
                beginoffset[1]+=(nodenprocs-numdev);
		MPI_Send(beginoffset,2*sizeof(int), MPI_CHAR, myclusterrank+1,
				112, nlrcomm);
		beginoffset[0]-=numdev;
		beginoffset[1]-=(nodenprocs-numdev);
            }
        }
    }
    MPI_Bcast(beginoffset,2,MPI_INT,0,smpcomm);

    if ( amGPUTask )
    {
        // I am to do GPU work
        grp1[beginoffset[0]+mynoderank]=mympirank;
        grpnumtasks=totalnumdev;
    }
    else
    {
        // I am to do MPI communication work
        grp2[beginoffset[1]+(mynoderank-numdev)]=mympirank;
        grpnumtasks=numtasks-totalnumdev;
    }

    MPI_Allreduce(MPI_IN_PLACE, grp1, totalnumdev, MPI_INT, MPI_SUM,
                    MPI_COMM_WORLD);
    MPI_Allreduce(MPI_IN_PLACE, grp2, (numtasks-totalnumdev), MPI_INT,
                            MPI_SUM, MPI_COMM_WORLD);

    if ( amGPUTask )
    {
        // I am to do GPU work, so will be part of GPU communicator
        MPI_Group_incl(orig_group, totalnumdev, grp1, &bmgrp);
    }
    else
    {
        // I am to do MPI communication work, so will be part of MPI
        // messaging traffic communicator
        MPI_Group_incl(orig_group, (numtasks-totalnumdev), grp2,
                        &bmgrp);
    }

    MPI_Comm_create(MPI_COMM_WORLD, bmgrp, &bmcomm);
    MPI_Comm_rank(bmcomm, &mygrprank);
    NodeInfo *GRPNI = new NodeInfo(bmcomm);
    int mygrpnoderank=GRPNI->nodeRank();
    int grpnodealr = GRPNI->nodeALR();
    int grpnodenprocs = GRPNI->nodeNprocs();
    MPI_Comm grpnlrcomm = GRPNI->getNLRComm();
    //note that clusterrank and number of nodes don't change for this child
    //group/comm


    //form node-random pairs (see README) among communication tasks
    if( amGPUTask )
    {
        //setup GPU in GPU tasks
        GPUSetup(op, mympirank, mynoderank);
    }
    else
    {
        int * pairlist = new int[numnodes];
        for (i=0;i<numnodes;i++) pairlist[i]=0;

        if ( mygrpnoderank==0 )
        {
            pairlist[myclusterrank]=grpnodealr;
            MPI_Allreduce(MPI_IN_PLACE,pairlist,numnodes,MPI_INT,MPI_SUM,
                          grpnlrcomm);
            mypair = RandomPairs(myclusterrank, numnodes, grpnlrcomm);
            mypair = pairlist[mypair];
        }
        for (i=0;i<numnodes;i++) pairlist[i]=0;
        if ( mygrpnoderank==0 )
            pairlist[myclusterrank]=mypair;
        MPI_Allreduce(MPI_IN_PLACE,pairlist,numnodes,MPI_INT,MPI_SUM,
                      bmcomm);
        mypair = pairlist[myclusterrank]+mygrpnoderank;
    }

    // ensure we are all synchronized before starting test
    MPI_Barrier(MPI_COMM_WORLD);

    //warmup run
    if ( amGPUTask )
    {
        GPUDriver(op, resultDBWU);
    }
    //first, individual runs for device benchmark
    for(i=0;i<npasses;i++){
        if ( amGPUTask )
        {
            GPUDriver(op, resultDB);
        }
    }
    MPI_Barrier(MPI_COMM_WORLD);

    //warmup run
    if ( !amGPUTask )
    {
        MPITest(op, resultDBWU, grpnumtasks, mygrprank, mypair, bmcomm);
    }
    //next, individual run for MPI Benchmark
    for(i=0;i<npasses;i++){
        if ( !amGPUTask )
        {
            MPITest(op, resultDB, grpnumtasks, mygrprank, mypair, bmcomm);
        }
    }
    MPI_Barrier(MPI_COMM_WORLD);

    //merge and print
    pardb.MergeSerialDatabases(resultDB, bmcomm);
    if (mympirank==0)
        cout<<endl<<"*****************************Sequential GPU and MPI runs****************************"<<endl;
    DumpInSequence(pardb, mygrprank, mympirank);

    // Simultaneous runs for observing impact of contention
    MPI_Barrier(MPI_COMM_WORLD);
    if ( amGPUTask )
    {
        do {
            if (mympirank == 0 ) cout<<".";
            GPUDriver(op, resultDB1);flush(cout);
        } while(*mpidone==0);
        if (mympirank == 0 ) cout<<"*"<<endl;
    }
    else
    {
        for ( i=0;i<npasses;i++ )
        {
            MPITest(op, resultDB1, grpnumtasks, mygrprank, mypair, bmcomm);
        }
        *mpidone=1;
    }
    MPI_Barrier(MPI_COMM_WORLD);

    //merge and print
    pardb1.MergeSerialDatabases(resultDB1,bmcomm);
    if (mympirank==0)
        cout<<endl<<"*****************************Simultaneous GPU and MPI runs****************************"<<endl;
    DumpInSequence(pardb1, mygrprank, mympirank);

    //print summary
    if ( !amGPUTask && mygrprank==0)
    {
        vector<ResultDatabase::Result> prelatency  = pardb.GetResultsForTest("MPI Latency(mean)");
        vector<ResultDatabase::Result> postlatency = pardb1.GetResultsForTest("MPI Latency(mean)");
        cout<<endl<<"Summarized Mean(Mean) MPI Baseline Latency vs. Latency with Contention";
        cout<<endl<<"MSG SIZE(B)\t";
        int msgsize=0;
        for (i=0; i<prelatency.size(); i++)
        {
            cout<<msgsize<<"\t";
            msgsize = (msgsize ? msgsize * 2 : msgsize + 1);
        }

        cout << endl <<"BASELATENCY\t";
        for (i=0; i<prelatency.size(); i++)
            cout<<setiosflags(ios::fixed) << setprecision(2)<<prelatency[i].GetMean() << "\t";

        cout << endl <<"CONTLATENCY\t";
        for (i=0; i<postlatency.size(); i++)
            cout<<setiosflags(ios::fixed) << setprecision(2)<<postlatency[i].GetMean() << "\t";
        flush(cout);
        cout<<endl;
    }
    MPI_Barrier(MPI_COMM_WORLD);

    if ( amGPUTask && mympirank==0)
    {
        vector<ResultDatabase::Result> prespeed  = pardb.GetResultsForTest("DownloadSpeed(mean)");
        vector<ResultDatabase::Result> postspeed = pardb1.GetResultsForTest("DownloadSpeed(mean)");
        cout<<endl<<"Summarized Mean(Mean) GPU Baseline Download Speed vs. Download Speed with Contention";
        cout<<endl<<"MSG SIZE(KB)\t";
        int msgsize=1;
        for (i=0; i<prespeed.size(); ++i)
        {
            cout<<msgsize<<"\t";
            msgsize = (msgsize ? msgsize * 2 : msgsize + 1);
        }
        cout << endl <<"BASESPEED\t";
        for (i=0; i<prespeed.size(); ++i)
            cout<<setiosflags(ios::fixed) << setprecision(4)<<prespeed[i].GetMean() << "\t";

        cout << endl <<"CONTSPEED\t";
        for (i=0; i<postspeed.size(); ++i)
            cout<<setiosflags(ios::fixed) << setprecision(4)<<postspeed[i].GetMean() << "\t";
         cout<<endl;
    }

    if ( amGPUTask && mympirank==0)
    {
        vector<ResultDatabase::Result> pregpulat  = pardb.GetResultsForTest("DownloadLatencyEstimate(mean)");
        vector<ResultDatabase::Result> postgpulat = pardb1.GetResultsForTest("DownloadLatencyEstimate(mean)");
        cout<<endl<<"Summarized Mean(Mean) GPU Baseline Download Latency vs. Download Latency with Contention";
        cout<<endl<<"MSG SIZE\t";
        for (i=0; i<pregpulat.size(); ++i)
        {
            cout<<pregpulat[i].atts<<"\t";
        }
        cout << endl <<"BASEGPULAT\t";
        for (i=0; i<pregpulat.size(); ++i)
            cout<<setiosflags(ios::fixed) << setprecision(7)<<pregpulat[i].GetMean() << "\t";

        cout << endl <<"CONTGPULAT\t";
        for (i=0; i<postgpulat.size(); ++i)
            cout<<setiosflags(ios::fixed) << setprecision(7)<<postgpulat[i].GetMean() << "\t";
         cout<<endl;
    }
    //cleanup GPU
    if( amGPUTask )
    {
        GPUCleanup(op);
    }

    MPI_Finalize();

}

Ejemplo n.º 9

Mostrar archivo

Archivo: Stencil2Dmain.cpp Proyecto: optimus-prime/shoc-mic

void
DoTest( const char* timerDesc, ResultDatabase& resultDB, OptionParser& opts )
{
    StencilFactory<T>* stdStencilFactory = NULL;
    Stencil<T>* stdStencil = NULL;
    StencilFactory<T>* testStencilFactory = NULL;
    Stencil<T>* testStencil = NULL;

    //try
    {
        stdStencilFactory = new HostStencilFactory<T>;
        testStencilFactory = new MICStencilFactory<T>;
        assert( (stdStencilFactory != NULL) && (testStencilFactory != NULL) );

        // do a sanity check on option values
        CheckOptions( opts );
        stdStencilFactory->CheckOptions( opts );
        testStencilFactory->CheckOptions( opts );

        // extract and validate options
        std::vector<long long> arrayDims = opts.getOptionVecInt( "customSize" );
        if( arrayDims.size() != 2 )
        {
            cerr << "Dim size: " << arrayDims.size() << "\n";
            //throw InvalidArgValue( "all overall dimensions must be positive" );
        }
        if (arrayDims[0] == 0) // User has not specified a custom size
        {
            const int probSizes[4] = { 768, 1408, 2048, 4096 };
            int sizeClass = opts.getOptionInt("size");
            if (!(sizeClass >= 0 && sizeClass < 5))
            {
                //throw InvalidArgValue( "Size class must be between 1-4" );
            }
            arrayDims[0] = arrayDims[1] =probSizes[sizeClass - 1];
        }

        long int seed = (long)opts.getOptionInt( "seed" );
        bool beVerbose = opts.getOptionBool( "verbose" );
        unsigned int nIters = (unsigned int)opts.getOptionInt( "num-iters" );
        double valErrThreshold = (double)opts.getOptionFloat( "val-threshold" );
        unsigned int nValErrsToPrint = (unsigned int)opts.getOptionInt( "val-print-limit" );

#if defined(PARALLEL)
        unsigned int haloWidth = (unsigned int)opts.getOptionInt( "iters-per-exchange" );
#else
        unsigned int haloWidth = 1;
#endif // defined(PARALLEL)

        float haloVal = (float)opts.getOptionFloat( "haloVal" );

        // build a description of this experiment
        std::ostringstream experimentDescriptionStr;
        experimentDescriptionStr 
            << nIters << ':'
            << arrayDims[0] << 'x' << arrayDims[1] << ':'
            << LROWS << 'x' << LCOLS;

        unsigned int nPasses =(unsigned int)opts.getOptionInt( "passes" );
         unsigned long npts = (arrayDims[0] + 2*haloWidth - 2) * 
                                     (arrayDims[1] + 2*haloWidth - 2); 

unsigned long nflops = npts * 11 * nIters;
cout<<"flops are = "<<nflops<<endl;

        // compute the expected result on the host
#if defined(PARALLEL)
        int cwrank;
        MPI_Comm_rank( MPI_COMM_WORLD, &cwrank );
        if( cwrank == 0 )
        {
#endif // defined(PARALLEL)
        std::cout << "\nPerforming stencil operation on host for later comparison with MIC output\n"
            << "Depending on host capabilities, this may take a while."
            << std::endl;
#if defined(PARALLEL)
        }
#endif // defined(PARALLEL)
        Matrix2D<T> exp( arrayDims[0] + 2*haloWidth, 
                            arrayDims[1] + 2*haloWidth );
        Initialize<T> init( seed,
                        haloWidth,
                        haloVal );	

        init( exp );
        if( beVerbose )
        {
            std::cout << "initial state:\n" << exp << std::endl;
        }
        Stencil<T>* stdStencil = stdStencilFactory->BuildStencil( opts );
        (*stdStencil)( exp, nIters );
        if( beVerbose )
        {
            std::cout << "expected result:\n" << exp << std::endl;
        }
	

        // compute the result on the MIC device
        Matrix2D<T> data( arrayDims[0] + 2*haloWidth, 
                            arrayDims[1] + 2*haloWidth );
        Stencil<T>* testStencil = testStencilFactory->BuildStencil( opts );

#if defined(PARALLEL)
        MPI_Comm_rank( MPI_COMM_WORLD, &cwrank );
        if( cwrank == 0 )
        {
#endif // defined(PARALLEL)
        std::cout << "\nPerforming stencil operation on chosen device, " 
            << nPasses << " passes.\n"
            << "Depending on chosen device, this may take a while."
            << std::endl;
#if defined(PARALLEL)
        }
#endif // defined(PARALLEL)

#if !defined(PARALLEL)
        std::cout << "At the end of each pass the number of validation\nerrors observed will be printed to the standard output." 
            << std::endl;
#endif // !defined(PARALLEL)
	std::cout<<"Passes:"<<nPasses<<endl;
        for( unsigned int pass = 0; pass < nPasses; pass++ )
        {
            init( data );

            double start = curr_second();
            (*testStencil)( data, nIters );
            double elapsedTime = curr_second()-start;
            double gflops = (nflops / elapsedTime) / 1e9;
            resultDB.AddResult( timerDesc,
                                    experimentDescriptionStr.str(),
                                    "GFLOPS",
                                    gflops );
            if( beVerbose )
            {
                std::cout << "observed result, pass " << pass << ":\n" 
                    << data 
                    << std::endl;
            }

            // validate the result
#if defined(PARALLEL)
            //StencilValidater<T>* validater = new MPIStencilValidater<T>;
#else
            //StencilValidater<T>* validater = new SerialStencilValidater<T>;            
#endif // defined(PARALLEL)
	   MICValidate(exp,data,valErrThreshold,nValErrsToPrint);
            /*validater->ValidateResult( exp,
                            data,
                            valErrThreshold,
                            nValErrsToPrint );*/
        }
    }
    /*
    catch( ... )
    {
        // clean up - abnormal termination
        // wish we didn't have to do this, but C++ exceptions do not 
        // support a try-catch-finally approach
        delete stdStencil;
        delete stdStencilFactory;
        delete testStencil;
        delete testStencilFactory;
        throw;
    }*/

    // clean up - normal termination
    delete stdStencil;
    delete stdStencilFactory;
    delete testStencil;
    delete testStencilFactory;
}

Ejemplo n.º 10

Mostrar archivo

Archivo: main.cpp Proyecto: Poojachitral/shoc

// ****************************************************************************
// Function: main
//
// Purpose:
//   The main function takes care of initialization (device and MPI),  then
//   performs the benchmark and prints results.
//
// Arguments:
//
//
// Programmer: Jeremy Meredith
// Creation:
//
// Modifications:
//   Jeremy Meredith, Wed Nov 10 14:20:47 EST 2010
//   Split timing reports into detailed and summary.  For serial code, we
//   report all trial values, and for parallel, skip the per-process vals.
//   Also detect and print outliers from parallel runs.
//
// ****************************************************************************
int main(int argc, char *argv[])
{
    int ret = 0;
    bool noprompt = false;

    try
    {
#ifdef PARALLEL
        int rank, size;
        MPI_Init(&argc,&argv);
        MPI_Comm_size(MPI_COMM_WORLD, &size);
        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
        cerr << "MPI Task " << rank << "/" << size - 1 << " starting....\n";
#endif

        // Get args
        OptionParser op;
       
        //Add shared options to the parser
        op.addOption("device", OPT_VECINT, "0",
                "specify device(s) to run on", 'd');
        op.addOption("verbose", OPT_BOOL, "", "enable verbose output", 'v');
        op.addOption("passes", OPT_INT, "10", "specify number of passes", 'n');
        op.addOption("size", OPT_INT, "1", "specify problem size", 's');
        op.addOption("infoDevices", OPT_BOOL, "",
                "show info for available platforms and devices", 'i');
        op.addOption("quiet", OPT_BOOL, "", "write minimum necessary to standard output", 'q');
#ifdef _WIN32
        op.addOption("noprompt", OPT_BOOL, "", "don't wait for prompt at program exit");
#endif

        addBenchmarkSpecOptions(op);

        if (!op.parse(argc, argv))
        {
#ifdef PARALLEL
            if (rank == 0)
                op.usage();
            MPI_Finalize();
#else
            op.usage();
#endif
            return (op.HelpRequested() ? 0 : 1);
        }
        
        bool verbose = op.getOptionBool("verbose");
        bool infoDev = op.getOptionBool("infoDevices");
#ifdef _WIN32
        noprompt = op.getOptionBool("noprompt");
#endif

        int device;
#ifdef PARALLEL
        NodeInfo ni;
        int myNodeRank = ni.nodeRank();
        vector<long long> deviceVec = op.getOptionVecInt("device");
        if (myNodeRank >= deviceVec.size()) {
            // Default is for task i to test device i
            device = myNodeRank;
        } else {
            device = deviceVec[myNodeRank];
        }
#else
        device = op.getOptionVecInt("device")[0];
#endif
        int deviceCount;
        cudaGetDeviceCount(&deviceCount);
        if (device >= deviceCount) {
            cerr << "Warning: device index: " << device <<
            " out of range, defaulting to device 0.\n";
            device = 0;
        }

        // Initialization
        EnumerateDevicesAndChoose(device, infoDev);
        if( infoDev )
        {
            return 0;
        }
        ResultDatabase resultDB;

        // Run the benchmark
        RunBenchmark(resultDB, op);

#ifndef PARALLEL
        resultDB.DumpDetailed(cout);
#else
        ParallelResultDatabase pardb;
        pardb.MergeSerialDatabases(resultDB,MPI_COMM_WORLD);
        if (rank==0)
        {
            pardb.DumpSummary(cout);
            pardb.DumpOutliers(cout);
        }
#endif

    }
    catch( InvalidArgValue& e )
    {
        std::cerr << e.what() << ": " << e.GetMessage() << std::endl;
        ret = 1;
    }
    catch( std::exception& e )
    {
        std::cerr << e.what() << std::endl;
        ret = 1;
    }
    catch( ... )
    {
        ret = 1;
    }


#ifdef PARALLEL
    MPI_Finalize();
#endif

#ifdef _WIN32
    if (!noprompt)
    {
        cout << "Press return to exit\n";
        cin.get();
    }
#endif

    return ret;
}

Ejemplo n.º 11

Mostrar archivo

Archivo: fftlib.cpp Proyecto: TakayukiSakai/shoc

void 
init(OptionParser& op, bool _do_dp)
{
    cl_int err;

    do_dp = _do_dp;
    
    if (!fftCtx) {
        // first get the device
        int device, platform = op.getOptionInt("platform");
        if (op.getOptionVecInt("device").size() > 0) {
            device = op.getOptionVecInt("device")[0];
        }
        else {
            device = 0;
        }
        fftDev = ListDevicesAndGetDevice(platform, device);

        // now get the context
        fftCtx = clCreateContext(NULL, 1, &fftDev, NULL, NULL, &err);
        CL_CHECK_ERROR(err);
    }

    if (!fftQueue) {
        // get a queue
        fftQueue = clCreateCommandQueue(fftCtx, fftDev, CL_QUEUE_PROFILING_ENABLE,
                                        &err);
        CL_CHECK_ERROR(err);
    }
    
    // create the program...
    fftProg = clCreateProgramWithSource(fftCtx, 1, &cl_source_fft, NULL, &err);
    CL_CHECK_ERROR(err);
    
    // ...and build it
    string args = " -cl-mad-enable ";
    if (op.getOptionBool("use-native")) {
        args += " -cl-fast-relaxed-math ";
    }
    if (!do_dp) {
        args += " -DSINGLE_PRECISION ";
    }
    else if (checkExtension(fftDev, "cl_khr_fp64")) {
        args += " -DK_DOUBLE_PRECISION ";
    }
    else if (checkExtension(fftDev, "cl_amd_fp64")) {
        args += " -DAMD_DOUBLE_PRECISION ";
    }
    err = clBuildProgram(fftProg, 0, NULL, args.c_str(), NULL, NULL);
    {
        char* log = NULL;
        size_t bytesRequired = 0;
        err = clGetProgramBuildInfo(fftProg, 
                                    fftDev, 
                                    CL_PROGRAM_BUILD_LOG,
                                    0,
                                    NULL,
                                    &bytesRequired );
        log = (char*)malloc( bytesRequired + 1 );
        err = clGetProgramBuildInfo(fftProg, 
                                    fftDev, 
                                    CL_PROGRAM_BUILD_LOG,
                                    bytesRequired,
                                    log,
                                    NULL );
        std::cout << log << std::endl;
        free( log );
    }
    if (err != CL_SUCCESS) {
        char log[50000];
        size_t retsize = 0;
        err = clGetProgramBuildInfo(fftProg, fftDev, CL_PROGRAM_BUILD_LOG,
                                    50000*sizeof(char),  log, &retsize);
        CL_CHECK_ERROR(err);
        cout << "Retsize: " << retsize << endl;
        cout << "Log: " << log << endl;
        dumpPTXCode(fftCtx, fftProg, "oclFFT");
        exit(-1);
    }
    else {
        // dumpPTXCode(fftCtx, fftProg, "oclFFT");
    }

    // Create kernel for forward FFT
    fftKrnl = clCreateKernel(fftProg, "fft1D_512", &err);
    CL_CHECK_ERROR(err);
    // Create kernel for inverse FFT
    ifftKrnl = clCreateKernel(fftProg, "ifft1D_512", &err);
    CL_CHECK_ERROR(err);
    // Create kernel for check
    chkKrnl = clCreateKernel(fftProg, "chk1D_512", &err);
    CL_CHECK_ERROR(err);
}