예제 #1
0
// ****************************************************************************
// Function: RunBenchmark
//
// Purpose:
//   Measures the floating point capability of the device for a variety of 
//   combinations of arithmetic operations.
//
// Arguments:
//   op: the options parser / parameter database
//
// Returns:  nothing
//
// Programmer: Zhi Ying([email protected])
//             Jun Jin([email protected])
//
// Creation: May 23, 2011
//
// Modifications:
// 12/12/12 - Kyle Spafford - Code style and minor integration updates
//
// ****************************************************************************
void RunBenchmark(OptionParser &op, ResultDatabase &resultDB)
{
    const bool verbose = op.getOptionBool("verbose");
    // Quiet == no progress bar.
    const bool quiet   = op.getOptionBool("quiet");
    const unsigned int passes = op.getOptionInt("passes");
    const int micdev = op.getOptionInt("target");

    double repeatF = 3;
    cout << "Adjust repeat factor = " << repeatF << "\n";

    // Initialize progress bar
    int totalRuns = 16*passes*2;
    ProgressBar pb(totalRuns);
    if (!verbose && !quiet) 
    {
        pb.Show(stdout);
    }

    RunTest<float>(resultDB, passes, verbose, quiet,
                   repeatF, pb, "-SP", micdev);
    RunTest<double>(resultDB, passes, verbose, quiet,
                    repeatF, pb, "-DP", micdev);

    if (!verbose) cout << endl;
}
예제 #2
0
파일: FFT.cpp 프로젝트: Poojachitral/shoc
void
RunBenchmark(cl::Device& devcpp,
                  cl::Context& ctxcpp,
                  cl::CommandQueue& queuecpp,
                  ResultDatabase &resultDB,
                  OptionParser &op)
{
    // convert from C++ bindings to C bindings
    // TODO propagate use of C++ bindings
    cl_device_id dev = devcpp();
    cl_context ctx = ctxcpp();
    cl_command_queue queue = queuecpp();

    if (getMaxWorkGroupSize(dev) < 64) {
        cout << "FFT requires MaxWorkGroupSize of at least 64" << endl;
        fillResultDB("SP-FFT", "MaxWorkGroupSize<64", op, resultDB);
        fillResultDB("DP-FFT", "MaxWorkGroupSize<64", op, resultDB);
        return;
    }
    
    bool has_dp = checkExtension(dev, "cl_khr_fp64") ||
        checkExtension(dev, "cl_amd_fp64");

    if (op.getOptionBool("dump-sp")) {
        dump<cplxflt>(op);
    }
    else if (op.getOptionBool("dump-dp")) {
        if (!has_dp) {
            cout << "dump-dp: no double precision support!\n";
            return;
        }
        dump<cplxdbl>(op);
    }
    else {
        // Always run single precision test
        runTest<cplxflt>("SP-FFT", dev, ctx, queue, resultDB, op);

        // If double precision is supported, run the DP test
        if (has_dp) {
            cout << "DP Supported\n";
            runTest<cplxdbl>("DP-FFT", dev, ctx, queue, resultDB, op);
        }
        else {
            cout << "DP Not Supported\n";
            fillResultDB("DP-FFT", "DP_Not_Supported", op, resultDB);
        }
    }
}
예제 #3
0
// ****************************************************************************
// Function: GPUSetup
//
// Purpose:
//  do the necessary OpenCL setup for GPU part of the test
//
// Arguments:
//   op: the options parser / parameter database
//   mympirank: for printing errors in case of failure
//   mynoderank: this is typically the device ID (the mapping done in main)
//
// Returns: success/failure
//
// Creation: 2009
//
// Modifications:
//
// ****************************************************************************
//
int GPUSetup(OptionParser &op, int mympirank, int mynoderank)
{
    addBenchmarkSpecOptions(op);
    
    if (op.getOptionBool("infoDevices"))
    {
        OpenCLNodePlatformContainer ndc1;
        ndc1.Print (cout);
        return (0);
    }
    
    // The device option supports specifying more than one device
    int platform = op.getOptionInt("platform");
    int deviceIdx = mynoderank;
    if( deviceIdx >= op.getOptionVecInt( "device" ).size() )
    {
        std::ostringstream estr;
        estr << "Warning: not enough devices specified with --device flag for task "
            << mympirank
            << " ( node rank " << mynoderank
            << ") to claim its own device; forcing to use first device ";
        std::cerr << estr.str() << std::endl;
        deviceIdx = 0;
    }
    int device = op.getOptionVecInt("device")[deviceIdx];

    // Initialization
    _mpicontention_ocldev = new cl::Device( ListDevicesAndGetDevice(platform, device) );
    std::vector<cl::Device> ctxDevices;
    ctxDevices.push_back( *_mpicontention_ocldev );
    _mpicontention_ocldriver_ctx   = new cl::Context( ctxDevices );
    _mpicontention_ocldriver_queue = new cl::CommandQueue( *_mpicontention_ocldriver_ctx, *_mpicontention_ocldev, CL_QUEUE_PROFILING_ENABLE );
    _mpicontention_gpuop = op;
    return 0;
}
예제 #4
0
void
RunBenchmark(ResultDatabase &resultDB, OptionParser &op)
{
    // Test to see if this device supports double precision
    cudaGetDevice(&fftDevice);
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, fftDevice);
    bool has_dp = (deviceProp.major == 1 && deviceProp.minor >= 3) ||
        (deviceProp.major >= 2);

    if (op.getOptionBool("dump-sp")) {
        dump<float2>(op);
    }
    else if (op.getOptionBool("dump-dp")) {
        if (!has_dp) {
            cout << "dump-dp: no double precision support!\n";
            return;
        }
        dump<double2>(op);
    }
    else {
        cout << "Running single precision test" << endl;
        runTest<float2>("SP-FFT", resultDB, op);
        if (has_dp) {
            cout << "Running double precision test" << endl;
            runTest<double2>("DP-FFT", resultDB, op);
        } 
        else {
            cout << "Skipping double precision test" << endl;
            char atts[32] = "DP_Not_Supported";
            // resultDB requires neg entry for every possible result
            int passes = op.getOptionInt("passes");
            for (int k=0; k<passes; k++) {
                resultDB.AddResult("DP-FFT" , atts, "GB/s", FLT_MAX);
                resultDB.AddResult("DP-FFT_PCIe" , atts, "GB/s", FLT_MAX);
                resultDB.AddResult("DP-FFT_Parity" , atts, "GB/s", FLT_MAX);
                resultDB.AddResult("DP-FFT-INV" , atts, "GB/s", FLT_MAX);
                resultDB.AddResult("DP-FFT-INV_PCIe" , atts, "GB/s", FLT_MAX);
                resultDB.AddResult("DP-FFT-INV_Parity" , atts, "GB/s", FLT_MAX);
            }
        }
    }
}
예제 #5
0
파일: fft.cpp 프로젝트: Sable/Ostrich
	void
RunBenchmark(OptionParser &op)
{
	// convert from C++ bindings to C bindings
	// TODO propagate use of C++ bindings



	if(op.getOptionBool("2D"))
		dump2D<cplxflt>(op);
	else
		dump1D<cplxflt>(op);

}
예제 #6
0
파일: FFT.cpp 프로젝트: ellen-hl/shoc-mic
void
RunBenchmark(OptionParser &op, ResultDatabase &resultDB)
{
    const bool verbose = op.getOptionBool("verbose");

    if (verbose) // print MKL version info
    {
        static char mklver[200];
        char *p;
        MKL_Get_Version_String(mklver,sizeof(mklver));
        mklver[sizeof(mklver)-1] = 0;
        p = strrchr(mklver,' ');
        if (p) while (p[0]==' ' && p[1]==0) *p-- = 0;
        printf("SHOC FFT benchmark using MKL verison %s\n",mklver);
    }
    RunTest<cplxflt>("SP-FFT", resultDB, op);
    RunTest<cplxdbl>("DP-FFT", resultDB, op);
}
예제 #7
0
void
DoTest( const char* timerDesc, ResultDatabase& resultDB, OptionParser& opts )
{
    StencilFactory<T>* stdStencilFactory = NULL;
    Stencil<T>* stdStencil = NULL;
    StencilFactory<T>* testStencilFactory = NULL;
    Stencil<T>* testStencil = NULL;

    try
    {
#if defined(PARALLEL)
        stdStencilFactory = new MPIHostStencilFactory<T>;
        testStencilFactory = new MPICUDAStencilFactory<T>;
#else
        stdStencilFactory = new HostStencilFactory<T>;
        testStencilFactory = new CUDAStencilFactory<T>;
#endif // defined(PARALLEL)
        assert( (stdStencilFactory != NULL) && (testStencilFactory != NULL) );

        // do a sanity check on option values
        CheckOptions( opts );
        stdStencilFactory->CheckOptions( opts );
        testStencilFactory->CheckOptions( opts );

        // extract and validate options
        std::vector<long long> arrayDims = opts.getOptionVecInt( "customSize" );
        if( arrayDims.size() != 2 )
        {
            cerr << "Dim size: " << arrayDims.size() << "\n";
            throw InvalidArgValue( "all overall dimensions must be positive" );
        }
        if (arrayDims[0] == 0) // User has not specified a custom size
        {
            int sizeClass = opts.getOptionInt("size");
            arrayDims = StencilFactory<T>::GetStandardProblemSize( sizeClass );
        }
        long int seed = (long)opts.getOptionInt( "seed" );
        bool beVerbose = opts.getOptionBool( "verbose" );
        unsigned int nIters = (unsigned int)opts.getOptionInt( "num-iters" );
        double valErrThreshold = (double)opts.getOptionFloat( "val-threshold" );
        unsigned int nValErrsToPrint = (unsigned int)opts.getOptionInt( "val-print-limit" );

#if defined(PARALLEL)
        unsigned int haloWidth = (unsigned int)opts.getOptionInt( "iters-per-exchange" );
#else
        unsigned int haloWidth = 1;
#endif // defined(PARALLEL)

        float haloVal = (float)opts.getOptionFloat( "haloVal" );

        // build a description of this experiment
        std::vector<long long> lDims = opts.getOptionVecInt( "lsize" );
        assert( lDims.size() == 2 );
        std::ostringstream experimentDescriptionStr;
        experimentDescriptionStr
            << nIters << ':'
            << arrayDims[0] << 'x' << arrayDims[1] << ':'
            << lDims[0] << 'x' << lDims[1];

        unsigned int nPasses = (unsigned int)opts.getOptionInt( "passes" );
        unsigned int nWarmupPasses = (unsigned int)opts.getOptionInt( "warmupPasses" );


        // compute the expected result on the host
        // or read it from a pre-existing file
        std::string matrixFilenameBase = (std::string)opts.getOptionString( "expMatrixFile" );
#if defined(PARALLEL)
        int cwrank;
        MPI_Comm_rank( MPI_COMM_WORLD, &cwrank );
        if( cwrank == 0 )
        {
#endif // defined(PARALLEL)
        if( !matrixFilenameBase.empty() )
        {
            std::cout << "\nReading expected stencil operation result from file for later comparison with CUDA output\n"
                << std::endl;
        }
        else
        {
            std::cout << "\nPerforming stencil operation on host for later comparison with CUDA output\n"
                << "Depending on host capabilities, this may take a while."
                << std::endl;
        }
#if defined(PARALLEL)
        }
#endif // defined(PARALLEL)
        Matrix2D<T> expected( arrayDims[0] + 2*haloWidth,
                            arrayDims[1] + 2*haloWidth );
        Initialize<T> init( seed, haloWidth, haloVal );

        bool haveExpectedData = false;
        if( ! matrixFilenameBase.empty() )
        {
            bool readOK = ReadMatrixFromFile( expected, GetMatrixFileName<T>( matrixFilenameBase ) );
            if( readOK )
            {

                if( (expected.GetNumRows() != arrayDims[0] + 2*haloWidth) ||
                    (expected.GetNumColumns() != arrayDims[1] + 2*haloWidth) )
                {
                    std::cerr << "The matrix read from file \'"
                        << GetMatrixFileName<T>( matrixFilenameBase )
                        << "\' does not match the matrix size specified on the command line.\n";
                    expected.Reset( arrayDims[0] + 2*haloWidth, arrayDims[1] + 2*haloWidth );
                }
                else
                {
                    haveExpectedData = true;
                }
            }

            if( !haveExpectedData )
            {
                std::cout << "\nSince we could not read the expected matrix values,\nperforming stencil operation on host for later comparison with CUDA output.\n"
                    << "Depending on host capabilities, this may take a while."
                    << std::endl;
            }
        }
        if( !haveExpectedData )
        {
            init( expected );
            haveExpectedData = true;
            if( beVerbose )
            {
                std::cout << "initial state:\n" << expected << std::endl;
            }
            stdStencil = stdStencilFactory->BuildStencil( opts );
            (*stdStencil)( expected, nIters );
        }
        if( beVerbose )
        {
            std::cout << "expected result:\n" << expected << std::endl;
        }

        // determine whether we are to save the expected matrix values to a file
        // to speed up future runs
        matrixFilenameBase = (std::string)opts.getOptionString( "saveExpMatrixFile" );
        if( !matrixFilenameBase.empty() )
        {
            SaveMatrixToFile( expected, GetMatrixFileName<T>( matrixFilenameBase ) );
        }
        assert( haveExpectedData );


        // compute the result on the CUDA device
        Matrix2D<T> data( arrayDims[0] + 2*haloWidth,
                            arrayDims[1] + 2*haloWidth );
        Stencil<T>* testStencil = testStencilFactory->BuildStencil( opts );

        // Compute the number of floating point operations we will perform.
        //
        // Note: in the truly-parallel case, we count flops for redundant
        // work due to the need for a halo.
        // But we do not add to the count for the local 1-wide halo since
        // we aren't computing new values for those items.
        unsigned long npts = (arrayDims[0] + 2*haloWidth - 2) *
                            (arrayDims[1] + 2*haloWidth - 2);
#if defined(PARALLEL)
        MPICUDAStencil<T>* mpiTestStencil = static_cast<MPICUDAStencil<T>*>( testStencil );
        assert( mpiTestStencil != NULL );
        int participating = mpiTestStencil->ParticipatingInProgram() ? 1 : 0;
        int numParticipating = 0;
        MPI_Allreduce( &participating,      // src
                        &numParticipating,  // dest
                        1,                  // count
                        MPI_INT,            // type
                        MPI_SUM,            // op
                        MPI_COMM_WORLD );   // communicator
        npts *= numParticipating;
#endif // defined(PARALLEL)

        // In our 9-point stencil, there are 11 floating point operations
        // per point (3 multiplies and 11 adds):
        //
        // newval = weight_center * centerval +
        //      weight_cardinal * (northval + southval + eastval + westval) +
        //      weight_diagnoal * (neval + nwval + seval + swval)
        //
        // we do this stencil operation 'nIters' times
        unsigned long nflops = npts * 11 * nIters;

#if defined(PARALLEL)
        if( cwrank == 0 )
        {
#endif // defined(PARALLEL)
        std::cout << "Performing " << nWarmupPasses << " warmup passes...";
#if defined(PARALLEL)
        }
#endif // defined(PARALLEL)

        for( unsigned int pass = 0; pass < nWarmupPasses; pass++ )
        {
            init(data);
            (*testStencil)( data, nIters );
        }
#if defined(PARALLEL)
        if( cwrank == 0 )
        {
#endif // defined(PARALLEL)
        std::cout << "done." << std::endl;
#if defined(PARALLEL)
        }
#endif // defined(PARALLEL)

#if defined(PARALLEL)
        MPI_Comm_rank( MPI_COMM_WORLD, &cwrank );
        if( cwrank == 0 )
        {
#endif // defined(PARALLEL)
        std::cout << "\nPerforming stencil operation on chosen device, "
            << nPasses << " passes.\n"
            << "Depending on chosen device, this may take a while."
            << std::endl;
#if defined(PARALLEL)
        }
#endif // defined(PARALLEL)

#if !defined(PARALLEL)
        std::cout << "At the end of each pass the number of validation\nerrors observed will be printed to the standard output."
            << std::endl;
#endif // !defined(PARALLEL)


        for( unsigned int pass = 0; pass < nPasses; pass++ )
        {
#if !defined(PARALLEL)
            std::cout << "pass " << pass << ": ";
#endif // !defined(PARALLEL)
            init( data );

            int timerHandle = Timer::Start();
            (*testStencil)( data, nIters );
            double elapsedTime = Timer::Stop( timerHandle, "CUDA stencil" );


            // find and report the computation rate
            double gflops = (nflops / elapsedTime) / 1e9;

            resultDB.AddResult( timerDesc,
                                    experimentDescriptionStr.str(),
                                    "GFLOPS",
                                    gflops );
            if( beVerbose )
            {
                std::cout << "observed result, pass " << pass << ":\n"
                    << data
                    << std::endl;
            }

            // validate the result
#if defined(PARALLEL)
            StencilValidater<T>* validater = new MPIStencilValidater<T>;
#else
            StencilValidater<T>* validater = new SerialStencilValidater<T>;
#endif // defined(PARALLEL)
            validater->ValidateResult( expected,
                            data,
                            valErrThreshold,
                            nValErrsToPrint );
        }
    }
    catch( ... )
    {
        // clean up - abnormal termination
        // wish we didn't have to do this, but C++ exceptions do not
        // support a try-catch-finally approach
        delete stdStencil;
        delete stdStencilFactory;
        delete testStencil;
        delete testStencilFactory;
        throw;
    }

    // clean up - normal termination
    delete stdStencil;
    delete stdStencilFactory;
    delete testStencil;
    delete testStencilFactory;
}
예제 #8
0
파일: main.cpp 프로젝트: ManavA/shoc
// ****************************************************************************
// Method:  main()
//
// Purpose:
//   serial and parallel main for OpenCL level0 benchmarks
//
// Arguments:
//   argc, argv
//
// Programmer:  SHOC Team
// Creation:    The Epoch
//
// Modifications:
//   Jeremy Meredith, Tue Jan 12 15:09:33 EST 2010
//   Changed the way device selection works.  It now defaults to the device
//   index corresponding to the process's rank within a node if no devices
//   are specified on the command command line, and otherwise, round-robins
//   the list of devices among the tasks.
//
//   Gabriel Marin, Tue Jun 01 15:38 EST 2010
//   Check that we have valid (not NULL) context and queue objects before
//   running the benchmarks. Errors inside CreateContextFromSingleDevice or
//   CreateCommandQueueForContextAndDevice were not propagated out to the main
//   program.
//
//   Jeremy Meredith, Wed Nov 10 14:20:47 EST 2010
//   Split timing reports into detailed and summary.  For serial code, we
//   report all trial values, and for parallel, skip the per-process vals.
//   Also detect and print outliers from parallel runs.
//
// ****************************************************************************
int main(int argc, char *argv[])
{
    int ret = 0;

    try
    {
#ifdef PARALLEL
        int rank, size;
        MPI_Init(&argc,&argv);
        MPI_Comm_size(MPI_COMM_WORLD, &size);
        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
        cout << "MPI Task "<< rank << "/" << size - 1 << " starting....\n";
#endif

        OptionParser op;

        //Add shared options to the parser
        op.addOption("platform", OPT_INT, "0", "specify OpenCL platform to use",
                'p');
        op.addOption("device", OPT_VECINT, "", "specify device(s) to run on", 'd');
        op.addOption("passes", OPT_INT, "10", "specify number of passes", 'n');
        op.addOption("size", OPT_VECINT, "1", "specify problem size", 's');
        op.addOption("infoDevices", OPT_BOOL, "",
                "show info for available platforms and devices", 'i');
        op.addOption("verbose", OPT_BOOL, "", "enable verbose output", 'v');
        op.addOption("quiet", OPT_BOOL, "", "write minimum necessary to standard output", 'q');

        addBenchmarkSpecOptions(op);

        if (!op.parse(argc, argv))
        {
#ifdef PARALLEL
            if (rank == 0)
                op.usage();
            MPI_Finalize();
#else
            op.usage();
#endif
            return (op.HelpRequested() ? 0 : 1 );
        }

        if (op.getOptionBool("infoDevices"))
        {
#define DEBUG_DEVICE_CONTAINER 0
#ifdef PARALLEL
            // execute following code only if I am the process of lowest
            // rank on this node
            NodeInfo NI;
            int mynoderank = NI.nodeRank();
            if (mynoderank==0)
            {
                int nlrrank, nlrsize;
                MPI_Comm nlrcomm = NI.getNLRComm();
                MPI_Comm_size(nlrcomm, &nlrsize);
                MPI_Comm_rank(nlrcomm, &nlrrank);

                OpenCLNodePlatformContainer ndc1;
                OpenCLMultiNodeContainer localMnc(ndc1);
                localMnc.doMerge (nlrrank, nlrsize, nlrcomm);
                if (rank==0)  // I am the global rank 0, print all configurations
                    localMnc.Print (cout);
            }
#else
            OpenCLNodePlatformContainer ndc1;
            ndc1.Print (cout);
#if DEBUG_DEVICE_CONTAINER
            OpenCLMultiNodeContainer mnc1(ndc1), mnc2;
            mnc1.Print (cout);
            ostringstream oss;
            mnc1.writeObject (oss);
            std::string temp(oss.str());
            cout << "Serialized MultiNodeContainer:\n" << temp;
            istringstream iss(temp);
            mnc2.readObject (iss);
            cout << "Unserialized object2:\n";
            mnc2.Print (cout);
            mnc1.merge (mnc2);
            cout << "==============\nObject1 after merging 1:\n";
            mnc1.Print (cout);
            mnc1.merge (mnc2);
            cout << "==============\nObject1 after merging 2:\n";
            mnc1.Print (cout);
#endif  // DEBUG
#endif  // PARALLEL
            return (0);
        }

        bool verbose = op.getOptionBool("verbose");

        // The device option supports specifying more than one device
        // for now, just choose the first one.
        int platform = op.getOptionInt("platform");

#ifdef PARALLEL
        NodeInfo ni;
        int myNodeRank = ni.nodeRank();
        if (verbose)
        cout << "Global rank "<<rank<<" is local rank "<<myNodeRank << endl;
#else
        int myNodeRank = 0;
#endif

        // If they haven't specified any devices, assume they
        // want the process with in-node rank N to use device N
        int deviceIdx = myNodeRank;

        // If they have, then round-robin the list of devices
        // among the processes on a node.
        vector<long long> deviceVec = op.getOptionVecInt("device");
        if (deviceVec.size() > 0)
        {
        int len = deviceVec.size();
            deviceIdx = deviceVec[myNodeRank % len];
        }

        // Check for an erroneous device
        if (deviceIdx >= GetNumOclDevices(platform)) {
            cerr << "Warning: device index: " << deviceIdx
                 << " out of range, defaulting to device 0.\n";
            deviceIdx = 0;
        }

        // Initialization
        if (verbose) cout << ">> initializing\n";
        cl_device_id devID = ListDevicesAndGetDevice(platform, deviceIdx);
        cl_int clErr;
        cl_context ctx = clCreateContext( NULL,     // properties
                                            1,      // number of devices
                                            &devID, // device
                                            NULL,   // notification function
                                            NULL,
                                            &clErr );
        CL_CHECK_ERROR(clErr);
        cl_command_queue queue = clCreateCommandQueue( ctx,
                                                        devID,
                                                        CL_QUEUE_PROFILING_ENABLE,
                                                        &clErr );
        CL_CHECK_ERROR(clErr);
        ResultDatabase resultDB;

        // Run the benchmark
        RunBenchmark(devID, ctx, queue, resultDB, op);

        clReleaseCommandQueue( queue );
        clReleaseContext( ctx );


#ifndef PARALLEL
        resultDB.DumpDetailed(cout);
#else
        ParallelResultDatabase pardb;
        pardb.MergeSerialDatabases(resultDB,MPI_COMM_WORLD);
        if (rank==0)
        {
            pardb.DumpSummary(cout);
            pardb.DumpOutliers(cout);
        }
#endif
    }
    catch( std::exception& e )
    {
        std::cerr << e.what() << std::endl;
        ret = 1;
    }
    catch( ... )
    {
        std::cerr << "unrecognized exception caught" << std::endl;
        ret = 1;
    }

#ifdef PARALLEL
    MPI_Finalize();
#endif

    return ret;
}
예제 #9
0
void RunBenchmark(OptionParser &op, ResultDatabase &resultDB)
{
    const bool verbose = op.getOptionBool("verbose");

    // Sizes are in kb
    const int nSizes  = 17;
    int sizes[nSizes] = {1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,
        32768, 65536};
    
    long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;

    // Create host memory
    hostMem = (float*)_mm_malloc(numMaxFloats*sizeof(float),ALIGN);

    if(hostMem==NULL)
    {
        cerr << "Couldn't allocate CPU memory! \n";
        cerr << "Test failed." << endl;
        return;
    }
    // Initialize memory with some pattern.
    for (int i = 0; i < numMaxFloats; i++)
    {
        hostMem[i] = i % 77;
    }

    const unsigned int passes = op.getOptionInt("passes");
    int micdev = op.getOptionInt("target");

    // Allocate memory on the card
    #pragma offload target(mic:micdev) \
    nocopy(hostMem:length(numMaxFloats) alloc_if(1) free_if(0) align(ALIGN) ) 
    {
    }

    // Three passes, forward and backward both
    for (int pass = 0; pass < passes; pass++)
    {
        // Step through sizes forward on even passes and backward on odd
        for (int i = 0; i < nSizes; i++)
        {
            int sizeIndex;
            if ((pass % 2) == 0)
            {
                sizeIndex = i;
            }
            else
            {
                sizeIndex = (nSizes - 1) - i;
            }

            int nbytes = sizes[sizeIndex] * 1024;

            //  D->H test
            double start = curr_second();

            #pragma offload target(mic:micdev) \
            out(hostMem:length((1024*sizes[sizeIndex]/4)) \
            free_if(0) alloc_if(0)  )
            {
            }
            double t = curr_second()-start;

            if (verbose)
            {
                cerr << "Size " << sizes[sizeIndex] << "k took " << t <<
                    " sec\n";
            }

            double speed = (double(sizes[sizeIndex]) * 1024 / 
                    (1000. * 1000. * 1000.)) / t;
            char sizeStr[256];
            sprintf(sizeStr, "% 6dkB", sizes[sizeIndex]);
            resultDB.AddResult("ReadbackSpeed", sizeStr, "GB/sec", speed);
            resultDB.AddResult("ReadbackTime", sizeStr, "ms", t*1000);
        }
    }
    // Free memory allocated on the mic
    #pragma offload target(mic:micdev) \
    in(hostMem:length(numMaxFloats) alloc_if(0)  )
    {
    }

    // Cleanup
    _mm_free(hostMem);
}
예제 #10
0
void
DoTest( const char* timerDesc, ResultDatabase& resultDB, OptionParser& opts )
{
    StencilFactory<T>* stdStencilFactory = NULL;
    Stencil<T>* stdStencil = NULL;
    StencilFactory<T>* testStencilFactory = NULL;
    Stencil<T>* testStencil = NULL;

    //try
    {
        stdStencilFactory = new HostStencilFactory<T>;
        testStencilFactory = new MICStencilFactory<T>;
        assert( (stdStencilFactory != NULL) && (testStencilFactory != NULL) );

        // do a sanity check on option values
        CheckOptions( opts );
        stdStencilFactory->CheckOptions( opts );
        testStencilFactory->CheckOptions( opts );

        // extract and validate options
        std::vector<long long> arrayDims = opts.getOptionVecInt( "customSize" );
        if( arrayDims.size() != 2 )
        {
            cerr << "Dim size: " << arrayDims.size() << "\n";
            //throw InvalidArgValue( "all overall dimensions must be positive" );
        }
        if (arrayDims[0] == 0) // User has not specified a custom size
        {
            const int probSizes[4] = { 768, 1408, 2048, 4096 };
            int sizeClass = opts.getOptionInt("size");
            if (!(sizeClass >= 0 && sizeClass < 5))
            {
                //throw InvalidArgValue( "Size class must be between 1-4" );
            }
            arrayDims[0] = arrayDims[1] =probSizes[sizeClass - 1];
        }

        long int seed = (long)opts.getOptionInt( "seed" );
        bool beVerbose = opts.getOptionBool( "verbose" );
        unsigned int nIters = (unsigned int)opts.getOptionInt( "num-iters" );
        double valErrThreshold = (double)opts.getOptionFloat( "val-threshold" );
        unsigned int nValErrsToPrint = (unsigned int)opts.getOptionInt( "val-print-limit" );

#if defined(PARALLEL)
        unsigned int haloWidth = (unsigned int)opts.getOptionInt( "iters-per-exchange" );
#else
        unsigned int haloWidth = 1;
#endif // defined(PARALLEL)

        float haloVal = (float)opts.getOptionFloat( "haloVal" );

        // build a description of this experiment
        std::ostringstream experimentDescriptionStr;
        experimentDescriptionStr 
            << nIters << ':'
            << arrayDims[0] << 'x' << arrayDims[1] << ':'
            << LROWS << 'x' << LCOLS;

        unsigned int nPasses =(unsigned int)opts.getOptionInt( "passes" );
         unsigned long npts = (arrayDims[0] + 2*haloWidth - 2) * 
                                     (arrayDims[1] + 2*haloWidth - 2); 

unsigned long nflops = npts * 11 * nIters;
cout<<"flops are = "<<nflops<<endl;

        // compute the expected result on the host
#if defined(PARALLEL)
        int cwrank;
        MPI_Comm_rank( MPI_COMM_WORLD, &cwrank );
        if( cwrank == 0 )
        {
#endif // defined(PARALLEL)
        std::cout << "\nPerforming stencil operation on host for later comparison with MIC output\n"
            << "Depending on host capabilities, this may take a while."
            << std::endl;
#if defined(PARALLEL)
        }
#endif // defined(PARALLEL)
        Matrix2D<T> exp( arrayDims[0] + 2*haloWidth, 
                            arrayDims[1] + 2*haloWidth );
        Initialize<T> init( seed,
                        haloWidth,
                        haloVal );	

        init( exp );
        if( beVerbose )
        {
            std::cout << "initial state:\n" << exp << std::endl;
        }
        Stencil<T>* stdStencil = stdStencilFactory->BuildStencil( opts );
        (*stdStencil)( exp, nIters );
        if( beVerbose )
        {
            std::cout << "expected result:\n" << exp << std::endl;
        }
	

        // compute the result on the MIC device
        Matrix2D<T> data( arrayDims[0] + 2*haloWidth, 
                            arrayDims[1] + 2*haloWidth );
        Stencil<T>* testStencil = testStencilFactory->BuildStencil( opts );

#if defined(PARALLEL)
        MPI_Comm_rank( MPI_COMM_WORLD, &cwrank );
        if( cwrank == 0 )
        {
#endif // defined(PARALLEL)
        std::cout << "\nPerforming stencil operation on chosen device, " 
            << nPasses << " passes.\n"
            << "Depending on chosen device, this may take a while."
            << std::endl;
#if defined(PARALLEL)
        }
#endif // defined(PARALLEL)

#if !defined(PARALLEL)
        std::cout << "At the end of each pass the number of validation\nerrors observed will be printed to the standard output." 
            << std::endl;
#endif // !defined(PARALLEL)
	std::cout<<"Passes:"<<nPasses<<endl;
        for( unsigned int pass = 0; pass < nPasses; pass++ )
        {
            init( data );

            double start = curr_second();
            (*testStencil)( data, nIters );
            double elapsedTime = curr_second()-start;
            double gflops = (nflops / elapsedTime) / 1e9;
            resultDB.AddResult( timerDesc,
                                    experimentDescriptionStr.str(),
                                    "GFLOPS",
                                    gflops );
            if( beVerbose )
            {
                std::cout << "observed result, pass " << pass << ":\n" 
                    << data 
                    << std::endl;
            }

            // validate the result
#if defined(PARALLEL)
            //StencilValidater<T>* validater = new MPIStencilValidater<T>;
#else
            //StencilValidater<T>* validater = new SerialStencilValidater<T>;            
#endif // defined(PARALLEL)
	   MICValidate(exp,data,valErrThreshold,nValErrsToPrint);
            /*validater->ValidateResult( exp,
                            data,
                            valErrThreshold,
                            nValErrsToPrint );*/
        }
    }
    /*
    catch( ... )
    {
        // clean up - abnormal termination
        // wish we didn't have to do this, but C++ exceptions do not 
        // support a try-catch-finally approach
        delete stdStencil;
        delete stdStencilFactory;
        delete testStencil;
        delete testStencilFactory;
        throw;
    }*/

    // clean up - normal termination
    delete stdStencil;
    delete stdStencilFactory;
    delete testStencil;
    delete testStencilFactory;
}
예제 #11
0
// ****************************************************************************
// Function: RunBenchmark
//
// Purpose:
//   Executes a series of arithmetic benchmarks for OpenCL devices. 
//   OpenCL kernels are auto-generated based on the values in the
//   _benchmark_type structures.
//   The benchmark tests throughput for add, multiply, multiply-add and 
//   multiply+multiply-add series of operations, for 1, 2, 4 and 8
//   independent streams..
//
// Arguments:
//   ctx: the opencl context to use for the benchmark
//   queue: the opencl command queue to issue commands to
//   resultDB: results from the benchmark are stored in this db
//   op: the options parser (contains input parameters)
//
// Returns:  nothing
//
// Programmer: Gabriel Marin
// Creation: June 26, 2009
//
// Modifications:
//
// ****************************************************************************
void RunBenchmark(cl::Device& devcpp,
                  cl::Context& ctxcpp,
                  cl::CommandQueue& queuecpp,
                  ResultDatabase &resultDB,
                  OptionParser &op)
{
    // convert from C++ bindings to C bindings
    // TODO propagate use of C++ bindings
    cl_device_id id = devcpp();
    cl_context ctx = ctxcpp();
    cl_command_queue queue = queuecpp();

    int npasses  = op.getOptionInt("passes");
    bool verbose = op.getOptionBool("verbose");
    bool quiet = op.getOptionBool("quiet");


    int err;
    cl_mem mem1;
    float *hostMem, *hostMem2;
    size_t maxGroupSize = 1;
    size_t localWorkSize = 1;

    // Seed the random number generator
    srand48(8650341L);
    
    // To prevent this benchmark from taking too long to run, we 
    // calibrate how many repetitions of each test to execute. To do this we
    // run one pass through a multiply-add benchmark and then adjust
    // the repeat factor based on runtime. Use MulMAdd4 for this.
    int aIdx = 0;
    float repeatF = 1.0f;
    // Find the index of the MAdd4 benchmark
    while ((aTests!=0) && (aTests[aIdx].name!=0) && 
        strcmp(aTests[aIdx].name,"MAdd4"))
    {   
       aIdx += 1;
    }
    if (aTests && aTests[aIdx].name)  // we found a benchmark with that name
    {
       struct _benchmark_type temp = aTests[aIdx];
       // Limit to one repetition
       temp.numRepeats = 10;

       // Kernel will be generated into this stream
       ostringstream oss;
       generateKernel (oss, temp, "float", "");
       std::string kernelCode(oss.str());
       
       // Allocate host memory
       int halfNumFloatsMax = temp.halfBufSizeMax*1024/4;
       int numFloatsMax = 2*halfNumFloatsMax;
       hostMem = new float[numFloatsMax];
       hostMem2 = new float[numFloatsMax];
       
       // Allocate device memory
       mem1 = clCreateBuffer(ctx, CL_MEM_READ_WRITE, 
                                 sizeof(float)*numFloatsMax, NULL, &err);
       CL_CHECK_ERROR(err);

       err = clEnqueueWriteBuffer(queue, mem1, true, 0,
                               numFloatsMax*sizeof(float), hostMem,
                               0, NULL, NULL);
       CL_CHECK_ERROR(err);
       
       // Create the kernel program
       const char* progSource[] = {kernelCode.c_str()};
       cl_program prog = clCreateProgramWithSource(ctx, 1, progSource, 
                                                   NULL, &err);
       CL_CHECK_ERROR(err);
       
       // Compile the kernel
       err = clBuildProgram(prog, 0, NULL, opts, NULL, NULL);
       // Compile the kernel
       CL_CHECK_ERROR(err);
       
       // Extract out madd kernel 
       cl_kernel kernel_madd = clCreateKernel(prog, temp.name, &err);
       CL_CHECK_ERROR(err);
       
       // Set kernel arguments
       err = clSetKernelArg (kernel_madd, 0, sizeof(cl_mem), (void*)&mem1);
       CL_CHECK_ERROR (err);
       err = clSetKernelArg (kernel_madd, 1, sizeof(cl_int), 
                             (void*)&temp.numRepeats);
       CL_CHECK_ERROR (err);
       
       // Determine the maximum work group size for this kernel
       maxGroupSize = getMaxWorkGroupSize(id);
       // use min(maxWorkGroupSize, 256)
       localWorkSize = maxGroupSize<128?maxGroupSize:128;
       
       // Initialize host data, with the first half the same as the second
       for (int j=0; j<halfNumFloatsMax; ++j) 
       {
           hostMem[j] = hostMem[numFloatsMax-j-1] = (float)(drand48()*5.0);
       }
       // Set global work size
       size_t globalWorkSize = numFloatsMax;
       
       Event evCopyMem("CopyMem");
       err = clEnqueueWriteBuffer (queue, mem1, true, 0,
                                   numFloatsMax*sizeof(float), hostMem,
                                   0, NULL, &evCopyMem.CLEvent());
       CL_CHECK_ERROR (err);
       // Wait for transfer to finish      
       err = clWaitForEvents (1, &evCopyMem.CLEvent());
       CL_CHECK_ERROR (err);

       Event evKernel(temp.name);
       err = clEnqueueNDRangeKernel (queue, kernel_madd, 1, NULL,
                                 &globalWorkSize, &localWorkSize,
                                 0, NULL, &evKernel.CLEvent());
       CL_CHECK_ERROR (err);
       // Wait for kernel to finish      
       err = clWaitForEvents (1, &evKernel.CLEvent());
       CL_CHECK_ERROR (err);
             
       evKernel.FillTimingInfo();
       // Calculate repeat factor based on kernel runtime
       double tt = double(evKernel.SubmitEndRuntime());
       repeatF = 1.1e07 / tt;
       cout << "Adjust repeat factor = " << repeatF << endl;
       
       // Clean up
       err = clReleaseKernel (kernel_madd);
       CL_CHECK_ERROR(err);
       err = clReleaseProgram (prog);
       CL_CHECK_ERROR(err);
       err = clReleaseMemObject(mem1);
       CL_CHECK_ERROR(err);

       delete[] hostMem;
       delete[] hostMem2;
    }
    
    // Compute total number of kernel runs
    int totalRuns = 0;
    aIdx = 0;
    while ((aTests!=0) && (aTests[aIdx].name!=0))
    {
        for (int halfNumFloats=aTests[aIdx].halfBufSizeMin*1024 ; 
             halfNumFloats<=aTests[aIdx].halfBufSizeMax*1024 ;
             halfNumFloats*=aTests[aIdx].halfBufSizeStride)
        {
            totalRuns += npasses;
        }
        aIdx += 1;
    }
    // Account for custom kernels
    totalRuns += 2 * npasses; 
    
    // check for double precision support
    int hasDoubleFp = 0;
    string doublePragma = "";
    if (checkExtension(id, "cl_khr_fp64")) {
        hasDoubleFp = 1;
        doublePragma = "#pragma OPENCL EXTENSION cl_khr_fp64: enable";
    } else
    if (checkExtension(id, "cl_amd_fp64")) {
        hasDoubleFp = 1;
        doublePragma = "#pragma OPENCL EXTENSION cl_amd_fp64: enable";
    }
    
    // Double the number of passes if double precision support found
    if (hasDoubleFp) {
        cout << "DP Supported" << endl;
        totalRuns <<= 1;
    } else
        cout << "DP Not Supported" << endl;

    ProgressBar pb(totalRuns);
    if (!verbose && !quiet)
        pb.Show(stdout);

    RunTest<float> (id, ctx, queue, resultDB, npasses, verbose, quiet,
             repeatF, localWorkSize, pb, "float", "-SP", "");

    if (hasDoubleFp)
        RunTest<double> (id, ctx, queue, resultDB, npasses, verbose, quiet,
             repeatF, localWorkSize, pb, "double", "-DP", doublePragma.c_str());
    else
    {
        aIdx = 0;
        const char atts[] = "DP_Not_Supported";
        while ((aTests!=0) && (aTests[aIdx].name!=0))
        {   
            for (int pas=0 ; pas<npasses ; ++pas) 
            {
                resultDB.AddResult(string(aTests[aIdx].name)+"-DP" , atts, "GFLOPS", FLT_MAX);
            }
            aIdx += 1;
        }
        for (int pas=0 ; pas<npasses ; ++pas) 
        {
            resultDB.AddResult("MulMAddU-DP", atts, "GFLOPS", FLT_MAX);
            resultDB.AddResult("MAddU-DP", atts, "GFLOPS", FLT_MAX);
        }
    }
    if (!verbose)
        fprintf (stdout, "\n\n");
}
예제 #12
0
파일: FFT.cpp 프로젝트: ellen-hl/shoc-mic
void RunTest(const string& name, ResultDatabase &resultDB, OptionParser &op)
{
    static __declspec(target(mic)) T2 *source;
    int chk;
    unsigned long bytes = 0;
    const int micdev = op.getOptionInt("target");
    const bool verbose = op.getOptionBool("verbose");
    
    // Get problem size
    if (op.getOptionInt("MB") == 0) {
        int probSizes[4] = { 1, 8, 96, 256 };
        int sizeIndex = op.getOptionInt("size")-1;
        if (sizeIndex < 0 || sizeIndex >= 4) {
            cerr << "Invalid size index specified\n";
            exit(-1);
        }
        bytes = probSizes[sizeIndex];
    } else {
        bytes = op.getOptionInt("MB");
    }
    // Convert to MiB
    bytes *= 1024 * 1024;

    int passes = op.getOptionInt("passes");

    // The size of the transform computed is fixed at 512 complex elements
    int fftsz = 512;        
    int N = (bytes)/sizeof(T2);
    int n_ffts = N/fftsz;

    // Allocate space (aligned)
    source = (T2*) MKL_malloc(bytes,  4096);

    //allocate buffers and create FFT plans
    #pragma offload target(mic:micdev) in(fftsz, n_ffts)       \
                                       nocopy(source:length(N) \
                                       align(4096) alloc_if(1) free_if(0))
    {
        forward((T2*)NULL, fftsz, n_ffts);
        inverse((T2*)NULL, fftsz, n_ffts);
    }

    const char *sizeStr;
    stringstream ss;
    ss << "N=" << (long)N;
    sizeStr = strdup(ss.str().c_str());

    for(int k = 0; k < passes; k++)
    {
        init<T2>( source, fftsz, n_ffts );
        // Warmup
        if (k==0)
        {
            #pragma offload target(mic:micdev) in(fftsz, n_ffts)   \
                                               in(source:length(N) \
                                               alloc_if(0)  free_if(0))
            {
                forward(source, fftsz, n_ffts);
            }
        }

        // Time forward fft with data transfer over PCIe
        double time_fwd_pcie = -curr_second();
        
        // Using in rather than inout to be consistent with CUDA version.
        #pragma offload target(mic:micdev) in(fftsz, n_ffts)               \
                                           in(source:length(N) alloc_if(0) \
                                           free_if(0))
        {
            forward(source, fftsz, n_ffts);
        }
        time_fwd_pcie += curr_second();
        #pragma offload target(mic:micdev) out(source:length(N) alloc_if(0) \
                free_if(0))
        {
        }

        // Time inverse fft with data transfer over PCIe
        double time_inv_pcie = -curr_second();
        #pragma offload target(mic:micdev) in(fftsz, n_ffts)   \
                                           in(source:length(N) \
                                           alloc_if(0)  free_if(0))
        {
            inverse(source, fftsz, n_ffts);
        }
        time_inv_pcie += curr_second();

        #pragma offload target(mic:micdev) out(source:length(N) alloc_if(0) \
            free_if(0))
        {}

        // Check result
        #pragma offload target(mic:micdev) in(fftsz,n_ffts) nocopy(source) \
            out(chk)
        {
            chk = checkDiff(source, fftsz, n_ffts);
        }
        if (verbose || chk)
        {
            cout << "Test " << k << ((chk) ? ": Failed\n" : ": Passed\n");
        }

        // Time forward fft without data transfer
        double time_fwd_native = -curr_second();
        #pragma offload target(mic:micdev) in(fftsz, n_ffts) nocopy(source)
        {
            forward(source, fftsz, n_ffts);
        }
        time_fwd_native += curr_second();

        // Time inverse fft without data transfer
        double time_inv_native = -curr_second();
        #pragma offload target(mic:micdev) in(fftsz, n_ffts) nocopy(source)
        {
            inverse(source, fftsz, n_ffts);
        }
        time_inv_native += curr_second();

        // Calculate gflops
        double flop_count    = n_ffts*(5*fftsz*log2(fftsz));
        double GF_fwd_pcie   = flop_count / (time_fwd_pcie   * 1e9);
        double GF_fwd_native = flop_count / (time_fwd_native * 1e9);
        double GF_inv_pcie   = flop_count / (time_inv_pcie   * 1e9);
        double GF_inv_native = flop_count / (time_inv_native * 1e9);

        resultDB.AddResult(name, sizeStr, "GFLOPS", GF_fwd_native);
        resultDB.AddResult(name+"_PCIe", sizeStr, "GFLOPS", GF_fwd_pcie);
        resultDB.AddResult(name+"_Parity", sizeStr, "N", 
                (time_fwd_pcie - time_fwd_native) / time_fwd_native);

        resultDB.AddResult(name+"-INV", sizeStr, "GFLOPS", GF_inv_native);
        resultDB.AddResult(name+"-INV_PCIe", sizeStr, "GFLOPS", GF_inv_pcie);
        resultDB.AddResult(name+"-INV_Parity", sizeStr, "N", 
                (time_inv_pcie - time_inv_native) / time_inv_native);
    }

    // Cleanup FFT plans and buffers
    #pragma offload target(mic:micdev) nocopy(source:length(N) \
            alloc_if(0) free_if(1))
    {
        forward((T2*)NULL, 0, 0);
        inverse((T2*)NULL, 0, 0);
    }
    MKL_free(source);
}
예제 #13
0
파일: main.cpp 프로젝트: Poojachitral/shoc
// ****************************************************************************
// Function: main
//
// Purpose:
//   The main function takes care of initialization (device and MPI),  then
//   performs the benchmark and prints results.
//
// Arguments:
//
//
// Programmer: Jeremy Meredith
// Creation:
//
// Modifications:
//   Jeremy Meredith, Wed Nov 10 14:20:47 EST 2010
//   Split timing reports into detailed and summary.  For serial code, we
//   report all trial values, and for parallel, skip the per-process vals.
//   Also detect and print outliers from parallel runs.
//
// ****************************************************************************
int main(int argc, char *argv[])
{
    int ret = 0;
    bool noprompt = false;

    try
    {
#ifdef PARALLEL
        int rank, size;
        MPI_Init(&argc,&argv);
        MPI_Comm_size(MPI_COMM_WORLD, &size);
        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
        cerr << "MPI Task " << rank << "/" << size - 1 << " starting....\n";
#endif

        // Get args
        OptionParser op;
       
        //Add shared options to the parser
        op.addOption("device", OPT_VECINT, "0",
                "specify device(s) to run on", 'd');
        op.addOption("verbose", OPT_BOOL, "", "enable verbose output", 'v');
        op.addOption("passes", OPT_INT, "10", "specify number of passes", 'n');
        op.addOption("size", OPT_INT, "1", "specify problem size", 's');
        op.addOption("infoDevices", OPT_BOOL, "",
                "show info for available platforms and devices", 'i');
        op.addOption("quiet", OPT_BOOL, "", "write minimum necessary to standard output", 'q');
#ifdef _WIN32
        op.addOption("noprompt", OPT_BOOL, "", "don't wait for prompt at program exit");
#endif

        addBenchmarkSpecOptions(op);

        if (!op.parse(argc, argv))
        {
#ifdef PARALLEL
            if (rank == 0)
                op.usage();
            MPI_Finalize();
#else
            op.usage();
#endif
            return (op.HelpRequested() ? 0 : 1);
        }
        
        bool verbose = op.getOptionBool("verbose");
        bool infoDev = op.getOptionBool("infoDevices");
#ifdef _WIN32
        noprompt = op.getOptionBool("noprompt");
#endif

        int device;
#ifdef PARALLEL
        NodeInfo ni;
        int myNodeRank = ni.nodeRank();
        vector<long long> deviceVec = op.getOptionVecInt("device");
        if (myNodeRank >= deviceVec.size()) {
            // Default is for task i to test device i
            device = myNodeRank;
        } else {
            device = deviceVec[myNodeRank];
        }
#else
        device = op.getOptionVecInt("device")[0];
#endif
        int deviceCount;
        cudaGetDeviceCount(&deviceCount);
        if (device >= deviceCount) {
            cerr << "Warning: device index: " << device <<
            " out of range, defaulting to device 0.\n";
            device = 0;
        }

        // Initialization
        EnumerateDevicesAndChoose(device, infoDev);
        if( infoDev )
        {
            return 0;
        }
        ResultDatabase resultDB;

        // Run the benchmark
        RunBenchmark(resultDB, op);

#ifndef PARALLEL
        resultDB.DumpDetailed(cout);
#else
        ParallelResultDatabase pardb;
        pardb.MergeSerialDatabases(resultDB,MPI_COMM_WORLD);
        if (rank==0)
        {
            pardb.DumpSummary(cout);
            pardb.DumpOutliers(cout);
        }
#endif

    }
    catch( InvalidArgValue& e )
    {
        std::cerr << e.what() << ": " << e.GetMessage() << std::endl;
        ret = 1;
    }
    catch( std::exception& e )
    {
        std::cerr << e.what() << std::endl;
        ret = 1;
    }
    catch( ... )
    {
        ret = 1;
    }


#ifdef PARALLEL
    MPI_Finalize();
#endif

#ifdef _WIN32
    if (!noprompt)
    {
        cout << "Press return to exit\n";
        cin.get();
    }
#endif

    return ret;
}
예제 #14
0
void 
init(OptionParser& op, bool _do_dp)
{
    cl_int err;

    do_dp = _do_dp;
    
    if (!fftCtx) {
        // first get the device
        int device, platform = op.getOptionInt("platform");
        if (op.getOptionVecInt("device").size() > 0) {
            device = op.getOptionVecInt("device")[0];
        }
        else {
            device = 0;
        }
        fftDev = ListDevicesAndGetDevice(platform, device);

        // now get the context
        fftCtx = clCreateContext(NULL, 1, &fftDev, NULL, NULL, &err);
        CL_CHECK_ERROR(err);
    }

    if (!fftQueue) {
        // get a queue
        fftQueue = clCreateCommandQueue(fftCtx, fftDev, CL_QUEUE_PROFILING_ENABLE,
                                        &err);
        CL_CHECK_ERROR(err);
    }
    
    // create the program...
    fftProg = clCreateProgramWithSource(fftCtx, 1, &cl_source_fft, NULL, &err);
    CL_CHECK_ERROR(err);
    
    // ...and build it
    string args = " -cl-mad-enable ";
    if (op.getOptionBool("use-native")) {
        args += " -cl-fast-relaxed-math ";
    }
    if (!do_dp) {
        args += " -DSINGLE_PRECISION ";
    }
    else if (checkExtension(fftDev, "cl_khr_fp64")) {
        args += " -DK_DOUBLE_PRECISION ";
    }
    else if (checkExtension(fftDev, "cl_amd_fp64")) {
        args += " -DAMD_DOUBLE_PRECISION ";
    }
    err = clBuildProgram(fftProg, 0, NULL, args.c_str(), NULL, NULL);
    {
        char* log = NULL;
        size_t bytesRequired = 0;
        err = clGetProgramBuildInfo(fftProg, 
                                    fftDev, 
                                    CL_PROGRAM_BUILD_LOG,
                                    0,
                                    NULL,
                                    &bytesRequired );
        log = (char*)malloc( bytesRequired + 1 );
        err = clGetProgramBuildInfo(fftProg, 
                                    fftDev, 
                                    CL_PROGRAM_BUILD_LOG,
                                    bytesRequired,
                                    log,
                                    NULL );
        std::cout << log << std::endl;
        free( log );
    }
    if (err != CL_SUCCESS) {
        char log[50000];
        size_t retsize = 0;
        err = clGetProgramBuildInfo(fftProg, fftDev, CL_PROGRAM_BUILD_LOG,
                                    50000*sizeof(char),  log, &retsize);
        CL_CHECK_ERROR(err);
        cout << "Retsize: " << retsize << endl;
        cout << "Log: " << log << endl;
        dumpPTXCode(fftCtx, fftProg, "oclFFT");
        exit(-1);
    }
    else {
        // dumpPTXCode(fftCtx, fftProg, "oclFFT");
    }

    // Create kernel for forward FFT
    fftKrnl = clCreateKernel(fftProg, "fft1D_512", &err);
    CL_CHECK_ERROR(err);
    // Create kernel for inverse FFT
    ifftKrnl = clCreateKernel(fftProg, "ifft1D_512", &err);
    CL_CHECK_ERROR(err);
    // Create kernel for check
    chkKrnl = clCreateKernel(fftProg, "chk1D_512", &err);
    CL_CHECK_ERROR(err);
}
예제 #15
0
//  Modifications:
//    Jeremy Meredith, Wed Dec  1 17:05:27 EST 2010
//    Added calculation of latency estimate.
//
void RunBenchmark(cl::Device& devcpp,
                  cl::Context& ctxcpp,
                  cl::CommandQueue& queuecpp,
                  ResultDatabase &resultDB,
                  OptionParser &op)
{
    // convert from C++ bindings to C bindings
    // TODO propagate use of C++ bindings
    cl_device_id id = devcpp();
    cl_context ctx = ctxcpp();
    cl_command_queue queue = queuecpp();

    bool verbose = op.getOptionBool("verbose");
    bool pinned = !op.getOptionBool("nopinned");
    int  npasses = op.getOptionInt("passes");
    const bool waitForEvents = true;

    // Sizes are in kb
    int nSizes  = 20;
    int sizes[20] = {1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,
		     32768,65536,131072,262144,524288};

    // Max sure we don't surpass the OpenCL limit.
    cl_long maxAllocSizeBytes = 0;
    clGetDeviceInfo(id, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
                    sizeof(cl_long), &maxAllocSizeBytes, NULL);
    while (sizes[nSizes-1]*1024 > 0.90 * maxAllocSizeBytes)
    {
        --nSizes;
        if (verbose) cout << " - dropping allocation size to keep under reported limit.\n";
        if (nSizes < 1)
        {
            cerr << "Error: OpenCL reported a max allocation size less than 1kB.\b";
            return;
        }
    }

    // Create some host memory pattern
    if (verbose) cout << ">> creating host mem pattern\n";
    int err;
    float *hostMem;
    cl_mem hostMemObj;
    long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
    if (pinned)
    {
	hostMemObj = clCreateBuffer(ctx,
				    CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
				    sizeof(float)*numMaxFloats, NULL, &err);
        if (err == CL_SUCCESS)
        {
            hostMem = (float*)clEnqueueMapBuffer(queue, hostMemObj, true,
                                                 CL_MAP_READ|CL_MAP_WRITE,
                                                 0,sizeof(float)*numMaxFloats,0,
                                                 NULL,NULL,&err);
        }
	while (err != CL_SUCCESS)
	{
	    // drop the size and try again
	    if (verbose) cout << " - dropping size allocating pinned mem\n";
	    --nSizes;
	    if (nSizes < 1)
	    {
		cerr << "Error: Couldn't allocated any pinned buffer\n";
		return;
	    }
	    numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
	    hostMemObj = clCreateBuffer(ctx,
					CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
					sizeof(float)*numMaxFloats, NULL, &err);
            if (err == CL_SUCCESS)
            {
                hostMem = (float*)clEnqueueMapBuffer(queue, hostMemObj, true,
                                                     CL_MAP_READ|CL_MAP_WRITE,
                                                     0,sizeof(float)*numMaxFloats,0,
                                                     NULL,NULL,&err);
            }
	}
    }
    else
    {
        hostMem = new float[numMaxFloats];
    }
    for (int i=0; i<numMaxFloats; i++)
        hostMem[i] = i % 77;

    // Allocate some device memory
    if (verbose) cout << ">> allocating device mem\n";
    cl_mem mem1 = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
                                 sizeof(float)*numMaxFloats, NULL, &err);
    while (err != CL_SUCCESS)
    {
	// drop the size and try again
	if (verbose) cout << " - dropping size allocating device mem\n";
	--nSizes;
	if (nSizes < 1)
	{
	    cerr << "Error: Couldn't allocated any device buffer\n";
	    return;
	}
	numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
	mem1 = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
			      sizeof(float)*numMaxFloats, NULL, &err);
    }
    if (verbose) cout << ">> filling device mem to force allocation\n";
    Event evDownloadPrime("DownloadPrime");
    err = clEnqueueWriteBuffer(queue, mem1, false, 0,
                               numMaxFloats*sizeof(float), hostMem,
                               0, NULL, &evDownloadPrime.CLEvent());
    CL_CHECK_ERROR(err);
    if (verbose) cout << ">> waiting for download to finish\n";
    err = clWaitForEvents(1, &evDownloadPrime.CLEvent());
    CL_CHECK_ERROR(err);

    // Three passes, forward and backward both
    for (int pass = 0; pass < npasses; pass++)
    {
        // store the times temporarily to estimate latency
        //float times[nSizes];
        // Step through sizes forward on even passes and backward on odd
        for (int i = 0; i < nSizes; i++)
        {
            int sizeIndex;
            if ((pass%2) == 0)
                sizeIndex = i;
            else
                sizeIndex = (nSizes-1) - i;

            // Copy input memory to the device
            if (verbose) cout << ">> copying to device "<<sizes[sizeIndex]<<"kB\n";
            Event evDownload("Download");
            err = clEnqueueWriteBuffer(queue, mem1, false, 0,
                                       sizes[sizeIndex]*1024, hostMem,
                                       0, NULL, &evDownload.CLEvent());
            CL_CHECK_ERROR(err);

            // Wait for event to finish
            if (verbose) cout << ">> waiting for download to finish\n";
            err = clWaitForEvents(1, &evDownload.CLEvent());
            CL_CHECK_ERROR(err);

            if (verbose) cout << ">> finish!";
            if (verbose) cout << endl;

            // Get timings
            err = clFlush(queue);
            CL_CHECK_ERROR(err);
            evDownload.FillTimingInfo();
            if (verbose) evDownload.Print(cerr);

            double t = evDownload.SubmitEndRuntime() / 1.e6; // in ms
            //times[sizeIndex] = t;

            // Add timings to database
            double speed = (double(sizes[sizeIndex] * 1024.) /  (1000.*1000.)) / t;
            char sizeStr[256];
            sprintf(sizeStr, "% 7dkB", sizes[sizeIndex]);
            resultDB.AddResult("DownloadSpeed", sizeStr, "GB/sec", speed);

            // Add timings to database
            double delay = evDownload.SubmitStartDelay() / 1.e6;
            resultDB.AddResult("DownloadDelay", sizeStr, "ms", delay);
            resultDB.AddResult("DownloadTime", sizeStr, "ms", t);
        }
	//resultDB.AddResult("DownloadLatencyEstimate", "1-2kb", "ms", times[0]-(times[1]-times[0])/1.);
	//resultDB.AddResult("DownloadLatencyEstimate", "1-4kb", "ms", times[0]-(times[2]-times[0])/3.);
	//resultDB.AddResult("DownloadLatencyEstimate", "2-4kb", "ms", times[1]-(times[2]-times[1])/1.);
    }

    // Cleanup
    err = clReleaseMemObject(mem1);
    CL_CHECK_ERROR(err);
    if (pinned)
    {
        err = clReleaseMemObject(hostMemObj);
        CL_CHECK_ERROR(err);
    }
    else
    {
        delete[] hostMem;
    }
}