Exemplo n.º 1
0
void RunTest(cl_device_id dev, cl_context ctx, cl_command_queue queue,
             ResultDatabase &resultDB, OptionParser &op, string compileFlags,
             int nRows=0)
{
    // Determine if the device is capable of using images in general
    cl_device_id device_id;
    cl_bool deviceSupportsImages;
    int err = 0;
    err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(device_id),
                                &device_id, NULL);
    CL_CHECK_ERROR(err);

    err = clGetDeviceInfo(device_id, CL_DEVICE_IMAGE_SUPPORT,
            sizeof(deviceSupportsImages), &deviceSupportsImages, NULL);
    CL_CHECK_ERROR(err);

    size_t maxImgWidth = 0;
    err = clGetDeviceInfo(device_id, CL_DEVICE_IMAGE2D_MAX_WIDTH,
            sizeof(size_t), &maxImgWidth, NULL);
    CL_CHECK_ERROR(err);

    // Make sure our sampler type is supported
    cl_sampler sampler;
    sampler = clCreateSampler(ctx, CL_FALSE, CL_ADDRESS_NONE,
            CL_FILTER_NEAREST, &err);
    if (err != CL_SUCCESS)
    {
        cout << "Warning: Device does not support required sampler type";
        cout << " falling back to global memory\n";
        deviceSupportsImages = false;
    } else
    {
        clReleaseSampler(sampler);
    }




    // Host data structures
    // array of values in the sparse matrix
    floatType *h_val, *h_valPad;
    // array of column indices for each value in h_val
    int *h_cols, *h_colsPad;
    // array of indices to the start of each row in h_val/valPad
    int *h_rowDelimiters, *h_rowDelimitersPad;
    // Dense vector of values
    floatType *h_vec;
    // Output vector
    floatType *h_out;
    // Reference solution computed by cpu
    floatType *refOut;

    int nItems;            // number of non-zero elements in the matrix
    int nItemsPadded;
    int numRows;           // number of rows in the matrix

    // This benchmark either reads in a matrix market input file or
    // generates a random matrix
    string inFileName = op.getOptionString("mm_filename");
    if (inFileName == "random")
    {
        // If we're not opening a file, the dimension of the matrix
        // has been passed in as an argument
        numRows = nRows;
        nItems = numRows * numRows / 100; // 1% of entries will be non-zero
        float maxval = op.getOptionFloat("maxval");
        h_val = new floatType[nItems];
        h_cols = new int[nItems];
        h_rowDelimiters = new int[nRows+1];
        fill(h_val, nItems, maxval);
        initRandomMatrix(h_cols, h_rowDelimiters, nItems, numRows);
    }
    else
    {   char filename[FIELD_LENGTH];
        strcpy(filename, inFileName.c_str());
        readMatrix(filename, &h_val, &h_cols, &h_rowDelimiters,
                &nItems, &numRows);
    }

    // Final Image Check -- Make sure the image format is supported.
    int imgHeight = (numRows+maxImgWidth-1)/maxImgWidth;
    cl_image_format fmt;
    fmt.image_channel_data_type = CL_FLOAT;
    if(sizeof(floatType)==4)
    {
        fmt.image_channel_order=CL_R;
    }
    else
    {
        fmt.image_channel_order=CL_RG;
    }
    cl_mem d_vec = clCreateImage2D(ctx, CL_MEM_READ_ONLY, &fmt, maxImgWidth,
            imgHeight, 0, NULL, &err);
    if (err != CL_SUCCESS)
    {
        deviceSupportsImages = false;
    } else {
        clReleaseMemObject(d_vec);
    }

    // Set up remaining host data
    h_vec = new floatType[numRows];
    refOut = new floatType[numRows];
    h_rowDelimitersPad = new int[numRows+1];
    fill(h_vec, numRows, op.getOptionFloat("maxval"));

    // Set up the padded data structures
    int paddedSize = numRows + (PAD_FACTOR - numRows % PAD_FACTOR);
    h_out = new floatType[paddedSize];
    convertToPadded(h_val, h_cols, numRows, h_rowDelimiters, &h_valPad,
            &h_colsPad, h_rowDelimitersPad, &nItemsPadded);

    // Compute reference solution
    spmvCpu(h_val, h_cols, h_rowDelimiters, h_vec, numRows, refOut);

    // Dispatch based on whether or not device supports OpenCL images
    if (deviceSupportsImages)
    {
        cout << "CSR Test\n";
        csrTest<floatType, clFloatType, true>
            (dev, ctx, compileFlags, queue, resultDB, op, h_val, h_cols,
             h_rowDelimiters, h_vec, h_out, numRows, nItems, refOut,
             false, maxImgWidth);

        // Test CSR kernels on padded data
        cout << "CSR Test -- Padded Data\n";
        csrTest<floatType,clFloatType, true>
            (dev, ctx, compileFlags, queue, resultDB, op, h_valPad, h_colsPad,
             h_rowDelimitersPad, h_vec, h_out, numRows, nItemsPadded, refOut,
             true, maxImgWidth);

        // Test ELLPACKR kernel
        cout << "ELLPACKR Test\n";
        ellPackTest<floatType, clFloatType, true>
            (dev, ctx, compileFlags, queue, resultDB, op, h_val, h_cols,
             h_rowDelimiters, h_vec, h_out, numRows, nItems,
             refOut, false, paddedSize, maxImgWidth);
    } else {
        cout << "CSR Test\n";
        csrTest<floatType, clFloatType, false>
            (dev, ctx, compileFlags, queue, resultDB, op, h_val, h_cols,
             h_rowDelimiters, h_vec, h_out, numRows, nItems, refOut,
             false, 0);

        // Test CSR kernels on padded data
        cout << "CSR Test -- Padded Data\n";
        csrTest<floatType,clFloatType, false>
            (dev, ctx, compileFlags, queue, resultDB, op, h_valPad, h_colsPad,
             h_rowDelimitersPad, h_vec, h_out, numRows, nItemsPadded, refOut,
             true, 0);

        // Test ELLPACKR kernel
        cout << "ELLPACKR Test\n";
        ellPackTest<floatType, clFloatType, false>
            (dev, ctx, compileFlags, queue, resultDB, op, h_val, h_cols,
             h_rowDelimiters, h_vec, h_out, numRows, nItems,
             refOut, false, paddedSize, 0);
    }

    delete[] h_val;
    delete[] h_cols;
    delete[] h_rowDelimiters;
    delete[] h_vec;
    delete[] h_out;
    delete[] h_valPad;
    delete[] h_colsPad;
    delete[] h_rowDelimitersPad;
}
Exemplo n.º 2
0
void
DoTest( const char* timerDesc, ResultDatabase& resultDB, OptionParser& opts )
{
    StencilFactory<T>* stdStencilFactory = NULL;
    Stencil<T>* stdStencil = NULL;
    StencilFactory<T>* testStencilFactory = NULL;
    Stencil<T>* testStencil = NULL;

    try
    {
#if defined(PARALLEL)
        stdStencilFactory = new MPIHostStencilFactory<T>;
        testStencilFactory = new MPICUDAStencilFactory<T>;
#else
        stdStencilFactory = new HostStencilFactory<T>;
        testStencilFactory = new CUDAStencilFactory<T>;
#endif // defined(PARALLEL)
        assert( (stdStencilFactory != NULL) && (testStencilFactory != NULL) );

        // do a sanity check on option values
        CheckOptions( opts );
        stdStencilFactory->CheckOptions( opts );
        testStencilFactory->CheckOptions( opts );

        // extract and validate options
        std::vector<long long> arrayDims = opts.getOptionVecInt( "customSize" );
        if( arrayDims.size() != 2 )
        {
            cerr << "Dim size: " << arrayDims.size() << "\n";
            throw InvalidArgValue( "all overall dimensions must be positive" );
        }
        if (arrayDims[0] == 0) // User has not specified a custom size
        {
            int sizeClass = opts.getOptionInt("size");
            arrayDims = StencilFactory<T>::GetStandardProblemSize( sizeClass );
        }
        long int seed = (long)opts.getOptionInt( "seed" );
        bool beVerbose = opts.getOptionBool( "verbose" );
        unsigned int nIters = (unsigned int)opts.getOptionInt( "num-iters" );
        double valErrThreshold = (double)opts.getOptionFloat( "val-threshold" );
        unsigned int nValErrsToPrint = (unsigned int)opts.getOptionInt( "val-print-limit" );

#if defined(PARALLEL)
        unsigned int haloWidth = (unsigned int)opts.getOptionInt( "iters-per-exchange" );
#else
        unsigned int haloWidth = 1;
#endif // defined(PARALLEL)

        float haloVal = (float)opts.getOptionFloat( "haloVal" );

        // build a description of this experiment
        std::vector<long long> lDims = opts.getOptionVecInt( "lsize" );
        assert( lDims.size() == 2 );
        std::ostringstream experimentDescriptionStr;
        experimentDescriptionStr
            << nIters << ':'
            << arrayDims[0] << 'x' << arrayDims[1] << ':'
            << lDims[0] << 'x' << lDims[1];

        unsigned int nPasses = (unsigned int)opts.getOptionInt( "passes" );
        unsigned int nWarmupPasses = (unsigned int)opts.getOptionInt( "warmupPasses" );


        // compute the expected result on the host
        // or read it from a pre-existing file
        std::string matrixFilenameBase = (std::string)opts.getOptionString( "expMatrixFile" );
#if defined(PARALLEL)
        int cwrank;
        MPI_Comm_rank( MPI_COMM_WORLD, &cwrank );
        if( cwrank == 0 )
        {
#endif // defined(PARALLEL)
        if( !matrixFilenameBase.empty() )
        {
            std::cout << "\nReading expected stencil operation result from file for later comparison with CUDA output\n"
                << std::endl;
        }
        else
        {
            std::cout << "\nPerforming stencil operation on host for later comparison with CUDA output\n"
                << "Depending on host capabilities, this may take a while."
                << std::endl;
        }
#if defined(PARALLEL)
        }
#endif // defined(PARALLEL)
        Matrix2D<T> expected( arrayDims[0] + 2*haloWidth,
                            arrayDims[1] + 2*haloWidth );
        Initialize<T> init( seed, haloWidth, haloVal );

        bool haveExpectedData = false;
        if( ! matrixFilenameBase.empty() )
        {
            bool readOK = ReadMatrixFromFile( expected, GetMatrixFileName<T>( matrixFilenameBase ) );
            if( readOK )
            {

                if( (expected.GetNumRows() != arrayDims[0] + 2*haloWidth) ||
                    (expected.GetNumColumns() != arrayDims[1] + 2*haloWidth) )
                {
                    std::cerr << "The matrix read from file \'"
                        << GetMatrixFileName<T>( matrixFilenameBase )
                        << "\' does not match the matrix size specified on the command line.\n";
                    expected.Reset( arrayDims[0] + 2*haloWidth, arrayDims[1] + 2*haloWidth );
                }
                else
                {
                    haveExpectedData = true;
                }
            }

            if( !haveExpectedData )
            {
                std::cout << "\nSince we could not read the expected matrix values,\nperforming stencil operation on host for later comparison with CUDA output.\n"
                    << "Depending on host capabilities, this may take a while."
                    << std::endl;
            }
        }
        if( !haveExpectedData )
        {
            init( expected );
            haveExpectedData = true;
            if( beVerbose )
            {
                std::cout << "initial state:\n" << expected << std::endl;
            }
            stdStencil = stdStencilFactory->BuildStencil( opts );
            (*stdStencil)( expected, nIters );
        }
        if( beVerbose )
        {
            std::cout << "expected result:\n" << expected << std::endl;
        }

        // determine whether we are to save the expected matrix values to a file
        // to speed up future runs
        matrixFilenameBase = (std::string)opts.getOptionString( "saveExpMatrixFile" );
        if( !matrixFilenameBase.empty() )
        {
            SaveMatrixToFile( expected, GetMatrixFileName<T>( matrixFilenameBase ) );
        }
        assert( haveExpectedData );


        // compute the result on the CUDA device
        Matrix2D<T> data( arrayDims[0] + 2*haloWidth,
                            arrayDims[1] + 2*haloWidth );
        Stencil<T>* testStencil = testStencilFactory->BuildStencil( opts );

        // Compute the number of floating point operations we will perform.
        //
        // Note: in the truly-parallel case, we count flops for redundant
        // work due to the need for a halo.
        // But we do not add to the count for the local 1-wide halo since
        // we aren't computing new values for those items.
        unsigned long npts = (arrayDims[0] + 2*haloWidth - 2) *
                            (arrayDims[1] + 2*haloWidth - 2);
#if defined(PARALLEL)
        MPICUDAStencil<T>* mpiTestStencil = static_cast<MPICUDAStencil<T>*>( testStencil );
        assert( mpiTestStencil != NULL );
        int participating = mpiTestStencil->ParticipatingInProgram() ? 1 : 0;
        int numParticipating = 0;
        MPI_Allreduce( &participating,      // src
                        &numParticipating,  // dest
                        1,                  // count
                        MPI_INT,            // type
                        MPI_SUM,            // op
                        MPI_COMM_WORLD );   // communicator
        npts *= numParticipating;
#endif // defined(PARALLEL)

        // In our 9-point stencil, there are 11 floating point operations
        // per point (3 multiplies and 11 adds):
        //
        // newval = weight_center * centerval +
        //      weight_cardinal * (northval + southval + eastval + westval) +
        //      weight_diagnoal * (neval + nwval + seval + swval)
        //
        // we do this stencil operation 'nIters' times
        unsigned long nflops = npts * 11 * nIters;

#if defined(PARALLEL)
        if( cwrank == 0 )
        {
#endif // defined(PARALLEL)
        std::cout << "Performing " << nWarmupPasses << " warmup passes...";
#if defined(PARALLEL)
        }
#endif // defined(PARALLEL)

        for( unsigned int pass = 0; pass < nWarmupPasses; pass++ )
        {
            init(data);
            (*testStencil)( data, nIters );
        }
#if defined(PARALLEL)
        if( cwrank == 0 )
        {
#endif // defined(PARALLEL)
        std::cout << "done." << std::endl;
#if defined(PARALLEL)
        }
#endif // defined(PARALLEL)

#if defined(PARALLEL)
        MPI_Comm_rank( MPI_COMM_WORLD, &cwrank );
        if( cwrank == 0 )
        {
#endif // defined(PARALLEL)
        std::cout << "\nPerforming stencil operation on chosen device, "
            << nPasses << " passes.\n"
            << "Depending on chosen device, this may take a while."
            << std::endl;
#if defined(PARALLEL)
        }
#endif // defined(PARALLEL)

#if !defined(PARALLEL)
        std::cout << "At the end of each pass the number of validation\nerrors observed will be printed to the standard output."
            << std::endl;
#endif // !defined(PARALLEL)


        for( unsigned int pass = 0; pass < nPasses; pass++ )
        {
#if !defined(PARALLEL)
            std::cout << "pass " << pass << ": ";
#endif // !defined(PARALLEL)
            init( data );

            int timerHandle = Timer::Start();
            (*testStencil)( data, nIters );
            double elapsedTime = Timer::Stop( timerHandle, "CUDA stencil" );


            // find and report the computation rate
            double gflops = (nflops / elapsedTime) / 1e9;

            resultDB.AddResult( timerDesc,
                                    experimentDescriptionStr.str(),
                                    "GFLOPS",
                                    gflops );
            if( beVerbose )
            {
                std::cout << "observed result, pass " << pass << ":\n"
                    << data
                    << std::endl;
            }

            // validate the result
#if defined(PARALLEL)
            StencilValidater<T>* validater = new MPIStencilValidater<T>;
#else
            StencilValidater<T>* validater = new SerialStencilValidater<T>;
#endif // defined(PARALLEL)
            validater->ValidateResult( expected,
                            data,
                            valErrThreshold,
                            nValErrsToPrint );
        }
    }
    catch( ... )
    {
        // clean up - abnormal termination
        // wish we didn't have to do this, but C++ exceptions do not
        // support a try-catch-finally approach
        delete stdStencil;
        delete stdStencilFactory;
        delete testStencil;
        delete testStencilFactory;
        throw;
    }

    // clean up - normal termination
    delete stdStencil;
    delete stdStencilFactory;
    delete testStencil;
    delete testStencilFactory;
}