Ejemplo n.º 1
0
int main(int argc, char **argv)
{

	OptionParser op;
  op.addOption("verbose", OPT_BOOL, "", "enable verbose output", 'v');
  op.addOption("passes", OPT_INT, "10", "specify number of passes", 'n');
  op.addOption("size", OPT_INT, "1", "specify problem size", 's');
  op.addOption("target", OPT_INT, "0", "specify MIC target device number", 't');
  
  // If benchmark has any specific options, add those
  addBenchmarkSpecOptions(op);
  
  if (!op.parse(argc, argv))
  {
     op.usage();
     return -1;
  }

  ResultDatabase resultDB;
  // Run the test
  RunBenchmark(op, resultDB);

  // Print out results to stdout
  resultDB.DumpDetailed(cout);

	return 0;
}
Ejemplo n.º 2
0
void
RunBenchmark(cl::Device& devcpp,
             cl::Context& ctxcpp,
             cl::CommandQueue& queuecpp,
             ResultDatabase &resultDB,
             OptionParser &op)
{
    // Convert from C++ bindings to C bindings
    // TODO propagate use of C++ bindings
    cl_device_id dev = devcpp();
    cl_context ctx = ctxcpp();
    cl_command_queue queue = queuecpp();

    // Collect basic MPI information
    int size, rank;
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    // Always run single precision test
    // OpenCL doesn't support templated kernels, so we have to use macros
    string spMacros = "-DSINGLE_PRECISION";
    runTest<float>
        ("TPScan-SP", dev, ctx, queue, resultDB, op, spMacros);

    // If double precision is supported, run the DP test
    if (checkExtension(dev, "cl_khr_fp64"))
    {
        cout << "DP Supported\n";
        string dpMacros = "-DK_DOUBLE_PRECISION ";
        runTest<double>
        ("TPScan-DP", dev, ctx, queue, resultDB, op, dpMacros);
    }
    else if (checkExtension(dev, "cl_amd_fp64"))
    {
        cout << "DP Supported\n";
        string dpMacros = "-DAMD_DOUBLE_PRECISION ";
        runTest<double>
        ("TPScan-DP", dev, ctx, queue, resultDB, op, dpMacros);
    }
    else
    {
        char atts[1024] = "DP_Not_Supported";
        cout << "Warning, rank " << rank << "'s device does not support DP\n";
        // ResultDB requires every rank to report something. If this rank
        // doesn't support DP, submit FLT_MAX (this is handled as no result by
        // ResultDB.
        int passes = op.getOptionInt("passes");
        for (int k = 0; k < passes; k++)
        {
            resultDB.AddResult("TPScan-DP-Kernel" , atts, "GB/s", FLT_MAX);
            resultDB.AddResult("TPScan-DP-Kernel+PCIe" , atts, "GB/s",
                FLT_MAX);
            resultDB.AddResult("TPScan-DP-MPI_ExScan" , atts, "GB/s",
                FLT_MAX);
            resultDB.AddResult("TPScan-DP-Overall" , atts, "GB/s", FLT_MAX);
        }
    }
}
Ejemplo n.º 3
0
void
RunBenchmark(cl::Device& devcpp,
                  cl::Context& ctxcpp,
                  cl::CommandQueue& queuecpp,
                  ResultDatabase &resultDB,
                  OptionParser &op)
{
    // convert from C++ bindings to C bindings
    // TODO propagate use of C++ bindings
    cl_device_id dev = devcpp();
    cl_context ctx = ctxcpp();
    cl_command_queue queue = queuecpp();

    // Always run single precision test
    // OpenCL doesn't support templated kernels, so we have to use macros
    runTest<float>("SGEMM", dev, ctx, queue, resultDB, op,
            "-DSINGLE_PRECISION");

    // If double precision is supported, run the DP test
    if (checkExtension(dev, "cl_khr_fp64"))
    {
        cout << "DP Supported\n";
        runTest<double>("DGEMM", dev, ctx, queue, resultDB, op,
                "-DK_DOUBLE_PRECISION ");
    }
    else if (checkExtension(dev, "cl_amd_fp64"))
    {
        cout << "DP Supported\n";
        runTest<double>("DGEMM", dev, ctx, queue, resultDB, op,
                "-DAMD_DOUBLE_PRECISION ");
    }
    else
    {
        cout << "DP Not Supported\n";
        char atts[1024] = "DP_Not_Supported";
        // resultDB requires neg entry for every possible result
        int passes = op.getOptionInt("passes");
        for (; passes > 0; --passes) {
            for (int i = 0; i < 2; i++) {
                const char transb = i ? 'T' : 'N';
                string testName="DGEMM";
                resultDB.AddResult(testName+"-"+transb, atts, "GFlops", FLT_MAX);
                resultDB.AddResult(testName+"-"+transb+"_PCIe", atts, "GFlops", FLT_MAX);
                resultDB.AddResult(testName+"-"+transb+"_Parity", atts, "N", FLT_MAX);
            }
        }
    }
}
Ejemplo n.º 4
0
void
RunBenchmark(cl::Device& devcpp, cl::Context& ctxcpp,
             cl::CommandQueue& queuecpp,
             ResultDatabase &resultDB, OptionParser &op)
{
    // convert from C++ bindings to C bindings
    // TODO propagate use of C++ bindings
    cl_device_id dev = devcpp();
    cl_context ctx = ctxcpp();
    cl_command_queue queue = queuecpp();

    // Always run single precision test
    // OpenCL doesn't support templated kernels, so we have to use macros
    string spMacros = "-DSINGLE_PRECISION ";
    RunTest<float>("S3D-SP", dev, ctx, queue, resultDB, op, spMacros);

    // If double precision is supported, run the DP test
    if (checkExtension(dev, "cl_khr_fp64"))
    {
        cout << "DP Supported\n";
        string dpMacros = "-DK_DOUBLE_PRECISION ";
        RunTest<double>
        ("S3D-DP", dev, ctx, queue, resultDB, op, dpMacros);
    }
    else if (checkExtension(dev, "cl_amd_fp64"))
    {
        cout << "DP Supported\n";
        string dpMacros = "-DAMD_DOUBLE_PRECISION ";
        RunTest<double>
        ("S3D-DP", dev, ctx, queue, resultDB, op, dpMacros);
    }
    else
    {
        cout << "DP Not Supported\n";
        char atts[1024] = "DP_Not_Supported";
        // resultDB requires neg entry for every possible result
        int passes = op.getOptionInt("passes");
        for (int k = 0; k < passes; k++) {
            resultDB.AddResult("S3D-DP" , atts, "GB/s", FLT_MAX);
            resultDB.AddResult("S3D-DP_PCIe" , atts, "GB/s", FLT_MAX);
            resultDB.AddResult("S3D-DP_Parity" , atts, "GB/s", FLT_MAX);
        }
    }
}
Ejemplo n.º 5
0
void
RunBenchmark(cl_device_id dev,
                  cl_context ctx,
                  cl_command_queue queue,
                  ResultDatabase &resultDB,
                  OptionParser &op)
{
    // Always run single precision test
    // OpenCL doesn't support templated kernels, so we have to use macros
    string spMacros = "-DSINGLE_PRECISION";
    runTest<float, float4, float4>
        ("MD-LJ", dev, ctx, queue, resultDB, op, spMacros);

    // If double precision is supported, run the DP test
    if (checkExtension(dev, "cl_khr_fp64"))
    {
        cout << "DP Supported\n";
        string dpMacros = "-DK_DOUBLE_PRECISION ";
        runTest<double, double4, double4>
                ("MD-LJ-DP", dev, ctx, queue, resultDB, op, dpMacros);
    }
    else if (checkExtension(dev, "cl_amd_fp64"))
    {
        cout << "DP Supported\n";
        string dpMacros = "-DAMD_DOUBLE_PRECISION ";
        runTest<double, double4, double4>
        ("MD-LJ-DP", dev, ctx, queue, resultDB, op, dpMacros);
    }
    else
    {
        cout << "DP Not Supported\n";
        char atts[32] = "DP_Not_Supported";
        // resultDB requires neg entry for every possible result
        int passes = op.getOptionInt("passes");
        for (int i = 0; i < passes; i++) {
            resultDB.AddResult("MD-LJ-DP" , atts, "GB/s", FLT_MAX);
            resultDB.AddResult("MD-LJ-DP_PCIe" , atts, "GB/s", FLT_MAX);
            resultDB.AddResult("MD-LJ-DP-Bandwidth", atts, "GB/s", FLT_MAX);
            resultDB.AddResult("MD-LJ-DP-Bandwidth_PCIe", atts, "GB/s", FLT_MAX);
            resultDB.AddResult("MD-LJ-DP_Parity" , atts, "GB/s", FLT_MAX);
        }
    }
}
Ejemplo n.º 6
0
void
RunBenchmark(OptionParser& opts, ResultDatabase& resultDB )
{
    int device;

#if defined(PARALLEL)
    int cwrank;
    MPI_Comm_rank( MPI_COMM_WORLD, &cwrank );
#endif // defined(PARALLEL)
#if defined(PARALLEL)
    if( cwrank == 0 )
    {
#endif // defined(PARALLEL)
        std::cout << "Running single precision test" << std::endl;
#if defined(PARALLEL)
    }
#endif // defined(PARALLEL)
    //omp_set_num_threads(124);
    DoTest<float>( "SP_Sten2D", resultDB, opts );

    // check if we can run double precision tests
    if( //deviceProps.major == 1) && (deviceProps.minor >= 3)) ||
        //eviceProps.major >= 2))
        1)
    {
#if defined(PARALLEL)
        if( cwrank == 0 )
        {
#endif // defined(PARALLEL)
            std::cout << "DP supported\n" << std::endl;
#if defined(PARALLEL)
        }
#endif // defined(PARALLEL)
	//omp_set_num_threads(93);
        DoTest<double>( "DP_Sten2D", resultDB, opts );
    }
    else
    {
#if defined(PARALLEL)
        if( cwrank == 0 )
        {
#endif // defined(PARALLEL)
            std::cout << "Double precision not supported - skipping" << std::endl;
#if defined(PARALLEL)
        }
#endif // defined(PARALLEL)
        // resultDB requires neg entry for every possible result
        int nPasses = (int)opts.getOptionInt( "passes" );
        for( int p = 0; p < nPasses; p++ )
        {
            resultDB.AddResult( (const char*)"DP_Sten2D", "N/A", "s", FLT_MAX);
        }
    }
	
}
Ejemplo n.º 7
0
Archivo: FFT.cpp Proyecto: vetter/shoc
void
RunBenchmark(ResultDatabase &resultDB, OptionParser &op)
{
    // Test to see if this device supports double precision
    cudaGetDevice(&fftDevice);
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, fftDevice);
    bool has_dp = (deviceProp.major == 1 && deviceProp.minor >= 3) ||
        (deviceProp.major >= 2);

    cout << "Running single precision test" << endl;
    runTest<float2>("SP-FFT", resultDB, op);
    if (has_dp) {
        cout << "Running double precision test" << endl;
        runTest<double2>("DP-FFT", resultDB, op);
    }
    else
    {
        cout << "Skipping double precision test" << endl;
        char atts[32] = "DP_Not_Supported";
        // resultDB requires neg entry for every possible result
        int passes = op.getOptionInt("passes");
        for (int k=0; k<passes; k++)
        {
            resultDB.AddResult("DP-FFT" , atts, "GB/s", FLT_MAX);
            resultDB.AddResult("DP-FFT_PCIe" , atts, "GB/s", FLT_MAX);
            resultDB.AddResult("DP-FFT_Parity" , atts, "GB/s", FLT_MAX);
            resultDB.AddResult("DP-FFT-INV" , atts, "GB/s", FLT_MAX);
            resultDB.AddResult("DP-FFT-INV_PCIe" , atts, "GB/s", FLT_MAX);
            resultDB.AddResult("DP-FFT-INV_Parity" , atts, "GB/s", FLT_MAX);
        }
    }
}
Ejemplo n.º 8
0
static void
fillResultDB(const string& name, const string& reason, OptionParser &op, 
             ResultDatabase& resultDB)
{
    // resultDB requires neg entry for every possible result
    int passes = op.getOptionInt("passes");
    for (int k=0; k<passes; k++) {
        resultDB.AddResult(name , reason, "GB/s", FLT_MAX);
        resultDB.AddResult(name+"_PCIe" , reason, "GB/s", FLT_MAX);
        resultDB.AddResult(name+"_Parity" , reason, "GB/s", FLT_MAX);
        resultDB.AddResult(name+"-INV" , reason, "GB/s", FLT_MAX);
        resultDB.AddResult(name+"-INV_PCIe" , reason, "GB/s", FLT_MAX);
        resultDB.AddResult(name+"-INV_Parity" , reason, "GB/s", FLT_MAX);
    }
}
Ejemplo n.º 9
0
void RunTest(const string& testName, cl_device_id dev, cl_context ctx,
             cl_command_queue queue, ResultDatabase &resultDB,
             OptionParser &op, string& compileFlags)
{
    int n_species = 22;
    int i, j, err;

    int probSizes_SP[4] = { 24, 32, 40, 48};
    int probSizes_DP[4] = { 16, 24, 32, 40};
    int *probSizes = (sizeof(T) == sizeof(double)) ? probSizes_DP : probSizes_SP;
    int size = probSizes[op.getOptionInt("size")-1];

    // The number of grid points
    int n = size * size * size;

    // For now these conversion factors are just 1
    T pconv = 1.0;    // 1418365.88544;
    T tconv = 1.0;    //120.0;
    T rateconv = 1.0; //11.0393507649917;

    // Host copies of data
    T* h_t     = new T[n];
    T* h_p     = new T[n];
    T* h_y     = new T[n*n_species];
    T* h_wdot  = new T[n*n_species];
    T* h_molwt = new T[n_species];

    // Device data
    cl_mem d_t;    // Temperatures array
    cl_mem d_p;    // Pressures array
    cl_mem d_y;    // Input variables
    cl_mem d_wdot; // Output variables

    // intermediate variables
    cl_mem d_rf, d_rb, d_rklow, d_c, d_a, d_eg, d_molwt;

    // Initialize host memory
    for (i=0; i<n; i++)
    {
        h_p[i] = 1.0132e6;
        h_t[i] = 1000.0;
    }

    for (j=0; j<22; j++)
    {
        for (i=0; i<n; i++)
        {
            h_y[(j*n)+i]= 0.0;
            if (j==14)
                h_y[(j*n)+i] = 0.064;
            if (j==3)
                h_y[(j*n)+i] = 0.218;
            if (j==21)
                h_y[(j*n)+i] = 0.718;
        }
    }

    for (int i=0; i<n_species; i++)
    {
        h_molwt[i] = 1.0f;
    }
//    // Initialize molecular weights
//    h_molwt[0]= 2.01594E-03;
//    h_molwt[1]= 1.00797E-03;
//    h_molwt[2]= 1.59994E-02;
//    h_molwt[3]= 3.19988E-02;
//    h_molwt[4]= 1.700737E-02;
//    h_molwt[5]= 1.801534E-02;
//    h_molwt[6]= 3.300677E-02;
//    h_molwt[7]= 3.401473999999999E-02;
//    h_molwt[8]= 1.503506E-02;
//    h_molwt[9] = 1.604303E-02;
//    h_molwt[10] = 2.801055E-02;
//    h_molwt[11] = 4.400995E-02;
//    h_molwt[12] = 3.002649E-02;
//    h_molwt[13] = 2.603824E-02;
//    h_molwt[14] = 2.805418E-02;
//    h_molwt[15] = 3.007012E-02;
//    h_molwt[16] = 4.102967E-02;
//    h_molwt[17] = 4.203764E-02;
//    h_molwt[18] = 4.405358E-02;
//    h_molwt[19] = 4.10733E-02;
//    h_molwt[20] = 4.208127E-02;
//    h_molwt[21] = 2.80134E-02;

    // Allocate device memory
    size_t base = n * sizeof(T);
    clMalloc(d_t, base);
    clMalloc(d_p, base);
    clMalloc(d_y, n_species*base);
    clMalloc(d_wdot, n_species*base);
    clMalloc(d_rf, 206*base);
    clMalloc(d_rb, 206*base);
    clMalloc(d_rklow, 21*base);
    clMalloc(d_c, C_SIZE*base);
    clMalloc(d_a, A_SIZE*base);
    clMalloc(d_eg, EG_SIZE*base);
    clMalloc(d_molwt, n_species*sizeof(T));

    // Copy over input params
    long inputTransferTime = 0;
    Event evTransfer("PCIe Transfer");

    clMemtoDevice(d_t, h_t, base);
    evTransfer.FillTimingInfo();
    inputTransferTime += evTransfer.StartEndRuntime();

    clMemtoDevice(d_p, h_p, base);
    evTransfer.FillTimingInfo();
    inputTransferTime += evTransfer.StartEndRuntime();

    clMemtoDevice(d_y, h_y, n_species*base);
    evTransfer.FillTimingInfo();
    inputTransferTime += evTransfer.StartEndRuntime();

    clMemtoDevice(d_molwt, h_molwt, n_species*sizeof(T));
    evTransfer.FillTimingInfo();
    inputTransferTime += evTransfer.StartEndRuntime();

    // Set up macros
    compileFlags += "-DDIM="  + toString(size) + " " +
                    "-DN_GP=" + toString(n)    + " ";

    unsigned int passes = op.getOptionInt("passes");
    for (unsigned int i = 0; i < passes; i++)
    {
        size_t globalWorkSize = n;
        size_t localWorkSize = 128;

        // -------------------- phase 1 -----------------

        // Setup Program Objects (phase 1)
        clProg(gr_prog, cl_source_gr_base);
        clProg(rdsmh_prog, cl_source_rdsmh);
        clProg(ratt_prog, cl_source_ratt);
        clProg(ratt2_prog, cl_source_ratt2);
        clProg(ratt3_prog, cl_source_ratt3);
        clProg(ratt4_prog, cl_source_ratt4);
        clProg(ratt5_prog, cl_source_ratt5);
        clProg(ratt6_prog, cl_source_ratt6);
        clProg(ratt7_prog, cl_source_ratt7);
        clProg(ratt8_prog, cl_source_ratt8);
        clProg(ratt9_prog, cl_source_ratt9);
        clProg(ratt10_prog, cl_source_ratt10);
        clProg(ratx_prog, cl_source_ratx);
        clProg(ratxb_prog, cl_source_ratxb);
        clProg(ratx2_prog, cl_source_ratx2);
        clProg(ratx4_prog, cl_source_ratx4);

        // Build the kernels (phase 1)
        cout << "Compiling kernels (phase 1)...";
        cout.flush();

        clBuild(gr_prog);
        clBuild(rdsmh_prog);
        clBuild(ratt_prog);
        clBuild(ratt2_prog);
        clBuild(ratt3_prog);
        clBuild(ratt4_prog);
        clBuild(ratt5_prog);
        clBuild(ratt6_prog);
        clBuild(ratt7_prog);
        clBuild(ratt8_prog);
        clBuild(ratt9_prog);
        clBuild(ratt10_prog);
        clBuild(ratx_prog);
        clBuild(ratxb_prog);
        clBuild(ratx2_prog);
        clBuild(ratx4_prog);

        cout << "done." << endl;

        // Extract out kernel objects (phase 1)
        cout << "Generating OpenCL Kernel Objects (phase 1)...";
        cout.flush();

        // GR Base Kernels
        cl_kernel grBase_kernel = clCreateKernel(gr_prog, "gr_base", &err);
        CL_CHECK_ERROR(err);

        // RDSMH Kernels
        cl_kernel rdsmh_kernel = clCreateKernel(rdsmh_prog, "rdsmh_kernel",
                &err);
        CL_CHECK_ERROR(err);

        // RATT Kernels
        cl_kernel ratt_kernel = clCreateKernel(ratt_prog, "ratt_kernel", &err);
        CL_CHECK_ERROR(err);
        cl_kernel ratt2_kernel = clCreateKernel(ratt2_prog, "ratt2_kernel",
                &err);
        CL_CHECK_ERROR(err);
        cl_kernel ratt3_kernel = clCreateKernel(ratt3_prog, "ratt3_kernel",
                &err);
        CL_CHECK_ERROR(err);
        cl_kernel ratt4_kernel = clCreateKernel(ratt4_prog, "ratt4_kernel",
                &err);
        CL_CHECK_ERROR(err);
        cl_kernel ratt5_kernel = clCreateKernel(ratt5_prog, "ratt5_kernel",
                &err);
        CL_CHECK_ERROR(err);
        cl_kernel ratt6_kernel = clCreateKernel(ratt6_prog, "ratt6_kernel",
                &err);
        CL_CHECK_ERROR(err);
        cl_kernel ratt7_kernel = clCreateKernel(ratt7_prog, "ratt7_kernel",
                &err);
        CL_CHECK_ERROR(err);
        cl_kernel ratt8_kernel = clCreateKernel(ratt8_prog, "ratt8_kernel",
                &err);
        CL_CHECK_ERROR(err);
        cl_kernel ratt9_kernel = clCreateKernel(ratt9_prog, "ratt9_kernel",
                &err);
        CL_CHECK_ERROR(err);
        cl_kernel ratt10_kernel = clCreateKernel(ratt10_prog, "ratt10_kernel",
                &err);
        CL_CHECK_ERROR(err);

        // RATX Kernels
        cl_kernel ratx_kernel = clCreateKernel(ratx_prog, "ratx_kernel", &err);
        CL_CHECK_ERROR(err);
        cl_kernel ratxb_kernel = clCreateKernel(ratxb_prog, "ratxb_kernel",
                &err);
        CL_CHECK_ERROR(err);
        cl_kernel ratx2_kernel = clCreateKernel(ratx2_prog, "ratx2_kernel",
                &err);
        CL_CHECK_ERROR(err);
        cl_kernel ratx4_kernel = clCreateKernel(ratx4_prog, "ratx4_kernel",
                &err);
        CL_CHECK_ERROR(err);
        cout << "done." << endl;

        //Set kernel arguments (phase 1)
        err = clSetKernelArg(grBase_kernel, 0, sizeof(cl_mem), (void*)&d_p);
        CL_CHECK_ERROR(err);
        err = clSetKernelArg(grBase_kernel, 1, sizeof(cl_mem), (void*)&d_t);
        CL_CHECK_ERROR(err);
        err = clSetKernelArg(grBase_kernel, 2, sizeof(cl_mem), (void*)&d_y);
        CL_CHECK_ERROR(err);
        err = clSetKernelArg(grBase_kernel, 3, sizeof(cl_mem), (void*)&d_c);
        CL_CHECK_ERROR(err);
        err = clSetKernelArg(grBase_kernel, 4, sizeof(T), (void*)&tconv);
        CL_CHECK_ERROR(err);
        err = clSetKernelArg(grBase_kernel, 5, sizeof(T), (void*)&pconv);
        CL_CHECK_ERROR(err);

        err = clSetKernelArg(rdsmh_kernel, 0, sizeof(cl_mem), (void*)&d_t);
        CL_CHECK_ERROR(err);
        err = clSetKernelArg(rdsmh_kernel, 1, sizeof(cl_mem), (void*)&d_eg);
        CL_CHECK_ERROR(err);
        err = clSetKernelArg(rdsmh_kernel, 2, sizeof(T), (void*)&tconv);
        CL_CHECK_ERROR(err);

        err = clSetKernelArg(ratt_kernel, 0, sizeof(cl_mem), (void*)&d_t);
        CL_CHECK_ERROR(err);
        err = clSetKernelArg(ratt_kernel, 1, sizeof(cl_mem), (void*)&d_rf);
        CL_CHECK_ERROR(err);
        err = clSetKernelArg(ratt_kernel, 2, sizeof(T), (void*)&tconv);
        CL_CHECK_ERROR(err);
        clSetRattArg(ratt2_kernel);
        clSetRattArg(ratt3_kernel);
        clSetRattArg(ratt4_kernel);
        clSetRattArg(ratt5_kernel);
        clSetRattArg(ratt6_kernel);
        clSetRattArg(ratt7_kernel);
        clSetRattArg(ratt8_kernel);
        clSetRattArg(ratt9_kernel);
        err = clSetKernelArg(ratt10_kernel, 0, sizeof(cl_mem), (void*)&d_t);
        CL_CHECK_ERROR(err);
        err = clSetKernelArg(ratt10_kernel, 1, sizeof(cl_mem), (void*)&d_rklow);
        CL_CHECK_ERROR(err);
        err = clSetKernelArg(ratt10_kernel, 2, sizeof(T), (void*)&tconv);
        CL_CHECK_ERROR(err);

        clSetRatxArg(ratx_kernel);
        clSetRatxArg(ratxb_kernel);

        err = clSetKernelArg(ratx2_kernel, 0, sizeof(cl_mem), (void*)&d_c);
        CL_CHECK_ERROR(err);
        err = clSetKernelArg(ratx2_kernel, 1, sizeof(cl_mem), (void*)&d_rf);
        CL_CHECK_ERROR(err);

        err = clSetKernelArg(ratx4_kernel, 0, sizeof(cl_mem), (void*)&d_c);
        CL_CHECK_ERROR(err);
        err = clSetKernelArg(ratx4_kernel, 1, sizeof(cl_mem), (void*)&d_rb);
        CL_CHECK_ERROR(err);

        // Execute kernels (phase 1)
        cout << "Executing kernels (phase 1)...";
        cout.flush();

        Event evFirst_1("first kernel phase 1");
        Event evLast_1("last kernel phase 1");

        clLaunchKernelEv(grBase_kernel, evFirst_1.CLEvent());
        clLaunchKernel(ratt_kernel);
        clLaunchKernel(rdsmh_kernel);

        clLaunchKernel(ratt2_kernel);
        clLaunchKernel(ratt3_kernel);
        clLaunchKernel(ratt4_kernel);
        clLaunchKernel(ratt5_kernel);
        clLaunchKernel(ratt6_kernel);
        clLaunchKernel(ratt7_kernel);
        clLaunchKernel(ratt8_kernel);
        clLaunchKernel(ratt9_kernel);
        clLaunchKernel(ratt10_kernel);

        clLaunchKernel(ratx_kernel);
        clLaunchKernel(ratxb_kernel);
        clLaunchKernel(ratx2_kernel);
        clLaunchKernelEv(ratx4_kernel, evLast_1.CLEvent());

        err = clFinish(queue);
        CL_CHECK_ERROR(err);
        cout << "done. " << endl;

        evFirst_1.FillTimingInfo();
        evLast_1.FillTimingInfo();
        double total_phase1 = evLast_1.EndTime() - evFirst_1.StartTime();

        // Release Kernels (phase 1)
        clReleaseKernel(grBase_kernel);
        clReleaseKernel(rdsmh_kernel);
        clReleaseKernel(ratt_kernel);
        clReleaseKernel(ratt2_kernel);
        clReleaseKernel(ratt3_kernel);
        clReleaseKernel(ratt4_kernel);
        clReleaseKernel(ratt5_kernel);
        clReleaseKernel(ratt6_kernel);
        clReleaseKernel(ratt7_kernel);
        clReleaseKernel(ratt8_kernel);
        clReleaseKernel(ratt9_kernel);
        clReleaseKernel(ratt10_kernel);
        clReleaseKernel(ratx_kernel);
        clReleaseKernel(ratxb_kernel);
        clReleaseKernel(ratx2_kernel);
        clReleaseKernel(ratx4_kernel);

        // Release Programs (phase 1)
        clReleaseProgram(gr_prog);
        clReleaseProgram(rdsmh_prog);
        clReleaseProgram(ratt_prog);
        clReleaseProgram(ratt2_prog);
        clReleaseProgram(ratt3_prog);
        clReleaseProgram(ratt4_prog);
        clReleaseProgram(ratt5_prog);
        clReleaseProgram(ratt6_prog);
        clReleaseProgram(ratt7_prog);
        clReleaseProgram(ratt8_prog);
        clReleaseProgram(ratt9_prog);
        clReleaseProgram(ratt10_prog);
        clReleaseProgram(ratx_prog);
        clReleaseProgram(ratxb_prog);
        clReleaseProgram(ratx2_prog);
        clReleaseProgram(ratx4_prog);

        // -------------------- phase 2 -----------------
        // Setup Program Objects (phase 2)
        clProg(qssa_prog, cl_source_qssa);
        clProg(qssab_prog, cl_source_qssab);
        clProg(qssa2_prog, cl_source_qssa2);
        clProg(rdwdot_prog, cl_source_rdwdot);
        clProg(rdwdot2_prog, cl_source_rdwdot2);
        clProg(rdwdot3_prog, cl_source_rdwdot3);
        clProg(rdwdot6_prog, cl_source_rdwdot6);
        clProg(rdwdot7_prog, cl_source_rdwdot7);
        clProg(rdwdot8_prog, cl_source_rdwdot8);
        clProg(rdwdot9_prog, cl_source_rdwdot9);
        clProg(rdwdot10_prog, cl_source_rdwdot10);

        // Build the kernels (phase 2)
        cout << "Compiling kernels (phase 2)...";
        cout.flush();

        clBuild(qssa_prog);
        clBuild(qssab_prog);
        clBuild(qssa2_prog);
        clBuild(rdwdot_prog);
        clBuild(rdwdot2_prog);
        clBuild(rdwdot3_prog);
        clBuild(rdwdot6_prog);
        clBuild(rdwdot7_prog);
        clBuild(rdwdot8_prog);
        clBuild(rdwdot9_prog);
        clBuild(rdwdot10_prog);

        cout << "done." << endl;

        // Extract out kernel objects (phase 2)
        cout << "Generating OpenCL Kernel Objects (phase 2)...";
        cout.flush();

        // QSSA Kernels
        cl_kernel qssa_kernel = clCreateKernel(qssa_prog, "qssa_kernel", &err);
        CL_CHECK_ERROR(err);
        cl_kernel qssab_kernel = clCreateKernel(qssab_prog, "qssab_kernel",
                &err);
        CL_CHECK_ERROR(err);
        cl_kernel qssa2_kernel = clCreateKernel(qssa2_prog, "qssa2_kernel",
                &err);
        CL_CHECK_ERROR(err);

        // RDWDOT Kernels
        cl_kernel rdwdot_kernel = clCreateKernel(rdwdot_prog, "rdwdot_kernel",
                &err);
        CL_CHECK_ERROR(err);
        cl_kernel rdwdot2_kernel = clCreateKernel(rdwdot2_prog,
                "rdwdot2_kernel", &err);
        CL_CHECK_ERROR(err);
        cl_kernel rdwdot3_kernel = clCreateKernel(rdwdot3_prog,
                "rdwdot3_kernel", &err);
        CL_CHECK_ERROR(err);
        cl_kernel rdwdot6_kernel = clCreateKernel(rdwdot6_prog,
                "rdwdot6_kernel", &err);
        CL_CHECK_ERROR(err);
        cl_kernel rdwdot7_kernel = clCreateKernel(rdwdot7_prog,
                "rdwdot7_kernel", &err);
        CL_CHECK_ERROR(err);
        cl_kernel rdwdot8_kernel = clCreateKernel(rdwdot8_prog,
                "rdwdot8_kernel", &err);
        CL_CHECK_ERROR(err);
        cl_kernel rdwdot9_kernel = clCreateKernel(rdwdot9_prog,
                "rdwdot9_kernel", &err);
        CL_CHECK_ERROR(err);
        cl_kernel rdwdot10_kernel = clCreateKernel(rdwdot10_prog,
                "rdwdot10_kernel", &err);
        CL_CHECK_ERROR(err);
        cout << "done." << endl;

        //Set kernel arguments (phase 2)
        clSetQssaArg(qssa_kernel);
        clSetQssaArg(qssab_kernel);
        clSetQssaArg(qssa2_kernel);

        clSetRdwdotArg(rdwdot_kernel);
        clSetRdwdotArg(rdwdot2_kernel);
        clSetRdwdotArg(rdwdot3_kernel);
        clSetRdwdotArg(rdwdot6_kernel);
        clSetRdwdotArg(rdwdot7_kernel);
        clSetRdwdotArg(rdwdot8_kernel);
        clSetRdwdotArg(rdwdot9_kernel);
        clSetRdwdotArg(rdwdot10_kernel);

        // Execute kernels (phase 2)
        cout << "Executing kernels (phase 2)...";
        cout.flush();

        Event evFirst_2("first kernel phase 2");
        Event evLast_2("last kernel phase 2");

        clLaunchKernelEv(qssa_kernel, evFirst_2.CLEvent());
        clLaunchKernel(qssab_kernel);
        clLaunchKernel(qssa2_kernel);

        clLaunchKernel(rdwdot_kernel);
        clLaunchKernel(rdwdot2_kernel);
        clLaunchKernel(rdwdot3_kernel);
        clLaunchKernel(rdwdot6_kernel);
        clLaunchKernel(rdwdot7_kernel);
        clLaunchKernel(rdwdot8_kernel);
        clLaunchKernel(rdwdot9_kernel);
        clLaunchKernelEv(rdwdot10_kernel, evLast_2.CLEvent());

        err = clFinish(queue);
        CL_CHECK_ERROR(err);
        cout << "done. " << endl;

        evFirst_2.FillTimingInfo();
        evLast_2.FillTimingInfo();
        double total_phase2 = evLast_2.EndTime() - evFirst_2.StartTime();

        // Release Kernels (phase 2)
        clReleaseKernel(qssa_kernel);
        clReleaseKernel(qssab_kernel);
        clReleaseKernel(qssa2_kernel);
        clReleaseKernel(rdwdot_kernel);
        clReleaseKernel(rdwdot2_kernel);
        clReleaseKernel(rdwdot3_kernel);
        clReleaseKernel(rdwdot6_kernel);
        clReleaseKernel(rdwdot7_kernel);
        clReleaseKernel(rdwdot8_kernel);
        clReleaseKernel(rdwdot9_kernel);
        clReleaseKernel(rdwdot10_kernel);

        // Release Programs (phase 2)
        clReleaseProgram(qssa_prog);
        clReleaseProgram(qssab_prog);
        clReleaseProgram(qssa2_prog);
        clReleaseProgram(rdwdot_prog);
        clReleaseProgram(rdwdot2_prog);
        clReleaseProgram(rdwdot3_prog);
        clReleaseProgram(rdwdot6_prog);
        clReleaseProgram(rdwdot7_prog);
        clReleaseProgram(rdwdot8_prog);
        clReleaseProgram(rdwdot9_prog);
        clReleaseProgram(rdwdot10_prog);

        // -------------------- timings -----------------

        double total = total_phase1 + total_phase2;
        // Estimate GFLOPs (roughly 10k flops / point)
        double gflops = (n*10000.) / total;
        // Copy results back
        err = clEnqueueReadBuffer(queue, d_wdot, true, 0,
                n*n_species*sizeof(T), h_wdot,
                0, NULL, &evTransfer.CLEvent());
        CL_CHECK_ERROR(err);
        err = clFinish(queue);
        CL_CHECK_ERROR(err);
        evTransfer.FillTimingInfo();
        double totalTransferTime = inputTransferTime +
            evTransfer.StartEndRuntime();
        double gflops_pcie = (n*10000.) / (total + totalTransferTime);

        resultDB.AddResult(testName, "cubic", "GFLOPS", gflops);
        resultDB.AddResult(testName+"_PCIe", "cubic", "GFLOPS", gflops_pcie);
        resultDB.AddResult(testName+"_Parity", "cubic", "n", totalTransferTime / total );
    }


    // Release Memory
    err = clReleaseMemObject(d_t);
    CL_CHECK_ERROR(err);
    err = clReleaseMemObject(d_p);
    CL_CHECK_ERROR(err);
    err = clReleaseMemObject(d_y);
    CL_CHECK_ERROR(err);
    err = clReleaseMemObject(d_wdot);
    CL_CHECK_ERROR(err);
    err = clReleaseMemObject(d_rf);
    CL_CHECK_ERROR(err);
    err = clReleaseMemObject(d_rb);
    CL_CHECK_ERROR(err);
    err = clReleaseMemObject(d_c);
    CL_CHECK_ERROR(err);
    err = clReleaseMemObject(d_eg);
    CL_CHECK_ERROR(err);
    err = clReleaseMemObject(d_rklow);
    CL_CHECK_ERROR(err);
    err = clReleaseMemObject(d_a);
    CL_CHECK_ERROR(err);

    // Cleanup Host Memory Objects
    delete[] h_t;
    delete[] h_p;
    delete[] h_y;
    delete[] h_wdot;
    delete[] h_molwt;

}
Ejemplo n.º 10
0
// ****************************************************************************
// Function: RunBenchmark
//
// Purpose: 
//   Runs the stablity test. The algorithm for the parallel
//   version of the test, which enables testing of an entire GPU
//   cluster at the same time, is as follows. Each participating node
//   first allocates its data, while node zero additionally determines
//   start and finish times based on a user input parameter. All nodes
//   then enter the outermost loop, copying fresh data from the CPU
//   before entering the core of the test. In the core, each node
//   performs a loop consisting of the forward kernel, a potential
//   check, and then the inverse kernel. After performing a configurable
//   number of forward/inverse iterations, along with a configurable
//   number of checks, each node sends the number of failures it
//   encountered to node zero. Node zero collects and reports the error
//   counts, determines whether the test has run its course, and
//   broadcasts the decision. If the decision is to proceed, each node
//   begins the next iteration of the outer loop, copying fresh data and
//   then performing the kernels and checks of the core loop.
//
// Arguments:
//   resultDB: the benchmark stores its results in this ResultDatabase
//   op: the options parser / parameter database
//
// Returns:  nothing
//
// Programmer: Collin McCurdy
// Creation: September 08, 2009
//
// Modifications:
//
// ****************************************************************************
void RunBenchmark(ResultDatabase &resultDB, OptionParser& op)
{	
    int mpi_rank, mpi_size, node_rank;
    int i, j;
    float2* source, * result;
    void* work, * chk;

#ifdef PARALLEL
    MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);

    NodeInfo NI;
    node_rank = NI.nodeRank();

    cout << "MPI Task " << mpi_rank << " of " << mpi_size 
         << " (noderank=" << node_rank << ") starting....\n";
#else
    mpi_rank = 0;
    mpi_size = 1;
    node_rank = 0;
#endif
   
    // ensure chk buffer alloc succeeds before grabbing the
    // rest of available memory.
    allocDeviceBuffer(&chk, 1);
    unsigned long avail_bytes = findAvailBytes();
    // unsigned long avail_bytes = 1024*1024*1024-1;
    
    // now determine how much available memory will be used (subject
    // to CUDA's constraint on the maximum block dimension size)
    int blocks = avail_bytes / (512*sizeof(float2));
    int slices = 1;
    while (blocks/slices > 65535) {
        slices *= 2;
    }
    int half_n_ffts = ((blocks/slices)*slices)/2;
    int n_ffts = half_n_ffts * 2;
    fprintf(stderr, "avail_bytes=%ld, blocks=%d, n_ffts=%d\n", 
            avail_bytes, blocks, n_ffts);

    int half_n_cmplx = half_n_ffts * 512;
    unsigned long used_bytes = half_n_cmplx * 2 * sizeof(float2);

    cout << mpi_rank << ": testing " 
         << used_bytes/((double)1024*1024) << " MBs\n";

    // allocate host memory
    source = (float2*)malloc(used_bytes);
    result = (float2*)malloc(used_bytes);

    // alloc device memory
    allocDeviceBuffer(&work, used_bytes);

    // alloc gather buffer
    int* recvbuf = (int*)malloc(mpi_size*sizeof(int));
    
    // compute start and finish times
    time_t start = time(NULL);
    time_t finish = start + (time_t)(op.getOptionInt("time")*60);
    struct tm start_tm, finish_tm;
    localtime_r(&start, &start_tm);
    localtime_r(&finish, &finish_tm);
    if (mpi_rank == 0) {
        printf("start = %s", asctime(&start_tm));
        printf("finish = %s", asctime(&finish_tm));
    }
    
    for (int iter = 0; ; iter++) {
        bool failed = false;
        int errorCount = 0, stop = 0;

        // (re-)init host memory...
        for (i = 0; i < half_n_cmplx; i++) {
            source[i].x = (rand()/(float)RAND_MAX)*2-1;
            source[i].y = (rand()/(float)RAND_MAX)*2-1;
            source[i+half_n_cmplx].x = source[i].x;
            source[i+half_n_cmplx].y = source[i].y;
        }

        // copy to device
        copyToDevice(work, source, used_bytes);
        copyToDevice(chk, &errorCount, 1);

        forward(work, n_ffts);
        if (check(work, chk, half_n_ffts, half_n_cmplx)) {
            fprintf(stderr, "First check failed...");
            failed = true;
        }

        if (!failed) {
            for (i = 1; i <= CHECKS; i++) {
                for (j = 1; j <= ITERS_PER_CHECK; j++) {
                    inverse(work, n_ffts);
                    forward(work, n_ffts);
                }

                if (check(work, chk, half_n_ffts, half_n_cmplx)) {
                    failed = true;
                    break;
                }
            }
        }

        // failing node is responsible for verifying failure, counting
        // errors and reporting count to node 0.
        if (failed) {
            fprintf(stderr, "Failure on node %d, iter %d:", mpi_rank, iter);
            
            // repeat check on CPU
            copyFromDevice(result, work, used_bytes);
            float2* result2 = result + half_n_cmplx;
            for (j = 0; j < half_n_cmplx; j++) {
                if (result[j].x != result2[j].x || 
                    result[j].y != result2[j].y) 
                {
                    errorCount++;
                }
            }
            if (!errorCount) {
                fprintf(stderr, "verification failed!\n");
            }
            else {
                fprintf(stderr, "%d errors\n", errorCount);
            }
        }

#ifdef PARALLEL
        MPI_Gather(&errorCount, 1, MPI_INT, recvbuf, 1, MPI_INT, 
                   0, MPI_COMM_WORLD);
#else
        recvbuf[0] = errorCount;
#endif

        // node 0 collects and reports error counts, determines
        // whether test has run its course, and broadcasts decision
        if (mpi_rank == 0) {
            time_t curtime = time(NULL);
            struct tm curtm;
            localtime_r(&curtime, &curtm);
            fprintf(stderr, "iter=%d: %s", iter, asctime(&curtm));
            
            for (i = 0; i < mpi_size; i++) {
                if (recvbuf[i]) {
                    fprintf(stderr, "--> %d failures on node %d\n", recvbuf[i], i);
                }
            }

            if (curtime > finish) {
                stop = 1;
            }
        }
        
#ifdef PARALLEL
        MPI_Bcast(&stop, 1, MPI_INT, 0, MPI_COMM_WORLD);
#endif

        resultDB.AddResult("Check", "", "Failures", errorCount);
        if (stop) break;
    }
    
    freeDeviceBuffer(work);
    freeDeviceBuffer(chk);

    free(source);
    free(result);
    free(recvbuf);
}
Ejemplo n.º 11
0
void RunTest(string testName, ResultDatabase &resultDB, OptionParser &op)
{
    int pbIndex = op.getOptionInt("size") - 1;
    int passes  = op.getOptionInt("passes");
    int iters   = op.getOptionInt("iterations");
    int micdev  = op.getOptionInt("target");

    int nThreads = 240; // Default

    #pragma offload target(mic) inout(nThreads)
    {
        nThreads = sysconf(_SC_NPROCESSORS_ONLN) - 4; // Leave something for the OS
    }

    printf("Using %d available threads for MIC run.\n", nThreads);

    size_t szOptimum = L1B * nThreads;

    int     pbSizesMB[]    = { 1, 8, 32, 64, 128, 256, 512, 768 };
    size_t  pbSizeBytes    = szOptimum * pbSizesMB[pbIndex] / 8;
    int     pbSizeElements = pbSizeBytes / sizeof(T);

    // Allocate Host Memory
    __declspec(target(MIC)) static T* h_idata;
    __declspec(target(MIC)) static T* reference;
    __declspec(target(MIC)) static T* h_odata;

    h_idata     = (T*)_mm_malloc(pbSizeBytes + ALIGN * sizeof(T), ALIGN);
    reference   = (T*)_mm_malloc(pbSizeBytes + ALIGN * sizeof(T), ALIGN);
    h_odata     = (T*)_mm_malloc(pbSizeBytes + ALIGN * sizeof(T), ALIGN);

    //Manually align memory
    h_idata += ALIGN - 1;
    h_odata += ALIGN - 1;

    srand(time(NULL));
    // Initialize host memory
    for (int i = 0; i < pbSizeElements; i++)
    {
        h_idata[i]    = rand() % 21 - 10; // Fill with some pattern
        h_odata[i]    = 0.0;
        reference[i]  = 0.0;
    }

    // Allocate data to mic
    #pragma offload target(mic:micdev) in(h_idata:length(pbSizeElements + 1) free_if(0)) \
        out(h_odata:length(pbSizeElements + 1) free_if(0))
    {
    }

    double start = curr_second();
    // Get data transfer time
    #pragma offload target(mic:micdev) in(h_idata:length(pbSizeElements + 1) alloc_if(0) \
            free_if(0)) out(h_odata:length(pbSizeElements + 1) alloc_if(0) free_if(0))
    {
    }

    float transferTime = curr_second()-start;

    cout << "Running benchmark with size " << pbSizeElements << endl;
    for (int k = 0; k < passes; k++)
    {

        double totalScanTime = 0.0f;
        start = curr_second();
        #pragma offload target(mic:micdev) nocopy(h_idata:length(pbSizeElements + 1) \
                alloc_if(0) free_if(0)) nocopy(h_odata:length(pbSizeElements + 1)    \
                alloc_if(0) free_if(0))
        {
            if (pbIndex > 0)
            {
                size_t elementsOptimum = szOptimum / sizeof(T);
                int  nChunks = pbSizesMB[pbIndex] / 8;
                T fOffset = 0;

                for (int iChunk = 0; iChunk < nChunks; iChunk++)
                {
                    SCAN_KNC<T>(h_idata + iChunk * elementsOptimum,
                                h_odata + iChunk * elementsOptimum,
                                elementsOptimum,
                                iters,
                                fOffset);
                    fOffset = (h_odata + iChunk * elementsOptimum)[elementsOptimum - 1];
                }
            }
            else
                SCAN_KNC<T>(h_idata, h_odata, szOptimum / (8 * sizeof(T)), iters, 0.0);
        }

        double stop = curr_second();
        totalScanTime = (stop-start);

        #pragma offload target(mic:micdev) out(h_odata:length(pbSizeElements + 1) \
                alloc_if(0) free_if(0))
        {
        }

        // If results aren't correct, don't report perf numbers
        if (! scanCPU<T>(h_idata, reference, h_odata, pbSizeElements))
        {
            return;
        }

        char atts[1024];
        double avgTime = (totalScanTime / (double) iters);
        sprintf(atts, "%d items", pbSizeElements);
        double gb = (double)(pbSizeElements * sizeof(T)) / (1000. * 1000. * 1000.);
        resultDB.AddResult(testName, atts, "GB/s", gb / avgTime);
        resultDB.AddResult(testName+"_PCIe", atts, "GB/s", gb / (avgTime + transferTime));
        resultDB.AddResult(testName+"_Parity", atts, "N", transferTime / avgTime);
    }

    // Clean up
    #pragma offload target(mic:micdev) in(h_idata:length(pbSizeElements + 1) alloc_if(0) ) \
                                out(h_odata:length(pbSizeElements + 1) alloc_if(0))
    {
    }
    _mm_free(h_idata - ALIGN + 1);
    _mm_free(h_odata - ALIGN + 1);
    _mm_free(reference);
}
Ejemplo n.º 12
0
void
RunBenchmark( ResultDatabase& resultDB, OptionParser& opts )
{
    int device;

#if defined(PARALLEL)
    int cwrank;
    MPI_Comm_rank( MPI_COMM_WORLD, &cwrank );
#endif // defined(PARALLEL)

    cudaGetDevice( &device );
    cudaDeviceProp deviceProps;
    cudaGetDeviceProperties( &deviceProps, device );

    // Configure to allocate performance-critical memory in
    // a programming model-specific way.
    Matrix2D<float>::SetAllocator( new CUDAPMSMemMgr<float> );

#if defined(PARALLEL)
    if( cwrank == 0 )
    {
#endif // defined(PARALLEL)
        std::cout << "Running single precision test" << std::endl;
#if defined(PARALLEL)
    }
#endif // defined(PARALLEL)
    DoTest<float>( "SP_Sten2D", resultDB, opts );

    // check if we can run double precision tests
    if( ((deviceProps.major == 1) && (deviceProps.minor >= 3)) ||
        (deviceProps.major >= 2) )
    {
        // Configure to allocate performance-critical memory in
        // a programming model-specific way.
        Matrix2D<double>::SetAllocator( new CUDAPMSMemMgr<double> );

#if defined(PARALLEL)
        if( cwrank == 0 )
        {
#endif // defined(PARALLEL)
            std::cout << "\n\nDP supported" << std::endl;
#if defined(PARALLEL)
        }
#endif // defined(PARALLEL)
        DoTest<double>( "DP_Sten2D", resultDB, opts );
    }
    else
    {
#if defined(PARALLEL)
        if( cwrank == 0 )
        {
#endif // defined(PARALLEL)
            std::cout << "Double precision not supported - skipping" << std::endl;
#if defined(PARALLEL)
        }
#endif // defined(PARALLEL)
        // resultDB requires neg entry for every possible result
        int nPasses = (int)opts.getOptionInt( "passes" );
        for( int p = 0; p < nPasses; p++ )
        {
            resultDB.AddResult( (const char*)"DP_Sten2D", "N/A", "GFLOPS", FLT_MAX );
        }
    }

    std::cout << "\n" << std::endl;
}
Ejemplo n.º 13
0
void
DoTest( const char* timerDesc, ResultDatabase& resultDB, OptionParser& opts )
{
    StencilFactory<T>* stdStencilFactory = NULL;
    Stencil<T>* stdStencil = NULL;
    StencilFactory<T>* testStencilFactory = NULL;
    Stencil<T>* testStencil = NULL;

    try
    {
#if defined(PARALLEL)
        stdStencilFactory = new MPIHostStencilFactory<T>;
        testStencilFactory = new MPICUDAStencilFactory<T>;
#else
        stdStencilFactory = new HostStencilFactory<T>;
        testStencilFactory = new CUDAStencilFactory<T>;
#endif // defined(PARALLEL)
        assert( (stdStencilFactory != NULL) && (testStencilFactory != NULL) );

        // do a sanity check on option values
        CheckOptions( opts );
        stdStencilFactory->CheckOptions( opts );
        testStencilFactory->CheckOptions( opts );

        // extract and validate options
        std::vector<long long> arrayDims = opts.getOptionVecInt( "customSize" );
        if( arrayDims.size() != 2 )
        {
            cerr << "Dim size: " << arrayDims.size() << "\n";
            throw InvalidArgValue( "all overall dimensions must be positive" );
        }
        if (arrayDims[0] == 0) // User has not specified a custom size
        {
            int sizeClass = opts.getOptionInt("size");
            arrayDims = StencilFactory<T>::GetStandardProblemSize( sizeClass );
        }
        long int seed = (long)opts.getOptionInt( "seed" );
        bool beVerbose = opts.getOptionBool( "verbose" );
        unsigned int nIters = (unsigned int)opts.getOptionInt( "num-iters" );
        double valErrThreshold = (double)opts.getOptionFloat( "val-threshold" );
        unsigned int nValErrsToPrint = (unsigned int)opts.getOptionInt( "val-print-limit" );

#if defined(PARALLEL)
        unsigned int haloWidth = (unsigned int)opts.getOptionInt( "iters-per-exchange" );
#else
        unsigned int haloWidth = 1;
#endif // defined(PARALLEL)

        float haloVal = (float)opts.getOptionFloat( "haloVal" );

        // build a description of this experiment
        std::vector<long long> lDims = opts.getOptionVecInt( "lsize" );
        assert( lDims.size() == 2 );
        std::ostringstream experimentDescriptionStr;
        experimentDescriptionStr
            << nIters << ':'
            << arrayDims[0] << 'x' << arrayDims[1] << ':'
            << lDims[0] << 'x' << lDims[1];

        unsigned int nPasses = (unsigned int)opts.getOptionInt( "passes" );
        unsigned int nWarmupPasses = (unsigned int)opts.getOptionInt( "warmupPasses" );


        // compute the expected result on the host
        // or read it from a pre-existing file
        std::string matrixFilenameBase = (std::string)opts.getOptionString( "expMatrixFile" );
#if defined(PARALLEL)
        int cwrank;
        MPI_Comm_rank( MPI_COMM_WORLD, &cwrank );
        if( cwrank == 0 )
        {
#endif // defined(PARALLEL)
        if( !matrixFilenameBase.empty() )
        {
            std::cout << "\nReading expected stencil operation result from file for later comparison with CUDA output\n"
                << std::endl;
        }
        else
        {
            std::cout << "\nPerforming stencil operation on host for later comparison with CUDA output\n"
                << "Depending on host capabilities, this may take a while."
                << std::endl;
        }
#if defined(PARALLEL)
        }
#endif // defined(PARALLEL)
        Matrix2D<T> expected( arrayDims[0] + 2*haloWidth,
                            arrayDims[1] + 2*haloWidth );
        Initialize<T> init( seed, haloWidth, haloVal );

        bool haveExpectedData = false;
        if( ! matrixFilenameBase.empty() )
        {
            bool readOK = ReadMatrixFromFile( expected, GetMatrixFileName<T>( matrixFilenameBase ) );
            if( readOK )
            {

                if( (expected.GetNumRows() != arrayDims[0] + 2*haloWidth) ||
                    (expected.GetNumColumns() != arrayDims[1] + 2*haloWidth) )
                {
                    std::cerr << "The matrix read from file \'"
                        << GetMatrixFileName<T>( matrixFilenameBase )
                        << "\' does not match the matrix size specified on the command line.\n";
                    expected.Reset( arrayDims[0] + 2*haloWidth, arrayDims[1] + 2*haloWidth );
                }
                else
                {
                    haveExpectedData = true;
                }
            }

            if( !haveExpectedData )
            {
                std::cout << "\nSince we could not read the expected matrix values,\nperforming stencil operation on host for later comparison with CUDA output.\n"
                    << "Depending on host capabilities, this may take a while."
                    << std::endl;
            }
        }
        if( !haveExpectedData )
        {
            init( expected );
            haveExpectedData = true;
            if( beVerbose )
            {
                std::cout << "initial state:\n" << expected << std::endl;
            }
            stdStencil = stdStencilFactory->BuildStencil( opts );
            (*stdStencil)( expected, nIters );
        }
        if( beVerbose )
        {
            std::cout << "expected result:\n" << expected << std::endl;
        }

        // determine whether we are to save the expected matrix values to a file
        // to speed up future runs
        matrixFilenameBase = (std::string)opts.getOptionString( "saveExpMatrixFile" );
        if( !matrixFilenameBase.empty() )
        {
            SaveMatrixToFile( expected, GetMatrixFileName<T>( matrixFilenameBase ) );
        }
        assert( haveExpectedData );


        // compute the result on the CUDA device
        Matrix2D<T> data( arrayDims[0] + 2*haloWidth,
                            arrayDims[1] + 2*haloWidth );
        Stencil<T>* testStencil = testStencilFactory->BuildStencil( opts );

        // Compute the number of floating point operations we will perform.
        //
        // Note: in the truly-parallel case, we count flops for redundant
        // work due to the need for a halo.
        // But we do not add to the count for the local 1-wide halo since
        // we aren't computing new values for those items.
        unsigned long npts = (arrayDims[0] + 2*haloWidth - 2) *
                            (arrayDims[1] + 2*haloWidth - 2);
#if defined(PARALLEL)
        MPICUDAStencil<T>* mpiTestStencil = static_cast<MPICUDAStencil<T>*>( testStencil );
        assert( mpiTestStencil != NULL );
        int participating = mpiTestStencil->ParticipatingInProgram() ? 1 : 0;
        int numParticipating = 0;
        MPI_Allreduce( &participating,      // src
                        &numParticipating,  // dest
                        1,                  // count
                        MPI_INT,            // type
                        MPI_SUM,            // op
                        MPI_COMM_WORLD );   // communicator
        npts *= numParticipating;
#endif // defined(PARALLEL)

        // In our 9-point stencil, there are 11 floating point operations
        // per point (3 multiplies and 11 adds):
        //
        // newval = weight_center * centerval +
        //      weight_cardinal * (northval + southval + eastval + westval) +
        //      weight_diagnoal * (neval + nwval + seval + swval)
        //
        // we do this stencil operation 'nIters' times
        unsigned long nflops = npts * 11 * nIters;

#if defined(PARALLEL)
        if( cwrank == 0 )
        {
#endif // defined(PARALLEL)
        std::cout << "Performing " << nWarmupPasses << " warmup passes...";
#if defined(PARALLEL)
        }
#endif // defined(PARALLEL)

        for( unsigned int pass = 0; pass < nWarmupPasses; pass++ )
        {
            init(data);
            (*testStencil)( data, nIters );
        }
#if defined(PARALLEL)
        if( cwrank == 0 )
        {
#endif // defined(PARALLEL)
        std::cout << "done." << std::endl;
#if defined(PARALLEL)
        }
#endif // defined(PARALLEL)

#if defined(PARALLEL)
        MPI_Comm_rank( MPI_COMM_WORLD, &cwrank );
        if( cwrank == 0 )
        {
#endif // defined(PARALLEL)
        std::cout << "\nPerforming stencil operation on chosen device, "
            << nPasses << " passes.\n"
            << "Depending on chosen device, this may take a while."
            << std::endl;
#if defined(PARALLEL)
        }
#endif // defined(PARALLEL)

#if !defined(PARALLEL)
        std::cout << "At the end of each pass the number of validation\nerrors observed will be printed to the standard output."
            << std::endl;
#endif // !defined(PARALLEL)


        for( unsigned int pass = 0; pass < nPasses; pass++ )
        {
#if !defined(PARALLEL)
            std::cout << "pass " << pass << ": ";
#endif // !defined(PARALLEL)
            init( data );

            int timerHandle = Timer::Start();
            (*testStencil)( data, nIters );
            double elapsedTime = Timer::Stop( timerHandle, "CUDA stencil" );


            // find and report the computation rate
            double gflops = (nflops / elapsedTime) / 1e9;

            resultDB.AddResult( timerDesc,
                                    experimentDescriptionStr.str(),
                                    "GFLOPS",
                                    gflops );
            if( beVerbose )
            {
                std::cout << "observed result, pass " << pass << ":\n"
                    << data
                    << std::endl;
            }

            // validate the result
#if defined(PARALLEL)
            StencilValidater<T>* validater = new MPIStencilValidater<T>;
#else
            StencilValidater<T>* validater = new SerialStencilValidater<T>;
#endif // defined(PARALLEL)
            validater->ValidateResult( expected,
                            data,
                            valErrThreshold,
                            nValErrsToPrint );
        }
    }
    catch( ... )
    {
        // clean up - abnormal termination
        // wish we didn't have to do this, but C++ exceptions do not
        // support a try-catch-finally approach
        delete stdStencil;
        delete stdStencilFactory;
        delete testStencil;
        delete testStencilFactory;
        throw;
    }

    // clean up - normal termination
    delete stdStencil;
    delete stdStencilFactory;
    delete testStencil;
    delete testStencilFactory;
}
Ejemplo n.º 14
0
void runTest(const string& testName, cl_device_id dev, cl_context ctx,
        cl_command_queue queue, ResultDatabase& resultDB, OptionParser& op,
        const string& compileFlags)
{
    int err = 0;

    // Collect basic MPI information
    int mpi_size, mpi_rank;
    MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);

    // Program Setup
    cl_program prog = clCreateProgramWithSource(ctx,
                                                1,
                                                &cl_source_scan,
                                                NULL,
                                                &err);
    CL_CHECK_ERROR(err);

    // Before proceeding, make sure the kernel code compiles and
    // all kernels are valid.
    if (mpi_rank == 0)
    {
        cout << "Compiling scan kernels." << endl;
    }
    err = clBuildProgram(prog, 1, &dev, compileFlags.c_str(), NULL, NULL);
    CL_CHECK_ERROR(err);

    if (err != CL_SUCCESS)
    {
        char log[5000];
        size_t retsize = 0;
        err = clGetProgramBuildInfo(prog, dev, CL_PROGRAM_BUILD_LOG, 5000
                * sizeof(char), log, &retsize);

        CL_CHECK_ERROR(err);
        cout << "Build error." << endl;
        cout << "Retsize: " << retsize << endl;
        cout << "Log: " << log << endl;
        return;
    }

    // Extract out the 3 kernels
    cl_kernel reduce = clCreateKernel(prog, "reduce", &err);
    CL_CHECK_ERROR(err);

    cl_kernel top_scan = clCreateKernel(prog, "top_scan", &err);
    CL_CHECK_ERROR(err);

    cl_kernel bottom_scan = clCreateKernel(prog, "bottom_scan", &err);
    CL_CHECK_ERROR(err);

    // If the device doesn't support at least 256 work items in a
    // group, use a different kernel (TODO)
    if (getMaxWorkGroupSize(dev) < 256)
    {
        cout << "Scan requires work group size of at least 256" << endl;
        char atts[1024] = "GSize_Not_Supported";
        // resultDB requires neg entry for every possible result
        int passes = op.getOptionInt("passes");
        for (int k = 0; k < passes; k++)
        {
            resultDB.AddResult(testName+"-Kernel" , atts, "GB/s", FLT_MAX);
            resultDB.AddResult(testName+"-Kernel+PCIe" , atts, "GB/s",
                FLT_MAX);
            resultDB.AddResult(testName+"-MPI_ExScan" , atts, "GB/s",
                FLT_MAX);
            resultDB.AddResult(testName+"-Overall" , atts, "GB/s", FLT_MAX);
        }
        return;
    }

    // Problem Sizes
    int probSizes[4] = { 1, 8, 32, 64 };
    int size = probSizes[op.getOptionInt("size")-1];

    // Convert to MB
    size = (size * 1024 * 1024) / sizeof(T);

    // Create input data on CPU
    unsigned int bytes = size * sizeof(T);

    // Allocate pinned host memory for input data (h_idata)
    cl_mem h_i = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
            bytes, NULL, &err);
    CL_CHECK_ERROR(err);
    T* h_idata = (T*)clEnqueueMapBuffer(queue, h_i, true,
            CL_MAP_READ|CL_MAP_WRITE, 0, bytes, 0, NULL, NULL, &err);
    CL_CHECK_ERROR(err);

    // Allocate pinned host memory for output data (h_odata)
    cl_mem h_o = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
            bytes, NULL, &err);
    CL_CHECK_ERROR(err);
    T* h_odata = (T*)clEnqueueMapBuffer(queue, h_o, true,
            CL_MAP_READ|CL_MAP_WRITE, 0, bytes, 0, NULL, NULL, &err);
    CL_CHECK_ERROR(err);


    // Initialize host memory
    if (mpi_rank == 0)
    {
        cout << "Initializing host memory." << endl;
    }
    for (int i = 0; i < size; i++)
    {
        h_idata[i] = i % 2; //Fill with some pattern
        h_odata[i] = -1;
    }

    // Allocate device memory for input array
    cl_mem d_idata = clCreateBuffer(ctx, CL_MEM_READ_WRITE, bytes, NULL, &err);
    CL_CHECK_ERROR(err);

    // Allocate device memory for output array
    cl_mem d_odata = clCreateBuffer(ctx, CL_MEM_READ_WRITE, bytes, NULL, &err);
    CL_CHECK_ERROR(err);

    // Number of local work items per group
    const size_t local_wsize  = 256;

    // Number of local work groups and total work items
    const size_t num_work_groups = 64;
    const size_t global_wsize = local_wsize * num_work_groups;

    // Allocate device memory for local work group intermediate sums
    cl_mem d_isums = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
            num_work_groups * sizeof(T), NULL, &err);
    CL_CHECK_ERROR(err);

    // Allocate pinned host memory for intermediate block sums (h_isums)
    cl_mem h_b = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
        num_work_groups * sizeof(T), NULL, &err);
    CL_CHECK_ERROR(err);
    T* h_isums = (T*)clEnqueueMapBuffer(queue, h_b, true,
        CL_MAP_READ|CL_MAP_WRITE, 0, num_work_groups * sizeof(T),
        0, NULL, NULL, &err);
    CL_CHECK_ERROR(err);

    // Set the kernel arguments for the reduction kernel
    err = clSetKernelArg(reduce, 0, sizeof(cl_mem), (void*)&d_idata);
    CL_CHECK_ERROR(err);
    err = clSetKernelArg(reduce, 1, sizeof(cl_mem), (void*)&d_isums);
    CL_CHECK_ERROR(err);
    err = clSetKernelArg(reduce, 2, sizeof(cl_int), (void*)&size);
    CL_CHECK_ERROR(err);
    err = clSetKernelArg(reduce, 3, local_wsize * sizeof(T), NULL);
    CL_CHECK_ERROR(err);

    // Set the kernel arguments for the top-level scan
    err = clSetKernelArg(top_scan, 0, sizeof(cl_mem), (void*)&d_isums);
    CL_CHECK_ERROR(err);
    err = clSetKernelArg(top_scan, 1, sizeof(cl_int), (void*)&num_work_groups);
    CL_CHECK_ERROR(err);
    err = clSetKernelArg(top_scan, 2, local_wsize * 2 * sizeof(T), NULL);
    CL_CHECK_ERROR(err);

    // Set the kernel arguments for the bottom-level scan
    err = clSetKernelArg(bottom_scan, 0, sizeof(cl_mem), (void*)&d_idata);
    CL_CHECK_ERROR(err);
    err = clSetKernelArg(bottom_scan, 1, sizeof(cl_mem), (void*)&d_isums);
    CL_CHECK_ERROR(err);
    err = clSetKernelArg(bottom_scan, 2, sizeof(cl_mem), (void*)&d_odata);
    CL_CHECK_ERROR(err);
    err = clSetKernelArg(bottom_scan, 3, sizeof(cl_int), (void*)&size);
    CL_CHECK_ERROR(err);
    err = clSetKernelArg(bottom_scan, 4, local_wsize * 2 * sizeof(T), NULL);
    CL_CHECK_ERROR(err);

    // Repeat the test multiple times to get a good measurement
    int passes = op.getOptionInt("passes");

    if (mpi_rank == 0)
    {
        cout << "Running benchmark with size " << size << endl;
    }
    for (int k = 0; k < passes; k++)
    {
        // Timing variables
        double pcie_time=0., kernel_time=0., mpi_time=0.;

        // Copy data to GPU
        Event evTransfer("PCIe transfer");
        double time_temp = 0.;
        err = clEnqueueWriteBuffer(queue, d_idata, true, 0, bytes, h_idata, 0,
                NULL, &evTransfer.CLEvent());
        CL_CHECK_ERROR(err);
        evTransfer.FillTimingInfo();
        pcie_time += (double)evTransfer.StartEndRuntime() / 1e9;

        // This code uses a reduce-then-scan strategy.
        // The major steps of the algorithm are:
        // 1. Local reduction on a node
        // 2. Global exclusive scan of the reduction values
        // 3. Local inclusive scan, seeded with the node's result
        //    from the global exclusive scan
        Event ev_reduce("Reduction Kernel");
        err = clEnqueueNDRangeKernel(queue, reduce, 1, NULL,
                    &global_wsize, &local_wsize, 0, NULL,
                    &ev_reduce.CLEvent());
        err = clFinish(queue);
        ev_reduce.FillTimingInfo();
        kernel_time += (double)ev_reduce.StartEndRuntime() * 1e-9;

        // Next step is to copy the reduced blocks back to the host,
        // sum them, and perform the MPI exlcusive (top level) scan.
        err = clEnqueueReadBuffer(queue, d_isums, true, 0,
                num_work_groups*sizeof(T), h_isums, 0,
                NULL, &evTransfer.CLEvent());
        CL_CHECK_ERROR(err);
        evTransfer.FillTimingInfo();
        pcie_time += (double)evTransfer.StartEndRuntime() * 1e-9;

        // Start the timer for MPI Scan
        int globscan_th = Timer::Start();
        T reduced=0., scanned=0.;

        // To get the true sum for this node, we have to add up
        // the block sums before MPI scanning.
        for (int i = 0; i < num_work_groups; i++)
        {
            reduced += h_isums[i];
        }

        // Next step is an exclusive scan across MPI ranks.
        // Then a local scan seeded with the result from MPI.
        globalExscan(&reduced, &scanned);
        mpi_time += Timer::Stop(globscan_th, "Global Scan");

        // Now, scanned contains all the information we need from other nodes
        // Next step is to perform the local top level (i.e. across blocks) scan,
        // but seed it with the "scanned", the sum of elems on all lower ranks.
        h_isums[0] += scanned;

        err = clEnqueueWriteBuffer(queue, d_isums, true, 0, sizeof(T), h_isums, 0,
                NULL, &evTransfer.CLEvent());
        CL_CHECK_ERROR(err);
        evTransfer.FillTimingInfo();
        pcie_time += (double)evTransfer.StartEndRuntime() * 1e-9;

        Event ev_scan("Scan Kernel");
        err = clEnqueueNDRangeKernel(queue, top_scan, 1, NULL,
                &local_wsize, &local_wsize, 0, NULL, &ev_scan.CLEvent());
        err = clFinish(queue);
        CL_CHECK_ERROR(err);
        ev_scan.FillTimingInfo();
        kernel_time += ((double)ev_scan.StartEndRuntime() * 1.e-9);

        // Finally, a bottom-level scan is performed by each block
        // that is seeded with the scanned value in block sums
        err = clEnqueueNDRangeKernel(queue, bottom_scan, 1, NULL,
                    &global_wsize, &local_wsize, 0, NULL,
                    &ev_scan.CLEvent());
        err = clFinish(queue);
        CL_CHECK_ERROR(err);
        ev_scan.FillTimingInfo();
        kernel_time += ((double)ev_scan.StartEndRuntime() * 1.e-9);

        // Read data back for correctness check
        err = clEnqueueReadBuffer(queue, d_odata, true, 0, bytes, h_odata,
                0, NULL, &evTransfer.CLEvent());
        CL_CHECK_ERROR(err);

        // Lightweight correctness check -- won't apply
        // if data is not initialized to i%2 above
        if (mpi_rank == mpi_size-1)
        {
            if (h_odata[size-1] != (mpi_size * size) / 2)
            {
                cout << "Test Failed\n";
            }
            else
            {
                cout << "Test Passed\n";
            }
        }

        char atts[1024];
        sprintf(atts, "%d items", size);
        double global_gb = (double)(mpi_size * size * sizeof(T)) /
            (1000. * 1000. * 1000.);

        resultDB.AddResult(testName+"-Kernel" , atts, "GB/s",
                global_gb / kernel_time);
        resultDB.AddResult(testName+"-Kernel+PCIe" , atts, "GB/s",
                global_gb / (kernel_time + pcie_time));
        resultDB.AddResult(testName+"-MPI_ExScan" , atts, "GB/s",
                (mpi_size * sizeof(T) *1e-9) / mpi_time);
        resultDB.AddResult(testName+"-Overall" , atts, "GB/s",
                global_gb / (kernel_time + pcie_time + mpi_time));
    }

    // Clean up device memory
    err = clReleaseMemObject(d_idata);
    CL_CHECK_ERROR(err);
    err = clReleaseMemObject(d_odata);
    CL_CHECK_ERROR(err);
    err = clReleaseMemObject(d_isums);
    CL_CHECK_ERROR(err);

    // Clean up pinned host memory
    err = clEnqueueUnmapMemObject(queue, h_i, h_idata, 0, NULL, NULL);
    CL_CHECK_ERROR(err);
    err = clEnqueueUnmapMemObject(queue, h_o, h_odata, 0, NULL, NULL);
    CL_CHECK_ERROR(err);
    err = clEnqueueUnmapMemObject(queue, h_b, h_isums, 0, NULL, NULL);
    CL_CHECK_ERROR(err);
    err = clReleaseMemObject(h_i);
    CL_CHECK_ERROR(err);
    err = clReleaseMemObject(h_o);
    CL_CHECK_ERROR(err);
    err = clReleaseMemObject(h_b);
    CL_CHECK_ERROR(err);

    err = clReleaseProgram(prog);
    CL_CHECK_ERROR(err);
    err = clReleaseKernel(reduce);
    CL_CHECK_ERROR(err);
    err = clReleaseKernel(top_scan);
    CL_CHECK_ERROR(err);
    err = clReleaseKernel(bottom_scan);
    CL_CHECK_ERROR(err);
}
Ejemplo n.º 15
0
void runTest(const string& testName, cl_device_id dev, cl_context ctx,
        cl_command_queue queue, ResultDatabase& resultDB, OptionParser& op,
        const string& compileFlags)
{

    int N;
    if (op.getOptionInt("KiB") == 0)
    {
        int probSizes[4] = { 1, 4, 8, 16 };
        N = probSizes[op.getOptionInt("size")-1] * 1024 / sizeof(T);
    } else {
        N = op.getOptionInt("KiB") * 1024 / sizeof(T);
    }

    cl_int err;
    int waitForEvents = 1;
    size_t m = N, n = N, k = N;
    size_t lda, ldb, ldc;
    const T alpha = 1;
    const T beta = -1;
    int i, j;

    lda = ldb = ldc = N;
    
    cl_uint numDimensions = 0;
    clGetDeviceInfo (dev, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,
                             sizeof(cl_uint), &numDimensions, NULL);
    size_t *maxWorkSizes = new size_t[numDimensions];
    clGetDeviceInfo (dev, CL_DEVICE_MAX_WORK_ITEM_SIZES,
                       sizeof(size_t)*numDimensions, maxWorkSizes, NULL);   
    
    if (numDimensions<2 || maxWorkSizes[0]<16 || maxWorkSizes[1] < 4)
    {
        cout << "SGEMM needs a 2-dimensional work group size of at least {16,4}." << endl;
        int passes = op.getOptionInt("passes");
        char atts[1024] = "GSize_Not_Supported";
        for (; passes > 0; --passes) {
            for (i = 0; i < 2; i++) {
                const char transb = i ? 'T' : 'N';
                resultDB.AddResult(testName+"-"+transb, atts, "GFlops", FLT_MAX);
                resultDB.AddResult(testName+"-"+transb+"_PCIe", atts, "GFlops", FLT_MAX);
                resultDB.AddResult(testName+"-"+transb+"_Parity", atts, "N", FLT_MAX);
            }
        }
        return;
    }

    size_t localWorkSize[2] = {16,4};
    

    // Create program object
    cl_program prog = clCreateProgramWithSource(ctx, 1,
                                 &cl_source_sgemmN, NULL, &err);
    CL_CHECK_ERROR(err);

    string flags = compileFlags + "-cl-mad-enable";
    err = clBuildProgram(prog, 0, NULL, flags.c_str(), NULL,
            NULL);
    CL_CHECK_ERROR(err);

    // If compilation fails, print error messages and return
    if (err != CL_SUCCESS) {
        char log[5000];
        size_t retsize = 0;
        err =  clGetProgramBuildInfo (prog, dev, CL_PROGRAM_BUILD_LOG,
                5000*sizeof(char),  log, &retsize);

        CL_CHECK_ERROR(err);
        cout << "Retsize: " << retsize << endl;
        cout << "Log: " << log << endl;
        exit(-1);
    }

    // Generate the kernel objects
    cl_kernel sgemmNN = clCreateKernel(prog, "sgemmNN", &err);
    CL_CHECK_ERROR(err);

    cl_kernel sgemmNT = clCreateKernel(prog, "sgemmNT", &err);
    CL_CHECK_ERROR(err);

    // Allocate memory for the matrices
    T *A, *B, *C;
    cl_mem Aobj, Bobj, Cobj;
    if (true) // pinned
    {
        Aobj = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, 
			       sizeof(T)*N*N, NULL, &err);
        CL_CHECK_ERROR(err);
        A =(T*)clEnqueueMapBuffer(queue,Aobj,true,CL_MAP_READ|CL_MAP_WRITE,
				       0,sizeof(T)*N*N,0, NULL,NULL,&err);
        CL_CHECK_ERROR(err);

        Bobj = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, 
			       sizeof(T)*N*N, NULL, &err);
        CL_CHECK_ERROR(err);
        B =(T*)clEnqueueMapBuffer(queue,Bobj,true,CL_MAP_READ|CL_MAP_WRITE,
				       0,sizeof(T)*N*N,0, NULL,NULL,&err);
        CL_CHECK_ERROR(err);

        Cobj = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, 
			       sizeof(T)*N*N, NULL, &err);
        CL_CHECK_ERROR(err);
        C =(T*)clEnqueueMapBuffer(queue,Cobj,true,CL_MAP_READ|CL_MAP_WRITE,
				       0,sizeof(T)*N*N,0, NULL,NULL,&err);
        CL_CHECK_ERROR(err);
    }
    else
    {
	A = (T*)malloc( N*N*sizeof( T ) );
	B = (T*)malloc( N*N*sizeof( T ) );
	C = (T*)malloc( N*N*sizeof( T ) );
    }

    // Initialize inputs
    srand48(13579862);
    for(i=0; i<m; ++i){
        for(j=0; j<k; ++j){
            A[i*k+j] = (T)(0.5 + drand48()*1.5);
        }
    }

    for(i=0; i<k; ++i){
        for(j=0; j<n; ++j){
            B[i*n+j] = (T)(0.5 + drand48()*1.5);
        }
    }

    for(i=0; i<m; ++i){
        for(j=0; j<n; ++j){
            C[i*n+j] = 0.0;
        }
    }

    // Pass A and B to the GPU and create a GPU buffer for C
    cl_mem Agpu = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
                                 m*k * sizeof(T), NULL, &err);
    CL_BAIL_ON_ERROR(err);
    cl_mem Bgpu = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
                                 k*n * sizeof(T), NULL, &err);
    CL_BAIL_ON_ERROR(err);
    cl_mem Cgpu = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
                                 m*n * sizeof(T), NULL, &err);
    CL_BAIL_ON_ERROR(err);


    // Set arguments to the sgemmNN kernel
    err = clSetKernelArg(sgemmNN, 0, sizeof(cl_mem), (void*)&Agpu);
    CL_BAIL_ON_ERROR(err);
    err = clSetKernelArg(sgemmNN, 1, sizeof(int), (void*)&lda);
    CL_BAIL_ON_ERROR(err);
    err = clSetKernelArg(sgemmNN, 2, sizeof(cl_mem), (void*)&Bgpu);
    CL_BAIL_ON_ERROR(err);
    err = clSetKernelArg(sgemmNN, 3, sizeof(int), (void*)&ldb);
    CL_BAIL_ON_ERROR(err);
    err = clSetKernelArg(sgemmNN, 4, sizeof(cl_mem), (void*)&Cgpu);
    CL_BAIL_ON_ERROR(err);
    err = clSetKernelArg(sgemmNN, 5, sizeof(int), (void*)&ldc);
    CL_BAIL_ON_ERROR(err);
    err = clSetKernelArg(sgemmNN, 6, sizeof(int), (void*)&k);
    CL_BAIL_ON_ERROR(err);
    err = clSetKernelArg(sgemmNN, 7, sizeof(T), (void*)&alpha);
    CL_BAIL_ON_ERROR(err);
    err = clSetKernelArg(sgemmNN, 8, sizeof(T), (void*)&beta);
    CL_BAIL_ON_ERROR(err);

    // Pass arguments to the sgemmNT kernel
    err = clSetKernelArg(sgemmNT, 0, sizeof(cl_mem), (void*)&Agpu);
    CL_BAIL_ON_ERROR(err);
    err = clSetKernelArg(sgemmNT, 1, sizeof(int), (void*)&lda);
    CL_BAIL_ON_ERROR(err);
    err = clSetKernelArg(sgemmNT, 2, sizeof(cl_mem), (void*)&Bgpu);
    CL_BAIL_ON_ERROR(err);
    err = clSetKernelArg(sgemmNT, 3, sizeof(int), (void*)&ldb);
    CL_BAIL_ON_ERROR(err);
    err = clSetKernelArg(sgemmNT, 4, sizeof(cl_mem), (void*)&Cgpu);
    CL_BAIL_ON_ERROR(err);
    err = clSetKernelArg(sgemmNT, 5, sizeof(int), (void*)&ldc);
    CL_BAIL_ON_ERROR(err);
    err = clSetKernelArg(sgemmNT, 6, sizeof(int), (void*)&k);
    CL_BAIL_ON_ERROR(err);
    err = clSetKernelArg(sgemmNT, 7, sizeof(T), (void*)&alpha);
    CL_BAIL_ON_ERROR(err);
    err = clSetKernelArg(sgemmNT, 8, sizeof(T), (void*)&beta);
    CL_BAIL_ON_ERROR(err);

    const size_t globalWorkSize[2] = {m/4,n/4};

    int passes = op.getOptionInt("passes");

    // Run NN
    for (int i = 0; i < passes; i++) {
        Event evDownload1("Download A");
        Event evUpload("Upload");
        Event evNN("sgemmNN");

        err = clEnqueueWriteBuffer(queue, Agpu, CL_TRUE, 0, m*n*sizeof(T),
                A, 0, NULL, &evDownload1.CLEvent());
        err = clEnqueueWriteBuffer(queue, Bgpu, CL_TRUE, 0, m*n*sizeof(T),
                B, 0, NULL, NULL);
        err = clEnqueueWriteBuffer(queue, Cgpu, CL_TRUE, 0, m*n*sizeof(T),
                C, 0, NULL, NULL);

        // Wait until data transfers finish
        clFinish(queue);
        CL_BAIL_ON_ERROR(err);

        //Launch Kernels
        err = clEnqueueNDRangeKernel(queue, sgemmNN, 2, NULL, globalWorkSize,
                                     localWorkSize, 0, NULL, &evNN.CLEvent());
        clFinish(queue);
        CL_BAIL_ON_ERROR(err);

        err = clEnqueueReadBuffer(queue, Cgpu, CL_TRUE, 0, m*n*sizeof(T),
                C, 0, NULL, &evUpload.CLEvent());
        clFinish(queue);
        CL_BAIL_ON_ERROR(err);

        evNN.FillTimingInfo();
        evDownload1.FillTimingInfo();
        evUpload.FillTimingInfo();

        double user_wait_time = 0.0;
        double gemm_pure_time = 0.0;

        gemm_pure_time = evNN.SubmitEndRuntime();
        user_wait_time = evUpload.EndTime() - evDownload1.QueuedTime();
        double transfer_time = user_wait_time - gemm_pure_time;
        double flops = 2.0*(double)N*N*N;
        resultDB.AddResult(testName+"-N", toString(N), "GFLOPS",
                flops / gemm_pure_time);
        resultDB.AddResult(testName+"-N_PCIe", toString(N), "GFLOPS",
                flops / user_wait_time);
        resultDB.AddResult(testName+"-N_Parity", toString(N), "N",
                transfer_time / gemm_pure_time);
    }

    // Run NT
    for (int i = 0; i < passes; i++) {
        Event evDownload1("Download A");
        Event evUpload("Upload");
        Event evNT("sgemmNT");

        err = clEnqueueWriteBuffer(queue, Agpu, CL_TRUE, 0, m*n*sizeof(T),
                A, 0, NULL, &evDownload1.CLEvent());
        err = clEnqueueWriteBuffer(queue, Bgpu, CL_TRUE, 0, m*n*sizeof(T),
                B, 0, NULL, NULL);
        err = clEnqueueWriteBuffer(queue, Cgpu, CL_TRUE, 0, m*n*sizeof(T),
                C, 0, NULL, NULL);
        clFinish(queue);
        CL_BAIL_ON_ERROR(err);

        //Launch Kernels
        err = clEnqueueNDRangeKernel(queue, sgemmNT, 2, NULL, globalWorkSize,
                                     localWorkSize, 0, NULL, &evNT.CLEvent());
        clFinish(queue);
        CL_BAIL_ON_ERROR(err);

        err = clEnqueueReadBuffer(queue, Cgpu, CL_TRUE, 0, m*n*sizeof(T),
                C, 0, NULL, &evUpload.CLEvent());
        clFinish(queue);
        CL_BAIL_ON_ERROR(err);

        evNT.FillTimingInfo();
        evDownload1.FillTimingInfo();
        evUpload.FillTimingInfo();

        double user_wait_time = 0.0;
        double gemm_pure_time = 0.0;

        gemm_pure_time = evNT.SubmitEndRuntime();
        user_wait_time = evUpload.EndTime() - evDownload1.QueuedTime();
        double transfer_time = user_wait_time - gemm_pure_time;
        double flops = 2.0*(double)N*N*N;
        resultDB.AddResult(testName+"-T", toString(N), "GFLOPS",
                flops / gemm_pure_time);
        resultDB.AddResult(testName+"-T_PCIe", toString(N), "GFLOPS",
                flops / user_wait_time);
        resultDB.AddResult(testName+"-T_Parity", toString(N), "N",
                transfer_time / gemm_pure_time);
    }

    if (true) // pinned
    {
        err = clReleaseMemObject(Aobj);
        CL_CHECK_ERROR(err);
        err = clReleaseMemObject(Bobj);
        CL_CHECK_ERROR(err);
        err = clReleaseMemObject(Cobj);
        CL_CHECK_ERROR(err);
    }
    else
    {
	free(A);
	free(B);
	free(C);
    }

    err = clReleaseProgram(prog);
    CL_CHECK_ERROR(err);
    err = clReleaseKernel(sgemmNN);
    CL_CHECK_ERROR(err);
    err = clReleaseKernel(sgemmNT);
    CL_CHECK_ERROR(err);

    err = clReleaseMemObject(Agpu);
    CL_CHECK_ERROR(err);
    err = clReleaseMemObject(Bgpu);
    CL_CHECK_ERROR(err);
    err = clReleaseMemObject(Cgpu);
    CL_CHECK_ERROR(err);

}
Ejemplo n.º 16
0
template <class T> void
RunTest(cl_device_id id,
        cl_context ctx,
        cl_command_queue queue,
        ResultDatabase &resultDB,
        int npasses,
        int verbose,
        int quiet,
        float repeatF,
        size_t localWorkSize,
        ProgressBar &pb,
        const char* typeName,
        const char* precision,
        const char* pragmaText)
{
    int err;
    cl_mem mem1;
    char sizeStr[128];
    T *hostMem, *hostMem2;
    
    int aIdx = 0;
    while ((aTests!=0) && (aTests[aIdx].name!=0))
    {
       ostringstream oss;
       struct _benchmark_type temp = aTests[aIdx];

       // Calculate adjusted repeat factor
       int tentativeRepeats = (int)round(repeatF*temp.numRepeats);
       if (tentativeRepeats < 2) {
          tentativeRepeats = 2;
          double realRepeatF = ((double)tentativeRepeats) 
            / temp.numRepeats;
          if (realRepeatF>8.0*repeatF)  // do not cut the number of unrolls
                                        // by more than a factor of 8
             realRepeatF = 8.0*repeatF;
          temp.numUnrolls = 
                  (int)round(repeatF*temp.numUnrolls/realRepeatF);
       }
       temp.numRepeats = tentativeRepeats;
       
       // Generate kernel source code
       generateKernel(oss, temp, typeName, pragmaText);
       std::string kernelCode(oss.str());

       // If in verbose mode, print the kernel
       if (verbose)
       {
           cout << "Code for kernel " << temp.name 
                << ":\n" + kernelCode << endl;
       }

       // Alloc host memory
       int halfNumFloatsMax = temp.halfBufSizeMax*1024;
       int numFloatsMax = 2*halfNumFloatsMax;
       hostMem = new T[numFloatsMax];
       hostMem2 = new T[numFloatsMax];
       
       // Allocate device memory
       mem1 = clCreateBuffer(ctx, CL_MEM_READ_WRITE, 
                                 sizeof(T)*numFloatsMax, NULL, &err);
       CL_CHECK_ERROR(err);
       
       // Issue a copy to force device allocation
       err = clEnqueueWriteBuffer(queue, mem1, true, 0,
                               numFloatsMax*sizeof(T), hostMem,
                               0, NULL, NULL);
       CL_CHECK_ERROR(err);

       // Create kernel program object
       const char* progSource[] = {kernelCode.c_str()};
       cl_program prog = clCreateProgramWithSource(ctx, 1, progSource, 
           NULL, &err);
       CL_CHECK_ERROR(err);
       
       // Compile the program
       err = clBuildProgram(prog, 1, &id, opts, NULL, NULL);
       CL_CHECK_ERROR(err);
       
       if (err != 0)
       {
           char log[5000];
           size_t retsize = 0;
           err =  clGetProgramBuildInfo(prog, id, CL_PROGRAM_BUILD_LOG,
                    5000*sizeof(char), log, &retsize);
           CL_CHECK_ERROR(err);
           
           cout << "Build error." << endl;
           cout << "Retsize: " << retsize << endl;
           cout << "Log: " << log << endl;
           return;
       }

       // Check if we have to dump the PTX (NVIDIA only)
       // Disabled by default
       // Set environment variable DUMP_PTX to enable
       char* dumpPtx = getenv("DUMP_PTX");
       if (dumpPtx && !strcmp(dumpPtx, "1")) {  // must dump the PTX
          dumpPTXCode(ctx, prog, temp.name);
       }
       
       // Extract out kernel
       cl_kernel kernel_madd = clCreateKernel(prog, temp.name, &err);
       CL_CHECK_ERROR(err);
       
       err = clSetKernelArg (kernel_madd, 0, sizeof(cl_mem), (void*)&mem1);
       CL_CHECK_ERROR (err);
       err = clSetKernelArg (kernel_madd, 1, sizeof(cl_int), 
           (void*)&temp.numRepeats);
       CL_CHECK_ERROR (err);

       if (verbose)
       {
          cout << "Running kernel " << temp.name << endl;
       }
       
       for (int halfNumFloats=temp.halfBufSizeMin*1024 ; 
            halfNumFloats<=temp.halfBufSizeMax*1024 ;
            halfNumFloats*=temp.halfBufSizeStride)
       {
          // Set up input memory, first half = second half
          int numFloats = 2*halfNumFloats;
          for (int j=0; j<halfNumFloats; ++j)
          {
             hostMem[j] = hostMem[numFloats-j-1] = (T)(drand48()*5.0);
          }

          size_t globalWorkSize = numFloats;

          for (int pas=0 ; pas<npasses ; ++pas) 
          {
             err = clEnqueueWriteBuffer (queue, mem1, true, 0,
                                   numFloats*sizeof(T), hostMem,
                                   0, NULL, NULL);
             CL_CHECK_ERROR(err);

             Event evKernel(temp.name);
             
             err = clEnqueueNDRangeKernel(queue, kernel_madd, 1, NULL,
                                 &globalWorkSize, &localWorkSize,
                                 0, NULL, &evKernel.CLEvent());
             CL_CHECK_ERROR(err);
             
             err = clWaitForEvents(1, &evKernel.CLEvent());
             CL_CHECK_ERROR(err);
             
             evKernel.FillTimingInfo();
             double flopCount = (double)numFloats * 
                                temp.flopCount *
                                temp.numRepeats * 
                                temp.numUnrolls * 
                                temp.numStreams;

             double gflop = flopCount / (double)(evKernel.SubmitEndRuntime());

             sprintf (sizeStr, "Size:%07d", numFloats);
             resultDB.AddResult(string(temp.name)+precision, sizeStr, "GFLOPS", gflop);

             // Zero out the test host memory
             for (int j=0 ; j<numFloats ; ++j)
             {
                 hostMem2[j] = 0.0;
             }

             // Read the result device memory back to the host
             err = clEnqueueReadBuffer(queue, mem1, true, 0,
                                 numFloats*sizeof(T), hostMem2,
                                 0, NULL, NULL);
             CL_CHECK_ERROR(err);
               
             // Check the result -- At a minimum the first half of memory
             // should match the second half exactly
             for (int j=0 ; j<halfNumFloats ; ++j)
             {
                if (hostMem2[j] != hostMem2[numFloats-j-1])
                {
                   cout << "Error; hostMem2[" << j << "]=" << hostMem2[j]
                        << " is different from its twin element hostMem2["
                        << (numFloats-j-1) << "]=" << hostMem2[numFloats-j-1]
                        <<"; stopping check\n";
                   break;
                }
             }
             
             // update progress bar
             pb.addItersDone();
             if (!verbose && !quiet)
                 pb.Show(stdout);
          }
       }

       err = clReleaseKernel (kernel_madd);
       CL_CHECK_ERROR(err);
       err = clReleaseProgram (prog);
       CL_CHECK_ERROR(err);
       err = clReleaseMemObject(mem1);
       CL_CHECK_ERROR(err);

       aIdx += 1;
       
       delete[] hostMem;
       delete[] hostMem2;
    }
 
    // Now, test hand-tuned custom kernels
    
    // 2D - width and height of input
    const int w = 2048, h = 2048;
    const int bytes = w * h * sizeof(T);
    
    // Allocate some device memory
    mem1 = clCreateBuffer(ctx, CL_MEM_READ_WRITE, bytes, NULL, &err);
    CL_CHECK_ERROR(err);
    
    // Get a couple non-zero random numbers
    float val1 = 0, val2 = 0;
    while (val1==0 || val2==0)
    {
        val1 = drand48();
        val2 = drand48();
    }
    // For each custom kernel
    for (int kCounter = 0; kCounter < 2; kCounter++)
    {
        // Calculate adjusted repeat factor
        int tentativeRepeats = (int)round(repeatF*5);
        int nUnrolls = 100;
        if (tentativeRepeats < 2) {
           tentativeRepeats = 2;
           double realRepeatF = ((double)tentativeRepeats) / 5;
           if (realRepeatF>8.0*repeatF)  // do not cut the number of unrolls
                                         // by more than a factor of 8
              realRepeatF = 8.0*repeatF;
           nUnrolls = (int)round(repeatF*100/realRepeatF);
        }
        
        // Double precision not currently supported
        string kSource = generateUKernel(kCounter, false, tentativeRepeats, nUnrolls, 
                       typeName, pragmaText);
        
        const char* progSource[] = {kSource.c_str()};
        cl_program prog = clCreateProgramWithSource(ctx,
                                 1, progSource, NULL, &err);
        CL_CHECK_ERROR(err);
    
        // Compile kernel
        err = clBuildProgram(prog, 1, &id, opts, NULL, NULL);
        CL_CHECK_ERROR(err);
        
        // Extract out kernel 
        cl_kernel kernel_madd = clCreateKernel(prog, "peak", &err);
        
        // Calculate kernel launch parameters
        //size_t localWorkSize = maxGroupSize<128?maxGroupSize:128;
        size_t globalWorkSize = w * h;
        
        // Set the arguments
        err = clSetKernelArg(kernel_madd, 0, sizeof(cl_mem), (void*)&mem1);
        CL_CHECK_ERROR(err);
        err = clSetKernelArg(kernel_madd, 1, sizeof(T), (void*)&val1);
        CL_CHECK_ERROR(err);
        err = clSetKernelArg(kernel_madd, 2, sizeof(T), (void*)&val2);
        CL_CHECK_ERROR(err);


        // Event object for timing
        Event evKernel_madd("madd");
        for (int passCounter=0; passCounter < npasses; passCounter++)
        {
            err = clEnqueueNDRangeKernel(queue, kernel_madd, 1, NULL,
                      &globalWorkSize, &localWorkSize,
                      0, NULL, &evKernel_madd.CLEvent());
            CL_CHECK_ERROR(err);
    
            // Wait for the kernel to finish
            err = clWaitForEvents(1, &evKernel_madd.CLEvent());
            CL_CHECK_ERROR(err);  
            evKernel_madd.FillTimingInfo();
            
            // Calculate result and add to DB
            char atts[1024];
            double nflopsPerItem = getUFlopCount(kCounter, false, tentativeRepeats, nUnrolls);
            sprintf(atts, "Size:%d", w*h);
            double gflops = (double) (nflopsPerItem*w*h) /
                            (double) evKernel_madd.SubmitEndRuntime();
            
            if (kCounter) {
                resultDB.AddResult(string("MulMAddU")+precision, atts, "GFLOPS", gflops);
            } else {
                resultDB.AddResult(string("MAddU")+precision, atts, "GFLOPS", gflops);
            }
            // update progress bar
            pb.addItersDone();
            if (!verbose && !quiet)
            {
                pb.Show(stdout);
            }
        }
        err = clReleaseKernel(kernel_madd);
        CL_CHECK_ERROR(err);
        err = clReleaseProgram(prog);
        CL_CHECK_ERROR(err);
    }
    err = clReleaseMemObject(mem1);
    CL_CHECK_ERROR(err);
}
Ejemplo n.º 17
0
void runTest(const string& name, ResultDatabase &resultDB, OptionParser& op)
{	
    int i, j;
    void* work, * chk;
    T2* source, * result;
    unsigned long bytes = 0;

    if (op.getOptionInt("MB") == 0) {
        int probSizes[4] = { 1, 8, 96, 256 };
	int sizeIndex = op.getOptionInt("size")-1;
	if (sizeIndex < 0 || sizeIndex >= 4) {
	    cerr << "Invalid size index specified\n";
	    exit(-1);
	}
        bytes = probSizes[sizeIndex];
    } 
    else {
        bytes = op.getOptionInt("MB");
    }

    // Convert to MB
    bytes *= 1024 * 1024;

    bool do_dp = dp<T2>();
    init(op, do_dp);

    int passes = op.getOptionInt("passes");

    // now determine how much available memory will be used
    int half_n_ffts = bytes / (512*sizeof(T2)*2);
    int n_ffts = half_n_ffts * 2;
    int half_n_cmplx = half_n_ffts * 512;
    unsigned long used_bytes = half_n_cmplx * 2 * sizeof(T2);
    double N = half_n_cmplx*2;
    
    // allocate host and device memory
    allocHostBuffer((void**)&source, used_bytes);
    allocHostBuffer((void**)&result, used_bytes);

    // init host memory...
    for (i = 0; i < half_n_cmplx; i++) {
        source[i].x = (rand()/(float)RAND_MAX)*2-1;
        source[i].y = (rand()/(float)RAND_MAX)*2-1;
        source[i+half_n_cmplx].x = source[i].x;
        source[i+half_n_cmplx].y = source[i].y;
    }

    // alloc device memory
    allocDeviceBuffer(&work, used_bytes);
    allocDeviceBuffer(&chk, 1);

    // Copy to device, and record transfer time
    fprintf(stderr, "used_bytes=%d, N=%g\n", used_bytes, N);

    int pcie_TH = Timer::Start();
    copyToDevice(work, source, used_bytes);
    double transfer_time = Timer::Stop(pcie_TH, "PCIe Transfer Time");

    char chk_init = 0;
    copyToDevice(chk, &chk_init, 1);

    const char *sizeStr;
    stringstream ss;
    ss << "N=" << (long)N;
    sizeStr = strdup(ss.str().c_str());

    for (int k=0; k<passes; k++) {
        // time fft kernel
        int TH = Timer::Start();
        forward(work, n_ffts);
        double t = Timer::Stop(TH, "fft");
        double fftsz = 512;
        double Gflops = n_ffts*(5*fftsz*log2(fftsz))/(t*1e9f);
        double gflopsPCIe = n_ffts*(5*fftsz*log2(fftsz)) /
                ((transfer_time+t)*1e9f);
        resultDB.AddResult(name, sizeStr, "GFLOPS", Gflops);
        resultDB.AddResult(name+"_PCIe", sizeStr, "GFLOPS", gflopsPCIe);
        resultDB.AddResult(name+"_Parity", sizeStr, "N", transfer_time / t);

        // time ifft kernel
        TH = Timer::Start();
        inverse(work, n_ffts);
        t = Timer::Stop(TH, "ifft");
        Gflops = n_ffts*(5*fftsz*log2(fftsz))/(t*1e9f);
        gflopsPCIe = n_ffts*(5*fftsz*log2(fftsz)) /
                ((transfer_time+t)*1e9f);
        resultDB.AddResult(name+"-INV", sizeStr, "GFLOPS", Gflops);
        resultDB.AddResult(name+"-INV_PCIe", sizeStr, "GFLOPS", gflopsPCIe);
        resultDB.AddResult(name+"-INV_Parity", sizeStr, "N", transfer_time / t);

        // time check kernel
        int failed = check(work, chk, half_n_ffts, half_n_cmplx);
        cout << "pass " << k << ((failed) ? ": failed\n" : ": passed\n");
    }
    
    freeDeviceBuffer(work);
    freeDeviceBuffer(chk);
    freeHostBuffer(source);
    freeHostBuffer(result);
}
Ejemplo n.º 18
0
//  Modifications:
//    Jeremy Meredith, Wed Nov 10 14:20:47 EST 2010
//    Split timing reports into detailed and summary.  For
//    serial code, we report all trial values.
//
void
SerialStencilTimingReporter::ReportTimings( ResultDatabase& resultDB ) const
{
    resultDB.DumpDetailed( std::cout );
}
Ejemplo n.º 19
0
void csrTest(cl_device_id dev, cl_context ctx, string compileFlags,
             cl_command_queue queue, ResultDatabase& resultDB, OptionParser& op,
             floatType* h_val, int* h_cols, int* h_rowDelimiters,
             floatType* h_vec, floatType* h_out, int numRows, int numNonZeroes,
             floatType* refOut, bool padded, const size_t maxImgWidth)
{
    if (devSupportsImages)
    {
        char texflags[64];
        sprintf(texflags," -DUSE_TEXTURE -DMAX_IMG_WIDTH=%ld", maxImgWidth);
        compileFlags+=string(texflags);
    }
    // Set up OpenCL Program Object
    int err = 0;

    cl_program prog = clCreateProgramWithSource(ctx, 1, &cl_source_spmv, NULL,
            &err);
    CL_CHECK_ERROR(err);

    // Build the openCL kernels
    err = clBuildProgram(prog, 1, &dev, compileFlags.c_str(), NULL, NULL);
    // CL_CHECK_ERROR(err);  // if we check and fail here, we never get to see
                            // the OpenCL compiler's build log

    // If there is a build error, print the output and return
    if (err != CL_SUCCESS)
    {
        char log[5000];
        size_t retsize = 0;
        err = clGetProgramBuildInfo(prog, dev, CL_PROGRAM_BUILD_LOG, 5000
                * sizeof(char), log, &retsize);
        CL_CHECK_ERROR(err);
        cout << "Retsize: " << retsize << endl;
        cout << "Log: " << log << endl;
        return;
    }

      // Device data structures
      cl_mem d_val, d_vec, d_out;
      cl_mem d_cols, d_rowDelimiters;

      // Allocate device memory
      d_val = clCreateBuffer(ctx, CL_MEM_READ_WRITE, numNonZeroes *
          sizeof(clFloatType), NULL, &err);
      CL_CHECK_ERROR(err);
      d_cols = clCreateBuffer(ctx, CL_MEM_READ_WRITE, numNonZeroes *
          sizeof(cl_int), NULL, &err);
      CL_CHECK_ERROR(err);
      int imgHeight = 0;
      if (devSupportsImages)
      {
          imgHeight=(numRows+maxImgWidth-1)/maxImgWidth;
          cl_image_format fmt; fmt.image_channel_data_type=CL_FLOAT;
          if(sizeof(floatType)==4)
              fmt.image_channel_order=CL_R;
          else
              fmt.image_channel_order=CL_RG;
          d_vec = clCreateImage2D( ctx, CL_MEM_READ_ONLY, &fmt, maxImgWidth,
              imgHeight, 0, NULL, &err);
          CL_CHECK_ERROR(err);
      } else {
          d_vec = clCreateBuffer(ctx, CL_MEM_READ_WRITE, numRows *
              sizeof(clFloatType), NULL, &err);
          CL_CHECK_ERROR(err);
      }
      d_out = clCreateBuffer(ctx, CL_MEM_READ_WRITE, numRows *
          sizeof(clFloatType), NULL, &err);
      CL_CHECK_ERROR(err);
      d_rowDelimiters = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (numRows+1) *
          sizeof(cl_int), NULL, &err);
      CL_CHECK_ERROR(err);

      // Setup events for timing
      Event valTransfer("transfer Val data over PCIe bus");
      Event colsTransfer("transfer cols data over PCIe bus");
      Event vecTransfer("transfer vec data over PCIe bus");
      Event rowDelimitersTransfer("transfer rowDelimiters data over PCIe bus");

      // Transfer data to device
      err = clEnqueueWriteBuffer(queue, d_val, true, 0, numNonZeroes *
          sizeof(floatType), h_val, 0, NULL, &valTransfer.CLEvent());
      CL_CHECK_ERROR(err);
      err = clEnqueueWriteBuffer(queue, d_cols, true, 0, numNonZeroes *
          sizeof(int), h_cols, 0, NULL, &colsTransfer.CLEvent());
      CL_CHECK_ERROR(err);

      if (devSupportsImages)
      {
          size_t offset[3]={0};
          size_t size[3]={maxImgWidth,(size_t)imgHeight,1};
          err = clEnqueueWriteImage(queue,d_vec, true, offset, size,
              0, 0, h_vec, 0, NULL, &vecTransfer.CLEvent());
          CL_CHECK_ERROR(err);
      } else
      {
          err = clEnqueueWriteBuffer(queue, d_vec, true, 0, numRows *
              sizeof(floatType), h_vec, 0, NULL, &vecTransfer.CLEvent());
          CL_CHECK_ERROR(err);
      }

      err = clEnqueueWriteBuffer(queue, d_rowDelimiters, true, 0, (numRows+1) *
          sizeof(int), h_rowDelimiters, 0, NULL,
          &rowDelimitersTransfer.CLEvent());
      CL_CHECK_ERROR(err);
      err = clFinish(queue);
      CL_CHECK_ERROR(err);

      valTransfer.FillTimingInfo();
      colsTransfer.FillTimingInfo();
      vecTransfer.FillTimingInfo();
      rowDelimitersTransfer.FillTimingInfo();

      double iTransferTime = valTransfer.StartEndRuntime() +
                            colsTransfer.StartEndRuntime() +
                             vecTransfer.StartEndRuntime() +
                   rowDelimitersTransfer.StartEndRuntime();

      int passes = op.getOptionInt("passes");
      int iters  = op.getOptionInt("iterations");

      // Results description info
      char atts[TEMP_BUFFER_SIZE];
      sprintf(atts, "%d_elements_%d_rows", numNonZeroes, numRows);
      string prefix = "";
      prefix += (padded) ? "Padded_" : "";
      double gflop = 2 * (double) numNonZeroes;
      cout << "CSR Scalar Kernel\n";
      Event kernelExec("kernel Execution");

      // Set up CSR Kernels
      cl_kernel csrScalar, csrVector;
      csrScalar  = clCreateKernel(prog, "spmv_csr_scalar_kernel", &err);
      CL_CHECK_ERROR(err);
      err = clSetKernelArg(csrScalar, 0, sizeof(cl_mem), (void*) &d_val);
      CL_CHECK_ERROR(err);
      err = clSetKernelArg(csrScalar, 1, sizeof(cl_mem), (void*) &d_vec);
      CL_CHECK_ERROR(err);
      err = clSetKernelArg(csrScalar, 2, sizeof(cl_mem), (void*) &d_cols);
      CL_CHECK_ERROR(err);
      err = clSetKernelArg(csrScalar, 3, sizeof(cl_mem),
          (void*) &d_rowDelimiters);
      CL_CHECK_ERROR(err);
      err = clSetKernelArg(csrScalar, 4, sizeof(cl_int), (void*) &numRows);
      CL_CHECK_ERROR(err);
      err = clSetKernelArg(csrScalar, 5, sizeof(cl_mem), (void*) &d_out);
      CL_CHECK_ERROR(err);

      csrVector = clCreateKernel(prog, "spmv_csr_vector_kernel", &err);

      CL_CHECK_ERROR(err);
      err = clSetKernelArg(csrVector, 0, sizeof(cl_mem), (void*) &d_val);
      CL_CHECK_ERROR(err);
      err = clSetKernelArg(csrVector, 1, sizeof(cl_mem), (void*) &d_vec);
      CL_CHECK_ERROR(err);
      err = clSetKernelArg(csrVector, 2, sizeof(cl_mem), (void*) &d_cols);
      CL_CHECK_ERROR(err);
      err = clSetKernelArg(csrVector, 3, sizeof(cl_mem),
          (void*) &d_rowDelimiters);
      CL_CHECK_ERROR(err);
      err = clSetKernelArg(csrVector, 4, sizeof(cl_int), (void*) &numRows);
      CL_CHECK_ERROR(err);
      err = clSetKernelArg(csrVector, 5, sizeof(cl_mem), (void*) &d_out);
      CL_CHECK_ERROR(err);

      // Append correct suffix to resultsDB entry
      string suffix;
      if (sizeof(floatType) == sizeof(float))
      {
          suffix = "-SP";
      }
      else
      {
          suffix = "-DP";
      }

      const size_t scalarGlobalWSize = numRows;
      size_t localWorkSize = BLOCK_SIZE;

      for (int k = 0; k < passes; k++)
      {
          double scalarKernelTime = 0.0;
          // Run Scalar Kernel
          for (int j = 0; j < iters; j++)
          {
              err = clEnqueueNDRangeKernel(queue, csrScalar, 1, NULL,
                   &scalarGlobalWSize, &localWorkSize, 0, NULL,
                   &kernelExec.CLEvent());
              CL_CHECK_ERROR(err);
              err = clFinish(queue);
              CL_CHECK_ERROR(err);
              kernelExec.FillTimingInfo();
              scalarKernelTime += kernelExec.StartEndRuntime();
          }

          // Transfer data back to host
          Event outTransfer("d->h data transfer");
          err = clEnqueueReadBuffer(queue, d_out, true, 0, numRows *
              sizeof(floatType), h_out, 0, NULL, &outTransfer.CLEvent());
          CL_CHECK_ERROR(err);
          err = clFinish(queue);
          CL_CHECK_ERROR(err);
          outTransfer.FillTimingInfo();
          double oTransferTime = outTransfer.StartEndRuntime();

          // Compare reference solution to GPU result
          if (! verifyResults(refOut, h_out, numRows, k))
          {
               return;  // If results don't match, don't report performance
          }
          scalarKernelTime = scalarKernelTime / (double)iters;
          string testName = prefix+"CSR-Scalar"+suffix;
          double totalTransfer = iTransferTime + oTransferTime;
          resultDB.AddResult(testName, atts, "Gflop/s",
              gflop/(scalarKernelTime));
          resultDB.AddResult(testName+"_PCIe", atts, "Gflop/s",
              gflop / (scalarKernelTime+totalTransfer));
      }

      // Clobber correct answer, so we can be sure the vector kernel is correct
      err = clEnqueueWriteBuffer(queue, d_out, true, 0, numRows *
          sizeof(floatType), h_vec, 0, NULL, NULL);
      CL_CHECK_ERROR(err);

      cout << "CSR Vector Kernel\n";
      // Verify Local work group size
      size_t maxLocal = getMaxWorkGroupSize(ctx, csrVector);
      if (maxLocal < 32)
      {
         cout << "Warning: CSRVector requires a work group size >= 32" << endl;
         cout << "Skipping this kernel." << endl;
         err = clReleaseMemObject(d_rowDelimiters);
         CL_CHECK_ERROR(err);
         err = clReleaseMemObject(d_vec);
         CL_CHECK_ERROR(err);
         err = clReleaseMemObject(d_out);
         CL_CHECK_ERROR(err);
         err = clReleaseMemObject(d_val);
         CL_CHECK_ERROR(err);
         err = clReleaseMemObject(d_cols);
         CL_CHECK_ERROR(err);
         err = clReleaseKernel(csrScalar);
         CL_CHECK_ERROR(err);
         err = clReleaseKernel(csrVector);
         CL_CHECK_ERROR(err);
         err = clReleaseProgram(prog);
         CL_CHECK_ERROR(err);
         return;
      }
      localWorkSize = VECTOR_SIZE;
      while (localWorkSize+VECTOR_SIZE <= maxLocal &&
          localWorkSize+VECTOR_SIZE <= BLOCK_SIZE)
      {
         localWorkSize += VECTOR_SIZE;
      }
      const size_t vectorGlobalWSize = numRows * VECTOR_SIZE; // 1 warp per row

      for (int k = 0; k < passes; k++)
      {
          // Run Vector Kernel
          double vectorKernelTime = 0.0;
          for (int j = 0; j < iters; j++)
          {
             err = clEnqueueNDRangeKernel(queue, csrVector, 1, NULL,
                  &vectorGlobalWSize, &localWorkSize, 0, NULL,
                  &kernelExec.CLEvent());
             CL_CHECK_ERROR(err);
             err = clFinish(queue);
             CL_CHECK_ERROR(err);
             kernelExec.FillTimingInfo();
             vectorKernelTime += kernelExec.StartEndRuntime();
          }

         Event outTransfer("d->h data transfer");
         err = clEnqueueReadBuffer(queue, d_out, true, 0, numRows *
             sizeof(floatType), h_out, 0, NULL, &outTransfer.CLEvent());
         CL_CHECK_ERROR(err);
         err = clFinish(queue);
         CL_CHECK_ERROR(err);
         outTransfer.FillTimingInfo();
         double oTransferTime = outTransfer.StartEndRuntime();

          // Compare reference solution to GPU result
          if (! verifyResults(refOut, h_out, numRows, k))
          {
              return;  // If results don't match, don't report performance
          }
          vectorKernelTime = vectorKernelTime / (double)iters;
          string testName = prefix+"CSR-Vector"+suffix;
          double totalTransfer = iTransferTime + oTransferTime;
          resultDB.AddResult(testName, atts, "Gflop/s", gflop/vectorKernelTime);
          resultDB.AddResult(testName+"_PCIe", atts, "Gflop/s",
                            gflop/(vectorKernelTime+totalTransfer));
      }

      // Free device memory
      err = clReleaseMemObject(d_rowDelimiters);
      CL_CHECK_ERROR(err);
      err = clReleaseMemObject(d_vec);
      CL_CHECK_ERROR(err);
      err = clReleaseMemObject(d_out);
      CL_CHECK_ERROR(err);
      err = clReleaseMemObject(d_val);
      CL_CHECK_ERROR(err);
      err = clReleaseMemObject(d_cols);
      CL_CHECK_ERROR(err);
      err = clReleaseKernel(csrScalar);
      CL_CHECK_ERROR(err);
      err = clReleaseKernel(csrVector);
      CL_CHECK_ERROR(err);
      err = clReleaseProgram(prog);
      CL_CHECK_ERROR(err);
}
Ejemplo n.º 20
0
void runTest(const string& testName, cl_device_id dev, cl_context ctx,
        cl_command_queue queue, ResultDatabase& resultDB, OptionParser& op,
        const string& compileFlags)
{
    int err = 0;

    // Program Setup
    cl_program prog = clCreateProgramWithSource(ctx, 
                                                1, 
                                                &cl_source_sort,
                                                NULL,
                                                &err);
    CL_CHECK_ERROR(err);

    // Before proceeding, make sure the kernel code compiles and
    // all kernels are valid.
    cout << "Compiling sort kernels." << endl;
    err = clBuildProgram(prog, 1, &dev, compileFlags.c_str(), NULL, NULL);
    CL_CHECK_ERROR(err);

    if (err != CL_SUCCESS)
    {
        char log[5000];
        size_t retsize = 0;
        err = clGetProgramBuildInfo(prog, dev, CL_PROGRAM_BUILD_LOG, 5000
                * sizeof(char), log, &retsize);

        CL_CHECK_ERROR(err);
        cout << "Build error." << endl;
        cout << "Retsize: " << retsize << endl;
        cout << "Log: " << log << endl;
        return;
    }

    // Extract out the 3 kernels
    // Note that these kernels are analogs of those in use for
    // scan, but have had "visiting" logic added to them
    // as described by Merrill et al. See
    // http://www.cs.virginia.edu/~dgm4d/
    cl_kernel reduce = clCreateKernel(prog, "reduce", &err);
    CL_CHECK_ERROR(err);

    cl_kernel top_scan = clCreateKernel(prog, "top_scan", &err);
    CL_CHECK_ERROR(err);
    
    cl_kernel bottom_scan = clCreateKernel(prog, "bottom_scan", &err);
    CL_CHECK_ERROR(err);

    // If the device doesn't support at least 256 work items in a
    // group, use a different kernel (TODO)
    if (getMaxWorkGroupSize(dev) < 256)
    {
        cout << "Scan requires work group size of at least 256" << endl;
        char atts[1024] = "GSize_Not_Supported";
        // resultDB requires neg entry for every possible result
        int passes = op.getOptionInt("passes");
        for (int k = 0; k < passes; k++) {
            resultDB.AddResult(testName , atts, "GB/s", FLT_MAX);
            resultDB.AddResult(testName+"_PCIe" , atts, "GB/s", FLT_MAX);
            resultDB.AddResult(testName+"_Parity" , atts, "GB/s", FLT_MAX);
        }
        return;
    }

    // Problem Sizes
    int probSizes[4] = { 1, 8, 32, 64 };
    int size = probSizes[op.getOptionInt("size")-1];

    // Convert to MiB
    size = (size * 1024 * 1024) / sizeof(T);

    // Create input data on CPU
    unsigned int bytes = size * sizeof(T);

    // Allocate pinned host memory for input data (h_idata)
    cl_mem h_i = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
            bytes, NULL, &err);
    CL_CHECK_ERROR(err);
    T* h_idata = (T*)clEnqueueMapBuffer(queue, h_i, true,
            CL_MAP_READ|CL_MAP_WRITE, 0, bytes, 0, NULL, NULL, &err);
    CL_CHECK_ERROR(err);
    
    // Allocate pinned host memory for output data (h_odata)
    cl_mem h_o = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
            bytes, NULL, &err);
    CL_CHECK_ERROR(err);
    T* h_odata = (T*)clEnqueueMapBuffer(queue, h_o, true,
            CL_MAP_READ|CL_MAP_WRITE, 0, bytes, 0, NULL, NULL, &err);
    CL_CHECK_ERROR(err);

    // Initialize host memory
    cout << "Initializing host memory." << endl;
    for (int i = 0; i < size; i++)
    {
        h_idata[i] = i % 16; // Fill with some pattern
        h_odata[i] = -1;
    }

    // The radix width in bits
    const int radix_width = 4; // Changing this requires major kernel updates
    const int num_digits = (int)pow((double)2, radix_width); // n possible digits

    // Allocate device memory for input array
    cl_mem d_idata = clCreateBuffer(ctx, CL_MEM_READ_WRITE, bytes, NULL, &err);
    CL_CHECK_ERROR(err);
    
    // Allocate device memory for output array
    cl_mem d_odata = clCreateBuffer(ctx, CL_MEM_READ_WRITE, bytes, NULL, &err);
    CL_CHECK_ERROR(err);

    // Number of local work items per group
    const size_t local_wsize  = 256;
    
    // Number of global work items
    const size_t global_wsize = 16384; // i.e. 64 work groups
    const size_t num_work_groups = global_wsize / local_wsize;

    // Allocate device memory for local work group intermediate sums
    cl_mem d_isums = clCreateBuffer(ctx, CL_MEM_READ_WRITE, 
            num_work_groups * num_digits * sizeof(T), NULL, &err);
    CL_CHECK_ERROR(err);

    // Set the kernel arguments for the reduction kernel
    err = clSetKernelArg(reduce, 0, sizeof(cl_mem), (void*)&d_idata);
    CL_CHECK_ERROR(err);
    err = clSetKernelArg(reduce, 1, sizeof(cl_mem), (void*)&d_isums);
    CL_CHECK_ERROR(err);
    err = clSetKernelArg(reduce, 2, sizeof(cl_int), (void*)&size);
    CL_CHECK_ERROR(err);
    err = clSetKernelArg(reduce, 3, local_wsize * sizeof(T), NULL);
    CL_CHECK_ERROR(err);
                        
    // Set the kernel arguments for the top-level scan
    err = clSetKernelArg(top_scan, 0, sizeof(cl_mem), (void*)&d_isums);
    CL_CHECK_ERROR(err);
    err = clSetKernelArg(top_scan, 1, sizeof(cl_int), (void*)&num_work_groups);
    CL_CHECK_ERROR(err);
    err = clSetKernelArg(top_scan, 2, local_wsize * 2 * sizeof(T), NULL);
    CL_CHECK_ERROR(err);

    // Set the kernel arguments for the bottom-level scan
    err = clSetKernelArg(bottom_scan, 0, sizeof(cl_mem), (void*)&d_idata);
    CL_CHECK_ERROR(err);
    err = clSetKernelArg(bottom_scan, 1, sizeof(cl_mem), (void*)&d_isums);
    CL_CHECK_ERROR(err);
    err = clSetKernelArg(bottom_scan, 2, sizeof(cl_mem), (void*)&d_odata);
    CL_CHECK_ERROR(err);
    err = clSetKernelArg(bottom_scan, 3, sizeof(cl_int), (void*)&size);
    CL_CHECK_ERROR(err);
    err = clSetKernelArg(bottom_scan, 4, local_wsize * 2 * sizeof(T), NULL);
    CL_CHECK_ERROR(err);
    
    // Copy data to GPU
    cout << "Copying input data to device." << endl;
    Event evTransfer("PCIe transfer");
    err = clEnqueueWriteBuffer(queue, d_idata, true, 0, bytes, h_idata, 0,
            NULL, &evTransfer.CLEvent());
    CL_CHECK_ERROR(err);
    err = clFinish(queue);
    CL_CHECK_ERROR(err);
    evTransfer.FillTimingInfo();
    double inTransferTime = evTransfer.StartEndRuntime();

    // Repeat the test multiplie times to get a good measurement
    int passes = op.getOptionInt("passes");

    cout << "Running benchmark with size " << size << endl;
    for (int k = 0; k < passes; k++)
    {
        int th = Timer::Start();
        // Assuming an 8 bit byte.
        for (int shift = 0; shift < sizeof(T)*8; shift += radix_width)
        {
            // Like scan, we use a reduce-then-scan approach
            
            // But before proceeding, update the shift appropriately
            // for each kernel. This is how many bits to shift to the
            // right used in binning.
            err = clSetKernelArg(reduce, 4, sizeof(cl_int), (void*)&shift);
            CL_CHECK_ERROR(err);

            err = clSetKernelArg(bottom_scan, 5, sizeof(cl_int), (void*)&shift);
            CL_CHECK_ERROR(err);
            
            // Also, the sort is not in place, so swap the input and output
            // buffers on each pass.
            bool even = ((shift / radix_width) % 2 == 0) ? true : false;
            
            if (even)
            {
                // Set the kernel arguments for the reduction kernel
                err = clSetKernelArg(reduce, 0, sizeof(cl_mem), 
                        (void*)&d_idata);
                CL_CHECK_ERROR(err);
                // Set the kernel arguments for the bottom-level scan
                err = clSetKernelArg(bottom_scan, 0, sizeof(cl_mem), 
                        (void*)&d_idata);
                CL_CHECK_ERROR(err);
                err = clSetKernelArg(bottom_scan, 2, sizeof(cl_mem), 
                        (void*)&d_odata);
                CL_CHECK_ERROR(err);
            } 
            else // i.e. odd pass
            {
                // Set the kernel arguments for the reduction kernel
                err = clSetKernelArg(reduce, 0, sizeof(cl_mem), 
                        (void*)&d_odata);
                CL_CHECK_ERROR(err);
                // Set the kernel arguments for the bottom-level scan
                err = clSetKernelArg(bottom_scan, 0, sizeof(cl_mem), 
                        (void*)&d_odata);
                CL_CHECK_ERROR(err);
                err = clSetKernelArg(bottom_scan, 2, sizeof(cl_mem), 
                        (void*)&d_idata);
                CL_CHECK_ERROR(err);
            }

            // Each thread block gets an equal portion of the
            // input array, and computes occurrences of each digit.
            err = clEnqueueNDRangeKernel(queue, reduce, 1, NULL,
                        &global_wsize, &local_wsize, 0, NULL, NULL);
            
            // Next, a top-level exclusive scan is performed on the
            // per block histograms.  This is done by a single
            // work group (note global size here is the same as local).
            err = clEnqueueNDRangeKernel(queue, top_scan, 1, NULL,
                        &local_wsize, &local_wsize, 0, NULL, NULL);
          
            // Finally, a bottom-level scan is performed by each block
            // that is seeded with the scanned histograms which rebins,
            // locally scans, then scatters keys to global memory
            err = clEnqueueNDRangeKernel(queue, bottom_scan, 1, NULL,
                        &global_wsize, &local_wsize, 0, NULL, NULL);
        }
        err = clFinish(queue);
        CL_CHECK_ERROR(err);
        double total_sort = Timer::Stop(th, "total sort time");

        err = clEnqueueReadBuffer(queue, d_idata, true, 0, bytes, h_odata,
                0, NULL, &evTransfer.CLEvent());
        CL_CHECK_ERROR(err);
        err = clFinish(queue);
        CL_CHECK_ERROR(err);
        evTransfer.FillTimingInfo();
        double totalTransfer = inTransferTime + evTransfer.StartEndRuntime();
        totalTransfer /= 1.e9; // Convert to seconds

        // If answer is incorrect, stop test and do not report performance
        if (! verifySort(h_odata, size))
        {
            return;
        }
        
        char atts[1024];
        double avgTime = total_sort;
        double gbs = (double) (size * sizeof(T)) / (1000. * 1000. * 1000.);
        sprintf(atts, "%d_items", size);
        resultDB.AddResult(testName, atts, "GB/s", gbs / (avgTime));
        resultDB.AddResult(testName+"_PCIe", atts, "GB/s",
                gbs / (avgTime + totalTransfer));
        resultDB.AddResult(testName+"_Parity", atts, "N",
                totalTransfer / avgTime);
    }

    // Clean up device memory
    err = clReleaseMemObject(d_idata);
    CL_CHECK_ERROR(err);
    err = clReleaseMemObject(d_odata);
    CL_CHECK_ERROR(err);
    err = clReleaseMemObject(d_isums);
    CL_CHECK_ERROR(err);

    // Clean up pinned host memory
    err = clEnqueueUnmapMemObject(queue, h_i, h_idata, 0, NULL, NULL);
    CL_CHECK_ERROR(err);
    err = clEnqueueUnmapMemObject(queue, h_o, h_odata, 0, NULL, NULL);
    CL_CHECK_ERROR(err);
    err = clReleaseMemObject(h_i);
    CL_CHECK_ERROR(err);
    err = clReleaseMemObject(h_o);
    CL_CHECK_ERROR(err);
    
    // Clean up program and kernel objects
    err = clReleaseProgram(prog);
    CL_CHECK_ERROR(err);
    err = clReleaseKernel(reduce);
    CL_CHECK_ERROR(err);
    err = clReleaseKernel(top_scan);
    CL_CHECK_ERROR(err);
    err = clReleaseKernel(bottom_scan);
    CL_CHECK_ERROR(err);
}
Ejemplo n.º 21
0
void runTest(const string& testName, cl_device_id dev, cl_context ctx,
        cl_command_queue queue, ResultDatabase& resultDB, OptionParser& op,
        string compileFlags)
{
    // Problem Parameters
    const int probSizes[4] = { 12288, 24576, 36864, 73728 };
    int sizeClass = op.getOptionInt("size");
    assert(sizeClass >= 0 && sizeClass < 5);
    int nAtom = probSizes[sizeClass - 1];

    // Allocate problem data on host
    cl_mem h_pos, h_force, h_neigh;
    posVecType*   position;
    forceVecType* force;
    int* neighborList;
    int passes = op.getOptionInt("passes");
    int iter   = op.getOptionInt("iterations");

    // Allocate and map pinned host memory
    int err = 0;
    // Position
    h_pos = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
            sizeof(posVecType)*nAtom, NULL, &err);
    CL_CHECK_ERROR(err);
    position = (posVecType*)clEnqueueMapBuffer(queue, h_pos, true,
            CL_MAP_READ|CL_MAP_WRITE, 0, sizeof(posVecType)*nAtom , 0,
            NULL, NULL, &err);
    CL_CHECK_ERROR(err);
    // Force
    h_force = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
            sizeof(forceVecType)*nAtom, NULL, &err);
    CL_CHECK_ERROR(err);
    force = (forceVecType*)clEnqueueMapBuffer(queue, h_force, true,
            CL_MAP_READ|CL_MAP_WRITE, 0, sizeof(forceVecType)*nAtom , 0,
            NULL, NULL, &err);
    CL_CHECK_ERROR(err);
    // Neighbor List
    h_neigh = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
            sizeof(int) * nAtom * maxNeighbors, NULL, &err);
    CL_CHECK_ERROR(err);
    neighborList = (int*)clEnqueueMapBuffer(queue, h_neigh, true,
            CL_MAP_READ|CL_MAP_WRITE, 0, sizeof(int) * nAtom * maxNeighbors, 0,
            NULL, NULL, &err);
    CL_CHECK_ERROR(err);

    // Allocate device memory
    cl_mem d_force = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
            nAtom * sizeof(forceVecType), NULL, &err);
    CL_CHECK_ERROR(err);
    cl_mem d_position = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
            nAtom * sizeof(posVecType), NULL, &err);
    CL_CHECK_ERROR(err);
    // Allocate device memory neighbor list
    cl_mem d_neighborList = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
            maxNeighbors * nAtom * sizeof(int), NULL, &err);
    CL_CHECK_ERROR(err);

    size_t maxGroupSize = getMaxWorkGroupSize(dev);
    if (maxGroupSize < 128)
    {
        cout << "MD requires a work group size of at least 128" << endl;
        // Add special values to the results database
        char atts[1024];
        sprintf(atts, "GSize_Not_Supported");
        for (int i=0 ; i<passes ; ++i) {
            resultDB.AddResult(testName, atts, "GFLOPS", FLT_MAX);
            resultDB.AddResult(testName + "_PCIe", atts, "GFLOPS", FLT_MAX);
            resultDB.AddResult(testName+"-Bandwidth", atts, "GB/s", FLT_MAX);
            resultDB.AddResult(testName+"-Bandwidth_PCIe", atts, "GB/s", FLT_MAX);
            resultDB.AddResult(testName+"_Parity", atts, "N", FLT_MAX);
        }
        return;
    }
    size_t localSize  = 128;
    size_t globalSize = nAtom;

    cout << "Initializing test problem (this can take several "
            "minutes for large problems).\n                   ";

    // Seed random number generator
    srand48(8650341L);

    // Initialize positions -- random distribution in cubic domain
    for (int i = 0; i < nAtom; i++)
    {
        position[i].x = (drand48() * domainEdge);
        position[i].y = (drand48() * domainEdge);
        position[i].z = (drand48() * domainEdge);
    }

    // Copy position to GPU
    Event evTransfer("h->d transfer");
    err = clEnqueueWriteBuffer(queue, d_position, true, 0,
            nAtom * sizeof(posVecType), position, 0, NULL,
            &evTransfer.CLEvent());
    CL_CHECK_ERROR(err);
    err = clFinish(queue);
    CL_CHECK_ERROR(err);

    evTransfer.FillTimingInfo();
    long transferTime = evTransfer.StartEndRuntime();

    // Keep track of how many atoms are within the cutoff distance to
    // accurately calculate FLOPS later
    int totalPairs = buildNeighborList<T, posVecType>(nAtom, position,
            neighborList);

    cout << "Finished.\n";
    cout << totalPairs << " of " << nAtom*maxNeighbors <<
            " pairs within cutoff distance = " <<
            100.0 * ((double)totalPairs / (nAtom*maxNeighbors)) << " %" << endl;

    // Copy data to GPU
    err = clEnqueueWriteBuffer(queue, d_neighborList, true, 0,
            maxNeighbors * nAtom * sizeof(int), neighborList, 0, NULL,
            &evTransfer.CLEvent());
    CL_CHECK_ERROR(err);
    clFinish(queue);
    evTransfer.FillTimingInfo();
    transferTime += evTransfer.StartEndRuntime();

    // Build the openCL kernel
    cl_program prog = clCreateProgramWithSource(ctx, 1, &cl_source_md, NULL,
            &err);
    CL_CHECK_ERROR(err);

    err = clBuildProgram(prog, 1, &dev, compileFlags.c_str(), NULL, NULL);
    CL_CHECK_ERROR(err);

    // If there is a build error, print the output and return
    if (err != CL_SUCCESS)
    {
        char log[5000];
        size_t retsize = 0;
        err = clGetProgramBuildInfo(prog, dev, CL_PROGRAM_BUILD_LOG, 50000
                * sizeof(char), log, &retsize);
        CL_CHECK_ERROR(err);
        cout << "Retsize: " << retsize << endl;
        cout << "Log: " << log << endl;
        return;
    }

    // Extract out the kernels
    cl_kernel lj_kernel = clCreateKernel(prog, "compute_lj_force",
            &err);
    CL_CHECK_ERROR(err);

    T lj1_t = (T) lj1;
    T lj2_t = (T) lj2;
    T cutsq_t = (T) cutsq;

    // Set kernel arguments
    err = clSetKernelArg(lj_kernel, 0, sizeof(cl_mem),
            (void*) &d_force);
    CL_CHECK_ERROR(err);
    err = clSetKernelArg(lj_kernel, 1, sizeof(cl_mem),
            (void*) &d_position);
    CL_CHECK_ERROR(err);
    err = clSetKernelArg(lj_kernel, 2, sizeof(cl_int),
            (void*) &maxNeighbors);
    CL_CHECK_ERROR(err);
    err = clSetKernelArg(lj_kernel, 3, sizeof(cl_mem),
            (void*) &d_neighborList);
    CL_CHECK_ERROR(err);
    err = clSetKernelArg(lj_kernel, 4, sizeof(T),
            (void*) &cutsq_t);
    CL_CHECK_ERROR(err);
    err = clSetKernelArg(lj_kernel, 5, sizeof(T),
            (void*) &lj1_t);
    CL_CHECK_ERROR(err);
    err = clSetKernelArg(lj_kernel, 6, sizeof(T),
            (void*) &lj2_t);
    CL_CHECK_ERROR(err);
    err = clSetKernelArg(lj_kernel, 7, sizeof(cl_int),
            (void*) &nAtom);
    CL_CHECK_ERROR(err);

    Event evLJ("computeLJ");

    // Warm up the kernel and check correctness
    err = clEnqueueNDRangeKernel(queue, lj_kernel, 1, NULL, &globalSize,
            &localSize, 0, NULL, &evLJ.CLEvent());
    CL_CHECK_ERROR(err);
    err = clFinish(queue);
    CL_CHECK_ERROR(err);

    err = clEnqueueReadBuffer(queue, d_force, true, 0,
            nAtom * sizeof(forceVecType), force, 0, NULL,
            &evTransfer.CLEvent());
    CL_CHECK_ERROR(err);
    err = clFinish(queue);
    CL_CHECK_ERROR(err);
    evTransfer.FillTimingInfo();
    transferTime += evTransfer.StartEndRuntime();


    cout << "Performing Correctness Check (can take several minutes)\n";

    // If results are correct, skip the performance tests
    if (!checkResults<T, forceVecType, posVecType>(force, position,
            neighborList, nAtom))
    {
        return;
    }


    for (int i = 0; i < passes; i++)
    {
        double total_time = 0.0;
        for (int j = 0; j < iter; j++)
        {
            //Launch Kernels
            err = clEnqueueNDRangeKernel(queue, lj_kernel, 1, NULL,
                    &globalSize, &localSize, 0, NULL,
                    &evLJ.CLEvent());
            CL_CHECK_ERROR(err);
            err = clFinish(queue);
            CL_CHECK_ERROR(err);

            // Collect timing info from events
            evLJ.FillTimingInfo();
            total_time += evLJ.SubmitEndRuntime();
        }

        char atts[1024];
        long int nflops = (8 * nAtom * maxNeighbors) + (totalPairs * 13);
        sprintf(atts, "%d_atoms", nAtom);
        total_time /= (double) iter;
        resultDB.AddResult(testName, atts, "GFLOPS",
                ((double) nflops) / total_time);
        resultDB.AddResult(testName + "_PCIe", atts, "GFLOPS",
                        ((double) nflops) / (total_time + transferTime));

        long int numPairs = nAtom * maxNeighbors;
        long int nbytes = (3 * sizeof(T) * (1+numPairs)) + // position data
                          (3 * sizeof(T) * nAtom) + // force for each atom
                          (sizeof(int) * numPairs); // neighbor list
        double gbytes = (double)nbytes / (1000. * 1000. * 1000.);
        double seconds = total_time / 1.e9;
        resultDB.AddResult(testName+"-Bandwidth", atts, "GB/s", gbytes /
                        seconds);
        resultDB.AddResult(testName+"-Bandwidth_PCIe", atts, "GB/s", gbytes /
                           (seconds + (transferTime / 1.e9)));
        resultDB.AddResult(testName+"_Parity", atts, "N",
                                    (transferTime / 1.e9) / seconds);
    }

    // Clean up
    // Host memory
    err = clEnqueueUnmapMemObject(queue, h_pos,   position, 0, NULL, NULL);
    CL_CHECK_ERROR(err);
    err = clEnqueueUnmapMemObject(queue, h_force, force, 0, NULL, NULL);
    CL_CHECK_ERROR(err);
    err = clEnqueueUnmapMemObject(queue, h_neigh, neighborList, 0, NULL, NULL);
    CL_CHECK_ERROR(err);
    err = clReleaseMemObject(h_pos);
    CL_CHECK_ERROR(err);
    err = clReleaseMemObject(h_force);
    CL_CHECK_ERROR(err);
    err = clReleaseMemObject(h_neigh);
    CL_CHECK_ERROR(err);

    // Program Objects
    err = clReleaseProgram(prog);
    CL_CHECK_ERROR(err);
    err = clReleaseKernel(lj_kernel);
    CL_CHECK_ERROR(err);

    // Device Memory
    err = clReleaseMemObject(d_force);
    CL_CHECK_ERROR(err);
    err = clReleaseMemObject(d_position);
    CL_CHECK_ERROR(err);
    err = clReleaseMemObject(d_neighborList);
    CL_CHECK_ERROR(err);

}
Ejemplo n.º 22
0
void
DoTest( const char* timerDesc, ResultDatabase& resultDB, OptionParser& opts )
{
    StencilFactory<T>* stdStencilFactory = NULL;
    Stencil<T>* stdStencil = NULL;
    StencilFactory<T>* testStencilFactory = NULL;
    Stencil<T>* testStencil = NULL;

    //try
    {
        stdStencilFactory = new HostStencilFactory<T>;
        testStencilFactory = new MICStencilFactory<T>;
        assert( (stdStencilFactory != NULL) && (testStencilFactory != NULL) );

        // do a sanity check on option values
        CheckOptions( opts );
        stdStencilFactory->CheckOptions( opts );
        testStencilFactory->CheckOptions( opts );

        // extract and validate options
        std::vector<long long> arrayDims = opts.getOptionVecInt( "customSize" );
        if( arrayDims.size() != 2 )
        {
            cerr << "Dim size: " << arrayDims.size() << "\n";
            //throw InvalidArgValue( "all overall dimensions must be positive" );
        }
        if (arrayDims[0] == 0) // User has not specified a custom size
        {
            const int probSizes[4] = { 768, 1408, 2048, 4096 };
            int sizeClass = opts.getOptionInt("size");
            if (!(sizeClass >= 0 && sizeClass < 5))
            {
                //throw InvalidArgValue( "Size class must be between 1-4" );
            }
            arrayDims[0] = arrayDims[1] =probSizes[sizeClass - 1];
        }

        long int seed = (long)opts.getOptionInt( "seed" );
        bool beVerbose = opts.getOptionBool( "verbose" );
        unsigned int nIters = (unsigned int)opts.getOptionInt( "num-iters" );
        double valErrThreshold = (double)opts.getOptionFloat( "val-threshold" );
        unsigned int nValErrsToPrint = (unsigned int)opts.getOptionInt( "val-print-limit" );

#if defined(PARALLEL)
        unsigned int haloWidth = (unsigned int)opts.getOptionInt( "iters-per-exchange" );
#else
        unsigned int haloWidth = 1;
#endif // defined(PARALLEL)

        float haloVal = (float)opts.getOptionFloat( "haloVal" );

        // build a description of this experiment
        std::ostringstream experimentDescriptionStr;
        experimentDescriptionStr 
            << nIters << ':'
            << arrayDims[0] << 'x' << arrayDims[1] << ':'
            << LROWS << 'x' << LCOLS;

        unsigned int nPasses =(unsigned int)opts.getOptionInt( "passes" );
         unsigned long npts = (arrayDims[0] + 2*haloWidth - 2) * 
                                     (arrayDims[1] + 2*haloWidth - 2); 

unsigned long nflops = npts * 11 * nIters;
cout<<"flops are = "<<nflops<<endl;

        // compute the expected result on the host
#if defined(PARALLEL)
        int cwrank;
        MPI_Comm_rank( MPI_COMM_WORLD, &cwrank );
        if( cwrank == 0 )
        {
#endif // defined(PARALLEL)
        std::cout << "\nPerforming stencil operation on host for later comparison with MIC output\n"
            << "Depending on host capabilities, this may take a while."
            << std::endl;
#if defined(PARALLEL)
        }
#endif // defined(PARALLEL)
        Matrix2D<T> exp( arrayDims[0] + 2*haloWidth, 
                            arrayDims[1] + 2*haloWidth );
        Initialize<T> init( seed,
                        haloWidth,
                        haloVal );	

        init( exp );
        if( beVerbose )
        {
            std::cout << "initial state:\n" << exp << std::endl;
        }
        Stencil<T>* stdStencil = stdStencilFactory->BuildStencil( opts );
        (*stdStencil)( exp, nIters );
        if( beVerbose )
        {
            std::cout << "expected result:\n" << exp << std::endl;
        }
	

        // compute the result on the MIC device
        Matrix2D<T> data( arrayDims[0] + 2*haloWidth, 
                            arrayDims[1] + 2*haloWidth );
        Stencil<T>* testStencil = testStencilFactory->BuildStencil( opts );

#if defined(PARALLEL)
        MPI_Comm_rank( MPI_COMM_WORLD, &cwrank );
        if( cwrank == 0 )
        {
#endif // defined(PARALLEL)
        std::cout << "\nPerforming stencil operation on chosen device, " 
            << nPasses << " passes.\n"
            << "Depending on chosen device, this may take a while."
            << std::endl;
#if defined(PARALLEL)
        }
#endif // defined(PARALLEL)

#if !defined(PARALLEL)
        std::cout << "At the end of each pass the number of validation\nerrors observed will be printed to the standard output." 
            << std::endl;
#endif // !defined(PARALLEL)
	std::cout<<"Passes:"<<nPasses<<endl;
        for( unsigned int pass = 0; pass < nPasses; pass++ )
        {
            init( data );

            double start = curr_second();
            (*testStencil)( data, nIters );
            double elapsedTime = curr_second()-start;
            double gflops = (nflops / elapsedTime) / 1e9;
            resultDB.AddResult( timerDesc,
                                    experimentDescriptionStr.str(),
                                    "GFLOPS",
                                    gflops );
            if( beVerbose )
            {
                std::cout << "observed result, pass " << pass << ":\n" 
                    << data 
                    << std::endl;
            }

            // validate the result
#if defined(PARALLEL)
            //StencilValidater<T>* validater = new MPIStencilValidater<T>;
#else
            //StencilValidater<T>* validater = new SerialStencilValidater<T>;            
#endif // defined(PARALLEL)
	   MICValidate(exp,data,valErrThreshold,nValErrsToPrint);
            /*validater->ValidateResult( exp,
                            data,
                            valErrThreshold,
                            nValErrsToPrint );*/
        }
    }
    /*
    catch( ... )
    {
        // clean up - abnormal termination
        // wish we didn't have to do this, but C++ exceptions do not 
        // support a try-catch-finally approach
        delete stdStencil;
        delete stdStencilFactory;
        delete testStencil;
        delete testStencilFactory;
        throw;
    }*/

    // clean up - normal termination
    delete stdStencil;
    delete stdStencilFactory;
    delete testStencil;
    delete testStencilFactory;
}
Ejemplo n.º 23
0
void RunTest(ResultDatabase &resultDB, const int npasses, const int verbose,
        const int noPB, const float repeatF, ProgressBar &pb, 
        const char* precision, const int micdev)
{
    char sizeStr[128];
    static __declspec(target(mic)) T *hostMem;

    int realRepeats = (int)round(repeatF*20);
    if (realRepeats < 2) realRepeats = 2;

    // Allocate host memory
    int halfNumFloats = 1024*1024;
    int numFloats = 2*halfNumFloats;
    hostMem = (T*)_mm_malloc(sizeof(T)*numFloats,64);

    sprintf (sizeStr, "Size:%07d", numFloats);
    float t = 0.0f;
    double TH;
    double flopCount;
    double gflop;

    for (int pass=0 ; pass<npasses ; ++pass)
    {
        ////////// Add1 //////////
        InitData<T>(hostMem,numFloats);
        #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0))
        {}
        TH = curr_second();
        #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem)
        {
            Add1_MIC<T>(numFloats,hostMem, realRepeats, 10.0);
        }
        t = curr_second()-TH;
        flopCount = (double)numFloats * realRepeats * omp_get_num_threads();
        gflop = flopCount / (double)(t*1e9);
        resultDB.AddResult(string("Add1")+precision, sizeStr, "GFLOPS", gflop);
        
        #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0))
        {}
        CheckResults<T>(hostMem,numFloats);
        pb.addItersDone();
        if (!verbose && !noPB)pb.Show(stdout);

        ////////// Add2 //////////
        InitData<T>(hostMem,numFloats);
        #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0))
        {}
        TH = curr_second();
        #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem)
        {
            Add2_MIC<T>(numFloats,hostMem, realRepeats, 10.0);
        }
        t = curr_second()-TH;
        flopCount = (double)numFloats * realRepeats * 120 * 2;
        gflop = flopCount / (double)(t*1e9);
        resultDB.AddResult(string("Add2")+precision, sizeStr, "GFLOPS", gflop);
        #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0))
        {}
        CheckResults<T>(hostMem,numFloats);
        pb.addItersDone();
        if (!verbose && !noPB)pb.Show(stdout);

        ////////// Add4 //////////
        InitData<T>(hostMem,numFloats);
        #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0))
        {}
        TH = curr_second();
        #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem)
        {
            Add4_MIC<T>(numFloats,hostMem, realRepeats, 10.0);
        }
        t = curr_second()-TH;
        flopCount = (double)numFloats * realRepeats * 60 * 4;
        gflop = flopCount / (double)(t*1e9);
        resultDB.AddResult(string("Add4")+precision, sizeStr, "GFLOPS", gflop);
        #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0))
        {}
        CheckResults<T>(hostMem,numFloats);
        pb.addItersDone();
        if (!verbose && !noPB)pb.Show(stdout);

        ////////// Add8 //////////
        InitData<T>(hostMem,numFloats);
        #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0))
        {}
        TH = curr_second();
        #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem)
        {
            Add8_MIC<T>(numFloats,hostMem, realRepeats, 10.0);
        }
        t = curr_second()-TH;
        flopCount = (double)numFloats * realRepeats * 80 * 3;
        gflop = flopCount / (double)(t*1e9);
        resultDB.AddResult(string("Add8")+precision, sizeStr, "GFLOPS", gflop);
        #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0))
        {}
        CheckResults<T>(hostMem,numFloats);
        pb.addItersDone();
        if (!verbose && !noPB)pb.Show(stdout);

        ////////// Mul1 //////////
        InitData<T>(hostMem,numFloats);
        #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0))
        {}
        TH = curr_second();
        #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem)
        {
            Mul1_MIC<T>(numFloats,hostMem, realRepeats, 1.01);
        }
        t = curr_second()-TH;
        flopCount = (double)numFloats * 2 * realRepeats * 200;
        gflop = flopCount / (double)(t*1e9);
        resultDB.AddResult(string("Mul1")+precision, sizeStr, "GFLOPS", gflop);
        #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0))
        {}
        CheckResults<T>(hostMem,numFloats);
        pb.addItersDone();
        if (!verbose && !noPB)pb.Show(stdout);

        ////////// Mul2 //////////
        InitData<T>(hostMem,numFloats);
        #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0))
        {}
        TH = curr_second();
        #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem)
        {
            Mul2_MIC<T>(numFloats,hostMem, realRepeats, 1.01);
        }
        t = curr_second()-TH;
        flopCount = (double)numFloats * 2 * realRepeats * 100 * 2;
        gflop = flopCount / (double)(t*1e9);
        resultDB.AddResult(string("Mul2")+precision, sizeStr, "GFLOPS", gflop);
        #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0))
        {}
        CheckResults<T>(hostMem,numFloats);
        pb.addItersDone();
        if (!verbose && !noPB)pb.Show(stdout);

        ////////// Mul4 //////////
        InitData<T>(hostMem,numFloats);
        #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0))
        {}
        TH = curr_second();
        #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem)
        {
            Mul4_MIC<T>(numFloats,hostMem, realRepeats, 1.01);
        }
        t = curr_second()-TH;
        flopCount = (double)numFloats * 2 * realRepeats * 50 * 4;
        gflop = flopCount / (double)(t*1e9);
        resultDB.AddResult(string("Mul4")+precision, sizeStr, "GFLOPS", gflop);
        #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0))
        {}
        CheckResults<T>(hostMem,numFloats);
        pb.addItersDone();
        if (!verbose && !noPB)pb.Show(stdout);

        ////////// Mul8 //////////
        InitData<T>(hostMem,numFloats);
        #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0))
        {}
        TH = curr_second();
        #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem)
        {
            Mul8_MIC<T>(numFloats,hostMem, realRepeats, 1.01);
        }
        t = curr_second()-TH;
        flopCount = (double)numFloats * 2 * realRepeats * 25 * 8;
        gflop = flopCount / (double)(t*1e9);
        resultDB.AddResult(string("Mul8")+precision, sizeStr, "GFLOPS", gflop);
        #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0))
        {}
        CheckResults<T>(hostMem,numFloats);
        pb.addItersDone();
        if (!verbose && !noPB)pb.Show(stdout);

        ////////// MAdd1 //////////
        InitData<T>(hostMem,numFloats);
        #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0))
        {}
        TH = curr_second();
        #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem)
        {
            MAdd1_MIC<T>(numFloats,hostMem, realRepeats, 10.0, 0.9899);
        }
        t = curr_second()-TH;
        flopCount = (double)numFloats * 2 * realRepeats * omp_get_num_threads() * 1;
        gflop = flopCount / (double)(t*1e9);
        resultDB.AddResult(string("MAdd1")+precision, sizeStr, "GFLOPS", gflop);
        #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0))
        {}
        CheckResults<T>(hostMem,numFloats);
        pb.addItersDone();
        if (!verbose && !noPB)pb.Show(stdout);

        ////////// MAdd2 //////////
        InitData<T>(hostMem,numFloats);
        #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0))
        {}
        TH = curr_second();
        #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem)
        {
            MAdd2_MIC<T>(numFloats,hostMem, realRepeats, 10.0, 0.9899);
        }
        t = curr_second()-TH;
        flopCount = (double)numFloats * 2 * realRepeats * 120 * 2;
        gflop = flopCount / (double)(t*1e9);
        resultDB.AddResult(string("MAdd2")+precision, sizeStr, "GFLOPS", gflop);
        #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0))
        {}
        CheckResults<T>(hostMem,numFloats);
        pb.addItersDone();
        if (!verbose && !noPB)pb.Show(stdout);

        ////////// MAdd4 //////////
        InitData<T>(hostMem,numFloats);
        #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0))
        {}
        TH = curr_second();
        #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem)
        {
            MAdd4_MIC<T>(numFloats,hostMem, realRepeats, 10.0, 0.9899);
        }
        t = curr_second()-TH;
        flopCount = (double)numFloats * 2 * realRepeats * 60 * 4;
        gflop = flopCount / (double)(t*1e9);
        resultDB.AddResult(string("MAdd4")+precision, sizeStr, "GFLOPS", gflop);
        #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0))
        {}
        CheckResults<T>(hostMem,numFloats);
        pb.addItersDone();
        if (!verbose && !noPB)pb.Show(stdout);

        ////////// MAdd8 //////////
        InitData<T>(hostMem,numFloats);
        #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0))
        {}
        TH = curr_second();
        #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem)
        {
            MAdd8_MIC<T>(numFloats,hostMem, realRepeats, 10.0, 0.9899);
        }
        t = curr_second()-TH;
        flopCount = (double)numFloats * 2 * realRepeats * 30 * 8;
        gflop = flopCount / (double)(t*1e9);
        resultDB.AddResult(string("MAdd8")+precision, sizeStr, "GFLOPS", gflop);
        #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0))
        {}
        CheckResults<T>(hostMem,numFloats);
        pb.addItersDone();
        if (!verbose && !noPB)pb.Show(stdout);

        ////////// MulMAdd1 //////////
        InitData<T>(hostMem,numFloats);
        #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0))
        {}
        TH = curr_second();
        #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem)
        {
            MulMAdd1_MIC<T>(numFloats,hostMem, realRepeats, 3.75, 0.355);
        }
        t = curr_second()-TH;
        flopCount = (double)numFloats * 3 * realRepeats * 160 * 1;
        gflop = flopCount / (double)(t*1e9);
        resultDB.AddResult(string("MulMAdd1")+precision, sizeStr, "GFLOPS", gflop);
        #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0))
        {}
        CheckResults<T>(hostMem,numFloats);
        pb.addItersDone();
        if (!verbose && !noPB)pb.Show(stdout);

        ////////// MulMAdd2 //////////
        InitData<T>(hostMem,numFloats);
        #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0))
        {}
        TH = curr_second();
        #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem)
        {
            MulMAdd2_MIC<T>(numFloats,hostMem, realRepeats, 3.75, 0.355);
        }
        t = curr_second()-TH;
        flopCount = (double)numFloats * 3 * realRepeats * 80 * 2;
        gflop = flopCount / (double)(t*1e9);
        resultDB.AddResult(string("MulMAdd2")+precision, sizeStr, "GFLOPS", gflop);
        #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0))
        {}
        CheckResults<T>(hostMem,numFloats);
        pb.addItersDone();
        if (!verbose && !noPB)pb.Show(stdout);

        ////////// MulMAdd4 //////////
        InitData<T>(hostMem,numFloats);
        #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0))
        {}
        TH = curr_second();
        #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem)
        {
            MulMAdd4_MIC<T>(numFloats,hostMem, realRepeats, 3.75, 0.355);
        }
        t = curr_second()-TH;
        flopCount = (double)numFloats * 3 * realRepeats * 40 * 4;
        gflop = flopCount / (double)(t*1e9);
        resultDB.AddResult(string("MulMAdd4")+precision, sizeStr, "GFLOPS", gflop);
        #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0))
        {}
        CheckResults<T>(hostMem,numFloats);
        pb.addItersDone();
        if (!verbose && !noPB)pb.Show(stdout);

        ////////// MulMAdd8 //////////
        InitData<T>(hostMem,numFloats);
        #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0))
        {}
        TH = curr_second();
        #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem)
        {
            MulMAdd8_MIC<T>(numFloats,hostMem, realRepeats, 3.75, 0.355);
        }
        t = curr_second()-TH;
        flopCount = (double)numFloats * 3 * realRepeats * 20 * 8;
        gflop = flopCount / (double)(t*1e9);
        resultDB.AddResult(string("MulMAdd8")+precision, sizeStr, "GFLOPS", gflop);
        #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0))
        {}
        CheckResults<T>(hostMem,numFloats);
        pb.addItersDone();
        if (!verbose && !noPB)pb.Show(stdout);
    }
    _mm_free(hostMem);
}
Ejemplo n.º 24
0
void RunBenchmark(OptionParser &op, ResultDatabase &resultDB)
{
    const bool verbose = op.getOptionBool("verbose");

    // Sizes are in kb
    const int nSizes  = 17;
    int sizes[nSizes] = {1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,
        32768, 65536};
    
    long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;

    // Create host memory
    hostMem = (float*)_mm_malloc(numMaxFloats*sizeof(float),ALIGN);

    if(hostMem==NULL)
    {
        cerr << "Couldn't allocate CPU memory! \n";
        cerr << "Test failed." << endl;
        return;
    }
    // Initialize memory with some pattern.
    for (int i = 0; i < numMaxFloats; i++)
    {
        hostMem[i] = i % 77;
    }

    const unsigned int passes = op.getOptionInt("passes");
    int micdev = op.getOptionInt("target");

    // Allocate memory on the card
    #pragma offload target(mic:micdev) \
    nocopy(hostMem:length(numMaxFloats) alloc_if(1) free_if(0) align(ALIGN) ) 
    {
    }

    // Three passes, forward and backward both
    for (int pass = 0; pass < passes; pass++)
    {
        // Step through sizes forward on even passes and backward on odd
        for (int i = 0; i < nSizes; i++)
        {
            int sizeIndex;
            if ((pass % 2) == 0)
            {
                sizeIndex = i;
            }
            else
            {
                sizeIndex = (nSizes - 1) - i;
            }

            int nbytes = sizes[sizeIndex] * 1024;

            //  D->H test
            double start = curr_second();

            #pragma offload target(mic:micdev) \
            out(hostMem:length((1024*sizes[sizeIndex]/4)) \
            free_if(0) alloc_if(0)  )
            {
            }
            double t = curr_second()-start;

            if (verbose)
            {
                cerr << "Size " << sizes[sizeIndex] << "k took " << t <<
                    " sec\n";
            }

            double speed = (double(sizes[sizeIndex]) * 1024 / 
                    (1000. * 1000. * 1000.)) / t;
            char sizeStr[256];
            sprintf(sizeStr, "% 6dkB", sizes[sizeIndex]);
            resultDB.AddResult("ReadbackSpeed", sizeStr, "GB/sec", speed);
            resultDB.AddResult("ReadbackTime", sizeStr, "ms", t*1000);
        }
    }
    // Free memory allocated on the mic
    #pragma offload target(mic:micdev) \
    in(hostMem:length(numMaxFloats) alloc_if(0)  )
    {
    }

    // Cleanup
    _mm_free(hostMem);
}
Ejemplo n.º 25
0
void runTest(const string& testName, cl_device_id dev, cl_context ctx,
        cl_command_queue queue, ResultDatabase& resultDB, OptionParser& op,
        const string& compileFlags)
{
    // Collect basic MPI information
    int mpi_size, mpi_rank;
    MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);

    int err;
    int waitForEvents = 1;

    // Program Setup
    cl_program prog = clCreateProgramWithSource(ctx, 1,
                            &cl_source_reduction, NULL, &err);
    CL_CHECK_ERROR(err);
    if (mpi_rank == 0)
    {
        cout << "Compiling reduction kernel." << endl;
    }

    err = clBuildProgram(prog, 1, &dev, compileFlags.c_str(), NULL, NULL);
    CL_CHECK_ERROR(err);

    if (err != 0)
    {
        char log[5000];
        size_t retsize = 0;
        err =  clGetProgramBuildInfo(prog, dev, CL_PROGRAM_BUILD_LOG,
                5000*sizeof(char), log, &retsize);

        CL_CHECK_ERROR(err);
        cout << "Build error." << endl;
        cout << "Retsize: " << retsize << endl;
        cout << "Log: " << log << endl;
        return;
    }

    // Extract out the kernels
    cl_kernel reduce = clCreateKernel(prog, "reduce", &err);
    CL_CHECK_ERROR(err);

    cl_kernel cpureduce = clCreateKernel(prog, "reduceNoLocal", &err);
    CL_CHECK_ERROR(err);

    size_t localWorkSize = 256;
    bool nolocal = false;
    if (getMaxWorkGroupSize(ctx, reduce) == 1) 
    {
        nolocal = true;
        localWorkSize = 1;
    }

    int probSizes[4] = { 1, 8, 64, 128 };

    int size = probSizes[op.getOptionInt("size")-1];
    size = (size * 1024 * 1024) / sizeof(T);

    unsigned int bytes = size * sizeof(T);

    // Allocate pinned host memory for input data
    cl_mem h_i = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
            bytes, NULL, &err);
    CL_CHECK_ERROR(err);
    T* h_idata = (T*)clEnqueueMapBuffer(queue, h_i, true,
            CL_MAP_READ|CL_MAP_WRITE, 0, bytes, 0, NULL, NULL, &err);
    CL_CHECK_ERROR(err);

    // Initialize host memory
    if (mpi_rank == 0)
    {
        cout << "Initializing host memory." << endl;
    }

    for(int i=0; i<size; i++)
    {
        h_idata[i] = i % 2; //Fill with some pattern
    }

    // Allocate device memory for input data
    cl_mem d_idata = clCreateBuffer(ctx, CL_MEM_READ_WRITE, bytes,
            NULL, &err);
    CL_CHECK_ERROR(err);

    int num_blocks;
    if (!nolocal)
    {
        num_blocks = 64;
    }
    else
    {
        num_blocks = 1; // NB: This should only be the case on Apple's CPU
                       // implementation, which is quite restrictive on
                       // work group sizes.
    }

    // Allocate host memory for output
    cl_mem h_o = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
            sizeof(T)*num_blocks, NULL, &err);
    CL_CHECK_ERROR(err);
    T* h_odata = (T*)clEnqueueMapBuffer(queue, h_o, true,
            CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(T) * num_blocks , 0, NULL, NULL,
            &err);
    CL_CHECK_ERROR(err);

    // Allocate device memory for output
    cl_mem d_odata = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
            num_blocks * sizeof(T), NULL, &err);
    CL_CHECK_ERROR(err);

    // Copy data to GPU
    Event evTransfer("PCIe Transfer");
    err = clEnqueueWriteBuffer(queue, d_idata, true, 0, bytes, h_idata,
            0, NULL, &evTransfer.CLEvent());
    CL_CHECK_ERROR(err);
    evTransfer.FillTimingInfo();

    double inputTransfer = evTransfer.StartEndRuntime();

    err = clSetKernelArg(reduce, 0, sizeof(cl_mem), (void*)&d_idata);
    CL_CHECK_ERROR(err);
    err = clSetKernelArg(reduce, 1, sizeof(cl_mem), (void*)&d_odata);
    CL_CHECK_ERROR(err);
    err = clSetKernelArg(reduce, 2,
            localWorkSize * sizeof(T), NULL);
    CL_CHECK_ERROR(err);
    err = clSetKernelArg(reduce, 3, sizeof(cl_int), (void*)&size);
    CL_CHECK_ERROR(err);

    err = clSetKernelArg(cpureduce, 0, sizeof(cl_mem), (void*)&d_idata);
    CL_CHECK_ERROR(err);
    err = clSetKernelArg(cpureduce, 1, sizeof(cl_mem), (void*)&d_odata);
    CL_CHECK_ERROR(err);
    err = clSetKernelArg(cpureduce, 2, sizeof(cl_int), (void*)&size);
    CL_CHECK_ERROR(err);

    size_t globalWorkSize;
    if (!nolocal)
    {
        globalWorkSize = localWorkSize * 64; // Use 64 work groups
    }
    else
    {
        globalWorkSize = 1;
    }

    int passes = op.getOptionInt("passes");
    int iters  = op.getOptionInt("iterations");

    if (mpi_rank == 0)
    {
        cout << "Running benchmark." << endl;
    }
    for (int k = 0; k < passes; k++)
    {
        // Synch processes at the start of each test.
        MPI_Barrier(MPI_COMM_WORLD); 
        
        double totalReduceTime = 0.0;
        Event evKernel("reduce kernel");
        for (int m = 0; m < iters; m++)
        {
            if (nolocal) 
            {
                err = clEnqueueNDRangeKernel(queue, cpureduce, 1, NULL,
                        &globalWorkSize, &localWorkSize, 0,
                        NULL, &evKernel.CLEvent());
            }
            else 
            {
                err = clEnqueueNDRangeKernel(queue, reduce, 1, NULL,
                        &globalWorkSize, &localWorkSize, 0,
                        NULL, &evKernel.CLEvent());
            }
            CL_CHECK_ERROR(err);
            err = clFinish(queue);
            CL_CHECK_ERROR (err);
            evKernel.FillTimingInfo();
            totalReduceTime += (evKernel.SubmitEndRuntime() / 1.e9);
        }

        err = clEnqueueReadBuffer(queue, d_odata, true, 0,
                num_blocks*sizeof(T), h_odata, 0, NULL, &evTransfer.CLEvent());
        CL_CHECK_ERROR(err);
        evTransfer.FillTimingInfo();
        double totalTransfer = (inputTransfer + evTransfer.StartEndRuntime()) /
                1.e9;
        
        T local_result = 0.0f, global_result = 0.0f;
        
        // Start a wallclock timer for MPI
        int TH_global = Timer::Start();
        
        // Perform reduction of block sums and MPI allreduce call
        for (int m = 0; m < iters; m++)
        {        
            local_result = 0.0f;

            for (int i=0; i<num_blocks; i++)
            {
                local_result += h_odata[i];
            }
            global_result = 0.0f;
            globalReduction(&local_result, &global_result);
        }
        double mpi_time = Timer::Stop(TH_global,"global all reduce") / iters;

        // Compute local reference solution
        T cpu_result = reduceCPU<T>(h_idata, size);
        // Use some error threshold for floating point rounding
        double threshold = 1.0e-6;
        T diff = fabs(local_result - cpu_result);

        if (diff > threshold)
        {
            cout << "Error in local reduction detected in rank " 
                 << mpi_rank << "\n";
            cout << "Diff: " << diff << endl;
        }
        
        if (global_result != (mpi_size * local_result))
        {
            cout << "Test Failed, error in global all reduce detected in rank "
                 << mpi_rank << endl;
        } 
        else 
        {
            if (mpi_rank == 0)
            {
                cout << "Test Passed.\n";
            }
        }
        // Calculate results
        char atts[1024];
        sprintf(atts, "%d_itemsPerRank",size);
        double local_gbytes = (double)(size*sizeof(T))/(1000.*1000.*1000.);
        double global_gbytes = local_gbytes * mpi_size;
        totalReduceTime /= iters; // use average time over the iterations
        resultDB.AddResult(testName+"-Kernel", atts, "GB/s", 
            global_gbytes / totalReduceTime);
        resultDB.AddResult(testName+"-Kernel+PCIe", atts, "GB/s",
            global_gbytes / (totalReduceTime + totalTransfer));
        resultDB.AddResult(testName+"-MPI_Allreduce",  atts, "GB/s",
            (sizeof(T)*mpi_size*1.e-9) / (mpi_time));
        resultDB.AddResult(testName+"-Overall", atts, "GB/s", 
            global_gbytes / (totalReduceTime + totalTransfer + mpi_time));
        }

    err = clEnqueueUnmapMemObject(queue, h_i, h_idata, 0, NULL, NULL);
    CL_CHECK_ERROR(err);
    err = clEnqueueUnmapMemObject(queue, h_o, h_odata, 0, NULL, NULL);
    CL_CHECK_ERROR(err);
    err = clReleaseMemObject(h_i);
    CL_CHECK_ERROR(err);
    err = clReleaseMemObject(h_o);
    CL_CHECK_ERROR(err);
    err = clReleaseMemObject(d_idata);
    CL_CHECK_ERROR(err);
    err = clReleaseMemObject(d_odata);
    CL_CHECK_ERROR(err);
    err = clReleaseProgram(prog);
    CL_CHECK_ERROR(err);
    err = clReleaseKernel(reduce);
    CL_CHECK_ERROR(err);
}
void
RunBenchmark( cl::Device& dev,
                cl::Context& ctx,
                cl::CommandQueue& queue,
                ResultDatabase& resultDB,
                OptionParser& op )
{
#if defined(PARALLEL)
    int cwrank;
#endif // defined(PARALLEL)

    // single precision
    DoTest<float>( "SP_Sten2D", dev, ctx, queue, resultDB, op, "-DSINGLE_PRECISION" );

    // double precision - might not be supported
    if( checkExtension( dev, "cl_khr_fp64" ))
    {
#if defined(PARALLEL)
        MPI_Comm_rank( MPI_COMM_WORLD, &cwrank );
        if( cwrank == 0 )
        {
#endif // defined(PARALLEL)
            std::cout << "\nDP supported\n";
#if defined(PARALLEL)
        }
#endif // defined(PARALLEL)
        DoTest<double>( "DP_Sten2D", dev, ctx, queue, resultDB, op, "-DK_DOUBLE_PRECISION" );
    }
    else if( checkExtension( dev, "cl_amd_fp64" ))
    {
#if defined(PARALLEL)
        MPI_Comm_rank( MPI_COMM_WORLD, &cwrank );
        if( cwrank == 0 )
        {
#endif // defined(PARALLEL)
            std::cout << "\nDP supported\n";
#if defined(PARALLEL)
        }
#endif // defined(PARALLEL)
        DoTest<double>( "DP_Sten2D", dev, ctx, queue, resultDB, op, "-DAMD_DOUBLE_PRECISION" );
    }
    else
    {
#if defined(PARALLEL)
        MPI_Comm_rank( MPI_COMM_WORLD, &cwrank );
        if( cwrank == 0 )
        {
#endif // defined(PARALLEL)
            std::cout << "\nDP not supported\n";
#if defined(PARALLEL)
        }
#endif // defined(PARALLEL)

        // resultDB requires an entry for every possible result
        int nPasses = (int)op.getOptionInt( "passes" );
        for( unsigned int p = 0; p < nPasses; p++ )
        {
            resultDB.AddResult( (const char*)"DP_Sten2D",
                                "N/A",
                                "GFLOPS",
                                FLT_MAX );
        }
    }

    std::cout << '\n' << std::endl;
}
Ejemplo n.º 27
0
void ellPackTest(cl_device_id dev, cl_context ctx, string compileFlags,
                 cl_command_queue queue, ResultDatabase& resultDB,
                 OptionParser& op, floatType* h_val, int* h_cols,
                 int* h_rowDelimiters, floatType* h_vec, floatType* h_out,
                 int numRows, int numNonZeroes, floatType* refOut, bool padded,
                 int paddedSize, const size_t maxImgWidth)
{
    if (devSupportsImages)
    {
        char texflags[64];
        sprintf(texflags," -DUSE_TEXTURE -DMAX_IMG_WIDTH=%ld", maxImgWidth);
        compileFlags+=string(texflags);
    }

    // Set up OpenCL Program Object
    int err = 0;
    cl_program prog = clCreateProgramWithSource(ctx, 1, &cl_source_spmv, NULL,
            &err);
    CL_CHECK_ERROR(err);

    // Build the openCL kernels
    err = clBuildProgram(prog, 1, &dev, compileFlags.c_str(), NULL, NULL);
    CL_CHECK_ERROR(err);

    // If there is a build error, print the output and return
    if (err != CL_SUCCESS)
    {
        char log[5000];
        size_t retsize = 0;
        err = clGetProgramBuildInfo(prog, dev, CL_PROGRAM_BUILD_LOG, 50000
                * sizeof(char), log, &retsize);
        CL_CHECK_ERROR(err);
        cout << "Retsize: " << retsize << endl;
        cout << "Log: " << log << endl;
        return;
    }

    int *h_rowLengths = new int[paddedSize];
    int maxrl = 0;
    for (int k=0; k<numRows; k++)
    {
        h_rowLengths[k] = h_rowDelimiters[k+1] - h_rowDelimiters[k];
        if (h_rowLengths[k] > maxrl)
        {
            maxrl = h_rowLengths[k];
        }
    }
    for (int p=numRows; p < paddedSize; p++)
    {
        h_rowLengths[p] = 0;
    }

    // Column major format host data structures
    int cmSize = padded ? paddedSize : numRows;
    floatType *h_valcm = new floatType[maxrl * cmSize];
    int *h_colscm = new int[maxrl * cmSize];
    convertToColMajor(h_val, h_cols, numRows, h_rowDelimiters, h_valcm,
                              h_colscm, h_rowLengths, maxrl, padded);

    // Device data structures
    cl_mem d_val, d_vec, d_out; // floating point
    cl_mem d_cols, d_rowLengths; // integer

    // Allocate device memory
    d_val = clCreateBuffer(ctx, CL_MEM_READ_WRITE, maxrl * cmSize *
        sizeof(clFloatType), NULL, &err);
    CL_CHECK_ERROR(err);
    d_cols = clCreateBuffer(ctx, CL_MEM_READ_WRITE, maxrl * cmSize *
        sizeof(int), NULL, &err);
    CL_CHECK_ERROR(err);
    int imgHeight = 0;
    if (devSupportsImages)
    {
        imgHeight=(numRows+maxImgWidth-1)/maxImgWidth;
        cl_image_format fmt; fmt.image_channel_data_type=CL_FLOAT;
        if(sizeof(floatType)==4)
        fmt.image_channel_order=CL_R;
        else
        fmt.image_channel_order=CL_RG;
        d_vec = clCreateImage2D( ctx, CL_MEM_READ_ONLY, &fmt, maxImgWidth,
            imgHeight, 0, NULL, &err);
        CL_CHECK_ERROR(err);
    } else {
        d_vec = clCreateBuffer(ctx, CL_MEM_READ_WRITE, numRows *
            sizeof(clFloatType), NULL, &err);
        CL_CHECK_ERROR(err);
    }
    d_out = clCreateBuffer(ctx, CL_MEM_READ_WRITE, paddedSize *
        sizeof(clFloatType), NULL, &err);
    CL_CHECK_ERROR(err);
    d_rowLengths = clCreateBuffer(ctx, CL_MEM_READ_WRITE, cmSize *
        sizeof(int), NULL, &err);
    CL_CHECK_ERROR(err);

    // Setup events for timing
    Event valTransfer("transfer Val data over PCIe bus");
    Event colsTransfer("transfer cols data over PCIe bus");
    Event vecTransfer("transfer vec data over PCIe bus");
    Event rowLengthsTransfer("transfer rowLengths data over PCIe bus");

    // Transfer data to device
    err = clEnqueueWriteBuffer(queue, d_val, true, 0, maxrl * cmSize *
        sizeof(clFloatType), h_valcm, 0, NULL, &valTransfer.CLEvent());
    CL_CHECK_ERROR(err);
    err = clEnqueueWriteBuffer(queue, d_cols, true, 0, maxrl * cmSize *
        sizeof(cl_int), h_colscm, 0, NULL, &colsTransfer.CLEvent());
    CL_CHECK_ERROR(err);

    if (devSupportsImages)
    {
        size_t offset[3]={0};
        size_t size[3]={maxImgWidth,(size_t)imgHeight,1};
        err = clEnqueueWriteImage(queue,d_vec, true, offset, size,
            0, 0, h_vec, 0, NULL, &vecTransfer.CLEvent());
        CL_CHECK_ERROR(err);
    } else {
        err = clEnqueueWriteBuffer(queue, d_vec, true, 0, numRows *
            sizeof(clFloatType), h_vec, 0, NULL, &vecTransfer.CLEvent());
        CL_CHECK_ERROR(err);
    }

    err = clEnqueueWriteBuffer(queue, d_rowLengths, true, 0, cmSize *
        sizeof(int), h_rowLengths, 0, NULL, &rowLengthsTransfer.CLEvent());
    CL_CHECK_ERROR(err);

    err = clFinish(queue);
    CL_CHECK_ERROR(err);

    valTransfer.FillTimingInfo();
    colsTransfer.FillTimingInfo();
    vecTransfer.FillTimingInfo();
    rowLengthsTransfer.FillTimingInfo();

    double iTransferTime =  valTransfer.StartEndRuntime() +
                           colsTransfer.StartEndRuntime() +
                            vecTransfer.StartEndRuntime() +
                     rowLengthsTransfer.StartEndRuntime();

    // Set up kernel arguments
    cl_kernel ellpackr = clCreateKernel(prog, "spmv_ellpackr_kernel", &err);
    CL_CHECK_ERROR(err);
    err = clSetKernelArg(ellpackr, 0, sizeof(cl_mem), (void*) &d_val);
    CL_CHECK_ERROR(err);
    err = clSetKernelArg(ellpackr, 1, sizeof(cl_mem), (void*) &d_vec);
    CL_CHECK_ERROR(err);
    err = clSetKernelArg(ellpackr, 2, sizeof(cl_mem), (void*) &d_cols);
    CL_CHECK_ERROR(err);
    err = clSetKernelArg(ellpackr, 3, sizeof(cl_mem), (void*) &d_rowLengths);
    CL_CHECK_ERROR(err);
    err = clSetKernelArg(ellpackr, 4, sizeof(cl_int), (void*) &cmSize);
    CL_CHECK_ERROR(err);
    err = clSetKernelArg(ellpackr, 5, sizeof(cl_mem), (void*) &d_out);
    CL_CHECK_ERROR(err);

    const size_t globalWorkSize = cmSize;
    const size_t localWorkSize = BLOCK_SIZE;
    Event kernelExec("ELLPACKR Kernel Execution");

    int passes = op.getOptionInt("passes");
    int iters  = op.getOptionInt("iterations");

    for (int k = 0; k < passes; k++)
    {
        double totalKernelTime = 0.0;
        for (int j = 0; j < iters; j++)
        {
            err = clEnqueueNDRangeKernel(queue, ellpackr, 1, NULL,
                &globalWorkSize, &localWorkSize, 0, NULL,
                &kernelExec.CLEvent());
            CL_CHECK_ERROR(err);
            err = clFinish(queue);
            CL_CHECK_ERROR(err);
            kernelExec.FillTimingInfo();
            totalKernelTime += kernelExec.StartEndRuntime();
        }

         Event outTransfer("d->h data transfer");
         err = clEnqueueReadBuffer(queue, d_out, true, 0, numRows *
             sizeof(clFloatType), h_out, 0, NULL, &outTransfer.CLEvent());
         CL_CHECK_ERROR(err);
         err = clFinish(queue);
         CL_CHECK_ERROR(err);
         outTransfer.FillTimingInfo();
         double oTransferTime = outTransfer.StartEndRuntime();

        // Compare reference solution to GPU result
        if (! verifyResults(refOut, h_out, numRows, k)) {
            return;
        }
        char atts[TEMP_BUFFER_SIZE];
        char benchName[TEMP_BUFFER_SIZE];
        double avgTime = totalKernelTime / (double)iters;
        sprintf(atts, "%d_elements_%d_rows", numNonZeroes, cmSize);
        double gflop = 2 * (double) numNonZeroes;
        bool dpTest = (sizeof(floatType) == sizeof(double));
        sprintf(benchName, "%sELLPACKR-%s", padded ? "Padded_":"",
                dpTest ? "DP":"SP");
        resultDB.AddResult(benchName, atts, "Gflop/s", gflop/avgTime);
        sprintf(benchName, "%s_PCIe", benchName);
        resultDB.AddResult(benchName, atts, "Gflop/s", gflop /
            (avgTime + iTransferTime + oTransferTime));
    }

    err = clReleaseProgram(prog);
    CL_CHECK_ERROR(err);
    err = clReleaseKernel(ellpackr);
    CL_CHECK_ERROR(err);

    // Free device memory
    err = clReleaseMemObject(d_rowLengths);
    CL_CHECK_ERROR(err);
    err = clReleaseMemObject(d_vec);
    CL_CHECK_ERROR(err);
    err = clReleaseMemObject(d_out);
    CL_CHECK_ERROR(err);
    err = clReleaseMemObject(d_val);
    CL_CHECK_ERROR(err);
    err = clReleaseMemObject(d_cols);
    CL_CHECK_ERROR(err);

    // Free host memory
    delete[] h_rowLengths;
    delete[] h_valcm;
    delete[] h_colscm;
}
Ejemplo n.º 28
0
void RunTest(string testName, ResultDatabase &resultDB, OptionParser &op)
{
    int probSizes[4] = { 1, 8, 48, 96 };

    int size = probSizes[op.getOptionInt("size")-1];
    
    // Convert to MiB
    size = (size*1024*1024)/sizeof(T);
    
    // Create input data on CPU
    unsigned int bytes = size * sizeof(T);

    // Allocate Host Memory
    __declspec(target(MIC)) static T *hkey, *outkey;
    __declspec(target(MIC)) static T *hvalue, *outvalue;

    hkey   = (T*)_mm_malloc(bytes,ALIGN);
    hvalue = (T*)_mm_malloc(bytes,ALIGN);

    outkey   = (T*)_mm_malloc(bytes,ALIGN);
    outvalue = (T*)_mm_malloc(bytes,ALIGN);


    // Initialize host memory
    cout << "Initializing host memory." << endl;

    srand(time(NULL));
    for (int i = 0; i < size; i++)
    {
        hkey[i] = hvalue[i]= (i+255) % 1089; // Fill with some pattern
    }

    int micdev = op.getOptionInt("target");
    int iters = op.getOptionInt("passes");
    int numThreads = op.getOptionInt("nthreads");

    cout << "nthreads   = " <<numThreads<< endl;

    cout << "Running benchmark" << endl;
    for(int it=0;it<iters;it++)
    {

        // Allocating buffer on card
        #pragma offload target(mic:micdev) in(hkey:length(size)  free_if(0)) \
                in(hvalue:length(size) free_if(0))\
                out(outkey:length(size) free_if(0))\
                out(outvalue:length(size) free_if(0))
        {
        }

        double start = curr_second();
        // Get data transfer time
        #pragma offload target(mic:micdev) in(hkey:length(size) alloc_if(0) \
                free_if(0)) in(hvalue:length(size) alloc_if(0) free_if(0))
        {
        }

        float transferTime = curr_second()-start;
        double totalRunTime = 0.0f;
        start = curr_second();
        #pragma offload target(mic:micdev) nocopy(hkey:length(size) \
                alloc_if(0) free_if(0))                             \
                nocopy(hvalue:length(size) alloc_if(0) free_if(0))  \
                nocopy(outkey:length(size) alloc_if(0) free_if(0))  \
                nocopy(outvalue:length(size) alloc_if(0) free_if(0))\
                in(numThreads)
        {
            sortKernel<T>(hkey, hvalue, outkey, outvalue, size, numThreads);
        }
        totalRunTime = curr_second()-start;

        #pragma offload target(mic:micdev) nocopy(hkey:length(size) \
                alloc_if(0) free_if(1))                             \
                nocopy(hvalue:length(size) alloc_if(0) free_if(1))  \
                out(outkey:length(size) alloc_if(0))                \
                out(outvalue:length(size) alloc_if(0))
        {
        }

        // If results aren't correct, don't report perf numbers
        if (!verifyResult<T>(outkey, outvalue, size))
        {
            return;
        }

        char atts[1024];
        double avgTime = (totalRunTime / (double) iters);
        sprintf(atts, "%d items", size);
        double gb = (double)(size * sizeof(T)) / (1000. * 1000. * 1000.);
        resultDB.AddResult(testName, atts, "GB/s", gb / avgTime);
        resultDB.AddResult(testName+"_PCIe", atts, "GB/s",
                gb / (avgTime + transferTime));
        resultDB.AddResult(testName+"_Parity", atts, "N",
                transferTime / avgTime);

    }
    // Clean up
    _mm_free(hkey);
    _mm_free(hvalue);

}
Ejemplo n.º 29
0
// ****************************************************************************
// Function: RunBenchmark
//
// Purpose:
//   Executes the sparse matrix - vector multiplication benchmark
//
// Arguments:
//   dev: the opencl device id to use for the benchmark
//   ctx: the opencl context to use for the benchmark
//   queue: the opencl command queue to issue commands to
//   resultDB: stores results from the benchmark
//   op: the options parser / parameter database
//
// Returns:  nothing
// Programmer: Lukasz Wesolowski
// Creation: August 13, 2010
//
// Modifications:
//
// ****************************************************************************
void
RunBenchmark(cl_device_id dev,
                  cl_context ctx,
                  cl_command_queue queue,
                  ResultDatabase &resultDB,
                  OptionParser &op)
{
    //create list of problem sizes
    int probSizes[4] = {1024, 8192, 12288, 16384};
    int sizeClass = op.getOptionInt("size") - 1;

    // Always run single precision test
    // OpenCL doesn't support templated kernels, so we have to use macros
    cout <<"Single precision tests:\n";
    string spMacros = "-DSINGLE_PRECISION ";
    RunTest<float, cl_float>
        (dev, ctx, queue, resultDB, op, spMacros, probSizes[sizeClass]);

    // If double precision is supported, run the DP test
    if (checkExtension(dev, "cl_khr_fp64"))
    {
        cout << "Double precision tests\n";
        string dpMacros = "-DK_DOUBLE_PRECISION ";
        RunTest<double, cl_double>
            (dev, ctx, queue, resultDB, op, dpMacros, probSizes[sizeClass]);
    }
    else if (checkExtension(dev, "cl_amd_fp64"))
    {
        cout << "Double precision tests\n";
        string dpMacros = "-DAMD_DOUBLE_PRECISION ";
        RunTest<double, cl_double>
            (dev, ctx, queue, resultDB, op, dpMacros, probSizes[sizeClass]);
    }
    else
    {
        std::cout << "Double precision not supported by chosen device, skipping" << std::endl;
        // driver script still needs entries for all tests, even if not run
        int nPasses = (int)op.getOptionInt( "passes" );
        for( unsigned int p = 0; p < nPasses; p++ )
        {
            resultDB.AddResult( (const char*)"CSR-Scalar-DP",
                                "N/A",
                                "Gflop/s",
                                FLT_MAX );
            resultDB.AddResult( (const char*)"CSR-Scalar-DP_PCIe",
                                "N/A",
                                "Gflop/s",
                                FLT_MAX );
            resultDB.AddResult( (const char*)"CSR-Vector-DP",
                                "N/A",
                                "Gflop/s",
                                FLT_MAX );
            resultDB.AddResult( (const char*)"CSR-Vector-DP_PCIe",
                                "N/A",
                                "Gflop/s",
                                FLT_MAX );
            resultDB.AddResult( (const char*)"ELLPACKR-DP",
                                "N/A",
                                "Gflop/s",
                                FLT_MAX );
            resultDB.AddResult( (const char*)"ELLPACKR-DP_PCIe",
                                "N/A",
                                "Gflop/s",
                                FLT_MAX );
            resultDB.AddResult( (const char*)"Padded_CSR-Scalar-DP",
                                "N/A",
                                "Gflop/s",
                                FLT_MAX );
            resultDB.AddResult( (const char*)"Padded_CSR-Scalar-DP_PCIe",
                                "N/A",
                                "Gflop/s",
                                FLT_MAX );
            resultDB.AddResult( (const char*)"Padded_CSR-Vector-DP",
                                "N/A",
                                "Gflop/s",
                                FLT_MAX );
            resultDB.AddResult( (const char*)"Padded_CSR-Vector-DP_PCIe",
                                "N/A",
                                "Gflop/s",
                                FLT_MAX );
        }
    }
}
Ejemplo n.º 30
0
Archivo: main.cpp Proyecto: ManavA/shoc
// ****************************************************************************
// Method:  main()
//
// Purpose:
//   serial and parallel main for OpenCL level0 benchmarks
//
// Arguments:
//   argc, argv
//
// Programmer:  SHOC Team
// Creation:    The Epoch
//
// Modifications:
//   Jeremy Meredith, Tue Jan 12 15:09:33 EST 2010
//   Changed the way device selection works.  It now defaults to the device
//   index corresponding to the process's rank within a node if no devices
//   are specified on the command command line, and otherwise, round-robins
//   the list of devices among the tasks.
//
//   Gabriel Marin, Tue Jun 01 15:38 EST 2010
//   Check that we have valid (not NULL) context and queue objects before
//   running the benchmarks. Errors inside CreateContextFromSingleDevice or
//   CreateCommandQueueForContextAndDevice were not propagated out to the main
//   program.
//
//   Jeremy Meredith, Wed Nov 10 14:20:47 EST 2010
//   Split timing reports into detailed and summary.  For serial code, we
//   report all trial values, and for parallel, skip the per-process vals.
//   Also detect and print outliers from parallel runs.
//
// ****************************************************************************
int main(int argc, char *argv[])
{
    int ret = 0;

    try
    {
#ifdef PARALLEL
        int rank, size;
        MPI_Init(&argc,&argv);
        MPI_Comm_size(MPI_COMM_WORLD, &size);
        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
        cout << "MPI Task "<< rank << "/" << size - 1 << " starting....\n";
#endif

        OptionParser op;

        //Add shared options to the parser
        op.addOption("platform", OPT_INT, "0", "specify OpenCL platform to use",
                'p');
        op.addOption("device", OPT_VECINT, "", "specify device(s) to run on", 'd');
        op.addOption("passes", OPT_INT, "10", "specify number of passes", 'n');
        op.addOption("size", OPT_VECINT, "1", "specify problem size", 's');
        op.addOption("infoDevices", OPT_BOOL, "",
                "show info for available platforms and devices", 'i');
        op.addOption("verbose", OPT_BOOL, "", "enable verbose output", 'v');
        op.addOption("quiet", OPT_BOOL, "", "write minimum necessary to standard output", 'q');

        addBenchmarkSpecOptions(op);

        if (!op.parse(argc, argv))
        {
#ifdef PARALLEL
            if (rank == 0)
                op.usage();
            MPI_Finalize();
#else
            op.usage();
#endif
            return (op.HelpRequested() ? 0 : 1 );
        }

        if (op.getOptionBool("infoDevices"))
        {
#define DEBUG_DEVICE_CONTAINER 0
#ifdef PARALLEL
            // execute following code only if I am the process of lowest
            // rank on this node
            NodeInfo NI;
            int mynoderank = NI.nodeRank();
            if (mynoderank==0)
            {
                int nlrrank, nlrsize;
                MPI_Comm nlrcomm = NI.getNLRComm();
                MPI_Comm_size(nlrcomm, &nlrsize);
                MPI_Comm_rank(nlrcomm, &nlrrank);

                OpenCLNodePlatformContainer ndc1;
                OpenCLMultiNodeContainer localMnc(ndc1);
                localMnc.doMerge (nlrrank, nlrsize, nlrcomm);
                if (rank==0)  // I am the global rank 0, print all configurations
                    localMnc.Print (cout);
            }
#else
            OpenCLNodePlatformContainer ndc1;
            ndc1.Print (cout);
#if DEBUG_DEVICE_CONTAINER
            OpenCLMultiNodeContainer mnc1(ndc1), mnc2;
            mnc1.Print (cout);
            ostringstream oss;
            mnc1.writeObject (oss);
            std::string temp(oss.str());
            cout << "Serialized MultiNodeContainer:\n" << temp;
            istringstream iss(temp);
            mnc2.readObject (iss);
            cout << "Unserialized object2:\n";
            mnc2.Print (cout);
            mnc1.merge (mnc2);
            cout << "==============\nObject1 after merging 1:\n";
            mnc1.Print (cout);
            mnc1.merge (mnc2);
            cout << "==============\nObject1 after merging 2:\n";
            mnc1.Print (cout);
#endif  // DEBUG
#endif  // PARALLEL
            return (0);
        }

        bool verbose = op.getOptionBool("verbose");

        // The device option supports specifying more than one device
        // for now, just choose the first one.
        int platform = op.getOptionInt("platform");

#ifdef PARALLEL
        NodeInfo ni;
        int myNodeRank = ni.nodeRank();
        if (verbose)
        cout << "Global rank "<<rank<<" is local rank "<<myNodeRank << endl;
#else
        int myNodeRank = 0;
#endif

        // If they haven't specified any devices, assume they
        // want the process with in-node rank N to use device N
        int deviceIdx = myNodeRank;

        // If they have, then round-robin the list of devices
        // among the processes on a node.
        vector<long long> deviceVec = op.getOptionVecInt("device");
        if (deviceVec.size() > 0)
        {
        int len = deviceVec.size();
            deviceIdx = deviceVec[myNodeRank % len];
        }

        // Check for an erroneous device
        if (deviceIdx >= GetNumOclDevices(platform)) {
            cerr << "Warning: device index: " << deviceIdx
                 << " out of range, defaulting to device 0.\n";
            deviceIdx = 0;
        }

        // Initialization
        if (verbose) cout << ">> initializing\n";
        cl_device_id devID = ListDevicesAndGetDevice(platform, deviceIdx);
        cl_int clErr;
        cl_context ctx = clCreateContext( NULL,     // properties
                                            1,      // number of devices
                                            &devID, // device
                                            NULL,   // notification function
                                            NULL,
                                            &clErr );
        CL_CHECK_ERROR(clErr);
        cl_command_queue queue = clCreateCommandQueue( ctx,
                                                        devID,
                                                        CL_QUEUE_PROFILING_ENABLE,
                                                        &clErr );
        CL_CHECK_ERROR(clErr);
        ResultDatabase resultDB;

        // Run the benchmark
        RunBenchmark(devID, ctx, queue, resultDB, op);

        clReleaseCommandQueue( queue );
        clReleaseContext( ctx );


#ifndef PARALLEL
        resultDB.DumpDetailed(cout);
#else
        ParallelResultDatabase pardb;
        pardb.MergeSerialDatabases(resultDB,MPI_COMM_WORLD);
        if (rank==0)
        {
            pardb.DumpSummary(cout);
            pardb.DumpOutliers(cout);
        }
#endif
    }
    catch( std::exception& e )
    {
        std::cerr << e.what() << std::endl;
        ret = 1;
    }
    catch( ... )
    {
        std::cerr << "unrecognized exception caught" << std::endl;
        ret = 1;
    }

#ifdef PARALLEL
    MPI_Finalize();
#endif

    return ret;
}