Example #1
0
int main(void)
{

    int Mdim, Ndim, Pdim;   // A[N][P], B[P][M], C[N][M]
    int szA, szB, szC;      // number of elements in each matrix


    double start_time;      // Starting time
    double run_time;        // timing data

    Ndim = ORDER;
    Pdim = ORDER;
    Mdim = ORDER;

    szA = Ndim * Pdim;
    szB = Pdim * Mdim;
    szC = Ndim * Mdim;

    std::vector<float> h_A(szA); // Host memory for Matrix A
    std::vector<float> h_B(szB); // Host memory for Matrix B
    std::vector<float> h_C(szC); // Host memory for Matrix C

    cl::Buffer d_a, d_b, d_c;   // Matrices in device memory

    initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C);

    printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",ORDER);
    for(int i = 0; i < COUNT; i++)
    {
        zero_mat(Ndim, Mdim, h_C);
        start_time = wtime();

        seq_mat_mul_sdot(Mdim, Ndim, Pdim, h_A, h_B, h_C);

        run_time  = wtime() - start_time;
        results(Mdim, Ndim, Pdim, h_C, run_time);
    }

    try
    {

//--------------------------------------------------------------------------------
// Create a context and queue for DEVICE
//--------------------------------------------------------------------------------

        cl::Context context(DEVICE);
        cl::CommandQueue queue(context);

//--------------------------------------------------------------------------------
// Setup the buffers, initialize matrices, and write them into global memory
//--------------------------------------------------------------------------------

        //  Reset A, B and C matrices (just to play it safe)
        initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C);

        d_a = cl::Buffer(context, begin(h_A), end(h_A), true);

        d_b = cl::Buffer(context, begin(h_B), end(h_B), true);

        d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * szC);

//--------------------------------------------------------------------------------
// OpenCL matrix multiplication ... Naive
//--------------------------------------------------------------------------------

        // Create the compute program from the source buffer
        cl::Program program(context, util::loadProgram("../C_elem.cl"), true);

        // Create the compute kernel from the program
        auto naive_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer>(program, "mmul");

        printf("\n===== OpenCL, matrix mult, C(i,j) per work item, order %d ======\n",Ndim);

        // Do the multiplication COUNT times
        for (int i = 0; i < COUNT; i++)
        {
            zero_mat(Ndim, Mdim, h_C);

            start_time = wtime();

            // Execute the kernel over the entire range of C matrix elements ... computing
            // a dot product for each element of the product matrix.  The local work
            // group size is set to NULL ... so I'm telling the OpenCL runtime to
            // figure out a local work group size for me.
            cl::NDRange global(Ndim, Mdim);
            naive_mmul(cl::EnqueueArgs(queue, global),
                    Mdim, Ndim, Pdim, d_a, d_b, d_c);

            queue.finish();

            run_time = wtime() - start_time;

            cl::copy(queue, d_c, begin(h_C), end(h_C));

            results(Mdim, Ndim, Pdim, h_C, run_time);

        } // end for loop

//--------------------------------------------------------------------------------
// OpenCL matrix multiplication ... C row per work item
//--------------------------------------------------------------------------------

        // Create the compute program from the source buffer
        program = cl::Program(context, util::loadProgram("../C_row.cl"), true);

        // Create the compute kernel from the program
        auto crow_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer>(program, "mmul");

         printf("\n===== OpenCL, matrix mult, C row per work item, order %d ======\n",Ndim);

        // Do the multiplication COUNT times
        for (int i = 0; i < COUNT; i++)
        {
            zero_mat(Ndim, Mdim, h_C);

            start_time = wtime();

            cl::NDRange global(Ndim);
            crow_mmul(cl::EnqueueArgs(queue, global),
                    Mdim, Ndim, Pdim, d_a, d_b, d_c);

            queue.finish();

            run_time = wtime() - start_time;

            cl::copy(queue, d_c, begin(h_C), end(h_C));

            results(Mdim, Ndim, Pdim, h_C, run_time);

        } // end for loop

//--------------------------------------------------------------------------------
// OpenCL matrix multiplication ... C row per work item, A row in pivate memory
//--------------------------------------------------------------------------------

        // Create the compute program from the source buffer
        program = cl::Program(context, util::loadProgram("../C_row_priv.cl"), true);

        // Create the compute kernel from the program
        auto arowpriv_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer>(program, "mmul");

        printf("\n===== OpenCL, matrix mult, C row, A row in priv mem, order %d ======\n",Ndim);

        // Do the multiplication COUNT times
        for (int i = 0; i < COUNT; i++)
        {
            zero_mat(Ndim, Mdim, h_C);

            start_time = wtime();

            cl::NDRange global(Ndim);
            cl::NDRange local(ORDER / 16);
            arowpriv_mmul(cl::EnqueueArgs(queue, global, local),
                    Mdim, Ndim, Pdim, d_a, d_b, d_c);

            queue.finish();

            run_time = wtime() - start_time;

            cl::copy(queue, d_c, begin(h_C), end(h_C));

            results(Mdim, Ndim, Pdim, h_C, run_time);

        } // end for loop

//--------------------------------------------------------------------------------
// OpenCL matrix multiplication ... C row per work item, A row pivate, B col local
//--------------------------------------------------------------------------------

        // Create the compute program from the source buffer
        program = cl::Program(context, util::loadProgram("../C_row_priv_bloc.cl"), true);

        // Create the compute kernel from the program
        auto browloc_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer, cl::LocalSpaceArg>(program, "mmul");

        printf("\n===== OpenCL, mat mult, C row, priv A, B cols loc, order %d ======\n",Ndim);

        // Do the multiplication COUNT times
        for (int i = 0; i < COUNT; i++)
        {
            zero_mat(Ndim, Mdim, h_C);

            start_time = wtime();

            cl::NDRange global(Ndim);
            cl::NDRange local(ORDER / 16);

            cl::LocalSpaceArg localmem = cl::Local(sizeof(float) * Pdim);

            browloc_mmul(cl::EnqueueArgs(queue, global, local),
                    Mdim, Ndim, Pdim, d_a, d_b, d_c, localmem);

            queue.finish();

            run_time = wtime() - start_time;

            cl::copy(queue, d_c, begin(h_C), end(h_C));

            results(Mdim, Ndim, Pdim, h_C, run_time);

        } // end for loop

//--------------------------------------------------------------------------------
// OpenCL matrix multiplication ...  A and B in block form in local memory
//--------------------------------------------------------------------------------

        // Create the compute program from the source buffer
        program = cl::Program(context, util::loadProgram("../C_block_form.cl"), true);

        // Create the compute kernel from the program
        auto block_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer, cl::LocalSpaceArg, cl::LocalSpaceArg>(program, "mmul");

        printf("\n===== OpenCL, A and B in block form in local memory, order %d ======\n",Ndim);

        // Do the multiplication COUNT times
        for (int i = 0; i < COUNT; i++)
        {
            zero_mat(Ndim, Mdim, h_C);

            start_time = wtime();

            int blocksize = 16;
            cl::NDRange global(Ndim, Mdim);
            cl::NDRange local(blocksize, blocksize);

            cl::LocalSpaceArg localmem1 = cl::Local(sizeof(float) * blocksize * blocksize);
            cl::LocalSpaceArg localmem2 = cl::Local(sizeof(float) * blocksize * blocksize);

            block_mmul(cl::EnqueueArgs(queue, global, local),
                    Mdim, Ndim, Pdim, d_a, d_b, d_c, localmem1, localmem2);

            queue.finish();

            run_time = wtime() - start_time;

            cl::copy(queue, d_c, begin(h_C), end(h_C));

            results(Mdim, Ndim, Pdim, h_C, run_time);

        } // end for loop

    } catch (cl::Error err)
    {
        std::cout << "Exception\n";
        std::cerr << "ERROR: "
                  << err.what()
                  << "("
                  << err_code(err.err())
                  << ")"
                  << std::endl;
    }

    return EXIT_SUCCESS;
}
Example #2
0
int main(int argc, char *argv[])
{

    int N;                  // A[N][N], B[N][N], C[N][N]
    int size;               // Number of elements in each matrix


    double start_time;      // Starting time
    double run_time;        // Timing
    util::Timer timer;      // Timing

    N    = ORDER;
    size = N * N;

    std::vector<float> h_A(size); // Host memory for Matrix A
    std::vector<float> h_B(size); // Host memory for Matrix B
    std::vector<float> h_C(size); // Host memory for Matrix C

    cl::Buffer d_a, d_b, d_c;   // Matrices in device memory

//--------------------------------------------------------------------------------
// Create a context and queue
//--------------------------------------------------------------------------------

    try
    {

        cl_uint deviceIndex = 0;
        parseArguments(argc, argv, &deviceIndex);

        // Get list of devices
        std::vector<cl::Device> devices;
        unsigned numDevices = getDeviceList(devices);

        // Check device index in range
        if (deviceIndex >= numDevices)
        {
          std::cout << "Invalid device index (try '--list')\n";
          return EXIT_FAILURE;
        }

        cl::Device device = devices[deviceIndex];

        std::string name;
        getDeviceName(device, name);
        std::cout << "\nUsing OpenCL device: " << name << "\n";

        std::vector<cl::Device> chosen_device;
        chosen_device.push_back(device);
        cl::Context context(chosen_device);
        cl::CommandQueue queue(context, device);

//--------------------------------------------------------------------------------
// Run sequential matmul
//--------------------------------------------------------------------------------


        initmat(N, h_A, h_B, h_C);

        timer.reset();

        printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",N);
        for(int i = 0; i < COUNT; i++)
        {
            zero_mat(N, h_C);

            start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0;

            seq_mat_mul_sdot(N, h_A, h_B, h_C);

            run_time  = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time;
            results(N, h_C, run_time);
        }

//--------------------------------------------------------------------------------
// Setup the buffers, initialize matrices, and write them into global memory
//--------------------------------------------------------------------------------

        //  Reset A, B and C matrices (just to play it safe)
        initmat(N, h_A, h_B, h_C);

        d_a = cl::Buffer(context, h_A.begin(), h_A.end(), true);

        d_b = cl::Buffer(context, h_B.begin(), h_B.end(), true);

        d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * size);

//--------------------------------------------------------------------------------
// OpenCL matrix multiplication ... Naive
//--------------------------------------------------------------------------------

        // Create the compute program from the source buffer
        cl::Program program(context, kernelsource, true);

        // Create the compute kernel from the program
        cl::make_kernel<int, cl::Buffer, cl::Buffer, cl::Buffer> naive_mmul(program, "mmul");

        printf("\n===== OpenCL, matrix mult, C(i,j) per work item, order %d ======\n",N);

        // Do the multiplication COUNT times
        for (int i = 0; i < COUNT; i++)
        {
            zero_mat(N, h_C);

            start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0;

            // Execute the kernel over the entire range of C matrix elements ... computing
            // a dot product for each element of the product matrix.  The local work
            // group size is set to NULL ... so I'm telling the OpenCL runtime to
            // figure out a local work group size for me.
            cl::NDRange global(N, N);
            naive_mmul(cl::EnqueueArgs(queue, global),
                    N, d_a, d_b, d_c);

            queue.finish();

            run_time  = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time;

            cl::copy(queue, d_c, h_C.begin(), h_C.end());

            results(N, h_C, run_time);

        } // end for loop

    } catch (cl::Error err)
    {
        std::cout << "Exception\n";
        std::cerr << "ERROR: "
                  << err.what()
                  << "("
                  << err_code(err.err())
                  << ")"
                  << std::endl;
    }

    return EXIT_SUCCESS;
}
int main(void)
{

    int N;   // A[N][N], B[N][N], C[N][N]
    int sz;  // number of elements in each matrix
    float tmp;

    N = ORDER;

    sz = N * N;

    std::vector<float> h_A(sz); // Matrix A on the host
    std::vector<float> h_B(sz); // Matrix B on the host
    std::vector<float> h_C(sz); // Matrix C on the host

    cl::Buffer d_A;    // matrix A on the device
    cl::Buffer d_B;    // matrix B on the device
    cl::Buffer d_C;    // matrix C on the device

    initmat(N, N, N, h_A, h_B, h_C);

    printf("\n===== Sequential, matrix mult (dot prod), order %d on CPU ======\n",ORDER);
 
    zero_mat(N, N, h_C);

    util::Timer timer;

    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            tmp = 0.0f;
            for (int k = 0; k < N; k++) {
                tmp += h_A[i*N+k] * h_B[k*N+j];
            }
            h_C[i*N+j] = tmp;
        }
    }
              
    double rtime = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0;

    results(N, N, N, h_C, rtime);

    printf("\n===== Parallel matrix mult (dot prod), order %d on device ======\n",ORDER);

    switch (DEVICE) {
      case CL_DEVICE_TYPE_DEFAULT: printf("DEVICE=DEFAULT\n"); break;
      case CL_DEVICE_TYPE_CPU:     printf("DEVICE=CPU\n"); break;
      case CL_DEVICE_TYPE_GPU:     printf("DEVICE=GPU\n"); break;
      default:                     printf("DEVICE=%d\n", DEVICE); break;
    }
 
    zero_mat(N, N, h_C);
    try
    {
   
       cl::Context context(DEVICE);

       // Load in kernel source, creating a program object for the context.
       // Build program explicitly so I can catch errors and display
       // compiler error messages (should any be generated)

       cl::Program program(context, util::loadProgram("matmul_kernel.cl"));
       try
       {
           program.build();
       }
       catch (cl::Error error)
       {
          // If it was a build error then show the error
          if (error.err() == CL_BUILD_PROGRAM_FAILURE)
           {
               std::vector<cl::Device> devices;
               devices = context.getInfo<CL_CONTEXT_DEVICES>();
               std::string built = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0]);
               std::cerr << built << "\n";
           }
           throw error;
       }


        // Get the command queue
        cl::CommandQueue queue(context);


        // Create the kernel functor
 
        auto mmul = cl::make_kernel<int, cl::Buffer, cl::Buffer, cl::Buffer, 
                                    cl::LocalSpaceArg, cl::LocalSpaceArg>   
                                    (program, "mmul");

        util::Timer timer;


        d_A   = cl::Buffer(context, begin(h_A), end(h_A), true);
        d_B   = cl::Buffer(context, begin(h_B), end(h_B), true);
        d_C   = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * sz);

        // Work-group computes a block of C.  This size is also set
        // in a #define inside the kernel function.  Note this blocksize
        // must evenly divide the matrix order
        int blocksize = 16;  

        cl::LocalSpaceArg A_block = cl::Local(sizeof(float) * blocksize*blocksize);
        cl::LocalSpaceArg B_block = cl::Local(sizeof(float) * blocksize*blocksize);
 
        mmul(
            cl::EnqueueArgs(
            queue,
            cl::NDRange(N,N),
            cl::NDRange(blocksize,blocksize)),
            N, 
            d_A,
            d_B,
            d_C,
            A_block,
            B_block);

        cl::copy(queue, d_C, begin(h_C), end(h_C));

        double rtime = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0;

        results(N, N, N, h_C, rtime);
          
    }
    catch (cl::Error err) {
        std::cout << "Exception\n";
        std::cerr 
            << "ERROR: "
            << err.what()
            << std::endl;
 
    }

}
int main(void)
{

    int Mdim, Ndim, Pdim;   // A[N][P], B[P][M], C[N][M]
    int szA, szB, szC;      // Number of elements in each matrix


    double start_time;      // Starting time
    double run_time;        // Timing 
    util::Timer timer;      // Timing

    Ndim = ORDER;
    Pdim = ORDER;
    Mdim = ORDER;

    szA = Ndim * Pdim;
    szB = Pdim * Mdim;
    szC = Ndim * Mdim;

    std::vector<float> h_A(szA); // Host memory for Matrix A
    std::vector<float> h_B(szB); // Host memory for Matrix B
    std::vector<float> h_C(szC); // Host memory for Matrix C

    cl::Buffer d_a, d_b, d_c;   // Matrices in device memory

    initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C);

    timer.reset();

    printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",ORDER);
    for(int i = 0; i < COUNT; i++)
    {
        zero_mat(Ndim, Mdim, h_C);

        start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0;

        seq_mat_mul_sdot(Mdim, Ndim, Pdim, h_A, h_B, h_C);

        run_time  = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time;
        results(Mdim, Ndim, Pdim, h_C, run_time);
    }

    try
    {

//--------------------------------------------------------------------------------
// Create a context and queue for DEVICE
//--------------------------------------------------------------------------------

        cl::Context context(DEVICE);
        cl::CommandQueue queue(context);

//--------------------------------------------------------------------------------
// Setup the buffers, initialize matrices, and write them into global memory
//--------------------------------------------------------------------------------

        //  Reset A, B and C matrices (just to play it safe)
        initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C);

        d_a = cl::Buffer(context, begin(h_A), end(h_A), true);

        d_b = cl::Buffer(context, begin(h_B), end(h_B), true);

        d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * szC);

//--------------------------------------------------------------------------------
// OpenCL matrix multiplication ... Naive
//--------------------------------------------------------------------------------

        // Create the compute program from the source buffer
        cl::Program program(context, kernelsource, true);

        // Create the compute kernel from the program
        auto naive_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer>(program, "mmul");

        printf("\n===== OpenCL, matrix mult, C(i,j) per work item, order %d ======\n",Ndim);

        // Do the multiplication COUNT times
        for (int i = 0; i < COUNT; i++)
        {
            zero_mat(Ndim, Mdim, h_C);

            start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0;

            // Execute the kernel over the entire range of C matrix elements ... computing
            // a dot product for each element of the product matrix.  The local work
            // group size is set to NULL ... so I'm telling the OpenCL runtime to
            // figure out a local work group size for me.
            cl::NDRange global(Ndim, Mdim);
            naive_mmul(cl::EnqueueArgs(queue, global),
                    Mdim, Ndim, Pdim, d_a, d_b, d_c);

            queue.finish();

            run_time  = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time;

            cl::copy(queue, d_c, begin(h_C), end(h_C));

            results(Mdim, Ndim, Pdim, h_C, run_time);

        } // end for loop

    } catch (cl::Error err)
    {
        std::cout << "Exception\n";
        std::cerr << "ERROR: "
                  << err.what()
                  << "("
                  << err_code(err.err())
                  << ")"
                  << std::endl;
    }

    return EXIT_SUCCESS;
}