コード例 #1
0
ファイル: matmul.c プロジェクト: Aestey/Exercises-Solutions
int main(void)
{
    float *h_A;             // A matrix
    float *h_B;             // B matrix
    float *h_C;             // C = A*B matrix
    int Mdim, Ndim, Pdim;   // A[N][P], B[P][M], C[N][M]
    int szA, szB, szC;      // number of elements in each matrix

    cl_mem d_a, d_b, d_c;   // Matrices in device memory

    double start_time;      // Starting time
    double run_time;        // timing data

    char * kernelsource;    // kernel source string

    cl_int err;             // error code returned from OpenCL calls
    cl_device_id     device_id;     // compute device id 
    cl_context       context;       // compute context
    cl_command_queue commands;      // compute command queue
    cl_program       program;       // compute program
    cl_kernel        kernel;        // compute kernel

    Ndim = ORDER;
    Pdim = ORDER;
    Mdim = ORDER;

    szA = Ndim * Pdim;
    szB = Pdim * Mdim;
    szC = Ndim * Mdim;

    h_A = (float *)malloc(szA * sizeof(float));
    h_B = (float *)malloc(szB * sizeof(float));
    h_C = (float *)malloc(szC * sizeof(float));

    initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C);

    printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",ORDER);
    for(int i = 0; i < COUNT; i++)
    {
        zero_mat(Ndim, Mdim, h_C);
        start_time = wtime();

        seq_mat_mul_sdot(Mdim, Ndim, Pdim, h_A, h_B, h_C);

        run_time  = wtime() - start_time;
        results(Mdim, Ndim, Pdim, h_C, run_time);
    }

//--------------------------------------------------------------------------------
// Create a context, queue and device.
//--------------------------------------------------------------------------------

    // Set up OpenCL context. queue, kernel, etc.
    cl_uint numPlatforms;
    // Find number of platforms
    err = clGetPlatformIDs(0, NULL, &numPlatforms);
    if (err != CL_SUCCESS || numPlatforms <= 0)
    {
        printf("Error: Failed to find a platform!\n%s\n",err_code(err));
        return EXIT_FAILURE;
    }
    // Get all platforms
    cl_platform_id Platform[numPlatforms];
    err = clGetPlatformIDs(numPlatforms, Platform, NULL);
    if (err != CL_SUCCESS || numPlatforms <= 0)
    {
        printf("Error: Failed to get the platform!\n%s\n",err_code(err));
        return EXIT_FAILURE;
    }
    // Secure a device
    for (int i = 0; i < numPlatforms; i++)
    {
        err = clGetDeviceIDs(Platform[i], DEVICE, 1, &device_id, NULL);
        if (err == CL_SUCCESS)
            break;
    }
    if (device_id == NULL)
    {
        printf("Error: Failed to create a device group!\n%s\n",err_code(err));
        return EXIT_FAILURE;
    }

    // Create a compute context
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    if (!context)
    {
        printf("Error: Failed to create a compute context!\n%s\n",err_code(err));
        return EXIT_FAILURE;
    }
    // Create a command queue
    commands = clCreateCommandQueue(context, device_id, 0, &err);
    if (!commands)
    {
        printf("Error: Failed to create a command commands!\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }

//--------------------------------------------------------------------------------
// Setup the buffers, initialize matrices, and write them into global memory
//--------------------------------------------------------------------------------

    //  Reset A, B and C matrices (just to play it safe)
    initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C);

    d_a = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                            sizeof(float) * szA, h_A, &err);
    if (err != CL_SUCCESS)
    {
        printf("Error: failed to create buffer\n%s\n", err_code(err));
        return EXIT_FAILURE;
    } 
    d_b = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                            sizeof(float) * szB, h_B, &err);
    if (err != CL_SUCCESS)
    {
        printf("Error: failed to create buffer\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }
    d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
                            sizeof(float) * szC, NULL, &err);
    if (err != CL_SUCCESS)
    {
        printf("Error: failed to create buffer\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }


//--------------------------------------------------------------------------------
// OpenCL matrix multiplication ... Naive
//--------------------------------------------------------------------------------

    kernelsource = getKernelSource("../C_elem.cl");
    // Create the comput program from the source buffer
    program = clCreateProgramWithSource(context, 1, (const char **) & kernelsource, NULL, &err);
    if (err != CL_SUCCESS)
    {
        printf("Error: could not create program\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }
    free(kernelsource);
    // Build the program
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];

        printf("Error: Failed to build program executable!\n%s\n", err_code(err));
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        return EXIT_FAILURE;
    }

    // Create the compute kernel from the program
    kernel = clCreateKernel(program, "mmul", &err);
    if (!kernel || err != CL_SUCCESS)
    {
        printf("Error: Failed to create compute kernel!\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }

    printf("\n===== OpenCL, matrix mult, C(i,j) per work item, order %d ======\n",Ndim);

    // Do the multiplication COUNT times
    for (int i = 0; i < COUNT; i++)
    {
        zero_mat(Ndim, Mdim, h_C);

        err =  clSetKernelArg(kernel, 0, sizeof(int),    &Mdim);
        err |= clSetKernelArg(kernel, 1, sizeof(int),    &Ndim);
        err |= clSetKernelArg(kernel, 2, sizeof(int),    &Pdim);
        err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_a);
        err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_b);
        err |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &d_c);

        if (err != CL_SUCCESS)
        {
            printf("Error: Could not set kernel arguments\n");
            return EXIT_FAILURE;
        }

        start_time = wtime();

        // Execute the kernel over the entire range of C matrix elements ... computing
        // a dot product for each element of the product matrix.  The local work
        // group size is set to NULL ... so I'm telling the OpenCL runtime to
        // figure out a local work group size for me.
        const size_t global[2] = {Ndim, Mdim};
        err = clEnqueueNDRangeKernel(
            commands,
            kernel,
            2, NULL,
            global, NULL,
            0, NULL, NULL);
        if (err != CL_SUCCESS)
        {
            printf("Error: Failed to execute kernel\n%s\n", err_code(err));
            return EXIT_FAILURE;
        }

        err = clFinish(commands);
        if (err != CL_SUCCESS)
        {
            printf("Error: waiting for queue to finish failed\n%s\n", err_code(err));
            return EXIT_FAILURE;
        }

        run_time = wtime() - start_time;

        err = clEnqueueReadBuffer(
            commands, d_c, CL_TRUE, 0,
            sizeof(float) * szC, h_C,
            0, NULL, NULL);
        if (err != CL_SUCCESS)
        {
            printf("Error: Failed to read buffer\n%s\n", err_code(err));
            return EXIT_FAILURE;
        }

        results(Mdim, Ndim, Pdim, h_C, run_time);

    } // end for loop

//--------------------------------------------------------------------------------
// OpenCL matrix multiplication ... C row per work item
//--------------------------------------------------------------------------------
    kernelsource = getKernelSource("../C_row.cl");
    // Create the comput program from the source buffer
    program = clCreateProgramWithSource(context, 1, (const char **) & kernelsource, NULL, &err);
    if (err != CL_SUCCESS)
    {
        printf("Error: could not create program\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }
    free(kernelsource);
    // Build the program
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];

        printf("Error: Failed to build program executable!\n%s\n", err_code(err));
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        return EXIT_FAILURE;
    }

    // Create the compute kernel from the program
    kernel = clCreateKernel(program, "mmul", &err);
    if (!kernel || err != CL_SUCCESS)
    {
        printf("Error: Failed to create compute kernel!\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }

    printf("\n===== OpenCL, matrix mult, C row per work item, order %d ======\n",Ndim);

    // Do the multiplication COUNT times
    for (int i = 0; i < COUNT; i++)
    {
        zero_mat(Ndim, Mdim, h_C);

        err =  clSetKernelArg(kernel, 0, sizeof(int),    &Mdim);
        err |= clSetKernelArg(kernel, 1, sizeof(int),    &Ndim);
        err |= clSetKernelArg(kernel, 2, sizeof(int),    &Pdim);
        err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_a);
        err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_b);
        err |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &d_c);

        if (err != CL_SUCCESS)
        {
            printf("Error: Could not set kernel arguments\n");
            return EXIT_FAILURE;
        }

        start_time = wtime();

        // Execute the kernel over the rows of the C matrix ... computing
        // a dot product for each element of the product matrix.
        const size_t global = Ndim;
        err = clEnqueueNDRangeKernel(
            commands,
            kernel,
            1, NULL,
            &global, NULL,
            0, NULL, NULL);
        if (err != CL_SUCCESS)
        {
            printf("Error: Failed to execute kernel\n%s\n", err_code(err));
            return EXIT_FAILURE;
        }

        err = clFinish(commands);
        if (err != CL_SUCCESS)
        {
            printf("Error: waiting for queue to finish failed\n%s\n", err_code(err));
            return EXIT_FAILURE;
        }

        run_time = wtime() - start_time;

        err = clEnqueueReadBuffer(
            commands, d_c, CL_TRUE, 0,
            sizeof(float) * szC, h_C,
            0, NULL, NULL);
        if (err != CL_SUCCESS)
        {
            printf("Error: Failed to read buffer\n%s\n", err_code(err));
            return EXIT_FAILURE;
        }

        results(Mdim, Ndim, Pdim, h_C, run_time);

    } // end for loop


//--------------------------------------------------------------------------------
// OpenCL matrix multiplication ... C row per work item, A row in pivate memory
//--------------------------------------------------------------------------------
    kernelsource = getKernelSource("../C_row_priv.cl");
    // Create the comput program from the source buffer
    program = clCreateProgramWithSource(context, 1, (const char **) & kernelsource, NULL, &err);
    if (err != CL_SUCCESS)
    {
        printf("Error: could not create program\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }
    free(kernelsource);
    // Build the program
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];

        printf("Error: Failed to build program executable!\n%s\n", err_code(err));
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        return EXIT_FAILURE;
    }

    // Create the compute kernel from the program
    kernel = clCreateKernel(program, "mmul", &err);
    if (!kernel || err != CL_SUCCESS)
    {
        printf("Error: Failed to create compute kernel!\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }

    printf("\n===== OpenCL, matrix mult, C row, A row in priv mem, order %d ======\n",Ndim);

    // Do the multiplication COUNT times
    for (int i = 0; i < COUNT; i++)
    {
        zero_mat(Ndim, Mdim, h_C);

        err =  clSetKernelArg(kernel, 0, sizeof(int),    &Mdim);
        err |= clSetKernelArg(kernel, 1, sizeof(int),    &Ndim);
        err |= clSetKernelArg(kernel, 2, sizeof(int),    &Pdim);
        err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_a);
        err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_b);
        err |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &d_c);

        if (err != CL_SUCCESS)
        {
            printf("Error: Could not set kernel arguments\n");
            return EXIT_FAILURE;
        }

        start_time = wtime();

        // Execute the kernel over the rows of the C matrix ... computing
        // a dot product for each element of the product matrix.
        const size_t global = Ndim;
        const size_t local = ORDER / 16;
        err = clEnqueueNDRangeKernel(
            commands,
            kernel,
            1, NULL,
            &global, &local,
            0, NULL, NULL);
        if (err != CL_SUCCESS)
        {
            printf("Error: Failed to execute kernel\n%s\n", err_code(err));
            return EXIT_FAILURE;
        }

        err = clFinish(commands);
        if (err != CL_SUCCESS)
        {
            printf("Error: waiting for queue to finish failed\n%s\n", err_code(err));
            return EXIT_FAILURE;
        }

        run_time = wtime() - start_time;

        err = clEnqueueReadBuffer(
            commands, d_c, CL_TRUE, 0,
            sizeof(float) * szC, h_C,
            0, NULL, NULL);
        if (err != CL_SUCCESS)
        {
            printf("Error: Failed to read buffer\n%s\n", err_code(err));
            return EXIT_FAILURE;
        }

        results(Mdim, Ndim, Pdim, h_C, run_time);

    } // end for loop


//--------------------------------------------------------------------------------
// Clean up!
//--------------------------------------------------------------------------------

    free(h_A);
    free(h_B);
    free(h_C);
    clReleaseMemObject(d_a);
    clReleaseMemObject(d_b);
    clReleaseMemObject(d_c);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(commands);
    clReleaseContext(context);

    return EXIT_SUCCESS;
}
コード例 #2
0
ファイル: matmul.cpp プロジェクト: Aestey/Exercises-Solutions
int main(void)
{

    int Mdim, Ndim, Pdim;   // A[N][P], B[P][M], C[N][M]
    int szA, szB, szC;      // number of elements in each matrix


    double start_time;      // Starting time
    double run_time;        // timing data

    Ndim = ORDER;
    Pdim = ORDER;
    Mdim = ORDER;

    szA = Ndim * Pdim;
    szB = Pdim * Mdim;
    szC = Ndim * Mdim;

    std::vector<float> h_A(szA); // Host memory for Matrix A
    std::vector<float> h_B(szB); // Host memory for Matrix B
    std::vector<float> h_C(szC); // Host memory for Matrix C

    cl::Buffer d_a, d_b, d_c;   // Matrices in device memory

    initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C);

    printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",ORDER);
    for(int i = 0; i < COUNT; i++)
    {
        zero_mat(Ndim, Mdim, h_C);
        start_time = wtime();

        seq_mat_mul_sdot(Mdim, Ndim, Pdim, h_A, h_B, h_C);

        run_time  = wtime() - start_time;
        results(Mdim, Ndim, Pdim, h_C, run_time);
    }

    try
    {

//--------------------------------------------------------------------------------
// Create a context and queue for DEVICE
//--------------------------------------------------------------------------------

        cl::Context context(DEVICE);
        cl::CommandQueue queue(context);

//--------------------------------------------------------------------------------
// Setup the buffers, initialize matrices, and write them into global memory
//--------------------------------------------------------------------------------

        //  Reset A, B and C matrices (just to play it safe)
        initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C);

        d_a = cl::Buffer(context, begin(h_A), end(h_A), true);

        d_b = cl::Buffer(context, begin(h_B), end(h_B), true);

        d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * szC);

//--------------------------------------------------------------------------------
// OpenCL matrix multiplication ... Naive
//--------------------------------------------------------------------------------

        // Create the compute program from the source buffer
        cl::Program program(context, util::loadProgram("../C_elem.cl"), true);

        // Create the compute kernel from the program
        auto naive_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer>(program, "mmul");

        printf("\n===== OpenCL, matrix mult, C(i,j) per work item, order %d ======\n",Ndim);

        // Do the multiplication COUNT times
        for (int i = 0; i < COUNT; i++)
        {
            zero_mat(Ndim, Mdim, h_C);

            start_time = wtime();

            // Execute the kernel over the entire range of C matrix elements ... computing
            // a dot product for each element of the product matrix.  The local work
            // group size is set to NULL ... so I'm telling the OpenCL runtime to
            // figure out a local work group size for me.
            cl::NDRange global(Ndim, Mdim);
            naive_mmul(cl::EnqueueArgs(queue, global),
                    Mdim, Ndim, Pdim, d_a, d_b, d_c);

            queue.finish();

            run_time = wtime() - start_time;

            cl::copy(queue, d_c, begin(h_C), end(h_C));

            results(Mdim, Ndim, Pdim, h_C, run_time);

        } // end for loop

//--------------------------------------------------------------------------------
// OpenCL matrix multiplication ... C row per work item
//--------------------------------------------------------------------------------

        // Create the compute program from the source buffer
        program = cl::Program(context, util::loadProgram("../C_row.cl"), true);

        // Create the compute kernel from the program
        auto crow_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer>(program, "mmul");

         printf("\n===== OpenCL, matrix mult, C row per work item, order %d ======\n",Ndim);

        // Do the multiplication COUNT times
        for (int i = 0; i < COUNT; i++)
        {
            zero_mat(Ndim, Mdim, h_C);

            start_time = wtime();

            cl::NDRange global(Ndim);
            crow_mmul(cl::EnqueueArgs(queue, global),
                    Mdim, Ndim, Pdim, d_a, d_b, d_c);

            queue.finish();

            run_time = wtime() - start_time;

            cl::copy(queue, d_c, begin(h_C), end(h_C));

            results(Mdim, Ndim, Pdim, h_C, run_time);

        } // end for loop

//--------------------------------------------------------------------------------
// OpenCL matrix multiplication ... C row per work item, A row in pivate memory
//--------------------------------------------------------------------------------

        // Create the compute program from the source buffer
        program = cl::Program(context, util::loadProgram("../C_row_priv.cl"), true);

        // Create the compute kernel from the program
        auto arowpriv_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer>(program, "mmul");

        printf("\n===== OpenCL, matrix mult, C row, A row in priv mem, order %d ======\n",Ndim);

        // Do the multiplication COUNT times
        for (int i = 0; i < COUNT; i++)
        {
            zero_mat(Ndim, Mdim, h_C);

            start_time = wtime();

            cl::NDRange global(Ndim);
            cl::NDRange local(ORDER / 16);
            arowpriv_mmul(cl::EnqueueArgs(queue, global, local),
                    Mdim, Ndim, Pdim, d_a, d_b, d_c);

            queue.finish();

            run_time = wtime() - start_time;

            cl::copy(queue, d_c, begin(h_C), end(h_C));

            results(Mdim, Ndim, Pdim, h_C, run_time);

        } // end for loop

//--------------------------------------------------------------------------------
// OpenCL matrix multiplication ... C row per work item, A row pivate, B col local
//--------------------------------------------------------------------------------

        // Create the compute program from the source buffer
        program = cl::Program(context, util::loadProgram("../C_row_priv_bloc.cl"), true);

        // Create the compute kernel from the program
        auto browloc_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer, cl::LocalSpaceArg>(program, "mmul");

        printf("\n===== OpenCL, mat mult, C row, priv A, B cols loc, order %d ======\n",Ndim);

        // Do the multiplication COUNT times
        for (int i = 0; i < COUNT; i++)
        {
            zero_mat(Ndim, Mdim, h_C);

            start_time = wtime();

            cl::NDRange global(Ndim);
            cl::NDRange local(ORDER / 16);

            cl::LocalSpaceArg localmem = cl::Local(sizeof(float) * Pdim);

            browloc_mmul(cl::EnqueueArgs(queue, global, local),
                    Mdim, Ndim, Pdim, d_a, d_b, d_c, localmem);

            queue.finish();

            run_time = wtime() - start_time;

            cl::copy(queue, d_c, begin(h_C), end(h_C));

            results(Mdim, Ndim, Pdim, h_C, run_time);

        } // end for loop

//--------------------------------------------------------------------------------
// OpenCL matrix multiplication ...  A and B in block form in local memory
//--------------------------------------------------------------------------------

        // Create the compute program from the source buffer
        program = cl::Program(context, util::loadProgram("../C_block_form.cl"), true);

        // Create the compute kernel from the program
        auto block_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer, cl::LocalSpaceArg, cl::LocalSpaceArg>(program, "mmul");

        printf("\n===== OpenCL, A and B in block form in local memory, order %d ======\n",Ndim);

        // Do the multiplication COUNT times
        for (int i = 0; i < COUNT; i++)
        {
            zero_mat(Ndim, Mdim, h_C);

            start_time = wtime();

            int blocksize = 16;
            cl::NDRange global(Ndim, Mdim);
            cl::NDRange local(blocksize, blocksize);

            cl::LocalSpaceArg localmem1 = cl::Local(sizeof(float) * blocksize * blocksize);
            cl::LocalSpaceArg localmem2 = cl::Local(sizeof(float) * blocksize * blocksize);

            block_mmul(cl::EnqueueArgs(queue, global, local),
                    Mdim, Ndim, Pdim, d_a, d_b, d_c, localmem1, localmem2);

            queue.finish();

            run_time = wtime() - start_time;

            cl::copy(queue, d_c, begin(h_C), end(h_C));

            results(Mdim, Ndim, Pdim, h_C, run_time);

        } // end for loop

    } catch (cl::Error err)
    {
        std::cout << "Exception\n";
        std::cerr << "ERROR: "
                  << err.what()
                  << "("
                  << err_code(err.err())
                  << ")"
                  << std::endl;
    }

    return EXIT_SUCCESS;
}
コード例 #3
0
int main(int argc, char *argv[])
{

    int N;                  // A[N][N], B[N][N], C[N][N]
    int size;               // Number of elements in each matrix


    double start_time;      // Starting time
    double run_time;        // Timing
    util::Timer timer;      // Timing

    N    = ORDER;
    size = N * N;

    std::vector<float> h_A(size); // Host memory for Matrix A
    std::vector<float> h_B(size); // Host memory for Matrix B
    std::vector<float> h_C(size); // Host memory for Matrix C

    cl::Buffer d_a, d_b, d_c;   // Matrices in device memory

//--------------------------------------------------------------------------------
// Create a context and queue
//--------------------------------------------------------------------------------

    try
    {

        cl_uint deviceIndex = 0;
        parseArguments(argc, argv, &deviceIndex);

        // Get list of devices
        std::vector<cl::Device> devices;
        unsigned numDevices = getDeviceList(devices);

        // Check device index in range
        if (deviceIndex >= numDevices)
        {
          std::cout << "Invalid device index (try '--list')\n";
          return EXIT_FAILURE;
        }

        cl::Device device = devices[deviceIndex];

        std::string name;
        getDeviceName(device, name);
        std::cout << "\nUsing OpenCL device: " << name << "\n";

        std::vector<cl::Device> chosen_device;
        chosen_device.push_back(device);
        cl::Context context(chosen_device);
        cl::CommandQueue queue(context, device);

//--------------------------------------------------------------------------------
// Run sequential matmul
//--------------------------------------------------------------------------------


        initmat(N, h_A, h_B, h_C);

        timer.reset();

        printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",N);
        for(int i = 0; i < COUNT; i++)
        {
            zero_mat(N, h_C);

            start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0;

            seq_mat_mul_sdot(N, h_A, h_B, h_C);

            run_time  = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time;
            results(N, h_C, run_time);
        }

//--------------------------------------------------------------------------------
// Setup the buffers, initialize matrices, and write them into global memory
//--------------------------------------------------------------------------------

        //  Reset A, B and C matrices (just to play it safe)
        initmat(N, h_A, h_B, h_C);

        d_a = cl::Buffer(context, h_A.begin(), h_A.end(), true);

        d_b = cl::Buffer(context, h_B.begin(), h_B.end(), true);

        d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * size);

//--------------------------------------------------------------------------------
// OpenCL matrix multiplication ... Naive
//--------------------------------------------------------------------------------

        // Create the compute program from the source buffer
        cl::Program program(context, kernelsource, true);

        // Create the compute kernel from the program
        cl::make_kernel<int, cl::Buffer, cl::Buffer, cl::Buffer> naive_mmul(program, "mmul");

        printf("\n===== OpenCL, matrix mult, C(i,j) per work item, order %d ======\n",N);

        // Do the multiplication COUNT times
        for (int i = 0; i < COUNT; i++)
        {
            zero_mat(N, h_C);

            start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0;

            // Execute the kernel over the entire range of C matrix elements ... computing
            // a dot product for each element of the product matrix.  The local work
            // group size is set to NULL ... so I'm telling the OpenCL runtime to
            // figure out a local work group size for me.
            cl::NDRange global(N, N);
            naive_mmul(cl::EnqueueArgs(queue, global),
                    N, d_a, d_b, d_c);

            queue.finish();

            run_time  = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time;

            cl::copy(queue, d_c, h_C.begin(), h_C.end());

            results(N, h_C, run_time);

        } // end for loop

    } catch (cl::Error err)
    {
        std::cout << "Exception\n";
        std::cerr << "ERROR: "
                  << err.what()
                  << "("
                  << err_code(err.err())
                  << ")"
                  << std::endl;
    }

    return EXIT_SUCCESS;
}
コード例 #4
0
ファイル: matmul.c プロジェクト: BenElgar/Exercises-Solutions
int main(int argc, char *argv[])
{
    float *h_A;             // A matrix
    float *h_B;             // B matrix
    float *h_C;             // C = A*B matrix
    int N;                  // A[N][N], B[N][N], C[N][N]
    int size;               // number of elements in each matrix

    cl_mem d_a, d_b, d_c;   // Matrices in device memory

    double start_time;      // Starting time
    double run_time;        // timing data

    char * kernelsource;    // kernel source string

    cl_int err;             // error code returned from OpenCL calls
    cl_device_id     device;        // compute device id 
    cl_context       context;       // compute context
    cl_command_queue commands;      // compute command queue
    cl_program       program;       // compute program
    cl_kernel        kernel;        // compute kernel

    N = ORDER;

    size = N * N;

    h_A = (float *)malloc(size * sizeof(float));
    h_B = (float *)malloc(size * sizeof(float));
    h_C = (float *)malloc(size * sizeof(float));

//--------------------------------------------------------------------------------
// Create a context, queue and device.
//--------------------------------------------------------------------------------

    cl_uint deviceIndex = 0;
    parseArguments(argc, argv, &deviceIndex);

    // Get list of devices
    cl_device_id devices[MAX_DEVICES];
    unsigned numDevices = getDeviceList(devices);

    // Check device index in range
    if (deviceIndex >= numDevices)
    {
      printf("Invalid device index (try '--list')\n");
      return EXIT_FAILURE;
    }

    device = devices[deviceIndex];

    char name[MAX_INFO_STRING];
    getDeviceName(device, name);
    printf("\nUsing OpenCL device: %s\n", name);

    // Create a compute context
    context = clCreateContext(0, 1, &device, NULL, NULL, &err);
    checkError(err, "Creating context");
    // Create a command queue
    commands = clCreateCommandQueue(context, device, 0, &err);
    checkError(err, "Creating command queue");


//--------------------------------------------------------------------------------
// Run sequential version on the host
//--------------------------------------------------------------------------------

    initmat(N, h_A, h_B, h_C);

    printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",ORDER);
    for(int i = 0; i < COUNT; i++)
    {
        zero_mat(N, h_C);
        start_time = wtime();

        seq_mat_mul_sdot(N, h_A, h_B, h_C);

        run_time  = wtime() - start_time;
        results(N, h_C, run_time);
    }

//--------------------------------------------------------------------------------
// Setup the buffers, initialize matrices, and write them into global memory
//--------------------------------------------------------------------------------

    //  Reset A, B and C matrices (just to play it safe)
    initmat(N, h_A, h_B, h_C);

    d_a = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                            sizeof(float) * size, h_A, &err);
    checkError(err, "Creating buffer d_a");
    d_b = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                            sizeof(float) * size, h_B, &err);
    checkError(err, "Creating buffer d_b");
    d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
                            sizeof(float) * size, NULL, &err);
    checkError(err, "Creating buffer d_c");


//--------------------------------------------------------------------------------
// OpenCL matrix multiplication ... Naive
//--------------------------------------------------------------------------------

    kernelsource = getKernelSource("../C_elem.cl");
    // Create the comput program from the source buffer
    program = clCreateProgramWithSource(context, 1, (const char **) & kernelsource, NULL, &err);
    checkError(err, "Creating program with C_elem.cl");
    free(kernelsource);
    // Build the program
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];

        printf("Error: Failed to build program executable!\n%s\n", err_code(err));
        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        return EXIT_FAILURE;
    }

    // Create the compute kernel from the program
    kernel = clCreateKernel(program, "mmul", &err);
    checkError(err, "Creating kernel from C_elem.cl");

    printf("\n===== OpenCL, matrix mult, C(i,j) per work item, order %d ======\n",N);

    // Do the multiplication COUNT times
    for (int i = 0; i < COUNT; i++)
    {
        zero_mat(N, h_C);

        err =  clSetKernelArg(kernel, 0, sizeof(int),    &N);
        err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_a);
        err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_b);
        err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_c);
        checkError(err, "Setting kernel args");

        start_time = wtime();

        // Execute the kernel over the entire range of C matrix elements ... computing
        // a dot product for each element of the product matrix.  The local work
        // group size is set to NULL ... so I'm telling the OpenCL runtime to
        // figure out a local work group size for me.
        const size_t global[2] = {N, N};
        err = clEnqueueNDRangeKernel(
            commands,
            kernel,
            2, NULL,
            global, NULL,
            0, NULL, NULL);
        checkError(err, "Enqueueing kernel");

        err = clFinish(commands);
        checkError(err, "Waiting for kernel to finish");

        run_time = wtime() - start_time;

        err = clEnqueueReadBuffer(
            commands, d_c, CL_TRUE, 0,
            sizeof(float) * size, h_C,
            0, NULL, NULL);
        checkError(err, "Copying back d_c");

        results(N, h_C, run_time);

    } // end for loop

//--------------------------------------------------------------------------------
// OpenCL matrix multiplication ... C row per work item
//--------------------------------------------------------------------------------
    kernelsource = getKernelSource("../C_row.cl");
    // Create the comput program from the source buffer
    program = clCreateProgramWithSource(context, 1, (const char **) & kernelsource, NULL, &err);
    checkError(err, "Creating program with C_row.cl");
    free(kernelsource);
    // Build the program
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];

        printf("Error: Failed to build program executable!\n%s\n", err_code(err));
        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        return EXIT_FAILURE;
    }

    // Create the compute kernel from the program
    kernel = clCreateKernel(program, "mmul", &err);
    checkError(err, "Creating kernel from C_row.cl");

    printf("\n===== OpenCL, matrix mult, C row per work item, order %d ======\n",N);

    // Do the multiplication COUNT times
    for (int i = 0; i < COUNT; i++)
    {
        zero_mat(N, h_C);

        err =  clSetKernelArg(kernel, 0, sizeof(int),    &N);
        err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_a);
        err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_b);
        err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_c);
        checkError(err, "Setting kernel args");

        start_time = wtime();

        // Execute the kernel over the rows of the C matrix ... computing
        // a dot product for each element of the product matrix.
        const size_t global = N;
        err = clEnqueueNDRangeKernel(
            commands,
            kernel,
            1, NULL,
            &global, NULL,
            0, NULL, NULL);
        checkError(err, "Enqueueing kernel");

        err = clFinish(commands);
        checkError(err, "Waiting for kernel to finish");

        run_time = wtime() - start_time;

        err = clEnqueueReadBuffer(
            commands, d_c, CL_TRUE, 0,
            sizeof(float) * size, h_C,
            0, NULL, NULL);
        checkError(err, "Reading back d_c");

        results(N, h_C, run_time);

    } // end for loop


//--------------------------------------------------------------------------------
// OpenCL matrix multiplication ... C row per work item, A row in pivate memory
//--------------------------------------------------------------------------------
    kernelsource = getKernelSource("../C_row_priv.cl");
    // Create the comput program from the source buffer
    program = clCreateProgramWithSource(context, 1, (const char **) & kernelsource, NULL, &err);
    checkError(err, "Creating program from C_row_priv.cl");
    free(kernelsource);
    // Build the program
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];

        printf("Error: Failed to build program executable!\n%s\n", err_code(err));
        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        return EXIT_FAILURE;
    }

    // Create the compute kernel from the program
    kernel = clCreateKernel(program, "mmul", &err);
    checkError(err, "Creating kernel from C_row_priv.cl");

    printf("\n===== OpenCL, matrix mult, C row, A row in priv mem, order %d ======\n",N);

    // Do the multiplication COUNT times
    for (int i = 0; i < COUNT; i++)
    {
        zero_mat(N, h_C);

        err =  clSetKernelArg(kernel, 0, sizeof(int),    &N);
        err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_a);
        err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_b);
        err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_c);
        checkError(err, "Setting kernel args");

        start_time = wtime();

        // Execute the kernel over the rows of the C matrix ... computing
        // a dot product for each element of the product matrix.
        const size_t global = N;
        const size_t local = ORDER / 16;
        err = clEnqueueNDRangeKernel(
            commands,
            kernel,
            1, NULL,
            &global, &local,
            0, NULL, NULL);
        checkError(err, "Enqueueing kernel");

        err = clFinish(commands);
        checkError(err, "Waiting for kernel to finish");

        run_time = wtime() - start_time;

        err = clEnqueueReadBuffer(
            commands, d_c, CL_TRUE, 0,
            sizeof(float) * size, h_C,
            0, NULL, NULL);
        checkError(err, "Reading back d_c");

        results(N, h_C, run_time);

    } // end for loop


//--------------------------------------------------------------------------------
// Clean up!
//--------------------------------------------------------------------------------

    free(h_A);
    free(h_B);
    free(h_C);
    clReleaseMemObject(d_a);
    clReleaseMemObject(d_b);
    clReleaseMemObject(d_c);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(commands);
    clReleaseContext(context);

    return EXIT_SUCCESS;
}
コード例 #5
0
int main(void)
{

    int Mdim, Ndim, Pdim;   // A[N][P], B[P][M], C[N][M]
    int szA, szB, szC;      // Number of elements in each matrix


    double start_time;      // Starting time
    double run_time;        // Timing 
    util::Timer timer;      // Timing

    Ndim = ORDER;
    Pdim = ORDER;
    Mdim = ORDER;

    szA = Ndim * Pdim;
    szB = Pdim * Mdim;
    szC = Ndim * Mdim;

    std::vector<float> h_A(szA); // Host memory for Matrix A
    std::vector<float> h_B(szB); // Host memory for Matrix B
    std::vector<float> h_C(szC); // Host memory for Matrix C

    cl::Buffer d_a, d_b, d_c;   // Matrices in device memory

    initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C);

    timer.reset();

    printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",ORDER);
    for(int i = 0; i < COUNT; i++)
    {
        zero_mat(Ndim, Mdim, h_C);

        start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0;

        seq_mat_mul_sdot(Mdim, Ndim, Pdim, h_A, h_B, h_C);

        run_time  = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time;
        results(Mdim, Ndim, Pdim, h_C, run_time);
    }

    try
    {

//--------------------------------------------------------------------------------
// Create a context and queue for DEVICE
//--------------------------------------------------------------------------------

        cl::Context context(DEVICE);
        cl::CommandQueue queue(context);

//--------------------------------------------------------------------------------
// Setup the buffers, initialize matrices, and write them into global memory
//--------------------------------------------------------------------------------

        //  Reset A, B and C matrices (just to play it safe)
        initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C);

        d_a = cl::Buffer(context, begin(h_A), end(h_A), true);

        d_b = cl::Buffer(context, begin(h_B), end(h_B), true);

        d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * szC);

//--------------------------------------------------------------------------------
// OpenCL matrix multiplication ... Naive
//--------------------------------------------------------------------------------

        // Create the compute program from the source buffer
        cl::Program program(context, kernelsource, true);

        // Create the compute kernel from the program
        auto naive_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer>(program, "mmul");

        printf("\n===== OpenCL, matrix mult, C(i,j) per work item, order %d ======\n",Ndim);

        // Do the multiplication COUNT times
        for (int i = 0; i < COUNT; i++)
        {
            zero_mat(Ndim, Mdim, h_C);

            start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0;

            // Execute the kernel over the entire range of C matrix elements ... computing
            // a dot product for each element of the product matrix.  The local work
            // group size is set to NULL ... so I'm telling the OpenCL runtime to
            // figure out a local work group size for me.
            cl::NDRange global(Ndim, Mdim);
            naive_mmul(cl::EnqueueArgs(queue, global),
                    Mdim, Ndim, Pdim, d_a, d_b, d_c);

            queue.finish();

            run_time  = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time;

            cl::copy(queue, d_c, begin(h_C), end(h_C));

            results(Mdim, Ndim, Pdim, h_C, run_time);

        } // end for loop

    } catch (cl::Error err)
    {
        std::cout << "Exception\n";
        std::cerr << "ERROR: "
                  << err.what()
                  << "("
                  << err_code(err.err())
                  << ")"
                  << std::endl;
    }

    return EXIT_SUCCESS;
}