Esempio n. 1
0
int main(void)
{

    int Mdim, Ndim, Pdim;   // A[N][P], B[P][M], C[N][M]
    int szA, szB, szC;      // number of elements in each matrix

    double start_time;      // Starting time
    double run_time;        // timing data

    Ndim = ORDER;
    Pdim = ORDER;
    Mdim = ORDER;

    szA = Ndim * Pdim;
    szB = Pdim * Mdim;
    szC = Ndim * Mdim;

    std::vector<float> A(szA); // Host memory for Matrix A
    std::vector<float> B(szB); // Host memory for Matrix B
    std::vector<float> C(szC); // Host memory for Matrix C

    initmat(Mdim, Ndim, Pdim, A, B, C);

    printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",ORDER);
    float tmp;
    zero_mat(Ndim, Mdim, C);
    start_time = wtime();

    for (int ii = 0; ii < Ndim; ii++) {
      for (int jj = 0; jj < Mdim; jj++) {
         tmp = 0.0f;
         for (int kk = 0; kk < Pdim; kk++) {
             /* C(ii,jj) = sum(over kk) A(ii,kk) * B(kk,jj) */
             tmp += A[ii*Ndim+kk] * B[kk*Pdim+jj];
         }
         C[ii*Ndim+jj] = tmp;
      }
    }

    run_time  = wtime() - start_time;
    results(Mdim, Ndim, Pdim, C, run_time);

    return EXIT_SUCCESS;
}
int main(void)
{

    int N;   // A[N][N], B[N][N], C[N][N]
    int sz;  // number of elements in each matrix
    float tmp;

    N = ORDER;

    sz = N * N;


    std::vector<float> A(sz); // Matrix A
    std::vector<float> B(sz); // Matrix B
    std::vector<float> C(sz); // Matrix C


    initmat(N, N, N, A, B, C);

    printf("\n===== Sequential, matrix mult (dot prod), order %d on CPU ======\n",ORDER);
 
    zero_mat(N, N, C);


    util::Timer timer;


    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            tmp = 0.0f;
            for (int k = 0; k < N; k++) {
                tmp += A[i*N+k] * B[k*N+j];
            }
            C[i*N+j] = tmp;
        }
    }
              
    double rtime = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0;

    results(N, N, N, C, rtime);

}
void Calculs::compile_gradient(int nnn)
{
       if(nnn==0)
       {
               initEnv();
       }
       Env();
       Biomasse();
       if(nnn==0)
       {
               init_temoin();
       }
       initmat();
       construction_dGdU();
       construction_dGdX();
       construction_dFdU();
       construction_dSdX();
       construction_dFdX();
       construction_adjoints();
       construction_gradient();       
}
Esempio n. 4
0
int main(void)
{
    float *h_A;             // A matrix
    float *h_B;             // B matrix
    float *h_C;             // C = A*B matrix
    int Mdim, Ndim, Pdim;   // A[N][P], B[P][M], C[N][M]
    int szA, szB, szC;      // number of elements in each matrix

    cl_mem d_a, d_b, d_c;   // Matrices in device memory

    double start_time;      // Starting time
    double run_time;        // timing data

    char * kernelsource;    // kernel source string

    cl_int err;             // error code returned from OpenCL calls
    cl_device_id     device_id;     // compute device id 
    cl_context       context;       // compute context
    cl_command_queue commands;      // compute command queue
    cl_program       program;       // compute program
    cl_kernel        kernel;        // compute kernel

    Ndim = ORDER;
    Pdim = ORDER;
    Mdim = ORDER;

    szA = Ndim * Pdim;
    szB = Pdim * Mdim;
    szC = Ndim * Mdim;

    h_A = (float *)malloc(szA * sizeof(float));
    h_B = (float *)malloc(szB * sizeof(float));
    h_C = (float *)malloc(szC * sizeof(float));

    initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C);

    printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",ORDER);
    for(int i = 0; i < COUNT; i++)
    {
        zero_mat(Ndim, Mdim, h_C);
        start_time = wtime();

        seq_mat_mul_sdot(Mdim, Ndim, Pdim, h_A, h_B, h_C);

        run_time  = wtime() - start_time;
        results(Mdim, Ndim, Pdim, h_C, run_time);
    }

//--------------------------------------------------------------------------------
// Create a context, queue and device.
//--------------------------------------------------------------------------------

    // Set up OpenCL context. queue, kernel, etc.
    cl_uint numPlatforms;
    // Find number of platforms
    err = clGetPlatformIDs(0, NULL, &numPlatforms);
    if (err != CL_SUCCESS || numPlatforms <= 0)
    {
        printf("Error: Failed to find a platform!\n%s\n",err_code(err));
        return EXIT_FAILURE;
    }
    // Get all platforms
    cl_platform_id Platform[numPlatforms];
    err = clGetPlatformIDs(numPlatforms, Platform, NULL);
    if (err != CL_SUCCESS || numPlatforms <= 0)
    {
        printf("Error: Failed to get the platform!\n%s\n",err_code(err));
        return EXIT_FAILURE;
    }
    // Secure a device
    for (int i = 0; i < numPlatforms; i++)
    {
        err = clGetDeviceIDs(Platform[i], DEVICE, 1, &device_id, NULL);
        if (err == CL_SUCCESS)
            break;
    }
    if (device_id == NULL)
    {
        printf("Error: Failed to create a device group!\n%s\n",err_code(err));
        return EXIT_FAILURE;
    }

    // Create a compute context
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    if (!context)
    {
        printf("Error: Failed to create a compute context!\n%s\n",err_code(err));
        return EXIT_FAILURE;
    }
    // Create a command queue
    commands = clCreateCommandQueue(context, device_id, 0, &err);
    if (!commands)
    {
        printf("Error: Failed to create a command commands!\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }

//--------------------------------------------------------------------------------
// Setup the buffers, initialize matrices, and write them into global memory
//--------------------------------------------------------------------------------

    //  Reset A, B and C matrices (just to play it safe)
    initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C);

    d_a = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                            sizeof(float) * szA, h_A, &err);
    if (err != CL_SUCCESS)
    {
        printf("Error: failed to create buffer\n%s\n", err_code(err));
        return EXIT_FAILURE;
    } 
    d_b = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                            sizeof(float) * szB, h_B, &err);
    if (err != CL_SUCCESS)
    {
        printf("Error: failed to create buffer\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }
    d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
                            sizeof(float) * szC, NULL, &err);
    if (err != CL_SUCCESS)
    {
        printf("Error: failed to create buffer\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }


//--------------------------------------------------------------------------------
// OpenCL matrix multiplication ... Naive
//--------------------------------------------------------------------------------

    kernelsource = getKernelSource("../C_elem.cl");
    // Create the comput program from the source buffer
    program = clCreateProgramWithSource(context, 1, (const char **) & kernelsource, NULL, &err);
    if (err != CL_SUCCESS)
    {
        printf("Error: could not create program\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }
    free(kernelsource);
    // Build the program
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];

        printf("Error: Failed to build program executable!\n%s\n", err_code(err));
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        return EXIT_FAILURE;
    }

    // Create the compute kernel from the program
    kernel = clCreateKernel(program, "mmul", &err);
    if (!kernel || err != CL_SUCCESS)
    {
        printf("Error: Failed to create compute kernel!\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }

    printf("\n===== OpenCL, matrix mult, C(i,j) per work item, order %d ======\n",Ndim);

    // Do the multiplication COUNT times
    for (int i = 0; i < COUNT; i++)
    {
        zero_mat(Ndim, Mdim, h_C);

        err =  clSetKernelArg(kernel, 0, sizeof(int),    &Mdim);
        err |= clSetKernelArg(kernel, 1, sizeof(int),    &Ndim);
        err |= clSetKernelArg(kernel, 2, sizeof(int),    &Pdim);
        err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_a);
        err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_b);
        err |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &d_c);

        if (err != CL_SUCCESS)
        {
            printf("Error: Could not set kernel arguments\n");
            return EXIT_FAILURE;
        }

        start_time = wtime();

        // Execute the kernel over the entire range of C matrix elements ... computing
        // a dot product for each element of the product matrix.  The local work
        // group size is set to NULL ... so I'm telling the OpenCL runtime to
        // figure out a local work group size for me.
        const size_t global[2] = {Ndim, Mdim};
        err = clEnqueueNDRangeKernel(
            commands,
            kernel,
            2, NULL,
            global, NULL,
            0, NULL, NULL);
        if (err != CL_SUCCESS)
        {
            printf("Error: Failed to execute kernel\n%s\n", err_code(err));
            return EXIT_FAILURE;
        }

        err = clFinish(commands);
        if (err != CL_SUCCESS)
        {
            printf("Error: waiting for queue to finish failed\n%s\n", err_code(err));
            return EXIT_FAILURE;
        }

        run_time = wtime() - start_time;

        err = clEnqueueReadBuffer(
            commands, d_c, CL_TRUE, 0,
            sizeof(float) * szC, h_C,
            0, NULL, NULL);
        if (err != CL_SUCCESS)
        {
            printf("Error: Failed to read buffer\n%s\n", err_code(err));
            return EXIT_FAILURE;
        }

        results(Mdim, Ndim, Pdim, h_C, run_time);

    } // end for loop

//--------------------------------------------------------------------------------
// OpenCL matrix multiplication ... C row per work item
//--------------------------------------------------------------------------------
    kernelsource = getKernelSource("../C_row.cl");
    // Create the comput program from the source buffer
    program = clCreateProgramWithSource(context, 1, (const char **) & kernelsource, NULL, &err);
    if (err != CL_SUCCESS)
    {
        printf("Error: could not create program\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }
    free(kernelsource);
    // Build the program
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];

        printf("Error: Failed to build program executable!\n%s\n", err_code(err));
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        return EXIT_FAILURE;
    }

    // Create the compute kernel from the program
    kernel = clCreateKernel(program, "mmul", &err);
    if (!kernel || err != CL_SUCCESS)
    {
        printf("Error: Failed to create compute kernel!\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }

    printf("\n===== OpenCL, matrix mult, C row per work item, order %d ======\n",Ndim);

    // Do the multiplication COUNT times
    for (int i = 0; i < COUNT; i++)
    {
        zero_mat(Ndim, Mdim, h_C);

        err =  clSetKernelArg(kernel, 0, sizeof(int),    &Mdim);
        err |= clSetKernelArg(kernel, 1, sizeof(int),    &Ndim);
        err |= clSetKernelArg(kernel, 2, sizeof(int),    &Pdim);
        err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_a);
        err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_b);
        err |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &d_c);

        if (err != CL_SUCCESS)
        {
            printf("Error: Could not set kernel arguments\n");
            return EXIT_FAILURE;
        }

        start_time = wtime();

        // Execute the kernel over the rows of the C matrix ... computing
        // a dot product for each element of the product matrix.
        const size_t global = Ndim;
        err = clEnqueueNDRangeKernel(
            commands,
            kernel,
            1, NULL,
            &global, NULL,
            0, NULL, NULL);
        if (err != CL_SUCCESS)
        {
            printf("Error: Failed to execute kernel\n%s\n", err_code(err));
            return EXIT_FAILURE;
        }

        err = clFinish(commands);
        if (err != CL_SUCCESS)
        {
            printf("Error: waiting for queue to finish failed\n%s\n", err_code(err));
            return EXIT_FAILURE;
        }

        run_time = wtime() - start_time;

        err = clEnqueueReadBuffer(
            commands, d_c, CL_TRUE, 0,
            sizeof(float) * szC, h_C,
            0, NULL, NULL);
        if (err != CL_SUCCESS)
        {
            printf("Error: Failed to read buffer\n%s\n", err_code(err));
            return EXIT_FAILURE;
        }

        results(Mdim, Ndim, Pdim, h_C, run_time);

    } // end for loop


//--------------------------------------------------------------------------------
// OpenCL matrix multiplication ... C row per work item, A row in pivate memory
//--------------------------------------------------------------------------------
    kernelsource = getKernelSource("../C_row_priv.cl");
    // Create the comput program from the source buffer
    program = clCreateProgramWithSource(context, 1, (const char **) & kernelsource, NULL, &err);
    if (err != CL_SUCCESS)
    {
        printf("Error: could not create program\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }
    free(kernelsource);
    // Build the program
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];

        printf("Error: Failed to build program executable!\n%s\n", err_code(err));
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        return EXIT_FAILURE;
    }

    // Create the compute kernel from the program
    kernel = clCreateKernel(program, "mmul", &err);
    if (!kernel || err != CL_SUCCESS)
    {
        printf("Error: Failed to create compute kernel!\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }

    printf("\n===== OpenCL, matrix mult, C row, A row in priv mem, order %d ======\n",Ndim);

    // Do the multiplication COUNT times
    for (int i = 0; i < COUNT; i++)
    {
        zero_mat(Ndim, Mdim, h_C);

        err =  clSetKernelArg(kernel, 0, sizeof(int),    &Mdim);
        err |= clSetKernelArg(kernel, 1, sizeof(int),    &Ndim);
        err |= clSetKernelArg(kernel, 2, sizeof(int),    &Pdim);
        err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_a);
        err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_b);
        err |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &d_c);

        if (err != CL_SUCCESS)
        {
            printf("Error: Could not set kernel arguments\n");
            return EXIT_FAILURE;
        }

        start_time = wtime();

        // Execute the kernel over the rows of the C matrix ... computing
        // a dot product for each element of the product matrix.
        const size_t global = Ndim;
        const size_t local = ORDER / 16;
        err = clEnqueueNDRangeKernel(
            commands,
            kernel,
            1, NULL,
            &global, &local,
            0, NULL, NULL);
        if (err != CL_SUCCESS)
        {
            printf("Error: Failed to execute kernel\n%s\n", err_code(err));
            return EXIT_FAILURE;
        }

        err = clFinish(commands);
        if (err != CL_SUCCESS)
        {
            printf("Error: waiting for queue to finish failed\n%s\n", err_code(err));
            return EXIT_FAILURE;
        }

        run_time = wtime() - start_time;

        err = clEnqueueReadBuffer(
            commands, d_c, CL_TRUE, 0,
            sizeof(float) * szC, h_C,
            0, NULL, NULL);
        if (err != CL_SUCCESS)
        {
            printf("Error: Failed to read buffer\n%s\n", err_code(err));
            return EXIT_FAILURE;
        }

        results(Mdim, Ndim, Pdim, h_C, run_time);

    } // end for loop


//--------------------------------------------------------------------------------
// Clean up!
//--------------------------------------------------------------------------------

    free(h_A);
    free(h_B);
    free(h_C);
    clReleaseMemObject(d_a);
    clReleaseMemObject(d_b);
    clReleaseMemObject(d_c);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(commands);
    clReleaseContext(context);

    return EXIT_SUCCESS;
}
Esempio n. 5
0
int main(void)
{

    int Mdim, Ndim, Pdim;   // A[N][P], B[P][M], C[N][M]
    int szA, szB, szC;      // number of elements in each matrix


    double start_time;      // Starting time
    double run_time;        // timing data

    Ndim = ORDER;
    Pdim = ORDER;
    Mdim = ORDER;

    szA = Ndim * Pdim;
    szB = Pdim * Mdim;
    szC = Ndim * Mdim;

    std::vector<float> h_A(szA); // Host memory for Matrix A
    std::vector<float> h_B(szB); // Host memory for Matrix B
    std::vector<float> h_C(szC); // Host memory for Matrix C

    cl::Buffer d_a, d_b, d_c;   // Matrices in device memory

    initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C);

    printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",ORDER);
    for(int i = 0; i < COUNT; i++)
    {
        zero_mat(Ndim, Mdim, h_C);
        start_time = wtime();

        seq_mat_mul_sdot(Mdim, Ndim, Pdim, h_A, h_B, h_C);

        run_time  = wtime() - start_time;
        results(Mdim, Ndim, Pdim, h_C, run_time);
    }

    try
    {

//--------------------------------------------------------------------------------
// Create a context and queue for DEVICE
//--------------------------------------------------------------------------------

        cl::Context context(DEVICE);
        cl::CommandQueue queue(context);

//--------------------------------------------------------------------------------
// Setup the buffers, initialize matrices, and write them into global memory
//--------------------------------------------------------------------------------

        //  Reset A, B and C matrices (just to play it safe)
        initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C);

        d_a = cl::Buffer(context, begin(h_A), end(h_A), true);

        d_b = cl::Buffer(context, begin(h_B), end(h_B), true);

        d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * szC);

//--------------------------------------------------------------------------------
// OpenCL matrix multiplication ... Naive
//--------------------------------------------------------------------------------

        // Create the compute program from the source buffer
        cl::Program program(context, util::loadProgram("../C_elem.cl"), true);

        // Create the compute kernel from the program
        auto naive_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer>(program, "mmul");

        printf("\n===== OpenCL, matrix mult, C(i,j) per work item, order %d ======\n",Ndim);

        // Do the multiplication COUNT times
        for (int i = 0; i < COUNT; i++)
        {
            zero_mat(Ndim, Mdim, h_C);

            start_time = wtime();

            // Execute the kernel over the entire range of C matrix elements ... computing
            // a dot product for each element of the product matrix.  The local work
            // group size is set to NULL ... so I'm telling the OpenCL runtime to
            // figure out a local work group size for me.
            cl::NDRange global(Ndim, Mdim);
            naive_mmul(cl::EnqueueArgs(queue, global),
                    Mdim, Ndim, Pdim, d_a, d_b, d_c);

            queue.finish();

            run_time = wtime() - start_time;

            cl::copy(queue, d_c, begin(h_C), end(h_C));

            results(Mdim, Ndim, Pdim, h_C, run_time);

        } // end for loop

//--------------------------------------------------------------------------------
// OpenCL matrix multiplication ... C row per work item
//--------------------------------------------------------------------------------

        // Create the compute program from the source buffer
        program = cl::Program(context, util::loadProgram("../C_row.cl"), true);

        // Create the compute kernel from the program
        auto crow_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer>(program, "mmul");

         printf("\n===== OpenCL, matrix mult, C row per work item, order %d ======\n",Ndim);

        // Do the multiplication COUNT times
        for (int i = 0; i < COUNT; i++)
        {
            zero_mat(Ndim, Mdim, h_C);

            start_time = wtime();

            cl::NDRange global(Ndim);
            crow_mmul(cl::EnqueueArgs(queue, global),
                    Mdim, Ndim, Pdim, d_a, d_b, d_c);

            queue.finish();

            run_time = wtime() - start_time;

            cl::copy(queue, d_c, begin(h_C), end(h_C));

            results(Mdim, Ndim, Pdim, h_C, run_time);

        } // end for loop

//--------------------------------------------------------------------------------
// OpenCL matrix multiplication ... C row per work item, A row in pivate memory
//--------------------------------------------------------------------------------

        // Create the compute program from the source buffer
        program = cl::Program(context, util::loadProgram("../C_row_priv.cl"), true);

        // Create the compute kernel from the program
        auto arowpriv_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer>(program, "mmul");

        printf("\n===== OpenCL, matrix mult, C row, A row in priv mem, order %d ======\n",Ndim);

        // Do the multiplication COUNT times
        for (int i = 0; i < COUNT; i++)
        {
            zero_mat(Ndim, Mdim, h_C);

            start_time = wtime();

            cl::NDRange global(Ndim);
            cl::NDRange local(ORDER / 16);
            arowpriv_mmul(cl::EnqueueArgs(queue, global, local),
                    Mdim, Ndim, Pdim, d_a, d_b, d_c);

            queue.finish();

            run_time = wtime() - start_time;

            cl::copy(queue, d_c, begin(h_C), end(h_C));

            results(Mdim, Ndim, Pdim, h_C, run_time);

        } // end for loop

//--------------------------------------------------------------------------------
// OpenCL matrix multiplication ... C row per work item, A row pivate, B col local
//--------------------------------------------------------------------------------

        // Create the compute program from the source buffer
        program = cl::Program(context, util::loadProgram("../C_row_priv_bloc.cl"), true);

        // Create the compute kernel from the program
        auto browloc_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer, cl::LocalSpaceArg>(program, "mmul");

        printf("\n===== OpenCL, mat mult, C row, priv A, B cols loc, order %d ======\n",Ndim);

        // Do the multiplication COUNT times
        for (int i = 0; i < COUNT; i++)
        {
            zero_mat(Ndim, Mdim, h_C);

            start_time = wtime();

            cl::NDRange global(Ndim);
            cl::NDRange local(ORDER / 16);

            cl::LocalSpaceArg localmem = cl::Local(sizeof(float) * Pdim);

            browloc_mmul(cl::EnqueueArgs(queue, global, local),
                    Mdim, Ndim, Pdim, d_a, d_b, d_c, localmem);

            queue.finish();

            run_time = wtime() - start_time;

            cl::copy(queue, d_c, begin(h_C), end(h_C));

            results(Mdim, Ndim, Pdim, h_C, run_time);

        } // end for loop

//--------------------------------------------------------------------------------
// OpenCL matrix multiplication ...  A and B in block form in local memory
//--------------------------------------------------------------------------------

        // Create the compute program from the source buffer
        program = cl::Program(context, util::loadProgram("../C_block_form.cl"), true);

        // Create the compute kernel from the program
        auto block_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer, cl::LocalSpaceArg, cl::LocalSpaceArg>(program, "mmul");

        printf("\n===== OpenCL, A and B in block form in local memory, order %d ======\n",Ndim);

        // Do the multiplication COUNT times
        for (int i = 0; i < COUNT; i++)
        {
            zero_mat(Ndim, Mdim, h_C);

            start_time = wtime();

            int blocksize = 16;
            cl::NDRange global(Ndim, Mdim);
            cl::NDRange local(blocksize, blocksize);

            cl::LocalSpaceArg localmem1 = cl::Local(sizeof(float) * blocksize * blocksize);
            cl::LocalSpaceArg localmem2 = cl::Local(sizeof(float) * blocksize * blocksize);

            block_mmul(cl::EnqueueArgs(queue, global, local),
                    Mdim, Ndim, Pdim, d_a, d_b, d_c, localmem1, localmem2);

            queue.finish();

            run_time = wtime() - start_time;

            cl::copy(queue, d_c, begin(h_C), end(h_C));

            results(Mdim, Ndim, Pdim, h_C, run_time);

        } // end for loop

    } catch (cl::Error err)
    {
        std::cout << "Exception\n";
        std::cerr << "ERROR: "
                  << err.what()
                  << "("
                  << err_code(err.err())
                  << ")"
                  << std::endl;
    }

    return EXIT_SUCCESS;
}
Esempio n. 6
0
int main()
{
    double *a;
    double *b;
    double *c;
    int i = 0, j = 0, k = 0;
    int *events;                        // Array of events
    long long *values;                  // Array of values events
    int EventSet = PAPI_NULL;           // Handle for a PAPI event set as created by PAPI_create_eventset (3) 
    int retval;                         // Test fail function
    int num_event = 0;                  // Number of events
    int max_event;                      // Number of available events
    int EventCode = 0;                  // Event code
    PAPI_event_info_t pset;             // PAPI_event_info_t Struct Reference
    char evname[PAPI_MAX_STR_LEN];      // Symbol event
   
    /* Memory asignament to matrixs*/   
    if((a = (double *)malloc(mrows * ncolumns * sizeof(double))) == NULL)
        printf("Error malloc matrix a[%d]\n",mrows * ncolumns);
    if((b = (double *)malloc(ncolumns * pcolumns * sizeof(double))) == NULL)
        printf("Error malloc matrix b[%d]\n",mrows * ncolumns);
    if((c = (double *)malloc(mrows * pcolumns * sizeof(double))) == NULL)
        printf("Error malloc matrix c[%d]\n",mrows * ncolumns);

    /* Initialize the Matrix arrays */
    initmat(a, b, mrows, ncolumns, pcolumns);

    /* Initialize the PAPI library */
    retval = PAPI_library_init(PAPI_VER_CURRENT);
    if (retval != PAPI_VER_CURRENT)
        test_fail( __FILE__, __LINE__, "PAPI_library_init", retval );

    /* Enable and initialize multiplex support */
    retval = PAPI_multiplex_init();
    if ( retval != PAPI_OK )
        test_fail( __FILE__, __LINE__, "PAPI_multiplex_init", retval );
 
    /* Create an EventSet */
    retval = PAPI_create_eventset(&EventSet);
    if ( retval != PAPI_OK )
        test_fail( __FILE__, __LINE__, "PAPI_create_eventset", retval );
 
    /* Assign it to the CPU component */
    retval = PAPI_assign_eventset_component(EventSet, 0);
    if ( retval != PAPI_OK )
        test_fail( __FILE__, __LINE__, "PAPI_assign_eventset_component", retval );
 
    /* Convert the EventSet to a multiplexed event set */
    retval = PAPI_set_multiplex(EventSet);
    if ( retval != PAPI_OK )
        test_fail( __FILE__, __LINE__, "PAPI_set_multiplex", retval );

    /* Obtaining the number of available events */
    max_event = PAPI_get_opt( PAPI_MAX_MPX_CTRS, NULL );
    printf("\nNumber of available events: %d", max_event );
 
    /* Fill up the event set with as many non-derived events as we can */
    EventCode = PAPI_PRESET_MASK;
    do {
        if ( PAPI_get_event_info( EventCode, &pset ) == PAPI_OK ) {
            if ( pset.count && ( strcmp( pset.derived, "NOT_DERIVED" ) == 0 ) ) {
                retval = PAPI_add_event( EventSet, ( int ) pset.event_code );
                if ( retval != PAPI_OK )
                    test_fail( __FILE__, __LINE__, "PAPI_add_event", retval );
                else {
                    //printf( "Added %s\n", pset.symbol );
                    num_event++;
                }
            }
        }
    } while ( ( PAPI_enum_event( &EventCode, PAPI_PRESET_ENUM_AVAIL ) == PAPI_OK ) && ( num_event < max_event ) );
    
    /* Memory asignament to values and events*/    
    events = ( int * ) malloc( ( size_t ) num_event * sizeof ( int ) );
    if ( events == NULL )
        test_fail( __FILE__, __LINE__, "Error malloc events", 0 );
    values = ( long long * ) malloc( ( size_t ) num_event * sizeof ( long long ) );
    if ( values == NULL )
        test_fail( __FILE__, __LINE__, "Erro malloc values", 0 );

    /* Start counting events */
    if ((retval=PAPI_start(EventSet)) != PAPI_OK)
        test_fail(__FILE__, __LINE__, "PAPI_start", retval);

    /* Matrix-Matrix multiply */
    matmul(a, b, c, mrows, ncolumns, pcolumns);

    /* Read the counters */
    if ((retval=PAPI_read( EventSet, values )) != PAPI_OK)
        test_fail(__FILE__, __LINE__, "PAPI_read_counters", retval);
   
    /* Stop counting events */
    if ((retval=PAPI_stop( EventSet, values )) != PAPI_OK)
        test_fail(__FILE__, __LINE__, "PAPI_stop_counters", retval);

    /* List the events in the event set */
    retval = PAPI_list_events( EventSet, events, &num_event );
    if ( retval != PAPI_OK )
        test_fail( __FILE__, __LINE__, "PAPI_list_events", retval );

    /* Print results */
    printf("\nNumber of non-zero events: %d\n", num_event );
    printf( "\nCounts of non-zero available events........................................................\n" );
    printf("Name: \t\t\t  Value: \t Description:\n");
    for ( i = 0; i < num_event; i++ ) {
        PAPI_event_code_to_name( events[i], evname );   // Obtaining name of available events
        PAPI_get_event_info(events[i], &pset);
        if ( values[i] != 0 )  printf("%s \t %15lld \t %s\n", evname, values[i], pset.long_descr);
    }
    printf( "\nCounts of zero available events............................................................\n" );
    printf("Name: \t\t\t  Value: \t Description:\n");
    for ( i = 0; i < num_event; i++ ) {
        PAPI_event_code_to_name( events[i], evname );   // Obtaining name of available events
        PAPI_get_event_info(events[i], &pset);
        if ( values[i] == 0 )  printf("%s \t %15lld \t %s\n", evname, values[i], pset.long_descr);
    }

    /* Check if counter pair(s) had identical values */
    for ( i = 0; i < num_event; i++ ) {
        for ( i = j+1; j < num_event; j++ ) {
            if ( ( i != j ) && ( values[i] == values[j] ) ) k++;  
        }
    }
    if ( k != 0 ) {
        printf( "\nCaution: %d counter pair(s) had identical values\n", k );
    }
    printf("\n");

    /* Free memory */
    free( events );
    free( values );
    free( a );
    free( b );
    free( c );

    /* Cleaning events */
    retval = PAPI_cleanup_eventset( EventSet );
    if ( retval != PAPI_OK )
        test_fail( __FILE__, __LINE__, "PAPI_cleanup_eventset", retval );
    
    /* Destroying events */
    retval = PAPI_destroy_eventset( &EventSet );
    if ( retval != PAPI_OK )
        test_fail( __FILE__, __LINE__, "PAPI_destroy_eventset", retval );

    return 0;
}
Esempio n. 7
0
int main(int argc, char *argv[])
{
    float *h_A;             // A matrix
    float *h_B;             // B matrix
    float *h_C;             // C = A*B matrix
    int N;                  // A[N][N], B[N][N], C[N][N]
    int size;               // number of elements in each matrix

    cl_mem d_a, d_b, d_c;   // Matrices in device memory

    double start_time;      // Starting time
    double run_time;        // timing data

    char * kernelsource;    // kernel source string

    cl_int err;             // error code returned from OpenCL calls
    cl_device_id     device;        // compute device id 
    cl_context       context;       // compute context
    cl_command_queue commands;      // compute command queue
    cl_program       program;       // compute program
    cl_kernel        kernel;        // compute kernel

    N = ORDER;

    size = N * N;

    h_A = (float *)malloc(size * sizeof(float));
    h_B = (float *)malloc(size * sizeof(float));
    h_C = (float *)malloc(size * sizeof(float));

//--------------------------------------------------------------------------------
// Create a context, queue and device.
//--------------------------------------------------------------------------------

    cl_uint deviceIndex = 0;
    parseArguments(argc, argv, &deviceIndex);

    // Get list of devices
    cl_device_id devices[MAX_DEVICES];
    unsigned numDevices = getDeviceList(devices);

    // Check device index in range
    if (deviceIndex >= numDevices)
    {
      printf("Invalid device index (try '--list')\n");
      return EXIT_FAILURE;
    }

    device = devices[deviceIndex];

    char name[MAX_INFO_STRING];
    getDeviceName(device, name);
    printf("\nUsing OpenCL device: %s\n", name);

    // Create a compute context
    context = clCreateContext(0, 1, &device, NULL, NULL, &err);
    checkError(err, "Creating context");
    // Create a command queue
    commands = clCreateCommandQueue(context, device, 0, &err);
    checkError(err, "Creating command queue");


//--------------------------------------------------------------------------------
// Run sequential version on the host
//--------------------------------------------------------------------------------

    initmat(N, h_A, h_B, h_C);

    printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",ORDER);
    for(int i = 0; i < COUNT; i++)
    {
        zero_mat(N, h_C);
        start_time = wtime();

        seq_mat_mul_sdot(N, h_A, h_B, h_C);

        run_time  = wtime() - start_time;
        results(N, h_C, run_time);
    }

//--------------------------------------------------------------------------------
// Setup the buffers, initialize matrices, and write them into global memory
//--------------------------------------------------------------------------------

    //  Reset A, B and C matrices (just to play it safe)
    initmat(N, h_A, h_B, h_C);

    d_a = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                            sizeof(float) * size, h_A, &err);
    checkError(err, "Creating buffer d_a");
    d_b = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                            sizeof(float) * size, h_B, &err);
    checkError(err, "Creating buffer d_b");
    d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
                            sizeof(float) * size, NULL, &err);
    checkError(err, "Creating buffer d_c");


//--------------------------------------------------------------------------------
// OpenCL matrix multiplication ... Naive
//--------------------------------------------------------------------------------

    kernelsource = getKernelSource("../C_elem.cl");
    // Create the comput program from the source buffer
    program = clCreateProgramWithSource(context, 1, (const char **) & kernelsource, NULL, &err);
    checkError(err, "Creating program with C_elem.cl");
    free(kernelsource);
    // Build the program
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];

        printf("Error: Failed to build program executable!\n%s\n", err_code(err));
        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        return EXIT_FAILURE;
    }

    // Create the compute kernel from the program
    kernel = clCreateKernel(program, "mmul", &err);
    checkError(err, "Creating kernel from C_elem.cl");

    printf("\n===== OpenCL, matrix mult, C(i,j) per work item, order %d ======\n",N);

    // Do the multiplication COUNT times
    for (int i = 0; i < COUNT; i++)
    {
        zero_mat(N, h_C);

        err =  clSetKernelArg(kernel, 0, sizeof(int),    &N);
        err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_a);
        err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_b);
        err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_c);
        checkError(err, "Setting kernel args");

        start_time = wtime();

        // Execute the kernel over the entire range of C matrix elements ... computing
        // a dot product for each element of the product matrix.  The local work
        // group size is set to NULL ... so I'm telling the OpenCL runtime to
        // figure out a local work group size for me.
        const size_t global[2] = {N, N};
        err = clEnqueueNDRangeKernel(
            commands,
            kernel,
            2, NULL,
            global, NULL,
            0, NULL, NULL);
        checkError(err, "Enqueueing kernel");

        err = clFinish(commands);
        checkError(err, "Waiting for kernel to finish");

        run_time = wtime() - start_time;

        err = clEnqueueReadBuffer(
            commands, d_c, CL_TRUE, 0,
            sizeof(float) * size, h_C,
            0, NULL, NULL);
        checkError(err, "Copying back d_c");

        results(N, h_C, run_time);

    } // end for loop

//--------------------------------------------------------------------------------
// OpenCL matrix multiplication ... C row per work item
//--------------------------------------------------------------------------------
    kernelsource = getKernelSource("../C_row.cl");
    // Create the comput program from the source buffer
    program = clCreateProgramWithSource(context, 1, (const char **) & kernelsource, NULL, &err);
    checkError(err, "Creating program with C_row.cl");
    free(kernelsource);
    // Build the program
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];

        printf("Error: Failed to build program executable!\n%s\n", err_code(err));
        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        return EXIT_FAILURE;
    }

    // Create the compute kernel from the program
    kernel = clCreateKernel(program, "mmul", &err);
    checkError(err, "Creating kernel from C_row.cl");

    printf("\n===== OpenCL, matrix mult, C row per work item, order %d ======\n",N);

    // Do the multiplication COUNT times
    for (int i = 0; i < COUNT; i++)
    {
        zero_mat(N, h_C);

        err =  clSetKernelArg(kernel, 0, sizeof(int),    &N);
        err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_a);
        err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_b);
        err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_c);
        checkError(err, "Setting kernel args");

        start_time = wtime();

        // Execute the kernel over the rows of the C matrix ... computing
        // a dot product for each element of the product matrix.
        const size_t global = N;
        err = clEnqueueNDRangeKernel(
            commands,
            kernel,
            1, NULL,
            &global, NULL,
            0, NULL, NULL);
        checkError(err, "Enqueueing kernel");

        err = clFinish(commands);
        checkError(err, "Waiting for kernel to finish");

        run_time = wtime() - start_time;

        err = clEnqueueReadBuffer(
            commands, d_c, CL_TRUE, 0,
            sizeof(float) * size, h_C,
            0, NULL, NULL);
        checkError(err, "Reading back d_c");

        results(N, h_C, run_time);

    } // end for loop


//--------------------------------------------------------------------------------
// OpenCL matrix multiplication ... C row per work item, A row in pivate memory
//--------------------------------------------------------------------------------
    kernelsource = getKernelSource("../C_row_priv.cl");
    // Create the comput program from the source buffer
    program = clCreateProgramWithSource(context, 1, (const char **) & kernelsource, NULL, &err);
    checkError(err, "Creating program from C_row_priv.cl");
    free(kernelsource);
    // Build the program
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];

        printf("Error: Failed to build program executable!\n%s\n", err_code(err));
        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        return EXIT_FAILURE;
    }

    // Create the compute kernel from the program
    kernel = clCreateKernel(program, "mmul", &err);
    checkError(err, "Creating kernel from C_row_priv.cl");

    printf("\n===== OpenCL, matrix mult, C row, A row in priv mem, order %d ======\n",N);

    // Do the multiplication COUNT times
    for (int i = 0; i < COUNT; i++)
    {
        zero_mat(N, h_C);

        err =  clSetKernelArg(kernel, 0, sizeof(int),    &N);
        err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_a);
        err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_b);
        err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_c);
        checkError(err, "Setting kernel args");

        start_time = wtime();

        // Execute the kernel over the rows of the C matrix ... computing
        // a dot product for each element of the product matrix.
        const size_t global = N;
        const size_t local = ORDER / 16;
        err = clEnqueueNDRangeKernel(
            commands,
            kernel,
            1, NULL,
            &global, &local,
            0, NULL, NULL);
        checkError(err, "Enqueueing kernel");

        err = clFinish(commands);
        checkError(err, "Waiting for kernel to finish");

        run_time = wtime() - start_time;

        err = clEnqueueReadBuffer(
            commands, d_c, CL_TRUE, 0,
            sizeof(float) * size, h_C,
            0, NULL, NULL);
        checkError(err, "Reading back d_c");

        results(N, h_C, run_time);

    } // end for loop


//--------------------------------------------------------------------------------
// Clean up!
//--------------------------------------------------------------------------------

    free(h_A);
    free(h_B);
    free(h_C);
    clReleaseMemObject(d_a);
    clReleaseMemObject(d_b);
    clReleaseMemObject(d_c);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(commands);
    clReleaseContext(context);

    return EXIT_SUCCESS;
}
int main(void)
{

    int N;   // A[N][N], B[N][N], C[N][N]
    int sz;  // number of elements in each matrix
    float tmp;

    N = ORDER;

    sz = N * N;

    std::vector<float> h_A(sz); // Matrix A on the host
    std::vector<float> h_B(sz); // Matrix B on the host
    std::vector<float> h_C(sz); // Matrix C on the host

    cl::Buffer d_A;    // matrix A on the device
    cl::Buffer d_B;    // matrix B on the device
    cl::Buffer d_C;    // matrix C on the device

    initmat(N, N, N, h_A, h_B, h_C);

    printf("\n===== Sequential, matrix mult (dot prod), order %d on CPU ======\n",ORDER);
 
    zero_mat(N, N, h_C);

    util::Timer timer;

    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            tmp = 0.0f;
            for (int k = 0; k < N; k++) {
                tmp += h_A[i*N+k] * h_B[k*N+j];
            }
            h_C[i*N+j] = tmp;
        }
    }
              
    double rtime = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0;

    results(N, N, N, h_C, rtime);

    printf("\n===== Parallel matrix mult (dot prod), order %d on device ======\n",ORDER);

    switch (DEVICE) {
      case CL_DEVICE_TYPE_DEFAULT: printf("DEVICE=DEFAULT\n"); break;
      case CL_DEVICE_TYPE_CPU:     printf("DEVICE=CPU\n"); break;
      case CL_DEVICE_TYPE_GPU:     printf("DEVICE=GPU\n"); break;
      default:                     printf("DEVICE=%d\n", DEVICE); break;
    }
 
    zero_mat(N, N, h_C);
    try
    {
   
       cl::Context context(DEVICE);

       // Load in kernel source, creating a program object for the context.
       // Build program explicitly so I can catch errors and display
       // compiler error messages (should any be generated)

       cl::Program program(context, util::loadProgram("matmul_kernel.cl"));
       try
       {
           program.build();
       }
       catch (cl::Error error)
       {
          // If it was a build error then show the error
          if (error.err() == CL_BUILD_PROGRAM_FAILURE)
           {
               std::vector<cl::Device> devices;
               devices = context.getInfo<CL_CONTEXT_DEVICES>();
               std::string built = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0]);
               std::cerr << built << "\n";
           }
           throw error;
       }


        // Get the command queue
        cl::CommandQueue queue(context);


        // Create the kernel functor
 
        auto mmul = cl::make_kernel<int, cl::Buffer, cl::Buffer, cl::Buffer, 
                                    cl::LocalSpaceArg, cl::LocalSpaceArg>   
                                    (program, "mmul");

        util::Timer timer;


        d_A   = cl::Buffer(context, begin(h_A), end(h_A), true);
        d_B   = cl::Buffer(context, begin(h_B), end(h_B), true);
        d_C   = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * sz);

        // Work-group computes a block of C.  This size is also set
        // in a #define inside the kernel function.  Note this blocksize
        // must evenly divide the matrix order
        int blocksize = 16;  

        cl::LocalSpaceArg A_block = cl::Local(sizeof(float) * blocksize*blocksize);
        cl::LocalSpaceArg B_block = cl::Local(sizeof(float) * blocksize*blocksize);
 
        mmul(
            cl::EnqueueArgs(
            queue,
            cl::NDRange(N,N),
            cl::NDRange(blocksize,blocksize)),
            N, 
            d_A,
            d_B,
            d_C,
            A_block,
            B_block);

        cl::copy(queue, d_C, begin(h_C), end(h_C));

        double rtime = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0;

        results(N, N, N, h_C, rtime);
          
    }
    catch (cl::Error err) {
        std::cout << "Exception\n";
        std::cerr 
            << "ERROR: "
            << err.what()
            << std::endl;
 
    }

}
Esempio n. 9
0
int main(int argc, char *argv[])
{

    int N;                  // A[N][N], B[N][N], C[N][N]
    int size;               // Number of elements in each matrix


    double start_time;      // Starting time
    double run_time;        // Timing
    util::Timer timer;      // Timing

    N    = ORDER;
    size = N * N;

    std::vector<float> h_A(size); // Host memory for Matrix A
    std::vector<float> h_B(size); // Host memory for Matrix B
    std::vector<float> h_C(size); // Host memory for Matrix C

    cl::Buffer d_a, d_b, d_c;   // Matrices in device memory

//--------------------------------------------------------------------------------
// Create a context and queue
//--------------------------------------------------------------------------------

    try
    {

        cl_uint deviceIndex = 0;
        parseArguments(argc, argv, &deviceIndex);

        // Get list of devices
        std::vector<cl::Device> devices;
        unsigned numDevices = getDeviceList(devices);

        // Check device index in range
        if (deviceIndex >= numDevices)
        {
          std::cout << "Invalid device index (try '--list')\n";
          return EXIT_FAILURE;
        }

        cl::Device device = devices[deviceIndex];

        std::string name;
        getDeviceName(device, name);
        std::cout << "\nUsing OpenCL device: " << name << "\n";

        std::vector<cl::Device> chosen_device;
        chosen_device.push_back(device);
        cl::Context context(chosen_device);
        cl::CommandQueue queue(context, device);

//--------------------------------------------------------------------------------
// Run sequential matmul
//--------------------------------------------------------------------------------


        initmat(N, h_A, h_B, h_C);

        timer.reset();

        printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",N);
        for(int i = 0; i < COUNT; i++)
        {
            zero_mat(N, h_C);

            start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0;

            seq_mat_mul_sdot(N, h_A, h_B, h_C);

            run_time  = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time;
            results(N, h_C, run_time);
        }

//--------------------------------------------------------------------------------
// Setup the buffers, initialize matrices, and write them into global memory
//--------------------------------------------------------------------------------

        //  Reset A, B and C matrices (just to play it safe)
        initmat(N, h_A, h_B, h_C);

        d_a = cl::Buffer(context, h_A.begin(), h_A.end(), true);

        d_b = cl::Buffer(context, h_B.begin(), h_B.end(), true);

        d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * size);

//--------------------------------------------------------------------------------
// OpenCL matrix multiplication ... Naive
//--------------------------------------------------------------------------------

        // Create the compute program from the source buffer
        cl::Program program(context, kernelsource, true);

        // Create the compute kernel from the program
        cl::make_kernel<int, cl::Buffer, cl::Buffer, cl::Buffer> naive_mmul(program, "mmul");

        printf("\n===== OpenCL, matrix mult, C(i,j) per work item, order %d ======\n",N);

        // Do the multiplication COUNT times
        for (int i = 0; i < COUNT; i++)
        {
            zero_mat(N, h_C);

            start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0;

            // Execute the kernel over the entire range of C matrix elements ... computing
            // a dot product for each element of the product matrix.  The local work
            // group size is set to NULL ... so I'm telling the OpenCL runtime to
            // figure out a local work group size for me.
            cl::NDRange global(N, N);
            naive_mmul(cl::EnqueueArgs(queue, global),
                    N, d_a, d_b, d_c);

            queue.finish();

            run_time  = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time;

            cl::copy(queue, d_c, h_C.begin(), h_C.end());

            results(N, h_C, run_time);

        } // end for loop

    } catch (cl::Error err)
    {
        std::cout << "Exception\n";
        std::cerr << "ERROR: "
                  << err.what()
                  << "("
                  << err_code(err.err())
                  << ")"
                  << std::endl;
    }

    return EXIT_SUCCESS;
}
Esempio n. 10
0
int main(void)
{

    int Mdim, Ndim, Pdim;   // A[N][P], B[P][M], C[N][M]
    int szA, szB, szC;      // Number of elements in each matrix


    double start_time;      // Starting time
    double run_time;        // Timing 
    util::Timer timer;      // Timing

    Ndim = ORDER;
    Pdim = ORDER;
    Mdim = ORDER;

    szA = Ndim * Pdim;
    szB = Pdim * Mdim;
    szC = Ndim * Mdim;

    std::vector<float> h_A(szA); // Host memory for Matrix A
    std::vector<float> h_B(szB); // Host memory for Matrix B
    std::vector<float> h_C(szC); // Host memory for Matrix C

    cl::Buffer d_a, d_b, d_c;   // Matrices in device memory

    initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C);

    timer.reset();

    printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",ORDER);
    for(int i = 0; i < COUNT; i++)
    {
        zero_mat(Ndim, Mdim, h_C);

        start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0;

        seq_mat_mul_sdot(Mdim, Ndim, Pdim, h_A, h_B, h_C);

        run_time  = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time;
        results(Mdim, Ndim, Pdim, h_C, run_time);
    }

    try
    {

//--------------------------------------------------------------------------------
// Create a context and queue for DEVICE
//--------------------------------------------------------------------------------

        cl::Context context(DEVICE);
        cl::CommandQueue queue(context);

//--------------------------------------------------------------------------------
// Setup the buffers, initialize matrices, and write them into global memory
//--------------------------------------------------------------------------------

        //  Reset A, B and C matrices (just to play it safe)
        initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C);

        d_a = cl::Buffer(context, begin(h_A), end(h_A), true);

        d_b = cl::Buffer(context, begin(h_B), end(h_B), true);

        d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * szC);

//--------------------------------------------------------------------------------
// OpenCL matrix multiplication ... Naive
//--------------------------------------------------------------------------------

        // Create the compute program from the source buffer
        cl::Program program(context, kernelsource, true);

        // Create the compute kernel from the program
        auto naive_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer>(program, "mmul");

        printf("\n===== OpenCL, matrix mult, C(i,j) per work item, order %d ======\n",Ndim);

        // Do the multiplication COUNT times
        for (int i = 0; i < COUNT; i++)
        {
            zero_mat(Ndim, Mdim, h_C);

            start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0;

            // Execute the kernel over the entire range of C matrix elements ... computing
            // a dot product for each element of the product matrix.  The local work
            // group size is set to NULL ... so I'm telling the OpenCL runtime to
            // figure out a local work group size for me.
            cl::NDRange global(Ndim, Mdim);
            naive_mmul(cl::EnqueueArgs(queue, global),
                    Mdim, Ndim, Pdim, d_a, d_b, d_c);

            queue.finish();

            run_time  = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time;

            cl::copy(queue, d_c, begin(h_C), end(h_C));

            results(Mdim, Ndim, Pdim, h_C, run_time);

        } // end for loop

    } catch (cl::Error err)
    {
        std::cout << "Exception\n";
        std::cerr << "ERROR: "
                  << err.what()
                  << "("
                  << err_code(err.err())
                  << ")"
                  << std::endl;
    }

    return EXIT_SUCCESS;
}