C++ (Cpp) zero_mat Examples

Programming Language: C++ (Cpp)

Method/Function: zero_mat

Examples at hotexamples.com: 5

C++ (Cpp) zero_mat - 5 examples found. These are the top rated real world C++ (Cpp) examples of zero_mat extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

// exponentiate exactly a hermitian matrix "Q" into SU(NC) matrix "U"
void
exponentiate( GLU_complex U[ NCNC ] , 
	      const GLU_complex Q[ NCNC ] )
{
#if NC == 3
  GLU_real *qq = ( GLU_real* )Q ;
  const double REQ0 = *( qq + 0 ) ;
  const double REQ1 = *( qq + 2 ) ;
  const double IMQ1 = *( qq + 3 ) ;
  const double REQ2 = *( qq + 4 ) ;
  const double IMQ2 = *( qq + 5 ) ;
  const double REQ4 = *( qq + 8 ) ;
  const double REQ5 = *( qq + 10 ) ;
  const double IMQ5 = *( qq + 11 ) ;
  const double REQ8 = *( qq + 16 ) ;

  // speed this up too (use determinant relation)
  const double c1 = ( REQ0 * REQ0 + REQ0 * REQ4 + REQ4 * REQ4		\
		      + REQ1 * REQ1 + IMQ1 * IMQ1			\
		      + REQ2 * REQ2 + IMQ2 * IMQ2			\
		      + REQ5 * REQ5 + IMQ5 * IMQ5 ) * OneO3 ; 
 
  //Iff c0_max < ( smallest representable double) the matrix Q is zero and its
  //exponential is the identity matrix ..
  if( unlikely( c1 < DBL_MIN ) ) {
    *( U + 0 ) = 1. ; 
    *( U + 1 ) = 0. ; 
    *( U + 2 ) = 0. ; 
    *( U + 3 ) = 0. ; 
    *( U + 4 ) = 1. ; 
    *( U + 5 ) = 0. ; 
    *( U + 6 ) = 0. ; 
    *( U + 7 ) = 0. ; 
    *( U + 8 ) = 1. ; 
    return ;
  }

  // will write this out as it can be done cheaper
  // 1/3 * tr AAA is just det( A ) 
  // Below is a quickened determinant
  double c0 =  REQ0  * ( REQ4 * REQ8				\
			 - REQ5  * REQ5  - IMQ5  * IMQ5  ) ;
  // from the middle
  c0 -= REQ1  * ( REQ1 * REQ8 		\
		  - REQ5  * REQ2  - IMQ5  * IMQ2  ) ;
  c0 += IMQ1  * ( - IMQ1 * REQ8		\
		  + REQ5  * IMQ2  - IMQ5  * REQ2 ) ;
  // final column
  c0 += REQ2  * ( - REQ4  * REQ2			\
		  + REQ1 * REQ5  - IMQ1 * IMQ5  ) ;
  c0 -= IMQ2  * ( REQ4  * IMQ2				\
		  - REQ1 * IMQ5  - IMQ1 * REQ5 ) ;

  // so if c0 is negative we flip the sign ...
  const double flag = c0 < 0 ? -1.0 : 1.0 ;
  c0 *= flag ;
 
  // compute the constants c0_max and the root of c1 ...
  const double rc1 = sqrt( c1 ) ;
  const double c0_max = 2. * rc1 * c1 ; 
  const double theta = acos( c0 / c0_max ) * OneO3 ; 
  const double ctheta = cos( theta ) ;
  register const double u = rc1 * ctheta ; 
  register const double w = r3 * rc1 * sin( theta ) ;
  const double uu = u * u  ,  ww = w * w  ,  cw = cos( w ) ; 
  const double denom = 1.0 / ( 9. * uu - ww ) ;
  const double cu = cos( u ) ;
  const double su = sin( u ) ;
  // and I thought double angle formulas were useless!
  //double complex one , two ;
  const double complex one = cu - I * su ;
  double complex two = conj( one ) ; //cu + I * su ;
  two *= two ;
 
  // taylor expand if getting toward the numerically unstable end
  const double E0 = fabs( w ) < SINTOL ? ( 1 - ww / 6. * ( 1 - ww / 20. * ( 1 - ww / 42. ) ) ) : sin( w ) / w ; 

  double complex f0 = ( uu - ww ) * two + one * ( 8. * uu * cw + 2. * I * u * ( 3. * uu + ww ) * E0 ) ; 
  double complex f1 = 2. * u * two - one * ( 2. * u * cw - I * ( 3. * uu - ww ) * E0 ) ; 
  double complex f2 = two - one * ( cw + 3. * I * u * E0 ) ; 

  f0 = denom * ( creal( f0 ) + I * cimag( f0 ) * flag ) ;
  f1 = denom * ( flag * creal( f1 ) + I * cimag( f1 ) ) ;
  f2 = denom * ( creal( f2 ) + I * cimag( f2 ) * flag ) ;

  // QQ[0].
  const double temp0 = REQ0 * REQ0 + REQ1 * REQ1 +	\
    IMQ1 * IMQ1 + REQ2 * REQ2 + IMQ2 * IMQ2 ;
  // QQ[1]
  const double complex temp1 = -REQ1 * ( REQ8 ) + REQ2 * REQ5 + IMQ2 * IMQ5 
    + I * ( REQ5 * IMQ2 - REQ2 * IMQ5 - IMQ1 * REQ8 ) ;
  // QQ[2]
  const double complex temp2 = REQ1 * REQ5 - IMQ1 * IMQ5 - REQ2 * REQ4 +
    I * ( IMQ1 * REQ5 + IMQ5 * REQ1 - IMQ2 * REQ4 ) ;
  // QQ[4]
  const double temp3 = REQ4 * REQ4 + REQ1 * REQ1	\
    + IMQ1 * IMQ1 + REQ5 * REQ5 + IMQ5 * IMQ5 ;
  // QQ[5]
  const double complex temp4 = REQ1 * REQ2 + IMQ2 * IMQ1 - REQ0 * REQ5 +
    I * ( REQ1 * IMQ2 - REQ2 * IMQ1 - REQ0 * IMQ5 ) ;
  // QQ[8]
  const double temp5 = REQ8 * REQ8 + REQ2 * REQ2 +	\
    IMQ2 * IMQ2 + REQ5 * REQ5 + IMQ5 * IMQ5 ;
  // U = f0I + f1 Q + f2 QQ 
  *( U + 0 ) = f0 + f1 * REQ0 + f2 * temp0  ; 
  *( U + 1 ) = f1 * Q[1] + f2 * temp1 ; 
  *( U + 2 ) = f1 * Q[2] + f2 * temp2 ; 
  //
  *( U + 3 ) = f1 * Q[3] + f2 * conj( temp1 ) ; 
  *( U + 4 ) = f0 + f1 * REQ4 + f2 * temp3 ; 
  *( U + 5 ) = f1 * Q[5] + f2 * temp4 ; 
  //
  *( U + 6 ) = f1 * Q[6] + f2 * conj( temp2 ) ; 
  *( U + 7 ) = f1 * Q[7] + f2 * conj( temp4 ) ; 
  *( U + 8 ) = f0 + f1 * REQ8 + f2 * temp5 ; 
#elif NC == 2
  double f0 , f1 ; // f1 is purely imaginary
  // eigenvalues are pretty simple +/- sqrt( |a|^2 + |b|^2 ) Only need one
  const double z = sqrt( creal( Q[0] ) * creal( Q[0] ) +	\
			 creal( Q[1] ) * creal( Q[1] ) +	\
			 cimag( Q[1] ) * cimag( Q[1] ) ) ;

  // have eigenvalues, now for the "fun" bit.
  f0 = cos( z ) ;
  // taylor expand 
  f1 = fabs ( z ) < SINTOLSU2 ? ( 1 - z / 6. * ( 1 - z / 20. * ( 1 - z / 42. ) ) ) : sin( z ) / z ;

  const double complex f1Q0 = I * f1 * creal( Q[0] ) ;
  const double complex f1Q1 = I * f1 * Q[1] ;
  *( U + 0 ) = (GLU_complex)( f0 + f1Q0 ) ;
  *( U + 1 ) = (GLU_complex)( f1Q1 ) ;
  *( U + 2 ) = (GLU_complex)( -conj( f1Q1 ) ) ;
  *( U + 3 ) = (GLU_complex)( f0 - f1Q0 ) ; 
#else
  // hmmm could be a toughy
  #if ( defined HAVE_LAPACKE_H || defined HAVE_GSL )
  double complex f[ NC ] ;
  double z[ NC ] ;
  Eigenvalues_hermitian( z , Q ) ;
  calculate_effs_VDM_herm( f , z ) ;
  // matrix expansion reversing horner's rule
  int i , j ;
  diag( U , f[ NC - 1 ] ) ; 
  for( i = NC-1 ; i > 0 ; i-- ) {
    multab_atomic_left( U , Q ) ; // left multiply U with Q
    for( j = 0 ; j < NC ; j++ ) { U[ j*(NC+1) ] += f[ i-1 ] ; }
  }
  #else
  // exponentiate routine from stephan durr's paper
  // Performs the nesting
  // U = ( exp{ A / DIV^n ) ) ^ ( DIV * n )
  GLU_complex EOLD[ NCNC ] GLUalign , SN[ NCNC ] GLUalign ;
  GLU_complex RN_MIN[ NCNC ] GLUalign , RN[ NCNC ] GLUalign ;

  // set to zero
  zero_mat( EOLD ) ;

  // set up the divisor and the minimum 
  double sum = 0.0 ;
  size_t j , n ;
  const int nmin = 3 ;

  // use precomputed factorials  
  for( n = nmin ; n < 10 ; n++ ) {

    // compute the multiplicative factor ...
    const int iter = 2 << ( n - 1 ) ;
    const GLU_complex fact = I / (GLU_real)iter ;

    // and the rational approximations
#ifdef USE_PADE
    for( j = 0 ; j < NCNC ; j++ ) { SN[ j ] = ( Q[j] * fact ) ; }
    horners_pade( RN , SN ) ;

    for( j = 0 ; j < NCNC ; j++ ) { SN[ j ] *= -1.0 ; }
    horners_pade( RN_MIN , SN ) ; 
#else
    for( j = 0 ; j < NCNC ; j++ ) { SN[ j ] = ( Q[j] * fact ) / 2.0 ; }
    horners_exp( RN , SN , 14 ) ;

    for( j = 0 ; j < NCNC ; j++ ) { SN[ j ] *= -1.0 ; }
    horners_exp( RN_MIN , SN , 14 ) ; 
#endif

    inverse( SN , RN_MIN ) ;         // uses our numerical inverse
    multab_atomic_right( RN , SN ) ; // gets the correct rational approx

    // and remove the nested scalings ...
    matrix_power( U , RN , iter ) ; // uses a fast-power like routine 
    
    // for the convergence criteria, I use the absolute difference between
    // evaluations
    sum = 0.0 ;
    for( j = 0 ; j < NCNC ; j++ ) {
      sum += (double)cabs( EOLD[j] - U[j] ) ;
      EOLD[ j ] = U[ j ] ;
    }
    sum /= NCNC ;

    // convergence ....
    if( sum < PREC_TOL ) { break ; } 
    // warning for non-convergence ..
    if( n >= ( MAX_FACTORIAL - 1 ) ) { 
      printf( "[EXPONENTIAL] not converging .. %zu %e \n" , n , sum ) ; 
      break ;
    }
  }
  // gramschmidt orthogonalisation just to make sure
  // has been seen to help preserve gauge invariance of log smearing
  gram_reunit( U ) ;
  #endif
#endif
  return ;
}

Example #2

Show file

File: matmul.cpp Project: BenElgar/Exercises-Solutions

int main(int argc, char *argv[])
{

    int N;                  // A[N][N], B[N][N], C[N][N]
    int size;               // Number of elements in each matrix


    double start_time;      // Starting time
    double run_time;        // Timing
    util::Timer timer;      // Timing

    N    = ORDER;
    size = N * N;

    std::vector<float> h_A(size); // Host memory for Matrix A
    std::vector<float> h_B(size); // Host memory for Matrix B
    std::vector<float> h_C(size); // Host memory for Matrix C

    cl::Buffer d_a, d_b, d_c;   // Matrices in device memory

//--------------------------------------------------------------------------------
// Create a context and queue
//--------------------------------------------------------------------------------

    try
    {

        cl_uint deviceIndex = 0;
        parseArguments(argc, argv, &deviceIndex);

        // Get list of devices
        std::vector<cl::Device> devices;
        unsigned numDevices = getDeviceList(devices);

        // Check device index in range
        if (deviceIndex >= numDevices)
        {
          std::cout << "Invalid device index (try '--list')\n";
          return EXIT_FAILURE;
        }

        cl::Device device = devices[deviceIndex];

        std::string name;
        getDeviceName(device, name);
        std::cout << "\nUsing OpenCL device: " << name << "\n";

        std::vector<cl::Device> chosen_device;
        chosen_device.push_back(device);
        cl::Context context(chosen_device);
        cl::CommandQueue queue(context, device);

//--------------------------------------------------------------------------------
// Run sequential matmul
//--------------------------------------------------------------------------------


        initmat(N, h_A, h_B, h_C);

        timer.reset();

        printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",N);
        for(int i = 0; i < COUNT; i++)
        {
            zero_mat(N, h_C);

            start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0;

            seq_mat_mul_sdot(N, h_A, h_B, h_C);

            run_time  = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time;
            results(N, h_C, run_time);
        }

//--------------------------------------------------------------------------------
// Setup the buffers, initialize matrices, and write them into global memory
//--------------------------------------------------------------------------------

        //  Reset A, B and C matrices (just to play it safe)
        initmat(N, h_A, h_B, h_C);

        d_a = cl::Buffer(context, h_A.begin(), h_A.end(), true);

        d_b = cl::Buffer(context, h_B.begin(), h_B.end(), true);

        d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * size);

//--------------------------------------------------------------------------------
// OpenCL matrix multiplication ... Naive
//--------------------------------------------------------------------------------

        // Create the compute program from the source buffer
        cl::Program program(context, kernelsource, true);

        // Create the compute kernel from the program
        cl::make_kernel<int, cl::Buffer, cl::Buffer, cl::Buffer> naive_mmul(program, "mmul");

        printf("\n===== OpenCL, matrix mult, C(i,j) per work item, order %d ======\n",N);

        // Do the multiplication COUNT times
        for (int i = 0; i < COUNT; i++)
        {
            zero_mat(N, h_C);

            start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0;

            // Execute the kernel over the entire range of C matrix elements ... computing
            // a dot product for each element of the product matrix.  The local work
            // group size is set to NULL ... so I'm telling the OpenCL runtime to
            // figure out a local work group size for me.
            cl::NDRange global(N, N);
            naive_mmul(cl::EnqueueArgs(queue, global),
                    N, d_a, d_b, d_c);

            queue.finish();

            run_time  = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time;

            cl::copy(queue, d_c, h_C.begin(), h_C.end());

            results(N, h_C, run_time);

        } // end for loop

    } catch (cl::Error err)
    {
        std::cout << "Exception\n";
        std::cerr << "ERROR: "
                  << err.what()
                  << "("
                  << err_code(err.err())
                  << ")"
                  << std::endl;
    }

    return EXIT_SUCCESS;
}

Example #3

Show file

File: mm.c Project: g-koutsou/CoS-2

int
main(int argc, char *argv[])
{
  int nargs = 3;  
  if(argc != nargs) {
    usage(argv);
    return 1;
  } 
  int M = atoi(argv[1]);
  int N = atoi(argv[2]);
  
  double *A = alloc(sizeof(double)*M*N);
  double *B = alloc(sizeof(double)*M*N);

  rand_mat(A, M, N);
  rand_mat(B, N, M);

  double *C = alloc(sizeof(double)*M*M);
  zero_mat(C, M, M);
  
  mat_mul(C, M, N, A, B);
  {
    double t0 = stop_watch(0);
    mat_mul(C, M, N, A, B);
    t0 = stop_watch(t0);
    double beta_fp = 0 /* _TODO_A_ calculate beta_fp from timing t0 */;
    printf(" ORIG: M = %d, N = %d,", M, N);
    printf(" took: %4.2e sec,", t0);
    printf(" P = %4.2e Mflop/s\n", beta_fp);
  }

#ifdef BLCK
  double *Cb = alloc(sizeof(double)*M*M);
  zero_mat(Cb, M, M);

  mat_mul_blocked(Cb, M, N, A, B);
  {
    double t0 = stop_watch(0);
    mat_mul_blocked(Cb, M, N, A, B);
    t0 = stop_watch(t0);
    double beta_fp = 0 /* _TODO_A_ calculate beta_fp from timing t0 */;    
    printf(" BLCK: M = %d, N = %d,", M, N);
    printf(" took: %4.2e sec,", t0);
    printf(" P = %4.2e Mflop/s, BM = %d, BN = %d\n", beta_fp, BM, BN);
  }
#endif
  
#ifdef BLCK
  double eps = 1e-12;
  double diff = 0;
  for(int i=0; i<M*M; i++) {
    diff += fabs((C[i] - Cb[i])/C[i]);
  }
  /*
   * If the difference between the flat and blocked result is larger
   * than eps, complain to stdout and write the two matrices to file
   * "diffs.out".
   */
  diff /= (double)M*M;
  if(diff > eps) {
    printf(" Non zero diff: %e\n", diff);
    FILE *fp = fopen("diffs.out", "w");
    for(int i=0; i<M*M; i++)
      fprintf(fp, "%e\n", fabs((C[i]-Cb[i])/C[i]));
    fclose(fp);
  }
#endif
  
  free(A);
  free(B);
  free(C);

#ifdef BLCK
  free(Cb);
#endif 
  return 0;
}

Example #4

Show file

File: matmul.cpp Project: OminiaVincit/Exercises-Solutions

int main(void)
{

    int Mdim, Ndim, Pdim;   // A[N][P], B[P][M], C[N][M]
    int szA, szB, szC;      // Number of elements in each matrix


    double start_time;      // Starting time
    double run_time;        // Timing 
    util::Timer timer;      // Timing

    Ndim = ORDER;
    Pdim = ORDER;
    Mdim = ORDER;

    szA = Ndim * Pdim;
    szB = Pdim * Mdim;
    szC = Ndim * Mdim;

    std::vector<float> h_A(szA); // Host memory for Matrix A
    std::vector<float> h_B(szB); // Host memory for Matrix B
    std::vector<float> h_C(szC); // Host memory for Matrix C

    cl::Buffer d_a, d_b, d_c;   // Matrices in device memory

    initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C);

    timer.reset();

    printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",ORDER);
    for(int i = 0; i < COUNT; i++)
    {
        zero_mat(Ndim, Mdim, h_C);

        start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0;

        seq_mat_mul_sdot(Mdim, Ndim, Pdim, h_A, h_B, h_C);

        run_time  = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time;
        results(Mdim, Ndim, Pdim, h_C, run_time);
    }

    try
    {

//--------------------------------------------------------------------------------
// Create a context and queue for DEVICE
//--------------------------------------------------------------------------------

        cl::Context context(DEVICE);
        cl::CommandQueue queue(context);

//--------------------------------------------------------------------------------
// Setup the buffers, initialize matrices, and write them into global memory
//--------------------------------------------------------------------------------

        //  Reset A, B and C matrices (just to play it safe)
        initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C);

        d_a = cl::Buffer(context, begin(h_A), end(h_A), true);

        d_b = cl::Buffer(context, begin(h_B), end(h_B), true);

        d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * szC);

//--------------------------------------------------------------------------------
// OpenCL matrix multiplication ... Naive
//--------------------------------------------------------------------------------

        // Create the compute program from the source buffer
        cl::Program program(context, kernelsource, true);

        // Create the compute kernel from the program
        auto naive_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer>(program, "mmul");

        printf("\n===== OpenCL, matrix mult, C(i,j) per work item, order %d ======\n",Ndim);

        // Do the multiplication COUNT times
        for (int i = 0; i < COUNT; i++)
        {
            zero_mat(Ndim, Mdim, h_C);

            start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0;

            // Execute the kernel over the entire range of C matrix elements ... computing
            // a dot product for each element of the product matrix.  The local work
            // group size is set to NULL ... so I'm telling the OpenCL runtime to
            // figure out a local work group size for me.
            cl::NDRange global(Ndim, Mdim);
            naive_mmul(cl::EnqueueArgs(queue, global),
                    Mdim, Ndim, Pdim, d_a, d_b, d_c);

            queue.finish();

            run_time  = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time;

            cl::copy(queue, d_c, begin(h_C), end(h_C));

            results(Mdim, Ndim, Pdim, h_C, run_time);

        } // end for loop

    } catch (cl::Error err)
    {
        std::cout << "Exception\n";
        std::cerr << "ERROR: "
                  << err.what()
                  << "("
                  << err_code(err.err())
                  << ")"
                  << std::endl;
    }

    return EXIT_SUCCESS;
}

Example #5

Show file

File: matmul_par_blocked.cpp Project: djmylt/High-Performance-Computing

int main(void)
{

    int N;   // A[N][N], B[N][N], C[N][N]
    int sz;  // number of elements in each matrix
    float tmp;

    N = ORDER;

    sz = N * N;

    std::vector<float> h_A(sz); // Matrix A on the host
    std::vector<float> h_B(sz); // Matrix B on the host
    std::vector<float> h_C(sz); // Matrix C on the host

    cl::Buffer d_A;    // matrix A on the device
    cl::Buffer d_B;    // matrix B on the device
    cl::Buffer d_C;    // matrix C on the device

    initmat(N, N, N, h_A, h_B, h_C);

    printf("\n===== Sequential, matrix mult (dot prod), order %d on CPU ======\n",ORDER);
 
    zero_mat(N, N, h_C);

    util::Timer timer;

    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            tmp = 0.0f;
            for (int k = 0; k < N; k++) {
                tmp += h_A[i*N+k] * h_B[k*N+j];
            }
            h_C[i*N+j] = tmp;
        }
    }
              
    double rtime = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0;

    results(N, N, N, h_C, rtime);

    printf("\n===== Parallel matrix mult (dot prod), order %d on CPU ======\n",ORDER);
 
    zero_mat(N, N, h_C);
    try
    {
   
       cl::Context context(DEVICE);

       // Load in kernel source, creating a program object for the context.
       // Build program explicitly so I can catch errors and display
       // compiler error messages (should any be generated)

       cl::Program program(context, util::loadProgram("matmul_blocked.cl"));
       try
       {
           program.build();
       }
       catch (cl::Error error)
       {
          // If it was a build error then show the error
          if (error.err() == CL_BUILD_PROGRAM_FAILURE)
           {
               std::vector<cl::Device> devices;
               devices = context.getInfo<CL_CONTEXT_DEVICES>();
               std::string built = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0]);
               std::cerr << built << "\n";
           }
           throw error;
       }


        // Get the command queue
        cl::CommandQueue queue(context);


        // Create the kernel functor
 
        auto mmul = cl::make_kernel<int, cl::Buffer, cl::Buffer, cl::Buffer, 
                                      cl::LocalSpaceArg, cl::LocalSpaceArg>   
                                                            (program, "mmul");

        util::Timer timer;


        d_A   = cl::Buffer(context, begin(h_A), end(h_A), true);
        d_B   = cl::Buffer(context, begin(h_B), end(h_B), true);
        d_C   = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * sz);

        // Work-group computes a block of C.  This size is also set
        // in a #define inside the kernel function.  Note this blocksize
        // must evenly divide the matrix order
        int blocksize = 16;  

        cl::LocalSpaceArg A_block = cl::Local(sizeof(float) * blocksize*blocksize);
        cl::LocalSpaceArg B_block = cl::Local(sizeof(float) * blocksize*blocksize);
 
        mmul(
            cl::EnqueueArgs(
            queue,
            cl::NDRange(N,N),
            cl::NDRange(blocksize,blocksize)),
            N, 
            d_A,
            d_B,
            d_C,
            A_block,
            B_block);

        cl::copy(queue, d_C, begin(h_C), end(h_C));

        double rtime = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0;

        results(N, N, N, h_C, rtime);
          
    }
    catch (cl::Error err) {
        std::cout << "Exception\n";
        std::cerr 
            << "ERROR: "
            << err.what()
            << std::endl;
 
    }

}