// exponentiate exactly a hermitian matrix "Q" into SU(NC) matrix "U" void exponentiate( GLU_complex U[ NCNC ] , const GLU_complex Q[ NCNC ] ) { #if NC == 3 GLU_real *qq = ( GLU_real* )Q ; const double REQ0 = *( qq + 0 ) ; const double REQ1 = *( qq + 2 ) ; const double IMQ1 = *( qq + 3 ) ; const double REQ2 = *( qq + 4 ) ; const double IMQ2 = *( qq + 5 ) ; const double REQ4 = *( qq + 8 ) ; const double REQ5 = *( qq + 10 ) ; const double IMQ5 = *( qq + 11 ) ; const double REQ8 = *( qq + 16 ) ; // speed this up too (use determinant relation) const double c1 = ( REQ0 * REQ0 + REQ0 * REQ4 + REQ4 * REQ4 \ + REQ1 * REQ1 + IMQ1 * IMQ1 \ + REQ2 * REQ2 + IMQ2 * IMQ2 \ + REQ5 * REQ5 + IMQ5 * IMQ5 ) * OneO3 ; //Iff c0_max < ( smallest representable double) the matrix Q is zero and its //exponential is the identity matrix .. if( unlikely( c1 < DBL_MIN ) ) { *( U + 0 ) = 1. ; *( U + 1 ) = 0. ; *( U + 2 ) = 0. ; *( U + 3 ) = 0. ; *( U + 4 ) = 1. ; *( U + 5 ) = 0. ; *( U + 6 ) = 0. ; *( U + 7 ) = 0. ; *( U + 8 ) = 1. ; return ; } // will write this out as it can be done cheaper // 1/3 * tr AAA is just det( A ) // Below is a quickened determinant double c0 = REQ0 * ( REQ4 * REQ8 \ - REQ5 * REQ5 - IMQ5 * IMQ5 ) ; // from the middle c0 -= REQ1 * ( REQ1 * REQ8 \ - REQ5 * REQ2 - IMQ5 * IMQ2 ) ; c0 += IMQ1 * ( - IMQ1 * REQ8 \ + REQ5 * IMQ2 - IMQ5 * REQ2 ) ; // final column c0 += REQ2 * ( - REQ4 * REQ2 \ + REQ1 * REQ5 - IMQ1 * IMQ5 ) ; c0 -= IMQ2 * ( REQ4 * IMQ2 \ - REQ1 * IMQ5 - IMQ1 * REQ5 ) ; // so if c0 is negative we flip the sign ... const double flag = c0 < 0 ? -1.0 : 1.0 ; c0 *= flag ; // compute the constants c0_max and the root of c1 ... const double rc1 = sqrt( c1 ) ; const double c0_max = 2. * rc1 * c1 ; const double theta = acos( c0 / c0_max ) * OneO3 ; const double ctheta = cos( theta ) ; register const double u = rc1 * ctheta ; register const double w = r3 * rc1 * sin( theta ) ; const double uu = u * u , ww = w * w , cw = cos( w ) ; const double denom = 1.0 / ( 9. * uu - ww ) ; const double cu = cos( u ) ; const double su = sin( u ) ; // and I thought double angle formulas were useless! //double complex one , two ; const double complex one = cu - I * su ; double complex two = conj( one ) ; //cu + I * su ; two *= two ; // taylor expand if getting toward the numerically unstable end const double E0 = fabs( w ) < SINTOL ? ( 1 - ww / 6. * ( 1 - ww / 20. * ( 1 - ww / 42. ) ) ) : sin( w ) / w ; double complex f0 = ( uu - ww ) * two + one * ( 8. * uu * cw + 2. * I * u * ( 3. * uu + ww ) * E0 ) ; double complex f1 = 2. * u * two - one * ( 2. * u * cw - I * ( 3. * uu - ww ) * E0 ) ; double complex f2 = two - one * ( cw + 3. * I * u * E0 ) ; f0 = denom * ( creal( f0 ) + I * cimag( f0 ) * flag ) ; f1 = denom * ( flag * creal( f1 ) + I * cimag( f1 ) ) ; f2 = denom * ( creal( f2 ) + I * cimag( f2 ) * flag ) ; // QQ[0]. const double temp0 = REQ0 * REQ0 + REQ1 * REQ1 + \ IMQ1 * IMQ1 + REQ2 * REQ2 + IMQ2 * IMQ2 ; // QQ[1] const double complex temp1 = -REQ1 * ( REQ8 ) + REQ2 * REQ5 + IMQ2 * IMQ5 + I * ( REQ5 * IMQ2 - REQ2 * IMQ5 - IMQ1 * REQ8 ) ; // QQ[2] const double complex temp2 = REQ1 * REQ5 - IMQ1 * IMQ5 - REQ2 * REQ4 + I * ( IMQ1 * REQ5 + IMQ5 * REQ1 - IMQ2 * REQ4 ) ; // QQ[4] const double temp3 = REQ4 * REQ4 + REQ1 * REQ1 \ + IMQ1 * IMQ1 + REQ5 * REQ5 + IMQ5 * IMQ5 ; // QQ[5] const double complex temp4 = REQ1 * REQ2 + IMQ2 * IMQ1 - REQ0 * REQ5 + I * ( REQ1 * IMQ2 - REQ2 * IMQ1 - REQ0 * IMQ5 ) ; // QQ[8] const double temp5 = REQ8 * REQ8 + REQ2 * REQ2 + \ IMQ2 * IMQ2 + REQ5 * REQ5 + IMQ5 * IMQ5 ; // U = f0I + f1 Q + f2 QQ *( U + 0 ) = f0 + f1 * REQ0 + f2 * temp0 ; *( U + 1 ) = f1 * Q[1] + f2 * temp1 ; *( U + 2 ) = f1 * Q[2] + f2 * temp2 ; // *( U + 3 ) = f1 * Q[3] + f2 * conj( temp1 ) ; *( U + 4 ) = f0 + f1 * REQ4 + f2 * temp3 ; *( U + 5 ) = f1 * Q[5] + f2 * temp4 ; // *( U + 6 ) = f1 * Q[6] + f2 * conj( temp2 ) ; *( U + 7 ) = f1 * Q[7] + f2 * conj( temp4 ) ; *( U + 8 ) = f0 + f1 * REQ8 + f2 * temp5 ; #elif NC == 2 double f0 , f1 ; // f1 is purely imaginary // eigenvalues are pretty simple +/- sqrt( |a|^2 + |b|^2 ) Only need one const double z = sqrt( creal( Q[0] ) * creal( Q[0] ) + \ creal( Q[1] ) * creal( Q[1] ) + \ cimag( Q[1] ) * cimag( Q[1] ) ) ; // have eigenvalues, now for the "fun" bit. f0 = cos( z ) ; // taylor expand f1 = fabs ( z ) < SINTOLSU2 ? ( 1 - z / 6. * ( 1 - z / 20. * ( 1 - z / 42. ) ) ) : sin( z ) / z ; const double complex f1Q0 = I * f1 * creal( Q[0] ) ; const double complex f1Q1 = I * f1 * Q[1] ; *( U + 0 ) = (GLU_complex)( f0 + f1Q0 ) ; *( U + 1 ) = (GLU_complex)( f1Q1 ) ; *( U + 2 ) = (GLU_complex)( -conj( f1Q1 ) ) ; *( U + 3 ) = (GLU_complex)( f0 - f1Q0 ) ; #else // hmmm could be a toughy #if ( defined HAVE_LAPACKE_H || defined HAVE_GSL ) double complex f[ NC ] ; double z[ NC ] ; Eigenvalues_hermitian( z , Q ) ; calculate_effs_VDM_herm( f , z ) ; // matrix expansion reversing horner's rule int i , j ; diag( U , f[ NC - 1 ] ) ; for( i = NC-1 ; i > 0 ; i-- ) { multab_atomic_left( U , Q ) ; // left multiply U with Q for( j = 0 ; j < NC ; j++ ) { U[ j*(NC+1) ] += f[ i-1 ] ; } } #else // exponentiate routine from stephan durr's paper // Performs the nesting // U = ( exp{ A / DIV^n ) ) ^ ( DIV * n ) GLU_complex EOLD[ NCNC ] GLUalign , SN[ NCNC ] GLUalign ; GLU_complex RN_MIN[ NCNC ] GLUalign , RN[ NCNC ] GLUalign ; // set to zero zero_mat( EOLD ) ; // set up the divisor and the minimum double sum = 0.0 ; size_t j , n ; const int nmin = 3 ; // use precomputed factorials for( n = nmin ; n < 10 ; n++ ) { // compute the multiplicative factor ... const int iter = 2 << ( n - 1 ) ; const GLU_complex fact = I / (GLU_real)iter ; // and the rational approximations #ifdef USE_PADE for( j = 0 ; j < NCNC ; j++ ) { SN[ j ] = ( Q[j] * fact ) ; } horners_pade( RN , SN ) ; for( j = 0 ; j < NCNC ; j++ ) { SN[ j ] *= -1.0 ; } horners_pade( RN_MIN , SN ) ; #else for( j = 0 ; j < NCNC ; j++ ) { SN[ j ] = ( Q[j] * fact ) / 2.0 ; } horners_exp( RN , SN , 14 ) ; for( j = 0 ; j < NCNC ; j++ ) { SN[ j ] *= -1.0 ; } horners_exp( RN_MIN , SN , 14 ) ; #endif inverse( SN , RN_MIN ) ; // uses our numerical inverse multab_atomic_right( RN , SN ) ; // gets the correct rational approx // and remove the nested scalings ... matrix_power( U , RN , iter ) ; // uses a fast-power like routine // for the convergence criteria, I use the absolute difference between // evaluations sum = 0.0 ; for( j = 0 ; j < NCNC ; j++ ) { sum += (double)cabs( EOLD[j] - U[j] ) ; EOLD[ j ] = U[ j ] ; } sum /= NCNC ; // convergence .... if( sum < PREC_TOL ) { break ; } // warning for non-convergence .. if( n >= ( MAX_FACTORIAL - 1 ) ) { printf( "[EXPONENTIAL] not converging .. %zu %e \n" , n , sum ) ; break ; } } // gramschmidt orthogonalisation just to make sure // has been seen to help preserve gauge invariance of log smearing gram_reunit( U ) ; #endif #endif return ; }
int main(int argc, char *argv[]) { int N; // A[N][N], B[N][N], C[N][N] int size; // Number of elements in each matrix double start_time; // Starting time double run_time; // Timing util::Timer timer; // Timing N = ORDER; size = N * N; std::vector<float> h_A(size); // Host memory for Matrix A std::vector<float> h_B(size); // Host memory for Matrix B std::vector<float> h_C(size); // Host memory for Matrix C cl::Buffer d_a, d_b, d_c; // Matrices in device memory //-------------------------------------------------------------------------------- // Create a context and queue //-------------------------------------------------------------------------------- try { cl_uint deviceIndex = 0; parseArguments(argc, argv, &deviceIndex); // Get list of devices std::vector<cl::Device> devices; unsigned numDevices = getDeviceList(devices); // Check device index in range if (deviceIndex >= numDevices) { std::cout << "Invalid device index (try '--list')\n"; return EXIT_FAILURE; } cl::Device device = devices[deviceIndex]; std::string name; getDeviceName(device, name); std::cout << "\nUsing OpenCL device: " << name << "\n"; std::vector<cl::Device> chosen_device; chosen_device.push_back(device); cl::Context context(chosen_device); cl::CommandQueue queue(context, device); //-------------------------------------------------------------------------------- // Run sequential matmul //-------------------------------------------------------------------------------- initmat(N, h_A, h_B, h_C); timer.reset(); printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",N); for(int i = 0; i < COUNT; i++) { zero_mat(N, h_C); start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0; seq_mat_mul_sdot(N, h_A, h_B, h_C); run_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time; results(N, h_C, run_time); } //-------------------------------------------------------------------------------- // Setup the buffers, initialize matrices, and write them into global memory //-------------------------------------------------------------------------------- // Reset A, B and C matrices (just to play it safe) initmat(N, h_A, h_B, h_C); d_a = cl::Buffer(context, h_A.begin(), h_A.end(), true); d_b = cl::Buffer(context, h_B.begin(), h_B.end(), true); d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * size); //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... Naive //-------------------------------------------------------------------------------- // Create the compute program from the source buffer cl::Program program(context, kernelsource, true); // Create the compute kernel from the program cl::make_kernel<int, cl::Buffer, cl::Buffer, cl::Buffer> naive_mmul(program, "mmul"); printf("\n===== OpenCL, matrix mult, C(i,j) per work item, order %d ======\n",N); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(N, h_C); start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0; // Execute the kernel over the entire range of C matrix elements ... computing // a dot product for each element of the product matrix. The local work // group size is set to NULL ... so I'm telling the OpenCL runtime to // figure out a local work group size for me. cl::NDRange global(N, N); naive_mmul(cl::EnqueueArgs(queue, global), N, d_a, d_b, d_c); queue.finish(); run_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time; cl::copy(queue, d_c, h_C.begin(), h_C.end()); results(N, h_C, run_time); } // end for loop } catch (cl::Error err) { std::cout << "Exception\n"; std::cerr << "ERROR: " << err.what() << "(" << err_code(err.err()) << ")" << std::endl; } return EXIT_SUCCESS; }
int main(int argc, char *argv[]) { int nargs = 3; if(argc != nargs) { usage(argv); return 1; } int M = atoi(argv[1]); int N = atoi(argv[2]); double *A = alloc(sizeof(double)*M*N); double *B = alloc(sizeof(double)*M*N); rand_mat(A, M, N); rand_mat(B, N, M); double *C = alloc(sizeof(double)*M*M); zero_mat(C, M, M); mat_mul(C, M, N, A, B); { double t0 = stop_watch(0); mat_mul(C, M, N, A, B); t0 = stop_watch(t0); double beta_fp = 0 /* _TODO_A_ calculate beta_fp from timing t0 */; printf(" ORIG: M = %d, N = %d,", M, N); printf(" took: %4.2e sec,", t0); printf(" P = %4.2e Mflop/s\n", beta_fp); } #ifdef BLCK double *Cb = alloc(sizeof(double)*M*M); zero_mat(Cb, M, M); mat_mul_blocked(Cb, M, N, A, B); { double t0 = stop_watch(0); mat_mul_blocked(Cb, M, N, A, B); t0 = stop_watch(t0); double beta_fp = 0 /* _TODO_A_ calculate beta_fp from timing t0 */; printf(" BLCK: M = %d, N = %d,", M, N); printf(" took: %4.2e sec,", t0); printf(" P = %4.2e Mflop/s, BM = %d, BN = %d\n", beta_fp, BM, BN); } #endif #ifdef BLCK double eps = 1e-12; double diff = 0; for(int i=0; i<M*M; i++) { diff += fabs((C[i] - Cb[i])/C[i]); } /* * If the difference between the flat and blocked result is larger * than eps, complain to stdout and write the two matrices to file * "diffs.out". */ diff /= (double)M*M; if(diff > eps) { printf(" Non zero diff: %e\n", diff); FILE *fp = fopen("diffs.out", "w"); for(int i=0; i<M*M; i++) fprintf(fp, "%e\n", fabs((C[i]-Cb[i])/C[i])); fclose(fp); } #endif free(A); free(B); free(C); #ifdef BLCK free(Cb); #endif return 0; }
int main(void) { int Mdim, Ndim, Pdim; // A[N][P], B[P][M], C[N][M] int szA, szB, szC; // Number of elements in each matrix double start_time; // Starting time double run_time; // Timing util::Timer timer; // Timing Ndim = ORDER; Pdim = ORDER; Mdim = ORDER; szA = Ndim * Pdim; szB = Pdim * Mdim; szC = Ndim * Mdim; std::vector<float> h_A(szA); // Host memory for Matrix A std::vector<float> h_B(szB); // Host memory for Matrix B std::vector<float> h_C(szC); // Host memory for Matrix C cl::Buffer d_a, d_b, d_c; // Matrices in device memory initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C); timer.reset(); printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",ORDER); for(int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0; seq_mat_mul_sdot(Mdim, Ndim, Pdim, h_A, h_B, h_C); run_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time; results(Mdim, Ndim, Pdim, h_C, run_time); } try { //-------------------------------------------------------------------------------- // Create a context and queue for DEVICE //-------------------------------------------------------------------------------- cl::Context context(DEVICE); cl::CommandQueue queue(context); //-------------------------------------------------------------------------------- // Setup the buffers, initialize matrices, and write them into global memory //-------------------------------------------------------------------------------- // Reset A, B and C matrices (just to play it safe) initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C); d_a = cl::Buffer(context, begin(h_A), end(h_A), true); d_b = cl::Buffer(context, begin(h_B), end(h_B), true); d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * szC); //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... Naive //-------------------------------------------------------------------------------- // Create the compute program from the source buffer cl::Program program(context, kernelsource, true); // Create the compute kernel from the program auto naive_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer>(program, "mmul"); printf("\n===== OpenCL, matrix mult, C(i,j) per work item, order %d ======\n",Ndim); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0; // Execute the kernel over the entire range of C matrix elements ... computing // a dot product for each element of the product matrix. The local work // group size is set to NULL ... so I'm telling the OpenCL runtime to // figure out a local work group size for me. cl::NDRange global(Ndim, Mdim); naive_mmul(cl::EnqueueArgs(queue, global), Mdim, Ndim, Pdim, d_a, d_b, d_c); queue.finish(); run_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time; cl::copy(queue, d_c, begin(h_C), end(h_C)); results(Mdim, Ndim, Pdim, h_C, run_time); } // end for loop } catch (cl::Error err) { std::cout << "Exception\n"; std::cerr << "ERROR: " << err.what() << "(" << err_code(err.err()) << ")" << std::endl; } return EXIT_SUCCESS; }
int main(void) { int N; // A[N][N], B[N][N], C[N][N] int sz; // number of elements in each matrix float tmp; N = ORDER; sz = N * N; std::vector<float> h_A(sz); // Matrix A on the host std::vector<float> h_B(sz); // Matrix B on the host std::vector<float> h_C(sz); // Matrix C on the host cl::Buffer d_A; // matrix A on the device cl::Buffer d_B; // matrix B on the device cl::Buffer d_C; // matrix C on the device initmat(N, N, N, h_A, h_B, h_C); printf("\n===== Sequential, matrix mult (dot prod), order %d on CPU ======\n",ORDER); zero_mat(N, N, h_C); util::Timer timer; for (int i = 0; i < N; i++) { for (int j = 0; j < N; j++) { tmp = 0.0f; for (int k = 0; k < N; k++) { tmp += h_A[i*N+k] * h_B[k*N+j]; } h_C[i*N+j] = tmp; } } double rtime = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0; results(N, N, N, h_C, rtime); printf("\n===== Parallel matrix mult (dot prod), order %d on CPU ======\n",ORDER); zero_mat(N, N, h_C); try { cl::Context context(DEVICE); // Load in kernel source, creating a program object for the context. // Build program explicitly so I can catch errors and display // compiler error messages (should any be generated) cl::Program program(context, util::loadProgram("matmul_blocked.cl")); try { program.build(); } catch (cl::Error error) { // If it was a build error then show the error if (error.err() == CL_BUILD_PROGRAM_FAILURE) { std::vector<cl::Device> devices; devices = context.getInfo<CL_CONTEXT_DEVICES>(); std::string built = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0]); std::cerr << built << "\n"; } throw error; } // Get the command queue cl::CommandQueue queue(context); // Create the kernel functor auto mmul = cl::make_kernel<int, cl::Buffer, cl::Buffer, cl::Buffer, cl::LocalSpaceArg, cl::LocalSpaceArg> (program, "mmul"); util::Timer timer; d_A = cl::Buffer(context, begin(h_A), end(h_A), true); d_B = cl::Buffer(context, begin(h_B), end(h_B), true); d_C = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * sz); // Work-group computes a block of C. This size is also set // in a #define inside the kernel function. Note this blocksize // must evenly divide the matrix order int blocksize = 16; cl::LocalSpaceArg A_block = cl::Local(sizeof(float) * blocksize*blocksize); cl::LocalSpaceArg B_block = cl::Local(sizeof(float) * blocksize*blocksize); mmul( cl::EnqueueArgs( queue, cl::NDRange(N,N), cl::NDRange(blocksize,blocksize)), N, d_A, d_B, d_C, A_block, B_block); cl::copy(queue, d_C, begin(h_C), end(h_C)); double rtime = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0; results(N, N, N, h_C, rtime); } catch (cl::Error err) { std::cout << "Exception\n"; std::cerr << "ERROR: " << err.what() << std::endl; } }