void AxBTtoC(const matw &A, const matw &B, matw &C, bool isOverWrite) { // A [M, K], B: [N, K], C: [M, N] ptrdiff_t M = A.H; ptrdiff_t K = A.W; ptrdiff_t N = B.H; ptrdiff_t ldA = M; ptrdiff_t ldB = N; ptrdiff_t ldC = M; float alpha = 1.0; float beta = isOverWrite? 0.0 : 1.0; sgemm( "n", "t", &M, &N, &K, &alpha, (float*)A.beg, &ldA, (float*)B.beg, &ldB, &beta, (float*)C.beg, &ldC); return; }
static void THBlas_gemm(char transa, char transb, long m, long n, long k, float alpha, float *a, long lda, float *b, long ldb, float beta, float *c, long ldc) { int transa_ = ((transa == 't') || (transa == 'T')); int transb_ = ((transb == 't') || (transb == 'T')); if(n == 1) ldc = m; if(transa_) { if(m == 1) lda = k; } else { if(k == 1) lda = m; } if(transb_) { if(k == 1) ldb = n; } else { if(n == 1) ldb = k; } if( (m <= INT_MAX) && (n <= INT_MAX) && (k <= INT_MAX) && (lda <= INT_MAX) && (ldb <= INT_MAX) && (ldc <= INT_MAX) ) { #ifdef USEBLAS int i_m = (int)m; int i_n = (int)n; int i_k = (int)k; int i_lda = (int)lda; int i_ldb = (int)ldb; int i_ldc = (int)ldc; sgemm_(&transa, &transb, &i_m, &i_n, &i_k, &alpha, a, &i_lda, b, &i_ldb, &beta, c, &i_ldc); #else sgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); #endif return; } THError("Wrong parameters to gemm"); }
static vl::Error gemm(vl::Context& context, char op1, char op2, ptrdiff_t m, ptrdiff_t n, ptrdiff_t k, type alpha, type const * a, ptrdiff_t lda, type const * b, ptrdiff_t ldb, type beta, type * c, ptrdiff_t ldc) { sgemm(&op1, &op2, &m, &n, &k, &alpha, (type*)a, &lda, (type*)b, &ldb, &beta, c, &ldc) ; return vl::vlSuccess ; }
void blocked_cholesky( int NB, float A[NB][NB] ) { int i, j, k; for (k=0; k<NB; k++) { #pragma omp task depend(inout:A[k][k]) spotrf (A[k][k]) ; for (i=k+1; i<NT; i++) #pragma omp task depend(in:A[k][k]) depend(inout:A[k][i]) strsm (A[k][k], A[k][i]); // update trailing submatrix for (i=k+1; i<NT; i++) { for (j=k+1; j<i; j++) #pragma omp task depend(in:A[k][i],A[k][j]) depend(inout:A[j][i]) sgemm( A[k][i], A[k][j], A[j][i]); #pragma omp task depend(in:A[k][i]) depend(inout:A[i][i]) ssyrk (A[k][i], A[i][i]); } } }
void Compute::doWork() { if(countA == num_chare_z-1 && countB == num_chare_x-1) { #if CMK_BLUEGENEP || CMK_VERSION_BLUEGENE const char trans = 'N'; const double alpha = 1.0; const double beta = 0.0; sgemm(&trans, &trans, blockDimX, blockDimZ, blockDimY, alpha, A, blockDimX, B, blockDimY, beta, C, blockDimX); #else for(int i=0; i<blockDimX; i++) for(int j=0; j<blockDimY; j++) for(int k=0; k<blockDimZ; k++) C[i*blockDimZ+k] += A[i*blockDimY+j] * B[j*blockDimZ+k]; #endif receiveC(&C[(thisIndex.y)*subBlockDimXy*blockDimZ], subBlockDimXy*blockDimZ, 0); sendC(); } }
void AxBtoC(const matw &A, const matw &B, matw &C, bool isOverWrite) { // A: [M, K], B: [K, N] ptrdiff_t M = A.H; // assert (M == C.H) ptrdiff_t K = A.W; // assert (K == B.H) ptrdiff_t N = B.W; // assert (N == C.W) float alpha = 1.0; float beta = isOverWrite? 0.0 : 1.0; sgemm( "n", "n", &M, &N, &K, &alpha, (float*)A.beg, &M, (float*)B.beg, &K, &beta, (float*)C.beg, &M); return; }
void kernelCallback() { sgemm(colMajor, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, SUBMATRIX_SIZE, bufferA, bufferB); }
int main( int argc, char **argv ) { srand(time(NULL)); double counter_random = 0.0; double total_random = 0.0; double sub5_random = 0; double total_60 = 0.0; double counter_60 = 0.0; int m = 60, n = 60; printf("\nTesting 60 by 60 matrices 20 times\n"); for (int i = 0; i < 20; i++){ /* Allocate and fill 2 random matrices A, C */ float *A = (float*) malloc( m * n * sizeof(float) ); float *C = (float*) malloc( m * m * sizeof(float) ); for( int i = 0; i < m*n; i++ ) A[i] = 2 * drand48() - 1; for( int i = 0; i < m*m; i++ ) C[i] = 2 * drand48() - 1; /* measure Gflop/s rate; time a sufficiently long sequence of calls to eliminate noise */ double Gflop_s, seconds = -1.0; for( int n_iterations = 1; seconds < 0.1; n_iterations *= 2 ) { /* warm-up */ sgemm( m, n, A, C ); /* measure time */ struct timeval start, end; gettimeofday( &start, NULL ); for( int i = 0; i < n_iterations; i++ ) sgemm( m,n, A, C ); gettimeofday( &end, NULL ); seconds = (end.tv_sec - start.tv_sec) + 1.0e-6 * (end.tv_usec - start.tv_usec); /* compute Gflop/s rate */ Gflop_s = 2e-9 * n_iterations * m * m * n / seconds; } printf( "%d by %d matrix \t %g Gflop/s\n", m, n, Gflop_s ); total_60 = total_60 + Gflop_s; counter_60 = counter_60 + 1; /* Ensure that error does not exceed the theoretical error bound */ /* Set initial C to 0 and do matrix multiply of A*B */ memset( C, 0, sizeof( float ) * m * m ); sgemm( m,n, A, C ); /* Subtract A*B from C using standard sgemm (note that this should be 0 to within machine roundoff) */ cblas_sgemm( CblasColMajor,CblasNoTrans,CblasTrans, m,m,n, -1, A,m, A,m, 1, C,m ); /* Subtract the maximum allowed roundoff from each element of C */ for( int i = 0; i < m*n; i++ ) A[i] = fabs( A[i] ); for( int i = 0; i < m*m; i++ ) C[i] = fabs( C[i] ); cblas_sgemm( CblasColMajor,CblasNoTrans,CblasTrans, m,m,n, -3.0*FLT_EPSILON*n, A,m, A,m, 1, C,m ); /* After this test if any element in C is still positive something went wrong in square_sgemm */ for( int i = 0; i < m * m; i++ ) if( C[i] > 0 ) { printf( "FAILURE: error in matrix multiply exceeds an acceptable margin\n" ); return -1; } /* release memory */ free( C ); free( A ); } printf("\nTesting random sizes from m = [32, 100] to n = [32, 300] 100 times\n"); /* Try different m */ for ( int i = 0; i < 100; i++ ){ int n = 32 + (rand() % 269); int m = 32 + (rand() % 69); /* Allocate and fill 2 random matrices A, C */ float *A = (float*) malloc( m * n * sizeof(float) ); float *C = (float*) malloc( m * m * sizeof(float) ); for( int i = 0; i < m*n; i++ ) A[i] = 2 * drand48() - 1; for( int i = 0; i < m*m; i++ ) C[i] = 2 * drand48() - 1; /* measure Gflop/s rate; time a sufficiently long sequence of calls to eliminate noise */ double Gflop_s, seconds = -1.0; for( int n_iterations = 1; seconds < 0.1; n_iterations *= 2 ) { /* warm-up */ sgemm( m, n, A, C ); /* measure time */ struct timeval start, end; gettimeofday( &start, NULL ); for( int i = 0; i < n_iterations; i++ ) sgemm( m,n, A, C ); gettimeofday( &end, NULL ); seconds = (end.tv_sec - start.tv_sec) + 1.0e-6 * (end.tv_usec - start.tv_usec); /* compute Gflop/s rate */ Gflop_s = 2e-9 * n_iterations * m * m * n / seconds; } printf( "%d by %d matrix \t %g Gflop/s\n", m, n, Gflop_s ); total_random = total_random + Gflop_s; counter_random = counter_random++; if (Gflop_s < 5.0){ sub5_random++; } /* Ensure that error does not exceed the theoretical error bound */ /* Set initial C to 0 and do matrix multiply of A*B */ memset( C, 0, sizeof( float ) * m * m ); sgemm( m,n, A, C ); /* Subtract A*B from C using standard sgemm (note that this should be 0 to within machine roundoff) */ cblas_sgemm( CblasColMajor,CblasNoTrans,CblasTrans, m,m,n, -1, A,m, A,m, 1, C,m ); /* Subtract the maximum allowed roundoff from each element of C */ for( int i = 0; i < m*n; i++ ) A[i] = fabs( A[i] ); for( int i = 0; i < m*m; i++ ) C[i] = fabs( C[i] ); cblas_sgemm( CblasColMajor,CblasNoTrans,CblasTrans, m,m,n, -3.0*FLT_EPSILON*n, A,m, A,m, 1, C,m ); /* After this test if any element in C is still positive something went wrong in square_sgemm */ for( int i = 0; i < m * m; i++ ) if( C[i] > 0 ) { printf( "FAILURE: error in matrix multiply exceeds an acceptable margin\n" ); return -1; } /* release memory */ free( C ); free( A ); } double average_random = total_random/counter_random; double average_60 = total_60/counter_60; int total = 0; printf("\n\nAverage for 60 by 60: %.8f Gflop/s\n", average_60); printf("Average for random sizes: %.8f Gflop/s\n", average_random); printf("\nPotential Grade for 60 by 60:\n"); if (average_60 >= 10.5){ total = 35; printf("%d/35\n", total); } else if (average_60 >= 10){ total = 34; printf("%d/35\n", total); } else if (average_60 >= 9){ total = 32; printf("%d/35\n", total); } else if (average_60 >= 8){ total = 30; printf("%d/35\n", total); } else if (average_60 >= 7){ total = 25; printf("%d/35\n", total); } else if (average_60 >= 6){ total = 20; printf("%d/35\n", total); } else if (average_60 >= 5){ total = 15; printf("%d/35\n", total); } else if (average_60 >= 4){ total = 10; printf("%d/35\n", total); } else if (average_60 >= 3){ total = 7; printf("%d/35\n", total); } else if (average_60 >= 2){ total = 4; printf("%d/35\n", total); } else { total = 1; printf("%d/35\n", total); } printf("All or nothing grade for random matrices:\n"); if (average_random >= 5.0){ total = total + 20; printf("20/20\n"); } else { printf("0/20\n"); } printf("\nPotential Total Grade: %d/55\n", total); printf("\nNo partial credit because I have no clue how that's going to work out.\n"); }
void run_test(void) { /* allocate */ #ifdef STREAM_A_B REALTYPE* l_a = (REALTYPE*)_mm_malloc(MY_LDA * MY_K * sizeof(REALTYPE) * STREAM_A_B_SIZE, 64); REALTYPE* l_b = (REALTYPE*)_mm_malloc(MY_LDB * MY_N * sizeof(REALTYPE) * STREAM_A_B_SIZE, 64); unsigned int l_s; #else REALTYPE* l_a = (REALTYPE*)_mm_malloc(MY_LDA * MY_K * sizeof(REALTYPE), 64); REALTYPE* l_b = (REALTYPE*)_mm_malloc(MY_LDB * MY_N * sizeof(REALTYPE), 64); #endif REALTYPE* l_c = (REALTYPE*)_mm_malloc(MY_LDC * MY_N * sizeof(REALTYPE), 64); REALTYPE* l_c_gold = (REALTYPE*)_mm_malloc(MY_LDC * MY_N * sizeof(REALTYPE), 64); REALTYPE l_max_error = 0.0; unsigned int l_i; unsigned int l_j; unsigned int l_t; unsigned int l_m; unsigned int l_n; unsigned int l_k; struct timeval l_start, l_end; double l_total; #ifdef STREAM_A_B for ( l_s = 0; l_s < STREAM_A_B_SIZE; l_s++ ) { REALTYPE* l_p_a = l_a + (l_s * MY_K * MY_LDA); #else REALTYPE* l_p_a = l_a; #endif /* touch A */ for ( l_i = 0; l_i < MY_LDA; l_i++) { for ( l_j = 0; l_j < MY_K; l_j++) { #if REPS==1 l_p_a[(l_j * MY_LDA) + l_i] = (REALTYPE)libxsmm_rng_f64(); #else l_p_a[(l_j * MY_LDA) + l_i] = (REALTYPE)(l_i + (l_j * MY_M)); #endif } } #ifdef STREAM_A_B } #endif #ifdef STREAM_A_B for ( l_s = 0; l_s < STREAM_A_B_SIZE; l_s++ ) { REALTYPE* l_p_b = l_b + (l_s * MY_N * MY_LDB); #else { REALTYPE* l_p_b = l_b; #endif /* touch B */ for ( l_i = 0; l_i < MY_LDB; l_i++ ) { for ( l_j = 0; l_j < MY_N; l_j++ ) { #if REPS==1 l_p_b[(l_j * MY_LDB) + l_i] = (REALTYPE)libxsmm_rng_f64(); #else l_p_b[(l_j * MY_LDB) + l_i] = (REALTYPE)(l_i + (l_j * MY_K)); #endif } } } #ifdef STREAM_A_B } #endif /* touch C */ for ( l_i = 0; l_i < MY_LDC; l_i++) { for ( l_j = 0; l_j < MY_N; l_j++) { l_c[(l_j * MY_LDC) + l_i] = (REALTYPE)0.0; l_c_gold[(l_j * MY_LDC) + l_i] = (REALTYPE)0.0; } } #ifdef __USE_MKL { char l_trans = 'N'; int l_M = MY_M; int l_N = MY_N; int l_K = MY_K; int l_lda = MY_LDA; int l_ldb = MY_LDB; int l_ldc = MY_LDC; if (sizeof(REALTYPE) == sizeof(double)) { double l_one = 1.0; dgemm(&l_trans, &l_trans, &l_M, &l_N, &l_K, &l_one, (double*)l_a, &l_lda, (double*)l_b, &l_ldb, &l_one, (double*)l_c_gold, &l_ldc); } else { float l_one = 1.0f; sgemm(&l_trans, &l_trans, &l_M, &l_N, &l_K, &l_one, (float*)l_a, &l_lda, (float*)l_b, &l_ldb, &l_one, (float*)l_c_gold, &l_ldc); } } /* touch C */ for ( l_i = 0; l_i < MY_LDC; l_i++) { for ( l_j = 0; l_j < MY_N; l_j++) { l_c[(l_j * MY_LDC) + l_i] = (REALTYPE)0.0; l_c_gold[(l_j * MY_LDC) + l_i] = (REALTYPE)0.0; } } #endif /* C routine */ gettimeofday(&l_start, NULL); #ifndef __USE_MKL #pragma nounroll_and_jam for ( l_t = 0; l_t < REPS; l_t++ ) { #ifdef STREAM_A_B REALTYPE* l_p_a = l_a - (MY_K * MY_LDA); REALTYPE* l_p_b = l_b - (MY_N * MY_LDB); for ( l_s = 0; l_s < STREAM_A_B_SIZE; l_s++ ) { l_p_a += (MY_K * MY_LDA); l_p_b += (MY_N * MY_LDB); #else REALTYPE* l_p_a = l_a; REALTYPE* l_p_b = l_b; #endif for ( l_n = 0; l_n < MY_N; l_n++ ) { for ( l_k = 0; l_k < MY_K; l_k++ ) { #pragma vector always for ( l_m = 0; l_m < MY_M; l_m++ ) { l_c_gold[(l_n * MY_LDC) + l_m] += l_p_a[(l_k * MY_LDA) + l_m] * l_p_b[(l_n * MY_LDB) + l_k]; } } } #ifdef STREAM_A_B } #endif } #else char l_trans = 'N'; int l_M = MY_M; int l_N = MY_N; int l_K = MY_K; int l_lda = MY_LDA; int l_ldb = MY_LDB; int l_ldc = MY_LDC; if (sizeof(REALTYPE) == sizeof(double)) { double l_one = 1.0; for ( l_t = 0; l_t < REPS; l_t++ ) { #ifdef STREAM_A_B REALTYPE* l_p_a = l_a - (MY_K * MY_LDA); REALTYPE* l_p_b = l_b - (MY_N * MY_LDB); for ( l_s = 0; l_s < STREAM_A_B_SIZE; l_s++ ) { l_p_a += (MY_K * MY_LDA); l_p_b += (MY_N * MY_LDB); #else REALTYPE* l_p_a = l_a; REALTYPE* l_p_b = l_b; #endif dgemm(&l_trans, &l_trans, &l_M, &l_N, &l_K, &l_one, (double*)l_p_a, &l_lda, (double*)l_p_b, &l_ldb, &l_one, (double*)l_c_gold, &l_ldc); #ifdef STREAM_A_B } #endif } } else { float l_one = 1.0f; for ( l_t = 0; l_t < REPS; l_t++ ) { #ifdef STREAM_A_B REALTYPE* l_p_a = l_a - (MY_K * MY_LDA); REALTYPE* l_p_b = l_b - (MY_N * MY_LDB); for ( l_s = 0; l_s < STREAM_A_B_SIZE; l_s++ ) { l_p_a += (MY_K * MY_LDA); l_p_b += (MY_N * MY_LDB); #else REALTYPE* l_p_a = l_a; REALTYPE* l_p_b = l_b; #endif sgemm(&l_trans, &l_trans, &l_M, &l_N, &l_K, &l_one, (float*)l_p_a, &l_lda, (float*)l_p_b, &l_ldb, &l_one, (float*)l_c_gold, &l_ldc); #ifdef STREAM_A_B } #endif } } #endif gettimeofday(&l_end, NULL); l_total = sec(l_start, l_end); #ifndef __USE_MKL printf("%fs for C\n", l_total); #ifdef STREAM_A_B printf("%f GFLOPS for C\n", ((double)((double)REPS * (double)MY_M * (double)MY_N * (double)MY_K) * 2.0 * ((double)STREAM_A_B_SIZE)) / (l_total * 1.0e9)); #else printf("%f GFLOPS for C\n", ((double)((double)REPS * (double)MY_M * (double)MY_N * (double)MY_K) * 2.0) / (l_total * 1.0e9)); #endif #else printf("%fs for MKL\n", l_total); #ifdef STREAM_A_B printf("%f GFLOPS for MKL\n", ((double)((double)REPS * (double)MY_M * (double)MY_N * (double)MY_K) * 2.0 * ((double)STREAM_A_B_SIZE)) / (l_total * 1.0e9)); #else printf("%f GFLOPS for MKL\n", ((double)((double)REPS * (double)MY_M * (double)MY_N * (double)MY_K) * 2.0) / (l_total * 1.0e9)); #endif #endif gettimeofday(&l_start, NULL); libxsmm_timer_tickint l_cyc_start = libxsmm_timer_cycles(); for ( l_t = 0; l_t < REPS; l_t++ ) { #ifdef STREAM_A_B REALTYPE* l_p_a = l_a - (MY_K * MY_LDA); REALTYPE* l_p_b = l_b - (MY_N * MY_LDB); for ( l_s = 0; l_s < STREAM_A_B_SIZE; l_s++ ) { l_p_a += (MY_K * MY_LDA); l_p_b += (MY_N * MY_LDB); #else REALTYPE* l_p_a = l_a; REALTYPE* l_p_b = l_b; #endif #ifdef STREAM_A_B_PREFETCH dense_test_mul(l_p_a, l_p_b, l_c, l_p_a + (MY_K * MY_LDA), l_p_b + (MY_N * MY_LDB), NULL); #else dense_test_mul(l_p_a, l_p_b, l_c); #endif #ifdef STREAM_A_B } #endif } libxsmm_timer_tickint l_cyc_end = libxsmm_timer_cycles(); gettimeofday(&l_end, NULL); l_total = sec(l_start, l_end); printf("%fs for assembly\n", l_total); #ifdef STREAM_A_B printf("%f GFLOPS for assembly\n", ((double)((double)REPS * (double)MY_M * (double)MY_N * (double)MY_K) * 2.0 * ((double)STREAM_A_B_SIZE)) / (l_total * 1.0e9)); #else printf("%f GFLOPS for assembly\n", ((double)((double)REPS * (double)MY_M * (double)MY_N * (double)MY_K) * 2.0) / (l_total * 1.0e9)); printf("%f FLOPS/cycle for assembly (using libxsmm_timer_cycles())\n", ((double)((double)REPS * (double)MY_M * (double)MY_N * (double)MY_K) * 2.0) / ((double)(l_cyc_end - l_cyc_start))); #endif /* check result */ for ( l_i = 0; l_i < MY_M; l_i++) { for ( l_j = 0; l_j < MY_N; l_j++) { #if 0 printf("Entries in row %i, column %i, gold: %f, assembly: %f\n", l_i+1, l_j+1, l_c_gold[(l_j*MY_M)+l_i], l_c[(l_j*MY_M)+l_i]); #endif if (l_max_error < fabs( l_c_gold[(l_j * MY_LDC) + l_i] - l_c[(l_j * MY_LDC) + l_i])) l_max_error = fabs( l_c_gold[(l_j * MY_LDC) + l_i] - l_c[(l_j * MY_LDC) + l_i]); } } printf("max. error: %f\n", l_max_error); /* free */ _mm_free(l_a); _mm_free(l_b); _mm_free(l_c); _mm_free(l_c_gold); }
int main( int argc, char **argv ) { srand(time(NULL)); int n = 32; /* Try different m */ for( int m = 32; m < 10000; m = m+1+m/3 ) { /* Allocate and fill 2 random matrices A, C */ float *A = (float*) malloc( m * n * sizeof(float) ); float *C = (float*) malloc( m * m * sizeof(float) ); for( int i = 0; i < m*n; i++ ) A[i] = 2 * drand48() - 1; for( int i = 0; i < m*m; i++ ) C[i] = 2 * drand48() - 1; /* measure Gflop/s rate; time a sufficiently long sequence of calls to eliminate noise */ double Gflop_s, seconds = -1.0; for( int n_iterations = 1; seconds < 0.1; n_iterations *= 2 ) { /* warm-up */ sgemm( m, n, A, C ); /* measure time */ struct timeval start, end; gettimeofday( &start, NULL ); for( int i = 0; i < n_iterations; i++ ) sgemm( m,n, A, C ); gettimeofday( &end, NULL ); seconds = (end.tv_sec - start.tv_sec) + 1.0e-6 * (end.tv_usec - start.tv_usec); /* compute Gflop/s rate */ Gflop_s = 2e-9 * n_iterations * m * m * n / seconds; } printf( "%d by %d matrix \t %g Gflop/s\n", m, n, Gflop_s ); /* Ensure that error does not exceed the theoretical error bound */ /* Set initial C to 0 and do matrix multiply of A*B */ memset( C, 0, sizeof( float ) * m * m ); sgemm( m,n, A, C ); /* Subtract A*B from C using standard sgemm (note that this should be 0 to within machine roundoff) */ cblas_sgemm( CblasColMajor,CblasNoTrans,CblasTrans, m,m,n, -1, A,m, A,m, 1, C,m ); /* Subtract the maximum allowed roundoff from each element of C */ for( int i = 0; i < m*n; i++ ) A[i] = fabs( A[i] ); for( int i = 0; i < m*m; i++ ) C[i] = fabs( C[i] ); cblas_sgemm( CblasColMajor,CblasNoTrans,CblasTrans, m,m,n, -3.0*FLT_EPSILON*n, A,m, A,m, 1, C,m ); /* After this test if any element in C is still positive something went wrong in square_sgemm */ for( int i = 0; i < m * m; i++ ) if( C[i] > 0 ) { printf( "FAILURE: error in matrix multiply exceeds an acceptable margin\n" ); return -1; } /* release memory */ free( C ); free( A ); } return 0; }
/*Multivariate polynomial*/ void mvarPolynomial( struct matrixM *A_in, struct matrixM *B_in, struct matrixM *model_out, struct matrixM *model_in, struct matrixM *err_out, unsigned int *rand_set, unsigned int nd ) { float A[M*N], B[M]; float work[LWORK]; mwSize lwork = LWORK; mwSize m = nd, Arows = A_in->dimElems[0], Acols = A_in->dimElems[1], Brows = B_in->dimElems[0], Bcols = B_in->dimElems[1], Mrows = model_out->dimElems[0], Mcols = model_out->dimElems[1], ipiv[N], info; unsigned int i, j, colOffset, colOffset2, colOffset3, colOffset4, colOffset5; char *chn = "N"; /*for dgemm */ float alpha = 1.0f, beta = -1.0f; colOffset = A_in->dimElems[0]; colOffset2 = 2*colOffset; colOffset3 = 3*colOffset; colOffset4 = 4*colOffset; colOffset5 = 5*colOffset; /*rand_set[0] = 0; rand_set[1] = 1; rand_set[2] = 2; rand_set[3] = 3; rand_set[4] = 4; rand_set[5] = 5; rand_set[6] = 6;*/ if( model_in == NULL ) { /*First or second order multivariate polynomial*/ switch( model_out->dimElems[0] ) { /*1st order*/ case 3: for(i=0;i<nd;i++) { A[ i ] = A_in->data[ rand_set[i] ]; A[ i + nd ] = A_in->data[ rand_set[i] + colOffset ]; A[ i + 2*nd ] = A_in->data[ rand_set[i] + colOffset2 ]; B[i] = B_in->data[ rand_set[i] ]; } /* A[0] = A_in->data[rand_set[0]]; A[1] = A_in->data[rand_set[1]]; A[2] = A_in->data[rand_set[2]]; A[3] = A_in->data[rand_set[0]+colOffset]; A[4] = A_in->data[rand_set[1]+colOffset]; A[5] = A_in->data[rand_set[2]+colOffset]; A[6] = 1.0f; A[7] = 1.0f; A[8] = 1.0f; model_out->data[0] = B_in->data[rand_set[0]]; model_out->data[1] = B_in->data[rand_set[1]]; model_out->data[2] = B_in->data[rand_set[2]]; */ break; /*2nd order*/ case 6: for(i=0;i<nd;i++) { A[ i ] = A_in->data[ rand_set[i] ]; A[ i + nd ] = A_in->data[ rand_set[i] + colOffset ]; A[ i + 2*nd ] = A_in->data[ rand_set[i] + colOffset2 ]; A[ i + 3*nd ] = A_in->data[ rand_set[i] + colOffset3 ]; A[ i + 4*nd ] = A_in->data[ rand_set[i] + colOffset4 ]; A[ i + 5*nd ] = A_in->data[ rand_set[i] + colOffset5 ]; B[i] = B_in->data[ rand_set[i] ]; } /* A[0] = A_in->data[ rand_set[0] ]; A[1] = A_in->data[ rand_set[1] ]; A[2] = A_in->data[ rand_set[2] ]; A[3] = A_in->data[ rand_set[3] ]; A[4] = A_in->data[ rand_set[4] ]; A[5] = A_in->data[ rand_set[5] ]; A[6] = A_in->data[ rand_set[0] + colOffset ]; A[7] = A_in->data[ rand_set[1] + colOffset ]; A[8] = A_in->data[ rand_set[2] + colOffset ]; A[9] = A_in->data[ rand_set[3] + colOffset ]; A[10] = A_in->data[ rand_set[4] + colOffset ]; A[11] = A_in->data[ rand_set[5] + colOffset ]; A[12] = A_in->data[ rand_set[0] + colOffset2 ]; A[13] = A_in->data[ rand_set[1] + colOffset2 ]; A[14] = A_in->data[ rand_set[2] + colOffset2 ]; A[15] = A_in->data[ rand_set[3] + colOffset2 ]; A[16] = A_in->data[ rand_set[4] + colOffset2 ]; A[17] = A_in->data[ rand_set[5] + colOffset2 ]; A[18] = A_in->data[ rand_set[0] + colOffset3 ]; A[19] = A_in->data[ rand_set[1] + colOffset3 ]; A[20] = A_in->data[ rand_set[2] + colOffset3 ]; A[21] = A_in->data[ rand_set[3] + colOffset3 ]; A[22] = A_in->data[ rand_set[4] + colOffset3 ]; A[23] = A_in->data[ rand_set[5] + colOffset3 ]; A[24] = A_in->data[ rand_set[0] + colOffset4 ]; A[25] = A_in->data[ rand_set[1] + colOffset4 ]; A[26] = A_in->data[ rand_set[2] + colOffset4 ]; A[27] = A_in->data[ rand_set[3] + colOffset4 ]; A[28] = A_in->data[ rand_set[4] + colOffset4 ]; A[29] = A_in->data[ rand_set[5] + colOffset4 ]; A[30] = A_in->data[ rand_set[0] + colOffset5 ]; A[31] = A_in->data[ rand_set[1] + colOffset5 ]; A[32] = A_in->data[ rand_set[2] + colOffset5 ]; A[33] = A_in->data[ rand_set[3] + colOffset5 ]; A[34] = A_in->data[ rand_set[4] + colOffset5 ]; A[35] = A_in->data[ rand_set[5] + colOffset5 ]; model_out->data[0] = B_in->data[rand_set[0]]; model_out->data[1] = B_in->data[rand_set[1]]; model_out->data[2] = B_in->data[rand_set[2]]; model_out->data[3] = B_in->data[rand_set[3]]; model_out->data[4] = B_in->data[rand_set[4]]; model_out->data[5] = B_in->data[rand_set[5]]; */ break; default: mexErrMsgTxt("mvarPolynomial: only 1st and 2nd order multivariate polynomials are implemented!!"); } /* Solve the linear equation A*model_out = B ( B is stored in model_out ) */ /* sgesv( &n, /*the number of linear equations */ /* &Bcols, /*number of columns in B (nrhs) */ /* A, /* &n, /*leading dimension of A, lda = max(1,n) */ /* ipiv, /*pivot indices, size n */ /* model_out->data, /* &n, /*leading dimension of B, ldb = max(1,n) */ /* &info ); */ sgels( chn, &m, &Acols, &Bcols, A, &m, B, &m, work, &lwork, &info ); memcpy( model_out->data, B, model_out->dimElems[0]*sizeof(float) ); }else { memcpy( model_out->data, model_in->data, model_in->dimElems[0]*sizeof(float) ); info = 0; } if( info==0 ) { /* C = alpha*A*B + beta*C */ /* Using out variables: err_out = alpha*A*model_out + beta*err_out */ /*Calculate the error between the model and the data*/ memcpy( err_out->data, B_in->data, B_in->dimElems[0]*B_in->dimElems[1]*sizeof(float) ); sgemm( chn, /*transA*/ chn, /*transB*/ &Arows, /*m, number of rows of A and C*/ &Mcols, /*n, number of columns of B*/ &Acols, /*k, number of columns of A and number of rows of B*/ &alpha, /*alpha*/ A_in->data, /*A*/ &Arows, /*lda = max(1,m)*/ model_out->data, /*B*/ &Acols, /*ldb = max(1,k)*/ &beta, /*beta*/ err_out->data, /*C*/ &Arows); /*ldc = max(1,m)*/ /*Quadratic error*/ for(i=0;i<err_out->dimElems[0];i++) err_out->data[i] *= err_out->data[i]; }else { for(i=0;i<err_out->dimElems[0];i++) err_out->data[i] = FLT_MAX; } }
int main(int argc, char * argv[]) { CBlasTranspose transA, transB; size_t m, n, k; if (argc != 6) { fprintf(stderr, "Usage: %s <transA> <transB> <m> <n> <k>\n" "where:\n" " transA and transB are 'n' or 'N' for CBlasNoTrans, 't' or 'T' for CBlasTrans or 'c' or 'C' for CBlasConjTrans\n" " m, n and k are the sizes of the matrices\n", argv[0]); return 1; } char t; if (sscanf(argv[1], "%c", &t) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[1]); return 1; } switch (t) { case 'N': case 'n': transA = CBlasNoTrans; break; case 'T': case 't': transA = CBlasTrans; break; case 'C': case 'c': transA = CBlasConjTrans; break; default: fprintf(stderr, "Unknown transpose '%c'\n", t); return 1; } if (sscanf(argv[2], "%c", &t) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[2]); return 2; } switch (t) { case 'N': case 'n': transB = CBlasNoTrans; break; case 'T': case 't': transB = CBlasTrans; break; case 'C': case 'c': transB = CBlasConjTrans; break; default: fprintf(stderr, "Unknown transpose '%c'\n", t); return 1; } if (sscanf(argv[3], "%zu", &m) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[3]); return 3; } if (sscanf(argv[4], "%zu", &n) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[4]); return 4; } if (sscanf(argv[5], "%zu", &k) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[5]); return 5; } srand(0); float alpha, beta, * A, * B, * C, * refC; size_t lda, ldb, ldc; alpha = (float)rand() / (float)RAND_MAX; beta = (float)rand() / (float)RAND_MAX; if (transA == CBlasNoTrans) { lda = (m + 3u) & ~3u; if ((A = malloc(lda * k * sizeof(float))) == NULL) { fputs("Unable to allocate A\n", stderr); return -1; } for (size_t j = 0; j < k; j++) { for (size_t i = 0; i < m; i++) A[j * lda + i] = (float)rand() / (float)RAND_MAX; } } else { lda = (k + 3u) & ~3u; if ((A = malloc(lda * m * sizeof(float))) == NULL) { fputs("Unable to allocate A\n", stderr); return -1; } for (size_t j = 0; j < m; j++) { for (size_t i = 0; i < k; i++) A[j * lda + i] = (float)rand() / (float)RAND_MAX; } } if (transB == CBlasNoTrans) { ldb = (k + 3u) & ~3u; if ((B = malloc(ldb * n * sizeof(float))) == NULL) { fputs("Unable to allocate B\n", stderr); return -2; } for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < k; i++) B[j * ldb + i] = (float)rand() / (float)RAND_MAX; } } else { ldb = (n + 3u) & ~3u; if ((B = malloc(ldb * k * sizeof(float))) == NULL) { fputs("Unable to allocate B\n", stderr); return -2; } for (size_t j = 0; j < k; j++) { for (size_t i = 0; i < n; i++) B[j * ldb + i] = (float)rand() / (float)RAND_MAX; } } ldc = (m + 3u) & ~3u; if ((C = malloc(ldc * n * sizeof(float))) == NULL) { fputs("Unable to allocate C\n", stderr); return -3; } if ((refC = malloc(ldc * n * sizeof(float))) == NULL) { fputs("Unable to allocate refC\n", stderr); return -4; } for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < m; i++) refC[j * ldc + i] = C[j * ldc + i] = (float)rand() / (float)RAND_MAX; } sgemm_ref(transA, transB, m, n, k, alpha, A, lda, B, ldb, beta, refC, ldc); sgemm(transA, transB, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); float diff = 0.0f; for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < m; i++) { float d = fabsf(C[j * ldc + i] - refC[j * ldc + i]); if (d > diff) diff = d; } } struct timeval start, stop; if (gettimeofday(&start, NULL) != 0) { fputs("gettimeofday failed\n", stderr); return -5; } for (size_t i = 0; i < 20; i++) sgemm(transA, transB, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); if (gettimeofday(&stop, NULL) != 0) { fputs("gettimeofday failed\n", stderr); return -6; } double time = ((double)(stop.tv_sec - start.tv_sec) + (double)(stop.tv_usec - start.tv_usec) * 1.e-6) / 20.0; size_t flops = 2 * k - 1; // k multiplies and k - 1 adds per element if (alpha != 1.0f) flops += 1; // additional multiply by alpha if (beta != 0.0f) flops += 2; // additional multiply and add by beta float error = (float)flops * 2.0f * FLT_EPSILON; // maximum per element error flops *= m * n; // m * n elements bool passed = (diff <= error); fprintf(stdout, "%.3es %.3gGFlops/s Error: %.3e\n%sED!\n", time, ((double)flops * 1.e-9) / time, diff, (passed) ? "PASS" : "FAIL"); free(A); free(B); free(C); free(refC); return (int)!passed; }
/* Kabsch alignment */ void kabsch_alignment( std::vector<float> ref, std::vector<float> tar, t_tiltdata &data, gmx_bool bVerbose) { if (ref.size() != tar.size()) { std::cerr << "\nError! Sizes of reference coordinate matrix and simulated structure coordinate matrices do not match!" << std::endl; std::exit(1); } int ncoords = ref.size(); int natoms = ncoords/3; // Center the two selections std::vector<float> stsel1(ncoords,0), stsel2(ncoords,0), stsel2T(ncoords,0); std::vector<float> ref_com(3,0), tar_com(3,0); average_coordinate(ref, ref_com); average_coordinate(tar, tar_com); for (int i=0; i<natoms; i++) { for (int j=0; j<3; j++) { stsel1[i+j*natoms] = ref[i+j*natoms] - ref_com[j]; stsel2[i+j*natoms] = tar[i+j*natoms] - tar_com[j]; } } // Initial residual float E0 = sdot(ncoords,&stsel1[0],1,&stsel1[0],1)+sdot(ncoords,&stsel2[0],1,&stsel2[0],1) ; // dot(target_transpose,reference) std::vector<float> T1_dot_2(3*natoms,0); sgemm('T','N',3,natoms,natoms,1,&stsel2[0],natoms,&stsel1[0],natoms,1,&T1_dot_2[0],3); // SVD of the dot product std::vector<float> U(9,0), S(3,0), V(9,0), work(5*9,0); int info; sgesvd('A','A',3,3,&T1_dot_2[0],3,&S[0],&U[0],3,&V[0],3,&work[0],9*5,info); /*std::cout << "\n S: "; for (int i=0;i<3;i++) { std::cout << S[i] << " "; } std::cout << "\n U: "; for (int i=0;i<9;i++) { std::cout << U[i] << " "; }*/ float reflect = det3x3(&U[0]) * det3x3(&V[0]); if ( 1 - reflect > 1e-5) { S[2] = -S[2]; U[6] = -U[6]; U[7] = -U[7]; U[8] = -U[8]; } float rmsd = sqrt(fabs( E0 - (2.0 * (S[0]+S[1]+S[2]) ) ) /natoms); // Rotation matrix is dot(U,V) std::vector<float> M(9,0); sgemm('N','N',3,3,3,1,&U[0],3,&V[0],3,1,&M[0],3); /* M = [ 0 3 6 ] = [ 00 01 02 ] [ 1 4 7 ] [ 10 11 12 ] [ 2 5 8 ] [ 20 21 22 ] */ float trace = M[0]+M[4]+M[8]; float angle = acos((trace-1)/2)*RAD2DEG; float rx,ry,rz,ux,uy,uz; rx = atan2(M[5],M[8])*RAD2DEG; ry = atan2(-M[2],sqrt(M[5]*M[5]+M[8]*M[8]))*RAD2DEG; rz = atan2(M[1],M[0])*RAD2DEG; float zeta = sqrt( (M[5]-M[7])*(M[5]-M[7]) + (M[6]-M[2])*(M[6]-M[2]) + (M[3]-M[1])*(M[3]-M[1]) ); //std::cout << "\n" << M[5] << " - " << M[7] << " = " << M[5]-M[7]; //std::cout << "\n" << M[6] << " - " << M[2] << " = " << M[6]-M[2]; //std::cout << "\n" << M[3] << " - " << M[1] << " = " << M[3]-M[1] << std::endl; ux = (M[5]-M[7])/zeta; uy = (M[6]-M[2])/zeta; uz = (M[3]-M[1])/zeta; //std::cout << zeta << " { " << ux << " " << uy << " " << uz << " }" << sqrt(ux*ux+uy*uy+uz*uz) << std:: endl; if (bVerbose) { fprintf(stdout,"%12s%12s%12s%12s%12s%12s%12s%12s\n","Angle(deg)","rmsd(nm)","x(deg)","y(deg)","z(deg)","ux(nm)","uy(nm)","uz(nm)"); fprintf(stdout,"%12.3f%12.6f%12.4f%12.4f%12.4f%12.4f%12.4f%12.4f\n",angle,rmsd,rx,ry,rz,ux,uy,uz); } data.rotation.push_back(angle); data.rmsd.push_back(rmsd); data.x_rotation.push_back(rx); data.y_rotation.push_back(ry); data.z_rotation.push_back(rz); data.x_rotation_axis.push_back(ux); data.y_rotation_axis.push_back(uy); data.z_rotation_axis.push_back(uz); return; }
int main(int argc, char **argv) { float *A, *B, *C; /* Matrices */ MKL_INT N=5, NP; /* Matrix dimensions */ int matrix_bytes; /* Matrix size in bytes */ int matrix_elements; /* Matrix size in elements */ float alpha = 1.0, beta = 1.0; /* Scaling factors */ char transa = 'N', transb = 'N'; /* Transposition options */ int i, j; /* Counters */ /* Check command line arguments */ if (argc < 2) { printf("\nUsage: %s <N>\n\n", argv[0]); } else { /* Parse command line arguments */ N = atoi(argv[1]); } if (N <= 0) { printf("Invalid matrix size\n"); return -1; } printf("\nMatrix dimension is being set to %d \n\n", (int)N); matrix_elements = N * N; matrix_bytes = sizeof(float) * matrix_elements; /* Allocate the matrices */ A = malloc(matrix_bytes); if (A == NULL) { printf("Could not allocate matrix A\n"); return -1; } B = malloc(matrix_bytes); if (B == NULL) { printf("Could not allocate matrix B\n"); return -1; } C = malloc(matrix_bytes); if (C == NULL) { printf("Could not allocate matrix C\n"); return -1; } /* Initialize the matrices */ for (i = 0; i < matrix_elements; i++) { A[i] = 1.0; B[i] = 2.0; C[i] = 0.0; } #pragma offload target(mic) \ in(transa, transb, N, alpha, beta) \ in(A:length(matrix_elements)) \ in(B:length(matrix_elements)) \ in(C:length(matrix_elements)) \ out(C:length(matrix_elements) alloc_if(0)) { sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N); } /* Display the result */ printf("Resulting matrix C:\n"); if (N>10) { printf("NOTE: C is too large, so print only its upper-left 10x10 block...\n"); NP=10; } else { NP=N; } printf("\n"); for (i = 0; i < NP; i++) { for (j = 0; j < NP; j++) printf("%7.3f ", C[i + j * N]); printf("\n"); } /* Free the matrix memory */ free(A); free(B); free(C); return 0; }