Esempi in C++ (Cpp) per sgemm

Esempio n. 1

0

Mostra file

File: wrapperBlas_cpu.cpp Progetto: caomw/MexConv3D

void AxBTtoC(const matw &A, const matw &B, matw &C, bool isOverWrite)
{
  // A [M, K], B: [N, K], C: [M, N]
  ptrdiff_t M = A.H;
  ptrdiff_t K = A.W;
  ptrdiff_t N = B.H;

  ptrdiff_t ldA = M;
  ptrdiff_t ldB = N;
  ptrdiff_t ldC = M;

  float alpha = 1.0;
  float beta = isOverWrite? 0.0 : 1.0;

  sgemm(
    "n", "t",
    &M, &N, &K,
    &alpha,
    (float*)A.beg, &ldA,
    (float*)B.beg, &ldB,
    &beta,
    (float*)C.beg, &ldC);

  return;
}

Esempio n. 2

0

Mostra file

File: thbasic.c Progetto: lijian8/thnets

static void THBlas_gemm(char transa, char transb, long m, long n, long k, float alpha, float *a, long lda, float *b, long ldb, float beta, float *c, long ldc)
{
	int transa_ = ((transa == 't') || (transa == 'T'));
	int transb_ = ((transb == 't') || (transb == 'T'));

	if(n == 1)
		ldc = m;

	if(transa_)
	{
		if(m == 1)
			lda = k;
	}
	else
	{
		if(k == 1)
			lda = m;
	}

	if(transb_)
	{
		if(k == 1)
			ldb = n;
	}
	else
	{
		if(n == 1)
			ldb = k;
	}

	if( (m <= INT_MAX) && (n <= INT_MAX) && (k <= INT_MAX) && (lda <= INT_MAX)  && (ldb <= INT_MAX) && (ldc <= INT_MAX) )
	{
#ifdef USEBLAS
		int i_m = (int)m;
		int i_n = (int)n;
		int i_k = (int)k;
		int i_lda = (int)lda;
		int i_ldb = (int)ldb;
		int i_ldc = (int)ldc;
		sgemm_(&transa, &transb, &i_m, &i_n, &i_k, &alpha, a, &i_lda, b, &i_ldb, &beta, c, &i_ldc);
#else
		sgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
#endif
		return;
	}
	THError("Wrong parameters to gemm");
}

Esempio n. 3

0

Mostra file

File: blashelper.hpp Progetto: BenJamesbabala/twostreamfusion

 static vl::Error
 gemm(vl::Context& context,
      char op1, char op2,
      ptrdiff_t m, ptrdiff_t n, ptrdiff_t k,
      type alpha,
      type const * a, ptrdiff_t lda,
      type const * b, ptrdiff_t ldb,
      type beta,
      type * c, ptrdiff_t ldc)
 {
   sgemm(&op1, &op2,
         &m, &n, &k,
         &alpha,
         (type*)a, &lda,
         (type*)b, &ldb,
         &beta,
         c, &ldc) ;
   return vl::vlSuccess ;
 }

Esempio n. 4

0

Mostra file

File: task_dep2.c Progetto: matzke1/rose-develop

void blocked_cholesky( int NB, float A[NB][NB] ) {
  int i, j, k;

  for (k=0; k<NB; k++) {
#pragma omp task depend(inout:A[k][k])
    spotrf (A[k][k]) ;
    for (i=k+1; i<NT; i++)
#pragma omp task depend(in:A[k][k]) depend(inout:A[k][i])
      strsm (A[k][k], A[k][i]);
    // update trailing submatrix
    for (i=k+1; i<NT; i++) {
      for (j=k+1; j<i; j++)
#pragma omp task depend(in:A[k][i],A[k][j]) depend(inout:A[j][i])
          sgemm( A[k][i], A[k][j], A[j][i]);
#pragma omp task depend(in:A[k][i]) depend(inout:A[i][i])
      ssyrk (A[k][i], A[i][i]);
    }
  }
}

Esempio n. 5

0

Mostra file

File: matmul3d.C Progetto: quinoacomputing/quinoa

void Compute::doWork() {
    if(countA == num_chare_z-1 && countB == num_chare_x-1) {

#if CMK_BLUEGENEP || CMK_VERSION_BLUEGENE
        const char trans = 'N';
        const double alpha = 1.0;
        const double beta = 0.0;

        sgemm(&trans, &trans, blockDimX, blockDimZ, blockDimY, alpha, A, blockDimX, B, blockDimY, beta, C, blockDimX);
#else
        for(int i=0; i<blockDimX; i++)
            for(int j=0; j<blockDimY; j++)
                for(int k=0; k<blockDimZ; k++)
                    C[i*blockDimZ+k] += A[i*blockDimY+j] * B[j*blockDimZ+k];
#endif

        receiveC(&C[(thisIndex.y)*subBlockDimXy*blockDimZ], subBlockDimXy*blockDimZ, 0);
        sendC();
    }
}

Esempio n. 6

0

Mostra file

File: wrapperBlas_cpu.cpp Progetto: caomw/MexConv3D

void AxBtoC(const matw &A, const matw &B, matw &C, bool isOverWrite)
{
  // A: [M, K], B: [K, N]
  ptrdiff_t M = A.H; // assert (M == C.H)
  ptrdiff_t K = A.W; // assert (K == B.H)
  ptrdiff_t N = B.W; // assert (N == C.W)

  float alpha = 1.0;
  float beta = isOverWrite? 0.0 : 1.0;

  sgemm(
    "n", "n",
    &M, &N, &K,
    &alpha,
    (float*)A.beg, &M,
    (float*)B.beg, &K,
    &beta,
    (float*)C.beg, &M);

  return;
}

Esempio n. 7

0

Mostra file

File: sgemm_kernel_test.c Progetto: patricius972/OpenCL-BLAS

void kernelCallback()
{
	sgemm(colMajor, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c,
			ldc, SUBMATRIX_SIZE, bufferA, bufferB);
}

Esempio n. 8

0

Mostra file

File: benchmark.c Progetto: mahi29/MatMul-Optimization

int main( int argc, char **argv )
{
  srand(time(NULL));

double counter_random = 0.0;
double total_random = 0.0;
double sub5_random = 0;

double total_60 = 0.0;
double counter_60 = 0.0;
int m = 60, n = 60;

printf("\nTesting 60 by 60 matrices 20 times\n");

for (int i = 0; i < 20; i++){
    /* Allocate and fill 2 random matrices A, C */
    float *A = (float*) malloc( m * n * sizeof(float) );
    float *C = (float*) malloc( m * m * sizeof(float) );
    
    for( int i = 0; i < m*n; i++ ) A[i] = 2 * drand48() - 1;
    for( int i = 0; i < m*m; i++ ) C[i] = 2 * drand48() - 1;
    
    /* measure Gflop/s rate; time a sufficiently long sequence of calls to eliminate noise */
    double Gflop_s, seconds = -1.0;
    for( int n_iterations = 1; seconds < 0.1; n_iterations *= 2 ) 
    {
      /* warm-up */
      sgemm( m, n, A, C );
      
      /* measure time */
      struct timeval start, end;
      gettimeofday( &start, NULL );
      for( int i = 0; i < n_iterations; i++ )
  sgemm( m,n, A, C );
      gettimeofday( &end, NULL );
      seconds = (end.tv_sec - start.tv_sec) + 1.0e-6 * (end.tv_usec - start.tv_usec);
      
      /* compute Gflop/s rate */
      Gflop_s = 2e-9 * n_iterations * m * m * n / seconds;
    }
    
    printf( "%d by %d matrix \t %g Gflop/s\n", m, n, Gflop_s );
  total_60 = total_60 + Gflop_s;
  counter_60 = counter_60 + 1;

    
    /* Ensure that error does not exceed the theoretical error bound */
    
    /* Set initial C to 0 and do matrix multiply of A*B */
    memset( C, 0, sizeof( float ) * m * m );
    sgemm( m,n, A, C );

    /* Subtract A*B from C using standard sgemm (note that this should be 0 to within machine roundoff) */
    cblas_sgemm( CblasColMajor,CblasNoTrans,CblasTrans, m,m,n, -1, A,m, A,m, 1, C,m );

    /* Subtract the maximum allowed roundoff from each element of C */
    for( int i = 0; i < m*n; i++ ) A[i] = fabs( A[i] );
    for( int i = 0; i < m*m; i++ ) C[i] = fabs( C[i] );
    cblas_sgemm( CblasColMajor,CblasNoTrans,CblasTrans, m,m,n, -3.0*FLT_EPSILON*n, A,m, A,m, 1, C,m );

    /* After this test if any element in C is still positive something went wrong in square_sgemm */
    for( int i = 0; i < m * m; i++ )
      if( C[i] > 0 ) {
  printf( "FAILURE: error in matrix multiply exceeds an acceptable margin\n" );
  return -1;
      }

    /* release memory */
    free( C );
    free( A );
}

printf("\nTesting random sizes from m = [32, 100] to n = [32, 300] 100 times\n");

  /* Try different m */
 for ( int i = 0; i < 100; i++ ){
  
  int n = 32 + (rand() % 269);
  int m = 32 + (rand() % 69);

    /* Allocate and fill 2 random matrices A, C */
    float *A = (float*) malloc( m * n * sizeof(float) );
    float *C = (float*) malloc( m * m * sizeof(float) );
    
    for( int i = 0; i < m*n; i++ ) A[i] = 2 * drand48() - 1;
    for( int i = 0; i < m*m; i++ ) C[i] = 2 * drand48() - 1;
    
    /* measure Gflop/s rate; time a sufficiently long sequence of calls to eliminate noise */
    double Gflop_s, seconds = -1.0;
    for( int n_iterations = 1; seconds < 0.1; n_iterations *= 2 ) 
    {
      /* warm-up */
      sgemm( m, n, A, C );
      
      /* measure time */
      struct timeval start, end;
      gettimeofday( &start, NULL );
      for( int i = 0; i < n_iterations; i++ )
  sgemm( m,n, A, C );
      gettimeofday( &end, NULL );
      seconds = (end.tv_sec - start.tv_sec) + 1.0e-6 * (end.tv_usec - start.tv_usec);
      
      /* compute Gflop/s rate */
      Gflop_s = 2e-9 * n_iterations * m * m * n / seconds;
    }
    
    printf( "%d by %d matrix \t %g Gflop/s\n", m, n, Gflop_s );
  total_random = total_random + Gflop_s;
  counter_random = counter_random++;
  if (Gflop_s < 5.0){
    sub5_random++;
  }

    
    /* Ensure that error does not exceed the theoretical error bound */
    
    /* Set initial C to 0 and do matrix multiply of A*B */
    memset( C, 0, sizeof( float ) * m * m );
    sgemm( m,n, A, C );

    /* Subtract A*B from C using standard sgemm (note that this should be 0 to within machine roundoff) */
    cblas_sgemm( CblasColMajor,CblasNoTrans,CblasTrans, m,m,n, -1, A,m, A,m, 1, C,m );

    /* Subtract the maximum allowed roundoff from each element of C */
    for( int i = 0; i < m*n; i++ ) A[i] = fabs( A[i] );
    for( int i = 0; i < m*m; i++ ) C[i] = fabs( C[i] );
    cblas_sgemm( CblasColMajor,CblasNoTrans,CblasTrans, m,m,n, -3.0*FLT_EPSILON*n, A,m, A,m, 1, C,m );

    /* After this test if any element in C is still positive something went wrong in square_sgemm */
    for( int i = 0; i < m * m; i++ )
      if( C[i] > 0 ) {
  printf( "FAILURE: error in matrix multiply exceeds an acceptable margin\n" );
  return -1;
      }

    /* release memory */
    free( C );
    free( A );
  

}

double average_random = total_random/counter_random;
double average_60 = total_60/counter_60;
int total = 0;
printf("\n\nAverage for 60 by 60: %.8f Gflop/s\n", average_60);
printf("Average for random sizes: %.8f Gflop/s\n", average_random);
printf("\nPotential Grade for 60 by 60:\n");
if (average_60 >= 10.5){
  total = 35;
  printf("%d/35\n", total);
}
else if (average_60 >= 10){
  total = 34;
  printf("%d/35\n", total);
}
else if (average_60 >= 9){
  total = 32;
  printf("%d/35\n", total);
}
else if (average_60 >= 8){
  total = 30;
  printf("%d/35\n", total);
}
else if (average_60 >= 7){
  total = 25;
  printf("%d/35\n", total);
}
else if (average_60 >= 6){
  total = 20;
  printf("%d/35\n", total);
}
else if (average_60 >= 5){
  total = 15;
  printf("%d/35\n", total);
}
else if (average_60 >= 4){
  total = 10;
  printf("%d/35\n", total);
}
else if (average_60 >= 3){
  total = 7;
  printf("%d/35\n", total);
}
else if (average_60 >= 2){
  total = 4;
  printf("%d/35\n", total);
}
else {
  total = 1;
  printf("%d/35\n", total);
}

printf("All or nothing grade for random matrices:\n");
if (average_random >= 5.0){
  total = total + 20;
  printf("20/20\n");
}
else {
  printf("0/20\n");
}

printf("\nPotential Total Grade: %d/55\n", total);
printf("\nNo partial credit because I have no clue how that's going to work out.\n");
}

Esempio n. 9

0

Mostra file

File: validation.c Progetto: hfp/libxsmm

void run_test(void) {
  /* allocate */
#ifdef STREAM_A_B
  REALTYPE* l_a = (REALTYPE*)_mm_malloc(MY_LDA * MY_K * sizeof(REALTYPE) * STREAM_A_B_SIZE, 64);
  REALTYPE* l_b = (REALTYPE*)_mm_malloc(MY_LDB * MY_N * sizeof(REALTYPE) * STREAM_A_B_SIZE, 64);
  unsigned int l_s;
#else
  REALTYPE* l_a = (REALTYPE*)_mm_malloc(MY_LDA * MY_K * sizeof(REALTYPE), 64);
  REALTYPE* l_b = (REALTYPE*)_mm_malloc(MY_LDB * MY_N * sizeof(REALTYPE), 64);
#endif
  REALTYPE* l_c = (REALTYPE*)_mm_malloc(MY_LDC * MY_N * sizeof(REALTYPE), 64);
  REALTYPE* l_c_gold = (REALTYPE*)_mm_malloc(MY_LDC * MY_N * sizeof(REALTYPE), 64);
  REALTYPE l_max_error = 0.0;

  unsigned int l_i;
  unsigned int l_j;
  unsigned int l_t;
  unsigned int l_m;
  unsigned int l_n;
  unsigned int l_k;

  struct timeval l_start, l_end;
  double l_total;

#ifdef STREAM_A_B
  for ( l_s = 0; l_s < STREAM_A_B_SIZE; l_s++ ) {
    REALTYPE* l_p_a = l_a + (l_s * MY_K * MY_LDA);
#else
    REALTYPE* l_p_a = l_a;
#endif
    /* touch A */
    for ( l_i = 0; l_i < MY_LDA; l_i++) {
      for ( l_j = 0; l_j < MY_K; l_j++) {
#if REPS==1
        l_p_a[(l_j * MY_LDA) + l_i] = (REALTYPE)libxsmm_rng_f64();
#else
        l_p_a[(l_j * MY_LDA) + l_i] = (REALTYPE)(l_i + (l_j * MY_M));
#endif
      }
    }
#ifdef STREAM_A_B
  }
#endif

#ifdef STREAM_A_B
  for ( l_s = 0; l_s < STREAM_A_B_SIZE; l_s++ ) {
    REALTYPE* l_p_b = l_b + (l_s * MY_N * MY_LDB);
#else
    {
      REALTYPE* l_p_b = l_b;
#endif
      /* touch B */
      for ( l_i = 0; l_i < MY_LDB; l_i++ ) {
        for ( l_j = 0; l_j < MY_N; l_j++ ) {
#if REPS==1
          l_p_b[(l_j * MY_LDB) + l_i] = (REALTYPE)libxsmm_rng_f64();
#else
          l_p_b[(l_j * MY_LDB) + l_i] = (REALTYPE)(l_i + (l_j * MY_K));
#endif
        }
      }
    }
#ifdef STREAM_A_B
  }
#endif

  /* touch C */
  for ( l_i = 0; l_i < MY_LDC; l_i++) {
    for ( l_j = 0; l_j < MY_N; l_j++) {
      l_c[(l_j * MY_LDC) + l_i] = (REALTYPE)0.0;
      l_c_gold[(l_j * MY_LDC) + l_i] = (REALTYPE)0.0;
    }
  }

#ifdef __USE_MKL
  {
    char l_trans = 'N';
    int l_M = MY_M;
    int l_N = MY_N;
    int l_K = MY_K;
    int l_lda = MY_LDA;
    int l_ldb = MY_LDB;
    int l_ldc = MY_LDC;
    if (sizeof(REALTYPE) == sizeof(double)) {
      double l_one = 1.0;
      dgemm(&l_trans, &l_trans, &l_M, &l_N, &l_K, &l_one, (double*)l_a, &l_lda, (double*)l_b, &l_ldb, &l_one, (double*)l_c_gold, &l_ldc);
    } else {
      float l_one = 1.0f;
      sgemm(&l_trans, &l_trans, &l_M, &l_N, &l_K, &l_one, (float*)l_a, &l_lda, (float*)l_b, &l_ldb, &l_one, (float*)l_c_gold, &l_ldc);
    }
  }

  /* touch C */
  for ( l_i = 0; l_i < MY_LDC; l_i++) {
    for ( l_j = 0; l_j < MY_N; l_j++) {
      l_c[(l_j * MY_LDC) + l_i] = (REALTYPE)0.0;
      l_c_gold[(l_j * MY_LDC) + l_i] = (REALTYPE)0.0;
    }
  }
#endif

  /* C routine */
  gettimeofday(&l_start, NULL);
#ifndef __USE_MKL
  #pragma nounroll_and_jam
  for ( l_t = 0; l_t < REPS; l_t++ ) {
#ifdef STREAM_A_B
    REALTYPE* l_p_a = l_a - (MY_K * MY_LDA);
    REALTYPE* l_p_b = l_b - (MY_N * MY_LDB);
    for ( l_s = 0; l_s < STREAM_A_B_SIZE; l_s++ ) {
      l_p_a += (MY_K * MY_LDA);
      l_p_b += (MY_N * MY_LDB);
#else
      REALTYPE* l_p_a = l_a;
      REALTYPE* l_p_b = l_b;
#endif
      for ( l_n = 0; l_n < MY_N; l_n++ ) {
        for ( l_k = 0; l_k < MY_K; l_k++ ) {
          #pragma vector always
          for ( l_m = 0; l_m < MY_M; l_m++ ) {
            l_c_gold[(l_n * MY_LDC) + l_m] += l_p_a[(l_k * MY_LDA) + l_m] * l_p_b[(l_n * MY_LDB) + l_k];
          }
        }
      }
#ifdef STREAM_A_B
    }
#endif
  }
#else
  char l_trans = 'N';
  int l_M = MY_M;
  int l_N = MY_N;
  int l_K = MY_K;
  int l_lda = MY_LDA;
  int l_ldb = MY_LDB;
  int l_ldc = MY_LDC;
  if (sizeof(REALTYPE) == sizeof(double)) {
    double l_one = 1.0;
    for ( l_t = 0; l_t < REPS; l_t++ ) {
#ifdef STREAM_A_B
      REALTYPE* l_p_a = l_a - (MY_K * MY_LDA);
      REALTYPE* l_p_b = l_b - (MY_N * MY_LDB);
      for ( l_s = 0; l_s < STREAM_A_B_SIZE; l_s++ ) {
        l_p_a += (MY_K * MY_LDA);
        l_p_b += (MY_N * MY_LDB);
#else
        REALTYPE* l_p_a = l_a;
        REALTYPE* l_p_b = l_b;
#endif
        dgemm(&l_trans, &l_trans, &l_M, &l_N, &l_K, &l_one, (double*)l_p_a, &l_lda, (double*)l_p_b, &l_ldb, &l_one, (double*)l_c_gold, &l_ldc);
#ifdef STREAM_A_B
      }
#endif
    }
  } else {
    float l_one = 1.0f;
    for ( l_t = 0; l_t < REPS; l_t++ ) {
#ifdef STREAM_A_B
      REALTYPE* l_p_a = l_a - (MY_K * MY_LDA);
      REALTYPE* l_p_b = l_b - (MY_N * MY_LDB);
      for ( l_s = 0; l_s < STREAM_A_B_SIZE; l_s++ ) {
        l_p_a += (MY_K * MY_LDA);
        l_p_b += (MY_N * MY_LDB);
#else
        REALTYPE* l_p_a = l_a;
        REALTYPE* l_p_b = l_b;
#endif
        sgemm(&l_trans, &l_trans, &l_M, &l_N, &l_K, &l_one, (float*)l_p_a, &l_lda, (float*)l_p_b, &l_ldb, &l_one, (float*)l_c_gold, &l_ldc);
#ifdef STREAM_A_B
      }
#endif
    }
  }
#endif
  gettimeofday(&l_end, NULL);

  l_total = sec(l_start, l_end);
#ifndef __USE_MKL
  printf("%fs for C\n", l_total);
#ifdef STREAM_A_B
  printf("%f GFLOPS for C\n", ((double)((double)REPS * (double)MY_M * (double)MY_N * (double)MY_K) * 2.0 * ((double)STREAM_A_B_SIZE)) / (l_total * 1.0e9));
#else
  printf("%f GFLOPS for C\n", ((double)((double)REPS * (double)MY_M * (double)MY_N * (double)MY_K) * 2.0) / (l_total * 1.0e9));
#endif
#else
  printf("%fs for MKL\n", l_total);
#ifdef STREAM_A_B
  printf("%f GFLOPS for MKL\n", ((double)((double)REPS * (double)MY_M * (double)MY_N * (double)MY_K) * 2.0 * ((double)STREAM_A_B_SIZE)) / (l_total * 1.0e9));
#else
  printf("%f GFLOPS for MKL\n", ((double)((double)REPS * (double)MY_M * (double)MY_N * (double)MY_K) * 2.0) / (l_total * 1.0e9));
#endif
#endif

  gettimeofday(&l_start, NULL);
  libxsmm_timer_tickint l_cyc_start = libxsmm_timer_cycles();

  for ( l_t = 0; l_t < REPS; l_t++ ) {
#ifdef STREAM_A_B
    REALTYPE* l_p_a = l_a - (MY_K * MY_LDA);
    REALTYPE* l_p_b = l_b - (MY_N * MY_LDB);
    for ( l_s = 0; l_s < STREAM_A_B_SIZE; l_s++ ) {
      l_p_a += (MY_K * MY_LDA);
      l_p_b += (MY_N * MY_LDB);
#else
      REALTYPE* l_p_a = l_a;
      REALTYPE* l_p_b = l_b;
#endif
#ifdef STREAM_A_B_PREFETCH
      dense_test_mul(l_p_a, l_p_b, l_c, l_p_a + (MY_K * MY_LDA), l_p_b + (MY_N * MY_LDB), NULL);
#else
      dense_test_mul(l_p_a, l_p_b, l_c);
#endif
#ifdef STREAM_A_B
    }
#endif
  }
  libxsmm_timer_tickint l_cyc_end = libxsmm_timer_cycles();
  gettimeofday(&l_end, NULL);
  l_total = sec(l_start, l_end);

  printf("%fs for assembly\n", l_total);
#ifdef STREAM_A_B
  printf("%f GFLOPS for assembly\n", ((double)((double)REPS * (double)MY_M * (double)MY_N * (double)MY_K) * 2.0 * ((double)STREAM_A_B_SIZE)) / (l_total * 1.0e9));
#else
  printf("%f GFLOPS for assembly\n", ((double)((double)REPS * (double)MY_M * (double)MY_N * (double)MY_K) * 2.0) / (l_total * 1.0e9));
  printf("%f FLOPS/cycle for assembly (using libxsmm_timer_cycles())\n", ((double)((double)REPS * (double)MY_M * (double)MY_N * (double)MY_K) * 2.0) / ((double)(l_cyc_end - l_cyc_start)));
#endif

  /* check result */
  for ( l_i = 0; l_i < MY_M; l_i++) {
    for ( l_j = 0; l_j < MY_N; l_j++) {
#if 0
      printf("Entries in row %i, column %i, gold: %f, assembly: %f\n", l_i+1, l_j+1, l_c_gold[(l_j*MY_M)+l_i], l_c[(l_j*MY_M)+l_i]);
#endif
      if (l_max_error < fabs( l_c_gold[(l_j * MY_LDC) + l_i] - l_c[(l_j * MY_LDC) + l_i]))
        l_max_error = fabs( l_c_gold[(l_j * MY_LDC) + l_i] - l_c[(l_j * MY_LDC) + l_i]);
    }
  }

  printf("max. error: %f\n", l_max_error);

  /* free */
  _mm_free(l_a);
  _mm_free(l_b);
  _mm_free(l_c);
  _mm_free(l_c_gold);
}

Esempio n. 10

0

Mostra file

File: benchmark.c Progetto: KevinJia191/CS61C

int main( int argc, char **argv )
{
  srand(time(NULL));

  int n = 32;
 
  /* Try different m */
  for( int m = 32; m < 10000; m = m+1+m/3 )
  {
    /* Allocate and fill 2 random matrices A, C */
    float *A = (float*) malloc( m * n * sizeof(float) );
    float *C = (float*) malloc( m * m * sizeof(float) );
    
    for( int i = 0; i < m*n; i++ ) A[i] = 2 * drand48() - 1;
    for( int i = 0; i < m*m; i++ ) C[i] = 2 * drand48() - 1;
    
    /* measure Gflop/s rate; time a sufficiently long sequence of calls to eliminate noise */
    double Gflop_s, seconds = -1.0;
    for( int n_iterations = 1; seconds < 0.1; n_iterations *= 2 ) 
    {
      /* warm-up */
      sgemm( m, n, A, C );
      
      /* measure time */
      struct timeval start, end;
      gettimeofday( &start, NULL );
      for( int i = 0; i < n_iterations; i++ )
	sgemm( m,n, A, C );
      gettimeofday( &end, NULL );
      seconds = (end.tv_sec - start.tv_sec) + 1.0e-6 * (end.tv_usec - start.tv_usec);
      
      /* compute Gflop/s rate */
      Gflop_s = 2e-9 * n_iterations * m * m * n / seconds;
    }
    
    printf( "%d by %d matrix \t %g Gflop/s\n", m, n, Gflop_s );
    
    /* Ensure that error does not exceed the theoretical error bound */
		
    /* Set initial C to 0 and do matrix multiply of A*B */
    memset( C, 0, sizeof( float ) * m * m );
    sgemm( m,n, A, C );

    /* Subtract A*B from C using standard sgemm (note that this should be 0 to within machine roundoff) */
    cblas_sgemm( CblasColMajor,CblasNoTrans,CblasTrans, m,m,n, -1, A,m, A,m, 1, C,m );

    /* Subtract the maximum allowed roundoff from each element of C */
    for( int i = 0; i < m*n; i++ ) A[i] = fabs( A[i] );
    for( int i = 0; i < m*m; i++ ) C[i] = fabs( C[i] );
    cblas_sgemm( CblasColMajor,CblasNoTrans,CblasTrans, m,m,n, -3.0*FLT_EPSILON*n, A,m, A,m, 1, C,m );

    /* After this test if any element in C is still positive something went wrong in square_sgemm */
    for( int i = 0; i < m * m; i++ )
      if( C[i] > 0 ) {
	printf( "FAILURE: error in matrix multiply exceeds an acceptable margin\n" );
	return -1;
      }

    /* release memory */
    free( C );
    free( A );
  }
  
  return 0;
}

Esempio n. 11

0

Mostra file

File: SurfaceEquation.c Progetto: Guokr1991/PDE-based-image-processing

/*Multivariate polynomial*/
void mvarPolynomial( struct matrixM *A_in,
		     struct matrixM *B_in,
		     struct matrixM *model_out,
		     struct matrixM *model_in,
		     struct matrixM *err_out, 
		     unsigned int *rand_set, 
		     unsigned int nd )
{
	float A[M*N], B[M];
	float work[LWORK];
	mwSize lwork = LWORK;
	mwSize	m = nd,
		Arows = A_in->dimElems[0],
		Acols = A_in->dimElems[1],
		Brows = B_in->dimElems[0],
		Bcols = B_in->dimElems[1],
		Mrows = model_out->dimElems[0],
		Mcols = model_out->dimElems[1],
		ipiv[N],
		info;
	
	unsigned int i, j, colOffset, colOffset2, colOffset3, colOffset4, colOffset5;
	char *chn = "N";
	/*for dgemm */
	float alpha = 1.0f, beta = -1.0f;
	
	colOffset = A_in->dimElems[0];
	colOffset2 = 2*colOffset;
	colOffset3 = 3*colOffset;
	colOffset4 = 4*colOffset;
	colOffset5 = 5*colOffset;
	
	/*rand_set[0] = 0;
	rand_set[1] = 1;
	rand_set[2] = 2;
	rand_set[3] = 3;
	rand_set[4] = 4;
	rand_set[5] = 5;
	rand_set[6] = 6;*/
	
	if( model_in == NULL )
	{
	  /*First or second order multivariate polynomial*/
	  switch( model_out->dimElems[0] )
	  {
	    /*1st order*/
	    case 3:

	      for(i=0;i<nd;i++)
	      {
		  A[ i ] = A_in->data[ rand_set[i]  ];
		  A[ i + nd ] = A_in->data[ rand_set[i] + colOffset ];
		  A[ i + 2*nd ] = A_in->data[ rand_set[i] + colOffset2 ];
		  
		  B[i] = B_in->data[ rand_set[i] ];
	      }
	      
/*	      A[0] = A_in->data[rand_set[0]];
	      A[1] = A_in->data[rand_set[1]];
	      A[2] = A_in->data[rand_set[2]];
	      
	      A[3] = A_in->data[rand_set[0]+colOffset];
	      A[4] = A_in->data[rand_set[1]+colOffset];
	      A[5] = A_in->data[rand_set[2]+colOffset];
	      
	      A[6] = 1.0f;
	      A[7] = 1.0f;
	      A[8] = 1.0f;
    
	      model_out->data[0] = B_in->data[rand_set[0]];
	      model_out->data[1] = B_in->data[rand_set[1]];
	      model_out->data[2] = B_in->data[rand_set[2]];
*/
	    break;
	    /*2nd order*/
	    case 6:
	      for(i=0;i<nd;i++)
	      {
		  A[ i ] = A_in->data[ rand_set[i]  ];
		  A[ i + nd ] = A_in->data[ rand_set[i] + colOffset ];
		  A[ i + 2*nd ] = A_in->data[ rand_set[i] + colOffset2 ];
		  A[ i + 3*nd ] = A_in->data[ rand_set[i] + colOffset3 ];
		  A[ i + 4*nd ] = A_in->data[ rand_set[i] + colOffset4 ];
		  A[ i + 5*nd ] = A_in->data[ rand_set[i] + colOffset5 ];
		  
		  B[i] = B_in->data[ rand_set[i] ];
	      }

/*	      A[0] = A_in->data[ rand_set[0] ];
	      A[1] = A_in->data[ rand_set[1] ];
	      A[2] = A_in->data[ rand_set[2] ];
	      A[3] = A_in->data[ rand_set[3] ];
	      A[4] = A_in->data[ rand_set[4] ];
	      A[5] = A_in->data[ rand_set[5] ];
	      
	      A[6] = A_in->data[ rand_set[0] + colOffset ];
	      A[7] = A_in->data[ rand_set[1] + colOffset ];
	      A[8] = A_in->data[ rand_set[2] + colOffset ];
	      A[9] = A_in->data[ rand_set[3] + colOffset ];
	      A[10] = A_in->data[ rand_set[4] + colOffset ];
	      A[11] = A_in->data[ rand_set[5] + colOffset ];
	      
	      A[12] = A_in->data[ rand_set[0] + colOffset2 ];
	      A[13] = A_in->data[ rand_set[1] + colOffset2 ];
	      A[14] = A_in->data[ rand_set[2] + colOffset2 ];
	      A[15] = A_in->data[ rand_set[3] + colOffset2 ];
	      A[16] = A_in->data[ rand_set[4] + colOffset2 ];
	      A[17] = A_in->data[ rand_set[5] + colOffset2 ];
	      
	      A[18] = A_in->data[ rand_set[0] + colOffset3 ];
	      A[19] = A_in->data[ rand_set[1] + colOffset3 ];
	      A[20] = A_in->data[ rand_set[2] + colOffset3 ];
	      A[21] = A_in->data[ rand_set[3] + colOffset3 ];
	      A[22] = A_in->data[ rand_set[4] + colOffset3 ];
	      A[23] = A_in->data[ rand_set[5] + colOffset3 ];
	      
	      A[24] = A_in->data[ rand_set[0] + colOffset4 ];
	      A[25] = A_in->data[ rand_set[1] + colOffset4 ];
	      A[26] = A_in->data[ rand_set[2] + colOffset4 ];
	      A[27] = A_in->data[ rand_set[3] + colOffset4 ];
	      A[28] = A_in->data[ rand_set[4] + colOffset4 ];
	      A[29] = A_in->data[ rand_set[5] + colOffset4 ];
	      
	      A[30] = A_in->data[ rand_set[0] + colOffset5 ];
	      A[31] = A_in->data[ rand_set[1] + colOffset5 ];
	      A[32] = A_in->data[ rand_set[2] + colOffset5 ];
	      A[33] = A_in->data[ rand_set[3] + colOffset5 ];
	      A[34] = A_in->data[ rand_set[4] + colOffset5 ];
	      A[35] = A_in->data[ rand_set[5] + colOffset5 ];
	      
	      model_out->data[0] = B_in->data[rand_set[0]];
	      model_out->data[1] = B_in->data[rand_set[1]];
	      model_out->data[2] = B_in->data[rand_set[2]];
	      model_out->data[3] = B_in->data[rand_set[3]];
	      model_out->data[4] = B_in->data[rand_set[4]];
	      model_out->data[5] = B_in->data[rand_set[5]];
*/
	      break;
	    
	    default:
	      mexErrMsgTxt("mvarPolynomial: only 1st and 2nd order multivariate polynomials are implemented!!");
	  }
	      
/*	    Solve the linear equation A*model_out = B ( B is stored in model_out )	*/
/*	    sgesv(	&n,		/*the number of linear equations		*/
/*			&Bcols,		/*number of columns in B (nrhs)			*/
/*			A,
/*			&n,		/*leading dimension of A, lda = max(1,n)	*/
/*			ipiv,		/*pivot indices, size n				*/
/*			model_out->data,
/*			&n,		/*leading dimension of B, ldb = max(1,n)	*/
/*			&info );							*/

	    sgels(	chn,
			&m,
			&Acols,
			&Bcols,
			A,
			&m,
			B,
			&m,
			work,
			&lwork,
			&info );
	    memcpy( model_out->data, B, model_out->dimElems[0]*sizeof(float) );
		
	}else
	{
	  memcpy( model_out->data, model_in->data, model_in->dimElems[0]*sizeof(float) );
	  info = 0;
	}

	if( info==0 )
	{
	  /* C = alpha*A*B + beta*C */
	  /* Using out variables: err_out = alpha*A*model_out + beta*err_out */
	  /*Calculate the error between the model and the data*/
	  memcpy( err_out->data, B_in->data, B_in->dimElems[0]*B_in->dimElems[1]*sizeof(float) );
	  sgemm(	chn,			/*transA*/
			chn,			/*transB*/
			&Arows,			/*m, number of rows of A and C*/
			&Mcols,			/*n, number of columns of B*/
			&Acols,			/*k, number of columns of A and number of rows of B*/
			&alpha,			/*alpha*/
			A_in->data, 		/*A*/
			&Arows,			/*lda = max(1,m)*/
			model_out->data, 	/*B*/
			&Acols,			/*ldb = max(1,k)*/
			&beta, 			/*beta*/
			err_out->data,		/*C*/
			&Arows);		/*ldc = max(1,m)*/
	  /*Quadratic error*/
	  for(i=0;i<err_out->dimElems[0];i++)
	    err_out->data[i] *= err_out->data[i];
	}else
	{	
	    for(i=0;i<err_out->dimElems[0];i++)
	      err_out->data[i] = FLT_MAX;
	}
	
}

Esempio n. 12

0

Mostra file

File: sgemm.c Progetto: garymacindoe/cuda-cholesky

int main(int argc, char * argv[]) {
  CBlasTranspose transA, transB;
  size_t m, n, k;

  if (argc != 6) {
    fprintf(stderr, "Usage: %s <transA> <transB> <m> <n> <k>\n"
                    "where:\n"
                    "  transA and transB  are 'n' or 'N' for CBlasNoTrans, 't' or 'T' for CBlasTrans or 'c' or 'C' for CBlasConjTrans\n"
                    "  m, n and k         are the sizes of the matrices\n", argv[0]);
    return 1;
  }

  char t;
  if (sscanf(argv[1], "%c", &t) != 1) {
    fprintf(stderr, "Unable to read character from '%s'\n", argv[1]);
    return 1;
  }
  switch (t) {
    case 'N': case 'n': transA = CBlasNoTrans; break;
    case 'T': case 't': transA = CBlasTrans; break;
    case 'C': case 'c': transA = CBlasConjTrans; break;
    default: fprintf(stderr, "Unknown transpose '%c'\n", t); return 1;
  }

  if (sscanf(argv[2], "%c", &t) != 1) {
    fprintf(stderr, "Unable to read character from '%s'\n", argv[2]);
    return 2;
  }
  switch (t) {
    case 'N': case 'n': transB = CBlasNoTrans; break;
    case 'T': case 't': transB = CBlasTrans; break;
    case 'C': case 'c': transB = CBlasConjTrans; break;
    default: fprintf(stderr, "Unknown transpose '%c'\n", t); return 1;
  }

  if (sscanf(argv[3], "%zu", &m) != 1) {
    fprintf(stderr, "Unable to parse number from '%s'\n", argv[3]);
    return 3;
  }

  if (sscanf(argv[4], "%zu", &n) != 1) {
    fprintf(stderr, "Unable to parse number from '%s'\n", argv[4]);
    return 4;
  }

  if (sscanf(argv[5], "%zu", &k) != 1) {
    fprintf(stderr, "Unable to parse number from '%s'\n", argv[5]);
    return 5;
  }

  srand(0);

  float alpha, beta, * A, * B, * C, * refC;
  size_t lda, ldb, ldc;

  alpha = (float)rand() / (float)RAND_MAX;
  beta = (float)rand() / (float)RAND_MAX;

  if (transA == CBlasNoTrans) {
    lda = (m + 3u) & ~3u;
    if ((A = malloc(lda * k * sizeof(float))) == NULL) {
      fputs("Unable to allocate A\n", stderr);
      return -1;
    }

    for (size_t j = 0; j < k; j++) {
      for (size_t i = 0; i < m; i++)
        A[j * lda + i] = (float)rand() / (float)RAND_MAX;
    }
  }
  else {
    lda = (k + 3u) & ~3u;
    if ((A = malloc(lda * m * sizeof(float))) == NULL) {
      fputs("Unable to allocate A\n", stderr);
      return -1;
    }

    for (size_t j = 0; j < m; j++) {
      for (size_t i = 0; i < k; i++)
        A[j * lda + i] = (float)rand() / (float)RAND_MAX;
    }
  }

  if (transB == CBlasNoTrans) {
    ldb = (k + 3u) & ~3u;
    if ((B = malloc(ldb * n * sizeof(float))) == NULL) {
      fputs("Unable to allocate B\n", stderr);
      return -2;
    }

    for (size_t j = 0; j < n; j++) {
      for (size_t i = 0; i < k; i++)
        B[j * ldb + i] = (float)rand() / (float)RAND_MAX;
    }
  }
  else {
    ldb = (n + 3u) & ~3u;
    if ((B = malloc(ldb * k * sizeof(float))) == NULL) {
      fputs("Unable to allocate B\n", stderr);
      return -2;
    }

    for (size_t j = 0; j < k; j++) {
      for (size_t i = 0; i < n; i++)
        B[j * ldb + i] = (float)rand() / (float)RAND_MAX;
    }
  }

  ldc = (m + 3u) & ~3u;
  if ((C = malloc(ldc * n * sizeof(float))) == NULL) {
    fputs("Unable to allocate C\n", stderr);
    return -3;
  }
  if ((refC = malloc(ldc * n * sizeof(float))) == NULL) {
    fputs("Unable to allocate refC\n", stderr);
    return -4;
  }

  for (size_t j = 0; j < n; j++) {
    for (size_t i = 0; i < m; i++)
      refC[j * ldc + i] = C[j * ldc + i] = (float)rand() / (float)RAND_MAX;
  }

  sgemm_ref(transA, transB, m, n, k, alpha, A, lda, B, ldb, beta, refC, ldc);
  sgemm(transA, transB, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);

  float diff = 0.0f;
  for (size_t j = 0; j < n; j++) {
    for (size_t i = 0; i < m; i++) {
      float d = fabsf(C[j * ldc + i] - refC[j * ldc + i]);
      if (d > diff)
        diff = d;
    }
  }

  struct timeval start, stop;
  if (gettimeofday(&start, NULL) != 0) {
    fputs("gettimeofday failed\n", stderr);
    return -5;
  }
  for (size_t i = 0; i < 20; i++)
    sgemm(transA, transB, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
  if (gettimeofday(&stop, NULL) != 0) {
    fputs("gettimeofday failed\n", stderr);
    return -6;
  }

  double time = ((double)(stop.tv_sec - start.tv_sec) +
                 (double)(stop.tv_usec - start.tv_usec) * 1.e-6) / 20.0;

  size_t flops = 2 * k - 1;     // k multiplies and k - 1 adds per element
  if (alpha != 1.0f)
    flops += 1;                 // additional multiply by alpha
  if (beta != 0.0f)
    flops += 2;                 // additional multiply and add by beta
  float error = (float)flops * 2.0f * FLT_EPSILON;     // maximum per element error
  flops *= m * n;               // m * n elements

  bool passed = (diff <= error);
  fprintf(stdout, "%.3es %.3gGFlops/s Error: %.3e\n%sED!\n", time,
          ((double)flops * 1.e-9) / time, diff, (passed) ? "PASS" : "FAIL");

  free(A);
  free(B);
  free(C);
  free(refC);

  return (int)!passed;
}

Esempio n. 13

0

Mostra file

File: gmx_tilt.cpp Progetto: awritchie/my_gmx

/* Kabsch alignment */
void kabsch_alignment( std::vector<float> ref, std::vector<float> tar, t_tiltdata &data, gmx_bool bVerbose)
{
    if (ref.size() != tar.size())
    {
        std::cerr << "\nError! Sizes of reference coordinate matrix and simulated structure coordinate matrices do not match!" << std::endl;
        std::exit(1);
    }
    int ncoords = ref.size();
    int natoms = ncoords/3;
    // Center the two selections
    std::vector<float> stsel1(ncoords,0), stsel2(ncoords,0), stsel2T(ncoords,0);
    std::vector<float> ref_com(3,0), tar_com(3,0);
    average_coordinate(ref, ref_com);
    average_coordinate(tar, tar_com);
    for (int i=0; i<natoms; i++)
    {
        for (int j=0; j<3; j++)
        {
            stsel1[i+j*natoms] = ref[i+j*natoms] - ref_com[j];
            stsel2[i+j*natoms] = tar[i+j*natoms] - tar_com[j];
        }
    }
    // Initial residual
    float E0 = sdot(ncoords,&stsel1[0],1,&stsel1[0],1)+sdot(ncoords,&stsel2[0],1,&stsel2[0],1) ;
    // dot(target_transpose,reference)
    std::vector<float> T1_dot_2(3*natoms,0);
    sgemm('T','N',3,natoms,natoms,1,&stsel2[0],natoms,&stsel1[0],natoms,1,&T1_dot_2[0],3);
    // SVD of the dot product
    std::vector<float> U(9,0), S(3,0), V(9,0), work(5*9,0);
    int info;
    sgesvd('A','A',3,3,&T1_dot_2[0],3,&S[0],&U[0],3,&V[0],3,&work[0],9*5,info);
    /*std::cout << "\n S: ";
    for (int i=0;i<3;i++)
    {
        std::cout << S[i] << " ";
    }
    std::cout << "\n U: ";
    for (int i=0;i<9;i++)
    {
        std::cout << U[i] << " ";
    }*/
    float reflect = det3x3(&U[0]) * det3x3(&V[0]);
    if ( 1 - reflect > 1e-5)
    {
        S[2] = -S[2];
        U[6] = -U[6];
        U[7] = -U[7];
        U[8] = -U[8];
    }
    float rmsd = sqrt(fabs(
                           E0
                           - (2.0 *
                              (S[0]+S[1]+S[2])
                              )
                           )
                      /natoms);
    // Rotation matrix is dot(U,V)
    std::vector<float> M(9,0);
    sgemm('N','N',3,3,3,1,&U[0],3,&V[0],3,1,&M[0],3);
    /*
     M = [ 0 3 6 ] = [ 00 01 02 ]
         [ 1 4 7 ]   [ 10 11 12 ]
         [ 2 5 8 ]   [ 20 21 22 ]
     */
    float trace = M[0]+M[4]+M[8];
    float angle = acos((trace-1)/2)*RAD2DEG;
    float rx,ry,rz,ux,uy,uz;
    rx = atan2(M[5],M[8])*RAD2DEG;
    ry = atan2(-M[2],sqrt(M[5]*M[5]+M[8]*M[8]))*RAD2DEG;
    rz = atan2(M[1],M[0])*RAD2DEG;
    float zeta = sqrt(
                        (M[5]-M[7])*(M[5]-M[7])
                      + (M[6]-M[2])*(M[6]-M[2])
                      + (M[3]-M[1])*(M[3]-M[1])
                      );
    //std::cout << "\n" << M[5] << " - " << M[7] << " = " << M[5]-M[7];
    //std::cout << "\n" << M[6] << " - " << M[2] << " = " << M[6]-M[2];
    //std::cout << "\n" << M[3] << " - " << M[1] << " = " << M[3]-M[1] << std::endl;
    ux = (M[5]-M[7])/zeta;
    uy = (M[6]-M[2])/zeta;
    uz = (M[3]-M[1])/zeta;
    //std::cout << zeta << " { " << ux << " " << uy << " " << uz << " }" << sqrt(ux*ux+uy*uy+uz*uz) << std:: endl;
    if (bVerbose)
    {
        fprintf(stdout,"%12s%12s%12s%12s%12s%12s%12s%12s\n","Angle(deg)","rmsd(nm)","x(deg)","y(deg)","z(deg)","ux(nm)","uy(nm)","uz(nm)");
        fprintf(stdout,"%12.3f%12.6f%12.4f%12.4f%12.4f%12.4f%12.4f%12.4f\n",angle,rmsd,rx,ry,rz,ux,uy,uz);
    }
    data.rotation.push_back(angle);
    data.rmsd.push_back(rmsd);
    data.x_rotation.push_back(rx);
    data.y_rotation.push_back(ry);
    data.z_rotation.push_back(rz);
    data.x_rotation_axis.push_back(ux);
    data.y_rotation_axis.push_back(uy);
    data.z_rotation_axis.push_back(uz);

    return;
}

Esempio n. 14

0

Mostra file

File: sgemm.c Progetto: Quiota/DNN

int main(int argc, char **argv)
{
	float *A, *B, *C; /* Matrices */

	MKL_INT N=5, NP; /* Matrix dimensions */
	int matrix_bytes; /* Matrix size in bytes */
	int matrix_elements; /* Matrix size in elements */

	float alpha = 1.0, beta = 1.0; /* Scaling factors */
	char transa = 'N', transb = 'N'; /* Transposition options */

	int i, j; /* Counters */

	/* Check command line arguments */
	if (argc < 2) {
		printf("\nUsage: %s <N>\n\n", argv[0]);
	} else {
	/* Parse command line arguments */
		N = atoi(argv[1]);
	}

	if (N <= 0) {
		printf("Invalid matrix size\n");
		return -1;
	}

	printf("\nMatrix dimension is being set to %d \n\n", (int)N);

	matrix_elements = N * N;
	matrix_bytes = sizeof(float) * matrix_elements;

	/* Allocate the matrices */
	A = malloc(matrix_bytes);
	if (A == NULL) {
		printf("Could not allocate matrix A\n");
		return -1;
	}

	B = malloc(matrix_bytes);
	if (B == NULL) {
		printf("Could not allocate matrix B\n");
		return -1;
	}

	C = malloc(matrix_bytes);
	if (C == NULL) {
		printf("Could not allocate matrix C\n");
		return -1;
	}

	/* Initialize the matrices */
	for (i = 0; i < matrix_elements; i++) {
		A[i] = 1.0; B[i] = 2.0; C[i] = 0.0;
	}

#pragma offload target(mic) \
	in(transa, transb, N, alpha, beta) \
	in(A:length(matrix_elements)) \
	in(B:length(matrix_elements)) \
	in(C:length(matrix_elements)) \
	out(C:length(matrix_elements) alloc_if(0))
	{
		sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N,
				&beta, C, &N);
	}

	/* Display the result */
	printf("Resulting matrix C:\n");
	if (N>10) {
		printf("NOTE: C is too large, so print only its upper-left 10x10 block...\n");
		NP=10;
	} else {
		NP=N;
	}
	printf("\n");
	for (i = 0; i < NP; i++) {
		for (j = 0; j < NP; j++)
			printf("%7.3f ", C[i + j * N]);
		printf("\n");
	}

	/* Free the matrix memory */
	free(A); free(B); free(C);

	return 0;
}