/* -- * Check that C = A*B to within roundoff error. * * We use the fact that dot products satisfy the error bound * * float(sum a_i * b_i) = sum a_i * b_i * (1 + delta_i) * * where delta_i <= n * epsilon. In order to check your matrix * multiply, we compute each element in turn and make sure that * your product is within three times the given error bound. * We make it three times because there are three sources of * error: * * - the roundoff error in your multiply * - the roundoff error in our multiply * - the roundoff error in computing the error bound * * That last source of error is not so significant, but that's a * story for another day. */ void validate_dgemm(const int M, const double *A, const double *B, double *C) { matrix_clear(C); square_dgemm(M, A, B, C); for (int i = 0; i < M; ++i) { for (int j = 0; j < M; ++j) { double dotprod = 0; double errorbound = 0; for (int k = 0; k < M; ++k) { double prod = A[k*M + i] * B[j*M + k]; dotprod += prod; errorbound += fabs(prod); } errorbound *= (M * DBL_EPSILON); double err = fabs(C[j*M + i] - dotprod); if (err > 3*errorbound) { fprintf(stderr, "Matrix multiply failed.\n"); fprintf(stderr, "C(%d,%d) should be %lg, was %lg\n", i, j, dotprod, C[j*M + i]); fprintf(stderr, "Error of %lg, acceptable limit %lg\n", err, 3*errorbound); diff_dgemm(M, A, B, C); exit(-1); } } } }
/* -- * Check that C = A*B to within roundoff error. * * We use the fact that dot products satisfy the error bound * * float(sum a_i * b_i) = sum a_i * b_i * (1 + delta_i) * * where delta_i <= n * epsilon. In order to check your matrix * multiply, we compute each element in turn and make sure that * your product is within three times the given error bound. * We make it three times because there are three sources of * error: * * - the roundoff error in your multiply * - the roundoff error in our multiply * - the roundoff error in computing the error bound * * That last source of error is not so significant, but that's a * story for another day. */ void diff_dgemm(const int M, const double *A, const double *B, double *C) { FILE* fp_our = fopen("dump_our.txt", "w"); FILE* fp_ref = fopen("dump_ref.txt", "w"); FILE* fp_diff = fopen("dump_diff.txt", "w"); matrix_clear(C); square_dgemm(M, A, B, C); for (int i = 0; i < M; ++i) { for (int j = 0; j < M; ++j) { double dotprod = 0; double errorbound = 0; for (int k = 0; k < M; ++k) { double prod = A[k*M + i] * B[j*M + k]; dotprod += prod; errorbound += fabs(prod); } fprintf(fp_our, " %g", C[j*M+i]); fprintf(fp_ref, " %g", dotprod); fprintf(fp_diff, " % 0.0e", C[j*M+i]-dotprod); } fprintf(fp_our, "\n"); fprintf(fp_ref, "\n"); fprintf(fp_diff, "\n"); } fclose(fp_diff); fclose(fp_ref); fclose(fp_our); }
/* -- * Compute a MFlop/s rate for C += A*B. * * The code runs the multiplication repeatedly in a loop MIN_RUNS times, * then doubles the loop time if it did not take MIN_SECS to perform the * run. This helps us get around the limits of timer resolution. */ double time_dgemm(const int M, const double *A, const double *B, double *C) { double secs = -1.0; double mflops_sec; int num_iterations = MIN_RUNS; while (secs < MIN_SECS) { matrix_clear(C); double start = omp_get_wtime(); for (int i = 0; i < num_iterations; ++i) { square_dgemm(M, A, B, C); } double finish = omp_get_wtime(); double mflops = 2.0 * num_iterations * M * M * M / 1.0e6; secs = finish-start; mflops_sec = mflops / secs; num_iterations *= 2; } return mflops_sec; }
/* The benchmarking program */ int main (int argc, char **argv) { printf ("Description:\t%s\n\n", dgemm_desc); /* Test sizes should highlight performance dips at multiples of certain powers-of-two */ int test_sizes[] = /* Multiples-of-32, +/- 1. Currently commented. */ {31,32,33,63,64,65,95,96,97,127,128,129,159,160,161,191,192,193,223,224,225,255,256,257,287,288,289,319,320,321,351,352,353,383,384,385,415,416,417,447,448,449,479,480,481,511,512,513,543,544,545,575,576,577,607,608,609,639,640,641,671,672,673,703,704,705,735,736,737,767,768,769,799,800,801,831,832,833,863,864,865,895,896,897,927,928,929,959,960,961,991,992,993,1023,1024,1025}; /* A representative subset of the first list. Currently uncommented. */ //{ 31, 32, 96, 97, 127, 128, 129, 191, 192, 229, 255, 256, 257, // 319, 320, 321, 417, 479, 480, 511, 512, 639, 640, 767, 768, 769 }; int nsizes = sizeof(test_sizes)/sizeof(test_sizes[0]); /* assume last size is also the largest size */ int nmax = test_sizes[nsizes-1]; /* allocate memory for all problems */ double* buf = NULL; buf = (double*) malloc (3 * nmax * nmax * sizeof(double)); if (buf == NULL) die ("failed to allocate largest problem size"); double Mflops_s[nsizes],per[nsizes],aveper; /* For each test size */ for (int isize = 0; isize < sizeof(test_sizes)/sizeof(test_sizes[0]); ++isize) { for( int block_size = 3;block_size<200;block_size++) { /* Create and fill 3 random matrices A,B,C*/ int n = test_sizes[isize]; double* A = buf + 0; double* B = A + nmax*nmax; double* C = B + nmax*nmax; fill (A, n*n); fill (B, n*n); fill (C, n*n); /* Measure performance (in Gflops/s). */ /* Time a "sufficiently long" sequence of calls to reduce noise */ double Gflops_s, seconds = -1.0; double timeout = 0.1; // "sufficiently long" := at least 1/10 second. for (int n_iterations = 1; seconds < timeout; n_iterations *= 2) { /* Warm-up */ square_dgemm (block_size,n, A, B, C); /* Benchmark n_iterations runs of square_dgemm */ seconds = -wall_time(); for (int it = 0; it < n_iterations; ++it) square_dgemm (block_size,n, A, B, C); seconds += wall_time(); /* compute Gflop/s rate */ Gflops_s = 2.e-9 * n_iterations * n * n * n / seconds; } /* Storing Mflop rate and calculating percentage of peak */ Mflops_s[isize] = Gflops_s*1000; per[isize] = Gflops_s*100/MAX_SPEED; printf ("Size: %d\t Block Size: %d\t Mflop/s: %8g\tPercentage:%6.2lf\n", n, block_size,Mflops_s[isize],per[isize]); /* Ensure that error does not exceed the theoretical error bound. */ /* C := A * B, computed with square_dgemm */ memset (C, 0, n * n * sizeof(double)); square_dgemm (block_size,n, A, B, C); /* Do not explicitly check that A and B were unmodified on square_dgemm exit * - if they were, the following will most likely detect it: * C := C - A * B, computed with reference_dgemm */ reference_dgemm(n, -1., A, B, C); /* A := |A|, B := |B|, C := |C| */ absolute_value (A, n * n); absolute_value (B, n * n); absolute_value (C, n * n); /* C := |C| - 3 * e_mach * n * |A| * |B|, computed with reference_dgemm */ reference_dgemm (n, -3.*DBL_EPSILON*n, A, B, C); /* If any element in C is positive, then something went wrong in square_dgemm */ for (int i = 0; i < n * n; ++i) if (C[i] > 0) die("*** FAILURE *** Error in matrix multiply exceeds componentwise error bounds.\n" ); } } free (buf); return 0; }
void main(void) { // Malloc spaces for four matrix double *A = malloc(sizeof(double) * SIZE * SIZE); fill_matrix(A, SIZE); double *B = malloc(sizeof(double) * SIZE * SIZE); fill_matrix(B, SIZE); double *C = malloc(sizeof(double) * SIZE * SIZE); memset(C, 0, sizeof(double) * SIZE * SIZE); double *D = malloc(sizeof(double) * SIZE * SIZE); memset(D, 0, sizeof(double) * SIZE * SIZE); // struct to timing struct timeval begin, end; // test function gettimeofday(&begin, NULL); square_dgemm(SIZE, A, B, C); gettimeofday(&end, NULL); // niave multipily naive_multiply(A, B, D, SIZE); // validate result, if wrong, print four matrix for(int i=0; i<SIZE*SIZE; i++) { if(C[i] != D[i]) { printf("WRONG.\n"); for(int x=0; x<SIZE; x++) { for(int y=0; y<SIZE; y++) { printf("%f ", A[x*SIZE+y]); } printf("\n"); } printf("-----------\n"); for(int x=0; x<SIZE; x++) { for(int y=0; y<SIZE; y++) { printf("%f ", B[x*SIZE+y]); } printf("\n"); } printf("-----------\n"); for(int x=0; x<SIZE; x++) { for(int y=0; y<SIZE; y++) { printf("%f ", C[x*SIZE+y]); } printf("\n"); } printf("-----------\n"); for(int x=0; x<SIZE; x++) { for(int y=0; y<SIZE; y++) { printf("%f ", D[x*SIZE+y]); } printf("\n"); } return; } } printf("CORRECT.^_^\n"); printf("Single Round Time use: %ld usec.\n", (end.tv_sec-begin.tv_sec)*1000000 + (end.tv_usec - begin.tv_usec)); /* Time a "sufficiently long" sequence of calls to reduce noise */ double Gflops_s, seconds = -1.0; double timeout = 0.1; // "sufficiently long" := at least 1/10 second. for (int n_iterations = 1; seconds < timeout; n_iterations *= 2) { /* Warm-up */ square_dgemm (SIZE, A, B, C); /* Benchmark n_iterations runs of square_dgemm */ seconds = -wall_time(); for (int it = 0; it < n_iterations; ++it) square_dgemm (SIZE, A, B, C); seconds += wall_time(); /* compute Mflop/s rate */ Gflops_s = 2.e-9 * n_iterations * SIZE * SIZE * SIZE / seconds; } printf ("Size: %d\tGflop/s: %.3g\n", SIZE, Gflops_s); }
int main( int argc, char **argv ) { printf ("Description:\t%s\n\n", dgemm_desc); /* These sizes should highlight performance dips at multiples of certain powers-of-two */ int test_sizes[] = { 31, 32, 96, 97, 127, 128, 129, 191, 192, 229, 255, 256, 257, 319, 320, 321, 417, 479, 480, 511, 512, 639, 640, 767, 768, 769, }; /*For each test size*/ for( int isize = 0; isize < sizeof(test_sizes)/sizeof(test_sizes[0]); isize++ ) { /*Craete and fill 3 random matrices A,B,C*/ int n = test_sizes[isize]; double *A = (double*) malloc( n * n * sizeof(double) ); double *B = (double*) malloc( n * n * sizeof(double) ); double *C = (double*) malloc( n * n * sizeof(double) ); fill( A, n * n ); fill( B, n * n ); fill( C, n * n ); /* measure Mflop/s rate; time a sufficiently long sequence of calls to eliminate noise*/ double Mflop_s, seconds = -1.0; for( int n_iterations = 1; seconds < 0.1; n_iterations *= 2 ) { /* warm-up */ square_dgemm( n, A, B, C ); /* measure time */ seconds = read_timer( ); for( int i = 0; i < n_iterations; i++ ) square_dgemm( n, A, B, C ); seconds = read_timer( ) - seconds; /* compute Mflop/s rate */ Mflop_s = 2e-6 * n_iterations * n * n * n / seconds; } printf ("Size: %d\tMflop/s: %g\n", n, Mflop_s); /* Ensure that error does not exceed the theoretical error bound */ /* Set initial C to 0 and do matrix multiply of A*B */ memset( C, 0, sizeof( double ) * n * n ); square_dgemm( n, A, B, C ); /*Subtract A*B from C using standard dgemm (note that this should be 0 to within machine roundoff)*/ dgemm( 'N','N', n,n,n, -1, A,n, B,n, 1, C,n ); /*Subtract the maximum allowed roundoff from each element of C*/ absolute_value( A, n * n ); absolute_value( B, n * n ); absolute_value( C, n * n ); dgemm( 'N','N', n,n,n, -3.0*DBL_EPSILON*n, A,n, B,n, 1, C,n ); /*After this test if any element in C is still positive something went wrong in square_dgemm*/ for( int i = 0; i < n * n; i++ ) if( C[i] > 0 ) { printf( "FAILURE: error in matrix multiply exceeds an acceptable margin\n" ); exit(-1); } /*Deallocate memory*/ free( C ); free( B ); free( A ); } return 0; }
int main( int argc, char **argv ) { int done = 0, myid, numprocs, i; int from, to; int namelen; char processor_name[MPI_MAX_PROCESSOR_NAME]; double seconds, Mflop_s;; int root_process = 0; int n_iterations = 1, iter = 0; int n = 1600; double *A = (double*) malloc( n * n * sizeof(double) ); double *B = (double*) malloc( n * n * sizeof(double) ); double *C = (double*) malloc( n * n * sizeof(double) ); MPI_Init(&argc,&argv); MPI_Comm_size(MPI_COMM_WORLD,&numprocs); MPI_Comm_rank(MPI_COMM_WORLD,&myid); MPI_Get_processor_name(processor_name,&namelen); /* These sizes should highlight performance dips at multiples of certain powers-of-two */ /*Craete and fill 3 random matrices A,B,C*/ from = myid * n/numprocs; to = (myid+1) * n/numprocs; if(myid == root_process){ printf ("Description:\t%s\n\n", dgemm_desc); n_iterations = 1; } START: if (myid == root_process){ fill( A, n * n ); fill( B, n * n ); //fill( C, n * n ); memset( C, 0, sizeof( double ) * n * n ); } MPI_Bcast(A, n * n, MPI_DOUBLE, 0,MPI_COMM_WORLD); MPI_Bcast(B, n * n, MPI_DOUBLE, 0,MPI_COMM_WORLD); MPI_Bcast(C, n * n, MPI_DOUBLE, 0,MPI_COMM_WORLD); if(myid == root_process){ iter = 0; } double *T = (double*) malloc( n * n * sizeof(double) ); ITERATION: if(myid == root_process){ seconds = MPI_Wtime(); } square_dgemm(n, from, to, A, B, C, T); // MPI_Barrier(MPI_COMM_WORLD); MPI_Gather(T + from * n, n * (n / numprocs), MPI_DOUBLE, C + from * n, n * (n / numprocs), MPI_DOUBLE, 0, MPI_COMM_WORLD); /* if (iter < n_iterations){ iter++; goto ITERATION; } seconds = MPI_Wtime() - seconds; if (seconds < 0.1){ n_iterations *= 2; goto START; } */ seconds = MPI_Wtime() - seconds; Mflop_s = 1e-6 * n_iterations * n * n * n / seconds; printf("Mflops: %g time: %g \n", Mflop_s, seconds); cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n,n,n, -1, A,n, B,n, 1, C,n ); /*Subtract the maximum allowed roundoff from each element of C*/ absolute_value( A, n * n ); absolute_value( B, n * n ); absolute_value( C, n * n ); //dgemm( 'N','N', n,n,n, -3.0*DBL_EPSILON*n, A,n, B,n, 1, C,n ); cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n,n,n, -3.0*DBL_EPSILON*n, A,n, B,n, 1, C,n ); /*After this test if any element in C is still positive something went wrong in square_dgemm*/ for( int i = 0; i < n * n; i++ ) if( C[i] > 0 ) { printf( "FAILURE: error in matrix multiply exceeds an acceptable margin\n" ); exit(-1); } /* if (iter < n_iterations){ iter++; goto ITERATION; } seconds = MPI_Wtime() - seconds; if (seconds < 0.1){ n_iterations *= 2; goto START; } */ }