Example #1
0
/* --
 * Check that C = A*B to within roundoff error.
 *
 * We use the fact that dot products satisfy the error bound
 *
 *   float(sum a_i * b_i) = sum a_i * b_i * (1 + delta_i)
 *
 * where delta_i <= n * epsilon.  In order to check your matrix
 * multiply, we compute each element in turn and make sure that
 * your product is within three times the given error bound.
 * We make it three times because there are three sources of
 * error:
 *
 *  - the roundoff error in your multiply
 *  - the roundoff error in our multiply
 *  - the roundoff error in computing the error bound
 *
 *  That last source of error is not so significant, but that's a
 *  story for another day.
 */
void validate_dgemm(const int M, const double *A, const double *B, double *C)
{
    matrix_clear(C);
    square_dgemm(M, A, B, C);

    for (int i = 0; i < M; ++i) {
        for (int j = 0; j < M; ++j) {
            double dotprod = 0;
            double errorbound = 0;
            for (int k = 0; k < M; ++k) {
                double prod = A[k*M + i] * B[j*M + k];
                dotprod += prod;
                errorbound += fabs(prod);
            }
            errorbound *= (M * DBL_EPSILON);
            double err = fabs(C[j*M + i] - dotprod);
            if (err > 3*errorbound) {
                fprintf(stderr, "Matrix multiply failed.\n");
                fprintf(stderr, "C(%d,%d) should be %lg, was %lg\n", i, j,
                        dotprod, C[j*M + i]);
                fprintf(stderr, "Error of %lg, acceptable limit %lg\n",
                        err, 3*errorbound);
                diff_dgemm(M, A, B, C);
                exit(-1);
            }
        }
    }
}
Example #2
0
/* --
 * Check that C = A*B to within roundoff error.
 *
 * We use the fact that dot products satisfy the error bound
 *
 *   float(sum a_i * b_i) = sum a_i * b_i * (1 + delta_i)
 *
 * where delta_i <= n * epsilon.  In order to check your matrix
 * multiply, we compute each element in turn and make sure that
 * your product is within three times the given error bound.
 * We make it three times because there are three sources of
 * error:
 *
 *  - the roundoff error in your multiply
 *  - the roundoff error in our multiply
 *  - the roundoff error in computing the error bound
 *
 *  That last source of error is not so significant, but that's a
 *  story for another day.
 */
void diff_dgemm(const int M, const double *A, const double *B, double *C)
{
    FILE* fp_our  = fopen("dump_our.txt", "w");
    FILE* fp_ref  = fopen("dump_ref.txt", "w");
    FILE* fp_diff = fopen("dump_diff.txt", "w");
    matrix_clear(C);
    square_dgemm(M, A, B, C);
    for (int i = 0; i < M; ++i) {
        for (int j = 0; j < M; ++j) {
            double dotprod = 0;
            double errorbound = 0;
            for (int k = 0; k < M; ++k) {
                double prod = A[k*M + i] * B[j*M + k];
                dotprod += prod;
                errorbound += fabs(prod);
            }
            fprintf(fp_our,  " %g", C[j*M+i]);
            fprintf(fp_ref,  " %g", dotprod);
            fprintf(fp_diff, " % 0.0e", C[j*M+i]-dotprod);
        }
        fprintf(fp_our, "\n");
        fprintf(fp_ref, "\n");
        fprintf(fp_diff, "\n");
    }
    fclose(fp_diff);
    fclose(fp_ref);
    fclose(fp_our);
}
Example #3
0
/* --
 * Compute a MFlop/s rate for C += A*B.
 *
 * The code runs the multiplication repeatedly in a loop MIN_RUNS times,
 * then doubles the loop time if it did not take MIN_SECS to perform the
 * run.  This helps us get around the limits of timer resolution.
 */
double time_dgemm(const int M, const double *A, const double *B, double *C)
{
    double secs = -1.0;
    double mflops_sec;
    int num_iterations = MIN_RUNS;
    while (secs < MIN_SECS) {
        matrix_clear(C);
        double start = omp_get_wtime();
        for (int i = 0; i < num_iterations; ++i) {
            square_dgemm(M, A, B, C);
        }
        double finish = omp_get_wtime();
        double mflops = 2.0 * num_iterations * M * M * M / 1.0e6;
        secs = finish-start;
        mflops_sec = mflops / secs;
        num_iterations *= 2;
    }
    return mflops_sec;
}
/* The benchmarking program */
int main (int argc, char **argv)
{
  printf ("Description:\t%s\n\n", dgemm_desc);

  /* Test sizes should highlight performance dips at multiples of certain powers-of-two */

  int test_sizes[] = 

  /* Multiples-of-32, +/- 1. Currently commented. */
   {31,32,33,63,64,65,95,96,97,127,128,129,159,160,161,191,192,193,223,224,225,255,256,257,287,288,289,319,320,321,351,352,353,383,384,385,415,416,417,447,448,449,479,480,481,511,512,513,543,544,545,575,576,577,607,608,609,639,640,641,671,672,673,703,704,705,735,736,737,767,768,769,799,800,801,831,832,833,863,864,865,895,896,897,927,928,929,959,960,961,991,992,993,1023,1024,1025}; 

  /* A representative subset of the first list. Currently uncommented. */ 
  //{ 31, 32, 96, 97, 127, 128, 129, 191, 192, 229, 255, 256, 257,
  //  319, 320, 321, 417, 479, 480, 511, 512, 639, 640, 767, 768, 769 };

  int nsizes = sizeof(test_sizes)/sizeof(test_sizes[0]);

  /* assume last size is also the largest size */
  int nmax = test_sizes[nsizes-1];

  /* allocate memory for all problems */
  double* buf = NULL;
  buf = (double*) malloc (3 * nmax * nmax * sizeof(double));
  if (buf == NULL) die ("failed to allocate largest problem size");

  double Mflops_s[nsizes],per[nsizes],aveper;

  /* For each test size */
  for (int isize = 0; isize < sizeof(test_sizes)/sizeof(test_sizes[0]); ++isize) {
    for( int block_size = 3;block_size<200;block_size++) {
    /* Create and fill 3 random matrices A,B,C*/
        int n = test_sizes[isize];

        double* A = buf + 0;
        double* B = A + nmax*nmax;
        double* C = B + nmax*nmax;

        fill (A, n*n);
        fill (B, n*n);
        fill (C, n*n);

        /* Measure performance (in Gflops/s). */

        /* Time a "sufficiently long" sequence of calls to reduce noise */
        double Gflops_s, seconds = -1.0;
        double timeout = 0.1; // "sufficiently long" := at least 1/10 second.
        for (int n_iterations = 1; seconds < timeout; n_iterations *= 2) {
        /* Warm-up */
        square_dgemm (block_size,n, A, B, C);

        /* Benchmark n_iterations runs of square_dgemm */
        seconds = -wall_time();
        for (int it = 0; it < n_iterations; ++it)
	       square_dgemm (block_size,n, A, B, C);
        seconds += wall_time();

        /*  compute Gflop/s rate */
        Gflops_s = 2.e-9 * n_iterations * n * n * n / seconds;
    }
  
    /* Storing Mflop rate and calculating percentage of peak */
    Mflops_s[isize] = Gflops_s*1000;
    per[isize] = Gflops_s*100/MAX_SPEED;

    printf ("Size: %d\t Block Size: %d\t Mflop/s: %8g\tPercentage:%6.2lf\n", n, block_size,Mflops_s[isize],per[isize]);

    /* Ensure that error does not exceed the theoretical error bound. */

    /* C := A * B, computed with square_dgemm */
    memset (C, 0, n * n * sizeof(double));
    square_dgemm (block_size,n, A, B, C);

    /* Do not explicitly check that A and B were unmodified on square_dgemm exit
     *  - if they were, the following will most likely detect it:   
     * C := C - A * B, computed with reference_dgemm */
    reference_dgemm(n, -1., A, B, C);

    /* A := |A|, B := |B|, C := |C| */
    absolute_value (A, n * n);
    absolute_value (B, n * n);
    absolute_value (C, n * n);

    /* C := |C| - 3 * e_mach * n * |A| * |B|, computed with reference_dgemm */ 
    reference_dgemm (n, -3.*DBL_EPSILON*n, A, B, C);

    /* If any element in C is positive, then something went wrong in square_dgemm */
    for (int i = 0; i < n * n; ++i)
      if (C[i] > 0)
	die("*** FAILURE *** Error in matrix multiply exceeds componentwise error bounds.\n" );

  }

  }

  free (buf);

  return 0;
}
void main(void) {

	// Malloc spaces for four matrix
	double *A = malloc(sizeof(double) * SIZE * SIZE);
	fill_matrix(A, SIZE);
	double *B = malloc(sizeof(double) * SIZE * SIZE);
	fill_matrix(B, SIZE);
	double *C = malloc(sizeof(double) * SIZE * SIZE);
	memset(C, 0, sizeof(double) * SIZE * SIZE);
	double *D = malloc(sizeof(double) * SIZE * SIZE);
	memset(D, 0, sizeof(double) * SIZE * SIZE);

	// struct to timing
	struct timeval begin, end;
	
	// test function
	gettimeofday(&begin, NULL);
	square_dgemm(SIZE, A, B, C);
	gettimeofday(&end, NULL);

	// niave multipily
	naive_multiply(A, B, D, SIZE);

	// validate result, if wrong, print four matrix
	for(int i=0; i<SIZE*SIZE; i++) {
		if(C[i] != D[i]) {
			printf("WRONG.\n");
			for(int x=0; x<SIZE; x++) {
				for(int y=0; y<SIZE; y++) {
					printf("%f ", A[x*SIZE+y]);
				}
				printf("\n");
			}
			printf("-----------\n");
			for(int x=0; x<SIZE; x++) {
				for(int y=0; y<SIZE; y++) {
					printf("%f ", B[x*SIZE+y]);
				}
				printf("\n");
			}
			printf("-----------\n");
			for(int x=0; x<SIZE; x++) {
				for(int y=0; y<SIZE; y++) {
					printf("%f ", C[x*SIZE+y]);
				}
				printf("\n");
			}
			printf("-----------\n");
			for(int x=0; x<SIZE; x++) {
				for(int y=0; y<SIZE; y++) {
					printf("%f ", D[x*SIZE+y]);
				}
				printf("\n");
			}
		return;
		}
	}
	printf("CORRECT.^_^\n");
	printf("Single Round Time use: %ld usec.\n", (end.tv_sec-begin.tv_sec)*1000000 + (end.tv_usec - begin.tv_usec));

    /* Time a "sufficiently long" sequence of calls to reduce noise */
    double Gflops_s, seconds = -1.0;
    double timeout = 0.1; // "sufficiently long" := at least 1/10 second.
    for (int n_iterations = 1; seconds < timeout; n_iterations *= 2) 
    {
      /* Warm-up */
      square_dgemm (SIZE, A, B, C);

      /* Benchmark n_iterations runs of square_dgemm */
      seconds = -wall_time();
      for (int it = 0; it < n_iterations; ++it)
		square_dgemm (SIZE, A, B, C);
      seconds += wall_time();

      /*  compute Mflop/s rate */
      Gflops_s = 2.e-9 * n_iterations * SIZE * SIZE * SIZE / seconds;
    }
    printf ("Size: %d\tGflop/s: %.3g\n", SIZE, Gflops_s);
}
int main( int argc, char **argv )
{
    printf ("Description:\t%s\n\n", dgemm_desc);

    /* These sizes should highlight performance dips at multiples of certain powers-of-two */
    int test_sizes[] = {
        31, 32, 96, 97, 127, 128, 129, 191, 192, 229, 255, 256, 257,
        319, 320, 321, 417, 479, 480, 511, 512, 639, 640, 767, 768, 769,
    };
 
	/*For each test size*/
    for( int isize = 0; isize < sizeof(test_sizes)/sizeof(test_sizes[0]); isize++ )
    {
		/*Craete and fill 3 random matrices A,B,C*/
        int n = test_sizes[isize];

        double *A = (double*) malloc( n * n * sizeof(double) );
        double *B = (double*) malloc( n * n * sizeof(double) );
        double *C = (double*) malloc( n * n * sizeof(double) );

        fill( A, n * n );
        fill( B, n * n );
        fill( C, n * n );
        
        /*  measure Mflop/s rate; time a sufficiently long sequence of calls to eliminate noise*/
        double Mflop_s, seconds = -1.0;
        for( int n_iterations = 1; seconds < 0.1; n_iterations *= 2 ) 
        {
            /* warm-up */
            square_dgemm( n, A, B, C );
            
            /*  measure time */
            seconds = read_timer( );
            for( int i = 0; i < n_iterations; i++ )
                square_dgemm( n, A, B, C );
            seconds = read_timer( ) - seconds;
           
            /*  compute Mflop/s rate */
            Mflop_s = 2e-6 * n_iterations * n * n * n / seconds;
        }
        printf ("Size: %d\tMflop/s: %g\n", n, Mflop_s);
        
        /*  Ensure that error does not exceed the theoretical error bound */
		
		/* Set initial C to 0 and do matrix multiply of A*B */
        memset( C, 0, sizeof( double ) * n * n );
        square_dgemm( n, A, B, C );
		/*Subtract A*B from C using standard dgemm (note that this should be 0 to within machine roundoff)*/
        dgemm( 'N','N', n,n,n, -1, A,n, B,n, 1, C,n );
		/*Subtract the maximum allowed roundoff from each element of C*/
        absolute_value( A, n * n );
        absolute_value( B, n * n );
        absolute_value( C, n * n );
        dgemm( 'N','N', n,n,n, -3.0*DBL_EPSILON*n, A,n, B,n, 1, C,n );
		/*After this test if any element in C is still positive something went wrong in square_dgemm*/
        for( int i = 0; i < n * n; i++ )
            if( C[i] > 0 )
            {
                printf( "FAILURE: error in matrix multiply exceeds an acceptable margin\n" );
                exit(-1);
            }

		/*Deallocate memory*/
        free( C );
        free( B );
        free( A );
    }
    
    return 0;
}
Example #7
0
int main( int argc, char **argv )
{
    int done = 0, myid, numprocs, i;
	int from, to;
    int  namelen;
    char processor_name[MPI_MAX_PROCESSOR_NAME];
	double seconds, Mflop_s;;
	
	int root_process = 0;
	int n_iterations = 1, iter = 0;
	int n = 1600; 

   	double *A = (double*) malloc( n * n * sizeof(double) );
   	double *B = (double*) malloc( n * n * sizeof(double) );
   	double *C = (double*) malloc( n * n * sizeof(double) );
    
	MPI_Init(&argc,&argv);
    MPI_Comm_size(MPI_COMM_WORLD,&numprocs);
    MPI_Comm_rank(MPI_COMM_WORLD,&myid);
    MPI_Get_processor_name(processor_name,&namelen);
	
	/* These sizes should highlight performance dips at multiples of certain powers-of-two */
	/*Craete and fill 3 random matrices A,B,C*/
	from = myid * n/numprocs;
	to = (myid+1) * n/numprocs;
	if(myid == root_process){
    	printf ("Description:\t%s\n\n", dgemm_desc);
		n_iterations = 1;
	}

START:	
	if (myid == root_process){
		

    	fill( A, n * n );
    	fill( B, n * n );
    	//fill( C, n * n );
        memset( C, 0, sizeof( double ) * n * n );

	}
	MPI_Bcast(A, n * n, MPI_DOUBLE, 0,MPI_COMM_WORLD);
	MPI_Bcast(B, n * n, MPI_DOUBLE, 0,MPI_COMM_WORLD);
	MPI_Bcast(C, n * n, MPI_DOUBLE, 0,MPI_COMM_WORLD);


	if(myid == root_process){
		iter = 0;
	}
	
	double *T = (double*) malloc( n * n * sizeof(double) );
	
ITERATION:
   	if(myid == root_process){
   		seconds = MPI_Wtime();
   	}
   	

	square_dgemm(n, from, to, A, B, C, T);
//	MPI_Barrier(MPI_COMM_WORLD);

	MPI_Gather(T + from * n,
               		n * (n / numprocs),
			        MPI_DOUBLE,
					C + from * n,
				    n * (n / numprocs),
				    MPI_DOUBLE,
				    0,
				    MPI_COMM_WORLD);




/*		
		if (iter < n_iterations){
			iter++;
			goto ITERATION;
		}

        seconds = MPI_Wtime() - seconds;
        if (seconds < 0.1){
			n_iterations *= 2;
			goto START;
		}
*/
        seconds = MPI_Wtime() - seconds;
		Mflop_s = 1e-6 * n_iterations * n * n * n / seconds;
		printf("Mflops: %g time: %g \n", Mflop_s, seconds);
        
		cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n,n,n, -1, A,n, B,n, 1, C,n );
		/*Subtract the maximum allowed roundoff from each element of C*/
        absolute_value( A, n * n );
        absolute_value( B, n * n );
        absolute_value( C, n * n );
        //dgemm( 'N','N', n,n,n, -3.0*DBL_EPSILON*n, A,n, B,n, 1, C,n );
        cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n,n,n, -3.0*DBL_EPSILON*n, A,n, B,n, 1, C,n );
		/*After this test if any element in C is still positive something went wrong in square_dgemm*/
        for( int i = 0; i < n * n; i++ )
            if( C[i] > 0 ) {
                printf( "FAILURE: error in matrix multiply exceeds an acceptable margin\n" );
                exit(-1);
            }

/*		
		if (iter < n_iterations){
			iter++;
			goto ITERATION;
		}

        seconds = MPI_Wtime() - seconds;
        if (seconds < 0.1){
			n_iterations *= 2;
			goto START;
		}
*/

    
	}