int main(int argc, char *argv[]) { int n, i; double *a, *b, *c0, *c1, *c2, *c3; double t0, t1, t2, t3; double diff1, diff2, diff3; struct timespec start, end; n = atoi(argv[1]); a = (double *) malloc(n*n*sizeof(double)); b = (double *) malloc(n*n*sizeof(double)); c0 = (double *)malloc(n*n*sizeof(double)); c1 = (double *)malloc(n*n*sizeof(double)); c2 = (double *)malloc(n*n*sizeof(double)); c3 = (double *)malloc(n*n*sizeof(double)); RandMatrixGen(a, n); RandMatrixGen(b, n); clock_gettime(CLOCK_MONOTONIC, &start); dgemm0(a, b, c0, n); clock_gettime(CLOCK_MONOTONIC, &end); t0 = (end.tv_sec - start.tv_sec)*BILLION + end.tv_nsec - start.tv_nsec; clock_gettime(CLOCK_MONOTONIC, &start); dgemm1(a, b, c1, n); clock_gettime(CLOCK_MONOTONIC, &end); t1 = (end.tv_sec - start.tv_sec)*BILLION + end.tv_nsec - start.tv_nsec; clock_gettime(CLOCK_MONOTONIC, &start); dgemm2(a, b, c2, n); clock_gettime(CLOCK_MONOTONIC, &end); t2 = (end.tv_sec - start.tv_sec)*BILLION + end.tv_nsec - start.tv_nsec; clock_gettime(CLOCK_MONOTONIC, &start); dgemm3(a, b, c3, n); clock_gettime(CLOCK_MONOTONIC, &end); t3 = (end.tv_sec - start.tv_sec)*BILLION + end.tv_nsec - start.tv_nsec; diff1 = verification(c0, c1, n); diff2 = verification(c0, c2, n); diff3 = verification(c0, c3, n); printf ("matrix size: %d\n", n); printf ("dgemm0 runtime: %llu nanoseconds\n", (long long unsigned int) t0); printf ("dgemm1 runtime: %llu nanoseconds\n", (long long unsigned int) t1); printf ("dgemm2 runtime: %llu nanoseconds\n", (long long unsigned int) t2); printf ("dgemm3 runtime: %llu nanoseconds\n", (long long unsigned int) t3); printf ("maximum difference between dgemm0 and dgemm1: %f\n", diff1); printf ("maximum difference between dgemm0 and dgemm2: %f\n", diff2); printf ("maximum difference between dgemm0 and dgemm3: %f\n", diff3); return 0; }
void do_block3(const int lda, const double *A, const double *B, double *C, const int i, const int j, const int k) { const int M = (i+BLOCK_SIZE3 > lda? lda-i : BLOCK_SIZE3); const int N = (j+BLOCK_SIZE3 > lda? lda-j : BLOCK_SIZE3); const int K = (k+BLOCK_SIZE3 > lda? lda-k : BLOCK_SIZE3); //printf("%d, %d %d %d, %.0f %.0f %.0f\n",lda, M,N,K, A[i + k*lda], B[k + j*lda], C[i + j*lda]); dgemm3(lda, M, N, K, A + i + k*lda, B + k + j*lda, C + i + j*lda); }