void do_block( int lda, double *A, double *B, double *C, int i, int j, int k ) { /* Remember that you need to deal with the fringes in each dimension. If the matrix is 7x7 and the blocks are 3x3, you'll have 1x3, 3x1, and 1x1 fringe blocks. xxxoooX xxxoooX xxxoooX oooxxxO oooxxxO oooxxxO XXXOOOX You won't get this to go fast until you figure out a `better' way to handle the fringe blocks. The better way will be more machine-efficient, but very programmer-inefficient. */ int M = min( BLOCK_SIZE, lda-i ); int N = min( BLOCK_SIZE, lda-j ); int K = min( BLOCK_SIZE, lda-k ); basic_dgemm( lda, M, N, K, A + i + k*lda, B + k + j*lda, C + i + j*lda); }
void do_block(const int lda, const double *A, const double *B, double *C, const int i, const int j, const int k) { const int M = (i+BLOCK_SIZE > lda? lda-i : BLOCK_SIZE); const int N = (j+BLOCK_SIZE > lda? lda-j : BLOCK_SIZE); const int K = (k+BLOCK_SIZE > lda? lda-k : BLOCK_SIZE); basic_dgemm(lda, M, N, K, A + i + k*lda, B + k + j*lda, C + i + j*lda); }
void do_block1(const int lda, const int MM, const int NN, const int KK, const double *A, const double *B, double *C, const int i, const int j, const int k) { const int M = (i+BLOCK_SIZE1 > MM? MM-i : BLOCK_SIZE1); const int N = (j+BLOCK_SIZE1 > NN? NN-j : BLOCK_SIZE1); const int K = (k+BLOCK_SIZE1 > KK? KK-k : BLOCK_SIZE1); basic_dgemm(lda, M, N, K, A + i + k*lda, B + k + j*lda, C + i + j*lda); }
void do_block( int lda, double *A, double *B, double *C, int i, int j, int k ) { static double Mflop_sb=0,Mflop_sf=0; double seconds=0; static double nbloques=0,nflecos=0; int M = min( BLOCK_SIZE, lda-i ); int N = min( BLOCK_SIZE, lda-j ); int K = min( BLOCK_SIZE, lda-k ); basic_dgemm( lda, M, N, K, A + i + k*lda, B + k + j*lda, C + i + j*lda); }
void do_block( int lda, double *A, double *B, double *C, int i, int j, int k ) { static double Mflop_sb=0,Mflop_sf=0; double seconds=0; static double nbloques=0,nflecos=0; /* Remember that you need to deal with the fringes in each dimension. If the matrix is 7x7 and the blocks are 3x3, you'll have 1x3, 3x1, and 1x1 fringe blocks. xxxoooX xxxoooX xxxoooX oooxxxO oooxxxO oooxxxO XXXOOOX You won't get this to go fast until you figure out a `better' way to handle the fringe blocks. The better way will be more machine-efficient, but very programmer-inefficient. */ int M = min( BLOCK_SIZE, lda-i ); int N = min( BLOCK_SIZE, lda-j ); int K = min( BLOCK_SIZE, lda-k ); //seconds = read_timer2( ); basic_dgemm( lda, M, N, K, A + i + k*lda, B + k + j*lda, C + i + j*lda); //seconds = read_timer2( )-seconds; /*if(M<BLOCK_SIZE||N<BLOCK_SIZE||K<BLOCK_SIZE) { Mflop_sf =Mflop_sf+ ((2e-6*M * N * K) / (seconds)); nflecos++; } else { Mflop_sb =Mflop_sb+ ((2e-6*M * N * K) / (seconds)); nbloques++; } if((lda-i)<=BLOCK_SIZE&&(lda-j)<=BLOCK_SIZE&&(lda-k)<=BLOCK_SIZE) { printf("tamano %d Rendimiento bloques %lf nº b=%lf media=%lf\n",lda,Mflop_sb,nbloques,(Mflop_sb/nbloques)); printf("tamano %d Rendimiento flecos %lf nº f=%lf media=%lf\n",lda,Mflop_sf,nflecos,(Mflop_sf/nflecos)); }*/ }
void square_dgemm(const int M, const double *A, const double *B, double *C) { // Number of blocks total const int n_blocks = M / BLOCK_SIZE + (M % BLOCK_SIZE? 1 : 0); const int n_size = n_blocks * BLOCK_SIZE; const int n_mem = n_size * n_size * sizeof(double); // Copied A matrix double * CA = (double *) malloc(n_mem); // Copied B matrix double * CB = (double *) malloc(n_mem); // Copied C matrix double * CC = (double *) malloc(n_mem); // Initialize matrices int bi, bj, bk, i, j, k; int copyoffset; int offset; for (bi = 0; bi < n_blocks; ++bi) { for (bj = 0; bj < n_blocks; ++bj) { int oi = bi * BLOCK_SIZE; int oj = bj * BLOCK_SIZE; copyoffset = (bi + bj * n_blocks) * BLOCK_SIZE * BLOCK_SIZE; for (j = 0; j < BLOCK_SIZE; ++j) { for (i = 0; i < BLOCK_SIZE; ++i) { offset = (oi + i) + (oj + j) * M; // Check bounds if (oi + i < M && oj + j < M) { CA[copyoffset] = A[offset]; CB[copyoffset] = B[offset]; CC[copyoffset] = 0; offset++; } else { CA[copyoffset] = 0; CB[copyoffset] = 0; CC[copyoffset] = 0; } copyoffset++; } } } } for (bi = 0; bi < n_blocks; ++bi) { for (bj = 0; bj < n_blocks; ++bj) { for (bk = 0; bk < n_blocks; ++bk) { //CC[(bi + bj * n_blocks) * BLOCK_SIZE * BLOCK_SIZE]++; //* basic_dgemm( BLOCK_SIZE, CA + (bi + bk * n_blocks) * BLOCK_SIZE * BLOCK_SIZE, CB + (bk + bj * n_blocks) * BLOCK_SIZE * BLOCK_SIZE, CC + (bi + bj * n_blocks) * BLOCK_SIZE * BLOCK_SIZE ); //*/ } } } /* for (bj = 0; bj < n_size; bj++) { for (bi = 0; bi < n_size; bi++) { printf("%.1f ", CC[bi + bj * n_size]); } printf("\n"); } */ // Copy results back for (bi = 0; bi < n_blocks; ++bi) { for (bj = 0; bj < n_blocks; ++bj) { int oi = bi * BLOCK_SIZE; int oj = bj * BLOCK_SIZE; copyoffset = (bi + bj * n_blocks) * BLOCK_SIZE * BLOCK_SIZE; for (j = 0; j < BLOCK_SIZE; ++j) { for (i = 0; i < BLOCK_SIZE; ++i) { offset = (oi + i) + (oj + j) * M; // Check bounds if (oi + i < M && oj + j < M) { C[offset] = CC[copyoffset]; } copyoffset++; } } } } }