void do_block( int lda, double *A, double *B, double *C,
               int i, int j, int k )
{

     /*
       Remember that you need to deal with the fringes in each
       dimension.

       If the matrix is 7x7 and the blocks are 3x3, you'll have 1x3,
       3x1, and 1x1 fringe blocks.

             xxxoooX
             xxxoooX
             xxxoooX
             oooxxxO
             oooxxxO
             oooxxxO
             XXXOOOX

       You won't get this to go fast until you figure out a `better'
       way to handle the fringe blocks.  The better way will be more
       machine-efficient, but very programmer-inefficient.
     */
     int M = min( BLOCK_SIZE, lda-i );
     int N = min( BLOCK_SIZE, lda-j );
     int K = min( BLOCK_SIZE, lda-k );

     basic_dgemm( lda, M, N, K, A + i + k*lda, B + k + j*lda, C + i + j*lda);

}
Example #2
0
void do_block(const int lda,
              const double *A, const double *B, double *C,
              const int i, const int j, const int k)
{
    const int M = (i+BLOCK_SIZE > lda? lda-i : BLOCK_SIZE);
    const int N = (j+BLOCK_SIZE > lda? lda-j : BLOCK_SIZE);
    const int K = (k+BLOCK_SIZE > lda? lda-k : BLOCK_SIZE);
    basic_dgemm(lda, M, N, K,
                A + i + k*lda, B + k + j*lda, C + i + j*lda);
}
Example #3
0
void do_block1(const int lda, const int MM, const int NN, const int KK,
               const double *A, const double *B, double *C,
               const int i, const int j, const int k)
{
    const int M = (i+BLOCK_SIZE1 > MM? MM-i : BLOCK_SIZE1);
    const int N = (j+BLOCK_SIZE1 > NN? NN-j : BLOCK_SIZE1);
    const int K = (k+BLOCK_SIZE1 > KK? KK-k : BLOCK_SIZE1);
    basic_dgemm(lda, M, N, K,
                A + i + k*lda, B + k + j*lda, C + i + j*lda);
}
Example #4
0
void do_block( int lda, double *A, double *B, double *C,
               int i, int j, int k )
{
     static double Mflop_sb=0,Mflop_sf=0;
     double seconds=0;
     static double nbloques=0,nflecos=0;
     int M = min( BLOCK_SIZE, lda-i );
     int N = min( BLOCK_SIZE, lda-j );
     int K = min( BLOCK_SIZE, lda-k );

     basic_dgemm( lda, M, N, K, A + i + k*lda, B + k + j*lda, C + i + j*lda);
}
Example #5
0
void do_block( int lda, double *A, double *B, double *C,
               int i, int j, int k )
{
     static double Mflop_sb=0,Mflop_sf=0;
     double seconds=0;
     static double nbloques=0,nflecos=0;
     /*
       Remember that you need to deal with the fringes in each
       dimension.

       If the matrix is 7x7 and the blocks are 3x3, you'll have 1x3,
       3x1, and 1x1 fringe blocks.

             xxxoooX
             xxxoooX
             xxxoooX
             oooxxxO
             oooxxxO
             oooxxxO
             XXXOOOX

       You won't get this to go fast until you figure out a `better'
       way to handle the fringe blocks.  The better way will be more
       machine-efficient, but very programmer-inefficient.
     */
     int M = min( BLOCK_SIZE, lda-i );
     int N = min( BLOCK_SIZE, lda-j );
     int K = min( BLOCK_SIZE, lda-k );

     //seconds = read_timer2( );
     basic_dgemm( lda, M, N, K, A + i + k*lda, B + k + j*lda, C + i + j*lda);
     //seconds = read_timer2( )-seconds;
     /*if(M<BLOCK_SIZE||N<BLOCK_SIZE||K<BLOCK_SIZE)
     {  
           Mflop_sf =Mflop_sf+ ((2e-6*M * N * K) / (seconds));           
           nflecos++;
     }
     else
     {
           Mflop_sb =Mflop_sb+ ((2e-6*M * N * K) / (seconds)); 
           nbloques++;
     }       
     if((lda-i)<=BLOCK_SIZE&&(lda-j)<=BLOCK_SIZE&&(lda-k)<=BLOCK_SIZE)
     {
           printf("tamano %d Rendimiento bloques %lf nº b=%lf media=%lf\n",lda,Mflop_sb,nbloques,(Mflop_sb/nbloques));
           printf("tamano %d Rendimiento flecos %lf nº f=%lf media=%lf\n",lda,Mflop_sf,nflecos,(Mflop_sf/nflecos));
     }*/  
}
Example #6
0
void square_dgemm(const int M, const double *A, const double *B, double *C)
{
    // Number of blocks total
    const int n_blocks = M / BLOCK_SIZE + (M % BLOCK_SIZE? 1 : 0);
    const int n_size = n_blocks * BLOCK_SIZE;
    const int n_mem = n_size * n_size * sizeof(double);
    // Copied A matrix
    double * CA = (double *) malloc(n_mem);
    // Copied B matrix
    double * CB = (double *) malloc(n_mem);
    // Copied C matrix
    double * CC = (double *) malloc(n_mem);

    // Initialize matrices
    int bi, bj, bk, i, j, k;

    int copyoffset;
    int offset;
    for (bi = 0; bi < n_blocks; ++bi) {
        for (bj = 0; bj < n_blocks; ++bj) {
            int oi = bi * BLOCK_SIZE;
            int oj = bj * BLOCK_SIZE;
            copyoffset = (bi + bj * n_blocks) * BLOCK_SIZE * BLOCK_SIZE;
            for (j = 0; j < BLOCK_SIZE; ++j) {
                for (i = 0; i < BLOCK_SIZE; ++i) {
                    offset = (oi + i) + (oj + j) * M;
                    // Check bounds
                    if (oi + i < M && oj + j < M) {
                        CA[copyoffset] = A[offset];
                        CB[copyoffset] = B[offset];
                        CC[copyoffset] = 0;
                        offset++;
                    }
                    else {
                        CA[copyoffset] = 0;
                        CB[copyoffset] = 0;
                        CC[copyoffset] = 0;
                    }
                    copyoffset++;
                }
            }
        }
    }
    
    for (bi = 0; bi < n_blocks; ++bi) {
        for (bj = 0; bj < n_blocks; ++bj) {
            for (bk = 0; bk < n_blocks; ++bk) {
                //CC[(bi + bj * n_blocks) * BLOCK_SIZE * BLOCK_SIZE]++;
                //*
                basic_dgemm(
                    BLOCK_SIZE,
                    CA + (bi + bk * n_blocks) * BLOCK_SIZE * BLOCK_SIZE,
                    CB + (bk + bj * n_blocks) * BLOCK_SIZE * BLOCK_SIZE,
                    CC + (bi + bj * n_blocks) * BLOCK_SIZE * BLOCK_SIZE
                );
                //*/
            }
        }
    }
    
    /*
    for (bj = 0; bj < n_size; bj++) {
        for (bi = 0; bi < n_size; bi++) {
            printf("%.1f ", CC[bi + bj * n_size]);
        }
        printf("\n");
    }
    */

    // Copy results back
    for (bi = 0; bi < n_blocks; ++bi) {
        for (bj = 0; bj < n_blocks; ++bj) {
            int oi = bi * BLOCK_SIZE;
            int oj = bj * BLOCK_SIZE;
            copyoffset = (bi + bj * n_blocks) * BLOCK_SIZE * BLOCK_SIZE;
            for (j = 0; j < BLOCK_SIZE; ++j) {
                for (i = 0; i < BLOCK_SIZE; ++i) {
                    offset = (oi + i) + (oj + j) * M;
                    // Check bounds
                    if (oi + i < M && oj + j < M) {
                        C[offset] = CC[copyoffset];
                    }
                    copyoffset++;
                }
            }
        }
    }
}