Beispiel #1
0
void transpose_submit(int M, int N, int A[N][M], int B[M][N])
{
    REQUIRES(M > 0);
    REQUIRES(N > 0);

    ENSURES(is_transpose(M, N, A, B));
}
Beispiel #2
0
void trans(int M, int N, int A[N][M], int B[M][N]){
    int i, j, tmp;

    REQUIRES(M > 0);
    REQUIRES(N > 0);

    for (i = 0; i < N; i++){
        for (j = 0; j < M; j++){
            tmp = A[i][j];
            B[j][i] = tmp;
        }
    }    

    ENSURES(is_transpose(M, N, A, B));
}
Beispiel #3
0
void basicBlockingTrans(int M, int N, int A[N][M], int B[M][N])
{
    int i, j, k, l, tmp;

    REQUIRES(M > 0);
    REQUIRES(N > 0);

    for (i = 0; i < N; i += 8) 
    {
        for (j = 0; j < M; j += 8) 
        {
            for (k = i; k <= i + 3 && k < N; k++)
            {
                for (l = j; l <= j + 3 && l < M; l++)
                {
                    tmp = A[k][l];
                    B[l][k] = tmp;
                }
            }
            for (k = i; k <= i + 3 && k < N; k++)
            {
                for (l = j + 4; l <= j + 7 && l < M; l++)
                {
                    tmp = A[k][l];
                    B[l][k] = tmp;
                }
            }
            for (k = i + 4; k <= i + 7 && k < N; k++)
            {
                for (l = j; l <= j + 4 && l < M; l++)
                {
                    tmp = A[k][l];
                    B[l][k] = tmp;
                }
            }
            for (k = i + 4; k <= i + 7 && k < N; k++)
            {
                for (l = j + 4; l <= j + 7 && l < M; l++)
                {
                    tmp = A[k][l];
                    B[l][k] = tmp;
                }
            }
        }
    }    

    ENSURES(is_transpose(M, N, A, B));
}
void transpose_submit(int M, int N, int A[N][M], int B[M][N])
{
    int i, j, x, y;
    int temp;
    int buff1, buff2, buff3, buff4, buff5, buff6, buff7;

    REQUIRES(M > 0);
    REQUIRES(N > 0);

    /*
     * When the matrix size is (32,32), we cam divide the matrix into several blocks;
     * The size of the block size is (8,8).
     */
    if (M == 32 && N == 32) {
        for (i = 0; i <= N - 8; i += 8 ) {
            for (j = 0; j <= M - 8; j += 8) {
                for (x = i; x < i + 8; x++) {
                    for (y = j; y < j + 8; y++) {
                        if (x != y) {
                            temp = A[x][y];
                            B[y][x] = temp;
                        }
                    }

                    if (i == j) {
                        temp = A[x][x];
                        B[x][x] = temp;
                    
                    }
                }
            }
        }
    }


    /*
     * When the matrix size is (64, 64), there will be conflits every 5 rows.
     * We can divide the matrix into blocks with size (4,4);
     */
    if (M == 64 && N == 64) {
        for (i = 0; i <= N - 8; i += 8) {
            for (j = 0; j <= M - 8; j += 8) {
                // deal with the subblock on the top-left    
                for (x = i; x < i + 4; x ++) {
                    //miss
                    temp  = A[x][j];
                    //hit
                    buff1 = A[x][j+1];
                    buff2 = A[x][j+2];
                    buff3 = A[x][j+3];
                    buff4 = A[x][j+4];
                    buff5 = A[x][j+5];
                    buff6 = A[x][j+6];
                    buff7 = A[x][j+7];
                
                    //B[][x+4] are not in the right place now
                    B[j][x] = temp;
                    B[j][x+4] = buff4;
                    
                    B[j+1][x] = buff1;
                    B[j+1][x+4] = buff5;
                    
                    B[j+2][x] = buff2;
                    B[j+2][x+4] = buff6;

                    B[j+3][x] = buff3;
                    B[j+3][x+4] = buff7;
                }

                // deal with the subblock on the diagonal
                for (x = 0; x < 4; x++) {
                    buff1 = B[j+x][i+4];
                    buff2 = B[j+x][i+5];
                    buff3 = B[j+x][i+6];
                    buff4 = B[j+x][i+7];

                    buff5 = A[i+4][j+x];
                    buff6 = A[i+5][j+x];
                    buff7 = A[i+6][j+x];
                    temp  = A[i+7][j+x];

                    B[j+x][i+4] = buff5;
                    B[j+x][i+5] = buff6;
                    B[j+x][i+6] = buff7;
                    B[j+x][i+7] = temp;

                    B[j+x+4][i] = buff1;
                    B[j+x+4][i+1] = buff2;
                    B[j+x+4][i+2] = buff3;
                    B[j+x+4][i+3] = buff4;
                }

                // deal with the subblock on the button-right
                for (x = i + 4; x < i + 8; x++) {
                    buff1 = A[x][j+4];
                    buff2 = A[x][j+5];
                    buff3 = A[x][j+6];
                    buff4 = A[x][j+7];

                    B[j+4][x] = buff1;
                    B[j+5][x] = buff2;
                    B[j+6][x] = buff3;
                    B[j+7][x] = buff4;
                }
                  
            }
        }
    }

    /*
     * As the matrix is not square matrix, it's hard to say the optimal block size.
     * Try different sizes of block, choose the size with best performance.
     */
    if (M == 61 && N == 67) {
          for (i = 0; i <= N-1; i += 16) {
               for (j = 0; j <= M - 1; j += 4) {
                    for (x = i; x < i + 16 && x < N; x++) {
                         buff1 = A[x][j];
                         buff2 = A[x][j+1];
                         buff3 = A[x][j+2];
                         buff4 = A[x][j+3];

                         B[j][x] = buff1;
                         B[j+1][x] = buff2;
                         B[j+2][x] = buff3;
                         B[j+3][x] = buff4;
                    }
               }
          }
   }
    ENSURES(is_transpose(M, N, A, B));
}
Beispiel #5
0
void trans_old(int M, int N, int A[N][M], int B[M][N])
{
    int i, j, k, l, t0, t1, t2, t3, t4, t5, t6, t7;
    REQUIRES(M > 0);
    REQUIRES(N > 0);

    if( M == 32 && N == 32 )
    {
        for( j = 0; j < 32; j += 8 )
        {
            for( i = 0; i < 32; ++i )
            {
                //Do local transpose on diagonal block
                if( i == j )
                {   
                    //Copy
                    for( k = i; k < i + 8; ++k )
                    {
                        t0 = A[k][i];
                        t1 = A[k][i+1];
                        t2 = A[k][i+2];
                        t3 = A[k][i+3];
                        t4 = A[k][i+4];
                        t5 = A[k][i+5];
                        t6 = A[k][i+6];
                        t7 = A[k][i+7];
                        B[k][i] = t0;
                        B[k][i+1] = t1;
                        B[k][i+2] = t2;
                        B[k][i+3] = t3;
                        B[k][i+4] = t4;
                        B[k][i+5] = t5;
                        B[k][i+6] = t6;
                        B[k][i+7] = t7;
                    }

                    //Local transpose
                    for( k = i; k < i + 8; ++k )
                    {
                        for( l = k + 1; l < i + 8; ++l )
                        {
                            t0 = B[k][l];
                            B[k][l] = B[l][k];
                            B[l][k] = t0;
                        }
                    }
                    i += 7;
                }
                //Simple buffered block transpose
                else
                {
                     t0 = A[i][j];
                     t1 = A[i][j+1];
                     t2 = A[i][j+2];
                     t3 = A[i][j+3];
                     t4 = A[i][j+4];
                     t5 = A[i][j+5];
                     t6 = A[i][j+6];
                     t7 = A[i][j+7];
                     B[j][i] = t0;
                     B[j+1][i] = t1;
                     B[j+2][i] = t2;
                     B[j+3][i] = t3;
                     B[j+4][i] = t4;
                     B[j+5][i] = t5;
                     B[j+6][i] = t6;
                     B[j+7][i] = t7;
                }
            }
        }
    }
    else if( M == 64 && N == 64 )
    {
        for( j = 0; j < 64; j += 8 )
        {
            for( i = 0; i < 64; i += 8 )
            {
                
                //Diagonal
                if( i == j )
                {   
                    //Copy a(1,1),a(1,2) to b(1,1),b(1,2)
                    for( k = i; k < i + 4; ++k )
                    {
                        t0 = A[k][j];
                        t1 = A[k][j+1];
                        t2 = A[k][j+2];
                        t3 = A[k][j+3];
                        t4 = A[k][j+4];
                        t5 = A[k][j+5];
                        t6 = A[k][j+6];
                        t7 = A[k][j+7];
                        B[k][j] = t0;
                        B[k][j+1] = t1;
                        B[k][j+2] = t2;
                        B[k][j+3] = t3;
                        B[k][j+4] = t4;
                        B[k][j+5] = t5;
                        B[k][j+6] = t6;
                        B[k][j+7] = t7;
                    }
                    //Local transpose: b(1,1) = bT(1,1), b(1,2) = bT(1,2)
                    for( k = i; k < i + 4; ++k )
                    {
                        for( l = k + 1; l < i + 4; ++l )
                        {
                            t0 = B[k][l];
                            t1 = B[k][l+4];
                            B[k][l] = B[l][k];
                            B[k][l+4] = B[l][k+4];
                            B[l][k] = t0;
                            B[l][k+4] = t1;
                        }
                    }
                    //Copy a(2,1),a(2,2) to b(2,1),b(2,2)
                    for( k = i + 4; k < i + 8; ++k )
                    {
                        t0 = A[k][j];
                        t1 = A[k][j+1];
                        t2 = A[k][j+2];
                        t3 = A[k][j+3];
                        t4 = A[k][j+4];
                        t5 = A[k][j+5];
                        t6 = A[k][j+6];
                        t7 = A[k][j+7];
                        B[k][j] = t0;
                        B[k][j+1] = t1;
                        B[k][j+2] = t2;
                        B[k][j+3] = t3;
                        B[k][j+4] = t4;
                        B[k][j+5] = t5;
                        B[k][j+6] = t6;
                        B[k][j+7] = t7;
                    }
                    //Local transpose: b(1,1) = bT(1,1), b(1,2) = bT(1,2)
                    for( k = i + 4; k < i + 8; ++k )
                    {
                        for( l = k + 1; l < i + 8; ++l )
                        {
                            t0 = B[k][l];
                            t1 = B[k][l-4];
                            B[k][l] = B[l][k];
                            B[k][l-4] = B[l][k-4];
                            B[l][k] = t0;
                            B[l][k-4] = t1;
                        }
                    }
                    //Swap: b(1,2),b(2,1)
                    for( k = i + 4; k < i + 8; ++k )
                    {
                        t0 = B[k][j];
                        t1 = B[k][j+1];
                        t2 = B[k][j+2];
                        t3 = B[k][j+3];
                        t4 = B[k-4][j+4];
                        t5 = B[k-4][j+5];
                        t6 = B[k-4][j+6];
                        t7 = B[k-4][j+7];
                        B[k-4][j+4] = t0;
                        B[k-4][j+5] = t1;
                        B[k-4][j+6] = t2;
                        B[k-4][j+7] = t3;
                        B[k][j] = t4;
                        B[k][j+1] = t5;
                        B[k][j+2] = t6;
                        B[k][j+3] = t7;
                    }
                }
                else
                {
                    //b(1,1) = aT(1,1), b(1,2) = aT(1,2)
                    for( k = i; k < i + 4; ++k )
                    {
                        t0 = A[k][j];
                        t1 = A[k][j+1];
                        t2 = A[k][j+2];
                        t3 = A[k][j+3];
                        t4 = A[k][j+4];
                        t5 = A[k][j+5];
                        t6 = A[k][j+6];
                        t7 = A[k][j+7];
                        B[j][k] = t0;
                        B[j+1][k] = t1;
                        B[j+2][k] = t2;
                        B[j+3][k] = t3;
                        B[j][k+4] = t4;
                        B[j+1][k+4] = t5;
                        B[j+2][k+4] = t6;
                        B[j+3][k+4] = t7;
                    }
                    //b(1,2) = aT(2,1), b(2,1) = b(1,2)
                    for( k = j; k < j + 4; ++k )
                    {
                        t0 = B[k][i+4];
                        t1 = B[k][i+5];
                        t2 = B[k][i+6];
                        t3 = B[k][i+7];
                        t4 = A[i+4][k];
                        t5 = A[i+5][k];
                        t6 = A[i+6][k];
                        t7 = A[i+7][k];
                        B[k][i+4] = t4;
                        B[k][i+5] = t5;
                        B[k][i+6] = t6;
                        B[k][i+7] = t7;
                        B[k+4][i] = t0;
                        B[k+4][i+1] = t1;
                        B[k+4][i+2] = t2;
                        B[k+4][i+3] = t3;
                    }
                    //b(2,2) = aT(2,2)
                    for( k = i + 4; k < i + 8; ++k )
                    {
                        t0 = A[k][j+4];
                        t1 = A[k][j+5];
                        t2 = A[k][j+6];
                        t3 = A[k][j+7];
                        B[j+4][k] = t0;
                        B[j+5][k] = t1;
                        B[j+6][k] = t2;
                        B[j+7][k] = t3;
                    }
                }
            }
        }
    }
    else if( M == 61 && N == 67 )
    {
    //Simple buffered block transpose
        for( j = 0; j < 56; j += 8 )
        {
            for( i = 0; i < 67; ++i )
            {
                t0 = A[i][j];
                t1 = A[i][j+1];
                t2 = A[i][j+2];
                t3 = A[i][j+3];
                t4 = A[i][j+4];
                t5 = A[i][j+5];
                t6 = A[i][j+6];
                t7 = A[i][j+7];
                B[j][i] = t0;
                B[j+1][i] = t1;
                B[j+2][i] = t2;
                B[j+3][i] = t3;
                B[j+4][i] = t4;
                B[j+5][i] = t5;
                B[j+6][i] = t6;
                B[j+7][i] = t7;
            }
        }
        for( i = 0; i < 67; ++i )
        {
            t0 = A[i][j];
            t1 = A[i][j+1];
            t2 = A[i][j+2];
            t3 = A[i][j+3];
            t4 = A[i][j+4];
            B[j][i] = t0;
            B[j+1][i] = t1;
            B[j+2][i] = t2;
            B[j+3][i] = t3;
            B[j+4][i] = t4;
        }
    }

    ENSURES(is_transpose(M,N,A,B));
}
Beispiel #6
0
void transpose_submit(int M, int N, int A[N][M], int B[M][N])
{
    int i, j, k, l, t0, t1, t2, t3, t4, t5, t6, t7;
    REQUIRES(M > 0);
    REQUIRES(N > 0);

    if( M == 32 && N == 32 )
    {
        for( j = 0; j < 32; j += 8 )
        {
            for( i = 0; i < 32; ++i )
            {
                //Do local transpose on diagonal block
                if( i == j )
                {   
                    //Copy
                    for( k = i; k < i + 8; ++k )
                    {
                        t0 = A[k][i];
                        t1 = A[k][i+1];
                        t2 = A[k][i+2];
                        t3 = A[k][i+3];
                        t4 = A[k][i+4];
                        t5 = A[k][i+5];
                        t6 = A[k][i+6];
                        t7 = A[k][i+7];
                        B[k][i] = t0;
                        B[k][i+1] = t1;
                        B[k][i+2] = t2;
                        B[k][i+3] = t3;
                        B[k][i+4] = t4;
                        B[k][i+5] = t5;
                        B[k][i+6] = t6;
                        B[k][i+7] = t7;
                    }

                    //Local transpose
                    for( k = i; k < i + 8; ++k )
                    {
                        for( l = k + 1; l < i + 8; ++l )
                        {
                            t0 = B[k][l];
                            B[k][l] = B[l][k];
                            B[l][k] = t0;
                        }
                    }
                    i += 7;
                }
                //Simple buffered block transpose
                else
                {
                     t0 = A[i][j];
                     t1 = A[i][j+1];
                     t2 = A[i][j+2];
                     t3 = A[i][j+3];
                     t4 = A[i][j+4];
                     t5 = A[i][j+5];
                     t6 = A[i][j+6];
                     t7 = A[i][j+7];
                     B[j][i] = t0;
                     B[j+1][i] = t1;
                     B[j+2][i] = t2;
                     B[j+3][i] = t3;
                     B[j+4][i] = t4;
                     B[j+5][i] = t5;
                     B[j+6][i] = t6;
                     B[j+7][i] = t7;
                }
            }
        }
    }
    else if( M == 64 && N == 64 )
    {
        for( j = 0; j < 64; j += 8 )
        {
            for( i = 0; i < 64; i += 8 )
            {
                
                //Obmit diagonal
                if( i == j )
                    continue;
                else
                {
                    //b(1,1) = aT(1,1), b(1,2) = aT(1,2)
                    for( k = i; k < i + 4; ++k )
                    {
                        t0 = A[k][j];
                        t1 = A[k][j+1];
                        t2 = A[k][j+2];
                        t3 = A[k][j+3];
                        t4 = A[k][j+4];
                        t5 = A[k][j+5];
                        t6 = A[k][j+6];
                        t7 = A[k][j+7];
                        B[j][k] = t0;
                        B[j+1][k] = t1;
                        B[j+2][k] = t2;
                        B[j+3][k] = t3;
                        B[j][k+4] = t4;
                        B[j+1][k+4] = t5;
                        B[j+2][k+4] = t6;
                        B[j+3][k+4] = t7;
                    }
                    //b(1,2) = aT(2,1), b(2,1) = b(1,2)
                    for( k = j; k < j + 4; ++k )
                    {
                        t0 = B[k][i+4];
                        t1 = B[k][i+5];
                        t2 = B[k][i+6];
                        t3 = B[k][i+7];
                        t4 = A[i+4][k];
                        t5 = A[i+5][k];
                        t6 = A[i+6][k];
                        t7 = A[i+7][k];
                        B[k][i+4] = t4;
                        B[k][i+5] = t5;
                        B[k][i+6] = t6;
                        B[k][i+7] = t7;
                        B[k+4][i] = t0;
                        B[k+4][i+1] = t1;
                        B[k+4][i+2] = t2;
                        B[k+4][i+3] = t3;
                    }
                    //b(2,2) = aT(2,2)
                    for( k = i + 4; k < i + 8; ++k )
                    {
                        t0 = A[k][j+4];
                        t1 = A[k][j+5];
                        t2 = A[k][j+6];
                        t3 = A[k][j+7];
                        B[j+4][k] = t0;
                        B[j+5][k] = t1;
                        B[j+6][k] = t2;
                        B[j+7][k] = t3;
                    }
                }
            }
        }
        //Handle Diagonal
        for( j = 0; j < 48; j += 8 )
        {
            //Take B[48~52][], B[56~63][] as buffer
            i = j;
            //b(1,1) = aT(1,1), b(1,2) = aT(1,2)
            for( k = i; k < i + 4; ++k )
            {
                t0 = A[k][j];
                t1 = A[k][j+1];
                t2 = A[k][j+2];
                t3 = A[k][j+3];
                t4 = A[k][j+4];
                t5 = A[k][j+5];
                t6 = A[k][j+6];
                t7 = A[k][j+7];
                B[48][k-i+48] = t0;
                B[49][k-i+48] = t1;
                B[50][k-i+48] = t2;
                B[51][k-i+48] = t3;
                B[48][k-i+52] = t4;
                B[49][k-i+52] = t5;
                B[50][k-i+52] = t6;
                B[51][k-i+52] = t7;
            }
            //b(1,2) = aT(2,1), b(2,1) = b(1,2)
            for( k = j; k < j + 4; ++k )
            {
                t0 = B[k-i+48][52];
                t1 = B[k-i+48][53];
                t2 = B[k-i+48][54];
                t3 = B[k-i+48][55];
                t4 = A[i+4][k];
                t5 = A[i+5][k];
                t6 = A[i+6][k];
                t7 = A[i+7][k];
                B[k-i+48][52] = t4;
                B[k-i+48][53] = t5;
                B[k-i+48][54] = t6;
                B[k-i+48][55] = t7;
                B[k-i+56][56] = t0;
                B[k-i+56][57] = t1;
                B[k-i+56][58] = t2;
                B[k-i+56][59] = t3;
            }
            //b(2,2) = aT(2,2)
            for( k = i + 4; k < i + 8; ++k )
            {
                t0 = A[k][j+4];
                t1 = A[k][j+5];
                t2 = A[k][j+6];
                t3 = A[k][j+7];
                B[56][k-i+56] = t0;
                B[57][k-i+56] = t1;
                B[58][k-i+56] = t2;
                B[59][k-i+56] = t3;
            }
            //copy back to B[j][i]
            for( k = i; k < i + 4; ++k )
            {
                t0 = B[48+k-i][48];
                t1 = B[48+k-i][49];
                t2 = B[48+k-i][50];
                t3 = B[48+k-i][51];
                t4 = B[48+k-i][52];
                t5 = B[48+k-i][53];
                t6 = B[48+k-i][54];
                t7 = B[48+k-i][55];
                B[k][i] = t0;
                B[k][i+1] = t1;
                B[k][i+2] = t2;
                B[k][i+3] = t3;
                B[k][i+4] = t4;
                B[k][i+5] = t5;
                B[k][i+6] = t6;
                B[k][i+7] = t7;                
            }
            for( k = i + 4; k < i + 8; ++k )
            {
                t0 = B[52+k-i][56];
                t1 = B[52+k-i][57];
                t2 = B[52+k-i][58];
                t3 = B[52+k-i][59];
                t4 = B[52+k-i][60];
                t5 = B[52+k-i][61];
                t6 = B[52+k-i][62];
                t7 = B[52+k-i][63];
                B[k][i+0] = t0;
                B[k][i+1] = t1;
                B[k][i+2] = t2;
                B[k][i+3] = t3;
                B[k][i+4] = t4;
                B[k][i+5] = t5;
                B[k][i+6] = t6;
                B[k][i+7] = t7;                
            }
        }
        //Last 2 diagonal blocks
        for(j = 48; j < 64; j += 8 )
        {
            i = j;
            //Copy a(1,1),a(1,2) to b(1,1),b(1,2)
            for( k = i; k < i + 4; ++k )
            {
                t0 = A[k][j];
                t1 = A[k][j+1];
                t2 = A[k][j+2];
                t3 = A[k][j+3];
                t4 = A[k][j+4];
                t5 = A[k][j+5];
                t6 = A[k][j+6];
                t7 = A[k][j+7];
                B[k][j] = t0;
                B[k][j+1] = t1;
                B[k][j+2] = t2;
                B[k][j+3] = t3;
                B[k][j+4] = t4;
                B[k][j+5] = t5;
                B[k][j+6] = t6;
                B[k][j+7] = t7;
            }
            //Local transpose: b(1,1) = bT(1,1), b(1,2) = bT(1,2)
            for( k = i; k < i + 4; ++k )
            {
                for( l = k + 1; l < i + 4; ++l )
                {
                    t0 = B[k][l];
                    t1 = B[k][l+4];
                    B[k][l] = B[l][k];
                    B[k][l+4] = B[l][k+4];
                    B[l][k] = t0;
                    B[l][k+4] = t1;
                }
            }
            //Copy a(2,1),a(2,2) to b(2,1),b(2,2)
            for( k = i + 4; k < i + 8; ++k )
            {
                t0 = A[k][j];
                t1 = A[k][j+1];
                t2 = A[k][j+2];
                t3 = A[k][j+3];
                t4 = A[k][j+4];
                t5 = A[k][j+5];
                t6 = A[k][j+6];
                t7 = A[k][j+7];
                B[k][j] = t0;
                B[k][j+1] = t1;
                B[k][j+2] = t2;
                B[k][j+3] = t3;
                B[k][j+4] = t4;
                B[k][j+5] = t5;
                B[k][j+6] = t6;
                B[k][j+7] = t7;
            }
            //Local transpose: b(1,1) = bT(1,1), b(1,2) = bT(1,2)
            for( k = i + 4; k < i + 8; ++k )
            {
                for( l = k + 1; l < i + 8; ++l )
                {
                    t0 = B[k][l];
                    t1 = B[k][l-4];
                    B[k][l] = B[l][k];
                    B[k][l-4] = B[l][k-4];
                    B[l][k] = t0;
                    B[l][k-4] = t1;
                }
            }
            //Swap: b(1,2),b(2,1)
            for( k = i + 4; k < i + 8; ++k )
            {
                t0 = B[k][j];
                t1 = B[k][j+1];
                t2 = B[k][j+2];
                t3 = B[k][j+3];
                t4 = B[k-4][j+4];
                t5 = B[k-4][j+5];
                t6 = B[k-4][j+6];
                t7 = B[k-4][j+7];
                B[k-4][j+4] = t0;
                B[k-4][j+5] = t1;
                B[k-4][j+6] = t2;
                B[k-4][j+7] = t3;
                B[k][j] = t4;
                B[k][j+1] = t5;
                B[k][j+2] = t6;
                B[k][j+3] = t7;
            }

        }
    }
    else if( M == 61 && N == 67 )
    {
    //Simple buffered block transpose
        for( j = 0; j < 56; j += 8 )
        {
            for( i = 0; i < 67; ++i )
            {
                t0 = A[i][j];
                t1 = A[i][j+1];
                t2 = A[i][j+2];
                t3 = A[i][j+3];
                t4 = A[i][j+4];
                t5 = A[i][j+5];
                t6 = A[i][j+6];
                t7 = A[i][j+7];
                B[j][i] = t0;
                B[j+1][i] = t1;
                B[j+2][i] = t2;
                B[j+3][i] = t3;
                B[j+4][i] = t4;
                B[j+5][i] = t5;
                B[j+6][i] = t6;
                B[j+7][i] = t7;
            }
        }
        for( i = 0; i < 67; ++i )
        {
            t0 = A[i][j];
            t1 = A[i][j+1];
            t2 = A[i][j+2];
            t3 = A[i][j+3];
            t4 = A[i][j+4];
            B[j][i] = t0;
            B[j+1][i] = t1;
            B[j+2][i] = t2;
            B[j+3][i] = t3;
            B[j+4][i] = t4;
        }
    }

    ENSURES(is_transpose(M,N,A,B));
}
Beispiel #7
0
void transpose_submit(int M, int N, int A[N][M], int B[M][N])
{
    REQUIRES(M > 0);
    REQUIRES(N > 0);
	if(N==32 && M==32)
	{
		int temp1=0, temp2 = 0;
		for(int i=0;i<N;i+=8)/*each row*/
		{
			for(int j=0;j<M;j+=8)/*each column*/
			{
				for(int i1=i;i1<i+8;i1++)/*inner block row*/
				{
					for(int j1=j;j1<j+8;j1++)/*inner block column*/
					{
						if(i1 != j1)/*if not diagonal*/
						{
							B[j1][i1] = A[i1][j1];
						}
						else/*else for that row store the diagonal*/
						{
							temp1 = A[i1][j1];
							temp2 = i1;
						}
					}
					if(i == j)/*check the diagonal*/
					{
						B[temp2][temp2] = temp1;
					}
				}
			}
		}
	}
	else
	{
		
		if(N==64 && M==64)
		{
			int temp3=0, temp4=0;
			for(int i=0;i<M;i=i+8)/*each row*/
			{
				for(int j=0;j<N;j=j+8)
				{
					for(int j1=j;j1<j+8;j1+=4)/*each inner row*/
					{
						for(int i1=i;i1<i+8;i1+=4)
						{
							for(int i2=i1;i2<i1+4;i2++)/*each row within the inner block*/
							{
								for(int j2=j1;j2<j1+4;j2++)
								{
									if(i2 != j2)
									{
										B[j2][i2] = A[i2][j2];
									}
									else
									{
										temp3 = A[i2][j2];
										temp4 = i2;
									}
								}
								if(i1 == j1)
								{
									B[temp4][temp4] = temp3;
								}
							}
						}	
					}
				}
			}
		}
		else
		{
			int temp5=0, temp6=0;
			for(int i=0;i<N;i=i+16)/*outer 16 block size*/
			{
				for(int j=0;j<M;j=j+16)
				{
					for(int i1=i;i1<i+16;i1++)
					{
						if(i1<N)/*check column of result should not exceed row*/
						{
							for(int j1=j;j1<j+16;j1++)
							{
								if(j1<M)/*check row of result should not exceed column*/
								{
									if(i1 != j1)
										{
											B[j1][i1] = A[i1][j1];
										}
										else
										{
											temp5 = A[i1][j1];
											temp6 = i1;
										}
								}
								else
								{
									continue;
								}
							}
							if(i == j)
							{	
								B[temp6][temp6] = temp5;
							}
						}
						else
						{
							continue;
						}
					}
				}
			}
		}
	}
    ENSURES(is_transpose(M, N, A, B));
}
Beispiel #8
0
void transpose_submit(int M, int N, int A[N][M], int B[M][N])
{
    int i, j, k, l;
    int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;

    REQUIRES(M > 0);
    REQUIRES(N > 0);

    // for 32 * 32 matrix
    // transpose 8 * 8 block every time
    if (M == 32)
    {
        for (i = 0; i < 32; i += 8)
        {
            for (j = 0; j < 32; j += 8)
            {
                for (k = 0; k < 8; k++)
                {
                    tmp0 = A[i + k][j];
                    tmp1 = A[i + k][j + 1];
                    tmp2 = A[i + k][j + 2];
                    tmp3 = A[i + k][j + 3];
                    tmp4 = A[i + k][j + 4];
                    tmp5 = A[i + k][j + 5];
                    tmp6 = A[i + k][j + 6];
                    tmp7 = A[i + k][j + 7];

                    B[j][i + k] = tmp0;
                    B[j + 1][i + k] = tmp1;
                    B[j + 2][i + k] = tmp2;
                    B[j + 3][i + k] = tmp3;
                    B[j + 4][i + k] = tmp4;
                    B[j + 5][i + k] = tmp5;
                    B[j + 6][i + k] = tmp6;
                    B[j + 7][i + k] = tmp7;
                }
            }
        }
    }

    // for 61 * 67 matrix
    // transpose 8 * 8 block
    else if (M == 61)
    {
        for (i = 0; i < N; i += 8) 
        {
            for (j = 0; j < M; j += 8) 
            {
                for (k = i; k <= i + 3 && k < N; k++)
                {
                    for (l = j; l <= j + 3 && l < M; l++)
                    {
                        tmp0 = A[k][l];
                        B[l][k] = tmp0;
                    }
                }
                for (k = i; k <= i + 3 && k < N; k++)
                {
                    for (l = j + 4; l <= j + 7 && l < M; l++)
                    {
                        tmp0 = A[k][l];
                        B[l][k] = tmp0;
                    }
                }
                for (k = i + 4; k <= i + 7 && k < N; k++)
                {
                    for (l = j; l <= j + 4 && l < M; l++)
                    {
                        tmp0 = A[k][l];
                        B[l][k] = tmp0;
                    }
                }
                for (k = i + 4; k <= i + 7 && k < N; k++)
                {
                    for (l = j + 4; l <= j + 7 && l < M; l++)
                    {
                        tmp0 = A[k][l];
                        B[l][k] = tmp0;
                    }
                }
            }
        }
    }

    // for 64 * 64 matrix
    else
    {
        // first tranpose blocks at diagonal
        for (i = 0; i < 64; i += 8)
        {
            tmp0 = (i + 8) % 64;
            for (k = i; k < i + 8; k++)
            {
                for (l = 0; l < 8; l++)
                {
                    B[k][tmp0 + l] = A[k][l + i];
                }
            }
            for (k = i; k < i + 8; k++)
            {
                for (l = 0; l < 4; l++)
                {
                    B[i + l][k]= B[k][tmp0 + l];
                }
            }
            for (k = i; k < i + 8; k++)
            {
                for (l = 4; l < 8; l++)
                {
                    B[i + l][k]= B[k][tmp0 + l];
                }
            }
        }

        // then tranpose blocks not at diagonal
        for (i = 0; i < N; i += 8) 
        {
            for (j = 0; j < M; j += 8) 
            {
                if (i != j)
                {
                    for (k = i; k < i + 4; k++)
                    { 
                        for (l = j; l < j + 4; l++)
                        {
                            B[l][k] = A[k][l];
                        }
                        if ((i == 0 && j == 56) || (i == 56 && j == 0))
                        {
                            tmp0 = 24;
                            tmp1 = 24;
                        }
                        else if (i == 0 || j == 0)
                        {
                            tmp0 = 56;
                            tmp1 = 56;
                        }
                        else
                        {
                            tmp0 = 0;
                            tmp1 = 0;
                        }
                    }

                    for (k = 0; k < 4; k++)
                        for (l = 4; l < 8; l++)
                            A[tmp0 + k][tmp1 + l] = A[k + i][l + j];

                    for (k = i + 4; k < i + 8; k++)
                        for (l = j; l < j + 4; l++)
                            B[l][k] = A[k][l];

                    for (k = 0; k < 4; k++)
                        for (l = 4; l < 8; l++)
                            B[l + j][i + k] = A[tmp0 + k][tmp1 + l];

                    for (k = i + 4; k < i + 8; k++)
                        for (l = j + 4; l < j + 8;l++)
                            B[l][k] = A[k][l];

                }
            }
        }
    }

    ENSURES(is_transpose(M, N, A, B));
}
Beispiel #9
0
void transpose_submit(int M, int N, int A[N][M], int B[M][N]){
    int n, sn, m, sm, i, j; 
    REQUIRES(M > 0);
    REQUIRES(N > 0);
    switch(N){
	case 32:
	for (n = 0; n < 25; n += 8){
		for(m = 0; m < 25; m += 8 ){
			if(m != n){
				for(i = n; i < n+8; i++){
					for(j =m; j < m+8; j++){
						B[j][i] = A[i][j];
					}
				}
			}
		}
	}
	for (n = 0; n < 25; n += 8){
		for(i = n; i < n+8; i++){
			for(j = n; j < n+8; j++){
				if(i != j){
				 B[j][i] = A[i][j];
				}
			}
			B[i][i] = A[i][i]; 
		}			
	}
	break;
	case 64:
	for(n = 0; n < 57; n +=8){
		for(m = 0; m < 57; m += 8){
		    if(m != n){
		    	sn = n;
		    	for(sm = m; sm < m + 5; sm += 4){
				for(i = sn; i < sn+4; i++){
			  		for(j = sm; j < sm+4; j++){
						B[j][i] = A[i][j];
			  		}
				}
		   	} 
		   	sn = n+4;
		   	for(sm = m + 4; sm > m-1; sm -=4){
				for(i = sn; i < sn+4; i++){
   			  		for(j = sm; j < sm+4; j++){
						B[j][i] = A[i][j];
					}		
				}	
		   	}
		  }
		}	
	}
	for(n = 0; n < 57; n+=8){
		sn = n;
		for(i = sn; i < sn+4; i++){
			for(j = sn; j < sn+4; j++){
				if(i != j){
					B[j][i] = A[i][j];
				}
			}
			B[i][i] = A[i][i];
		}

		for(i = sn; i < sn+4; i++){
			for(j = sn+4; j < sn+8; j++){
				B[j][i] = A[i][j];
			}
		}
		sn = n+4;
		for(i = sn; i < sn+4; i++){
   			for(j = sn; j < sn+4; j++){
				if(i != j){
					B[j][i] = A[i][j];
				}
			}
			B[i][i] = A[i][i];
		}

		for(i = sn; i < sn+4; i++){
	   		for(j = sn-4; j < sn; j++){
				B[j][i] = A[i][j];
			}
		}		
	}	
	break;
	case 67:
	for(m = 0; m < 61; m += 8){
		for(n = 0; n < 67; n += 8){
			for(i = n; i < n + 8 && i < 67; i++){
				for(j = m; j < m + 8 && j < 61; j++){
					B[j][i] = A[i][j];
				}
			}
		}
	}
	break;
}
    
    ENSURES(is_transpose(M, N, A, B));

}
Beispiel #10
0
void transpose_submit(int M, int N, int A[N][M], int B[M][N])
{
    int start_row, start_column, end_row, end_column;
    int i,j;
    int a0,a1,a2,a3,a4,a5,a6,a7;
    int temp;
    int row;
    int flag;
    int step;
    REQUIRES(M > 0);
    REQUIRES(N > 0);
    if(M == N && M == 32){
        if(M == 32)
            step = 8;
        for(start_row = 0; start_row < N; start_row += step){
            end_row = start_row + step;
            for(start_column = 0; start_column < M; start_column += step){
                end_column = start_column + step;
                for(i = start_row; i < end_row; ++ i){
                    for(j = start_column; j < end_column; ++ j){
                        if(i != j)
                            B[j][i] = A[i][j];
                        else{
                            flag = 1;
                            temp = A[i][i];
                            row = i;
                        }
                    }
                    if(flag){
                        flag = 0;
                        B[row][row] = temp;
                    }
                }
            }
        }
    }else if(M == N && M ==64){
        for (start_column = 0; start_column < M; start_column += 8)
            for (start_row = 0; start_row < N; start_row += 8){
                for(i = start_row; i < start_row + 8; ++ i){
                    a0 = A[i][start_column];
                    a1 = A[i][start_column + 1];
                    a2 = A[i][start_column + 2];
                    a3 = A[i][start_column + 3];
                    if(i == start_row){
                        a4 = A[i][start_column + 4];
                        a5 = A[i][start_column + 5];
                        a6 = A[i][start_column + 6];
                        a7 = A[i][start_column + 7];
                    }
                    B[start_column][i] = a0;
                    B[start_column + 1][i] = a1;
                    B[start_column + 2][i] = a2;
                    B[start_column + 3][i] = a3;
                }
                for(i = start_row + 7; i > start_row ; --i){
                    a0 = A[i][start_column + 4];
                    a1 = A[i][start_column + 5];
                    a2 = A[i][start_column + 6];
                    a3 = A[i][start_column + 7];
                    B[start_column + 4][i] = a0;
                    B[start_column + 5][i] = a1;
                    B[start_column + 6][i] = a2;
                    B[start_column + 7][i] = a3;
                }
                B[start_column+4][start_row] = a4;
                B[start_column+5][start_row] = a5;
                B[start_column+6][start_row] = a6;
                B[start_column+7][start_row] = a7;
            }
    }else if(M == 61 && N == 67){
        step = 21;
        for(start_column = 0; start_column < M; start_column += step){
            end_column = start_column + step;
            for(start_row = 0; start_row < N; start_row += step){
                end_row = start_row + step;
                for(i = start_row; i < end_row; ++ i){
                    for(j = start_column; j < end_column; ++ j){
                        if(i < 67 && j < 61)
                            B[j][i] = A[i][j];
                    }
                }
            }
        }
    }else{
        for(i = 0; i < M; ++ i)
            for(j = 0; j < M; ++ j)
                B[j][i] = A[i][j];
    }

    ENSURES(is_transpose(M, N, A, B));
}
Beispiel #11
0
void transpose_submit(int M, int N, int A[N][M], int B[M][N])
{
    REQUIRES(M > 0);
    REQUIRES(N > 0);
    //this is the index we need when calculating
    int i=0,j=0,k=0,l=0;
    //this is the temporal number we use to store the swap number
    int x0,x1,x2,x3;
    if(M==32&&N==32){
  	//use the block as size of 8
  	//generally follows the idea of the block;
  	//But since we don't need to swap the diagnal numbers
    	while(i<32){
	    j=0;
	    while(j<32){
		/*
		if(i==j){
		    k=i;
		    while(k<i+block){
		    	x0=A[k][j];
		   	x1=A[k][j+1];
		    	x2=A[k][j+2];
		    	x3=A[k][j+3];
		    	x4=A[k][j+4];
		    	x5=A[k][j+5];
		    	x6=A[k][j+6];
		    	x7=A[k][j+7];
		    	B[j][k]=x0;
		    	B[j+1][k]=x1;
		    	B[j+2][k]=x2;
		    	B[j+3][k]=x3;
		    	B[j+4][k]=x4;
		    	B[j+5][k]=x5;
		    	B[j+6][k]=x6;
		    	B[j+7][k]=x7;
		    	k++;
		    }
		}else{
		*/
		    //where blocking begins
		    k=j;
		    while(k<j+block){
			l=i;
		    	while(l<i+block){
			    B[l][k] = A[k][l];
			    l++;
			}
			k++;
		    
		}
		j+=block;
	    }
	    i+=block;
        }
    }
    //This will handle the case when M=64 and N=64
    //This time I also use the blocking
    //But since the cache size could only store 8
    //So we swap it one more time
    if(M==64&&N==64){
	i=0;
   	while(i<64){
	    j=0;	
	    while(j<64){
		k=0;
		while(k<block){
		    l=0;
		    x0 = A[k+j][i];
		    x1 = A[k+j][i+1];
		    x2 = A[k+j][i+2]; 
		    x3 = A[k+j][i+3]; 
		    B[i][k+j] = x0;
		    B[i+1][k+j] = x1; 
		    B[i+2][k+j] = x2; 
		    B[i+3][k+j] = x3;
		    k++;
		}
		l=7;
		while(l>=0){
		    x0 = A[l+j][i+4];
		    x1 = A[l+j][i+5];
		    x2 = A[l+j][i+6]; 
		    x3 = A[l+j][i+7]; 
		    B[i+4][l+j] = x0;
		    B[i+5][l+j] = x1; 
		    B[i+6][l+j] = x2; 
		    B[i+7][l+j] = x3;
		    l--;
		}
		j+=block;
	    }
	    i+=block;
        }
    }
    //This will handle the situation when M=61 and N=67
    //This time we don't need to handle the diagnal problem
    //We just need to care about the block size
    //So it is even easier than the 32*32
    //I just tested several times and found out that 17 is the best number
    //Although I felt this is a magic number
    if(M==61){
	i=0;
   	while(i<61){
	    j=0;	
	    while(j<67){
		k=j;	
		while(k<(j+17)&&k<67){
		    l=i;
		    while(l<(i+17)&&l<61){
			B[l][k]=A[k][l];
			l++;
		    }
		    k++;
		}
		j+=17;
	    }
	    i+=17;
        }
    }
    ENSURES(is_transpose(M, N, A, B));
}
void transpose_submit(int M, int N, int A[N][M], int B[M][N])
{
    int ii,jj;
    int i,j;
	int s0,s1,s2,s3,s4,s5,s6,s7;
    int iii,jjj;
    REQUIRES(M > 0);
    REQUIRES(N > 0);
	if(M<=32)//as for M==N==32
	for(ii=0;ii<N;ii+=8)
		for(jj=0;jj<M;jj+=8)
			if(ii!=jj)
		{
			for(i=ii;(i<N)&&(i<ii+8);i++)
				for(j=jj;(j<M)&&(j<jj+8);j++)
					B[j][i]=A[i][j];
		}
		else
		{
			for(i=ii;(i<N)&&(i<ii+8);i++)
			{
				s0=A[i][jj];s1=A[i][jj+1];s2=A[i][jj+2];
				s3=A[i][jj+3];s4=A[i][jj+4];s5=A[i][jj+5];
				s6=A[i][jj+6];s7=A[i][jj+7];
				B[jj][i]=s0;B[jj+1][i]=s1;B[jj+2][i]=s2;B[jj+3][i]=s3;
				B[jj+4][i]=s4;B[jj+5][i]=s5;B[jj+6][i]=s6;B[jj+7][i]=s7;
			}
		}
	else if(M==64)//as for M==N==64
	{
		for(iii=0;iii<N;iii+=8)
			for(jjj=0;jjj<M;jjj+=8)//8x8
				{
				ii=iii;jj=jjj;//1
				if(ii!=jj)
				{
					for(j=jj;(j<M)&&(j<jj+4);j+=2)//4x4
					{
						s0=A[ii][j];s1=A[ii+1][j];s2=A[ii+2][j];s3=A[ii+3][j];
						s4=A[ii][j+1];s5=A[ii+1][j+1];s6=A[ii+2][j+1];s7=A[ii+3][j+1];
						B[j][ii]=s0;B[j][ii+1]=s1;B[j][ii+2]=s2;B[j][ii+3]=s3;
						B[j+1][ii]=s4;B[j+1][ii+1]=s5;B[j+1][ii+2]=s6;B[j+1][ii+3]=s7;
					}
	
				}
				else
				{
					for(i=ii;(i<N)&&(i<ii+4);i+=2)
					{
						s0=A[i][jj];s1=A[i][jj+1];s2=A[i][jj+2];s3=A[i][jj+3];
						s4=A[i+1][jj];s5=A[i+1][jj+1];s6=A[i+1][jj+2];s7=A[i+1][jj+3];
						B[jj][i]=s0;B[jj+1][i]=s1;B[jj+2][i]=s2;B[jj+3][i]=s3;
						B[jj][i+1]=s4;B[jj+1][i+1]=s5;B[jj+2][i+1]=s6;B[jj+3][i+1]=s7;
					}

				}
				ii+=4;//3
				if(ii!=jj)
				{
					for(j=jj;(j<M)&&(j<jj+4);j+=2)
					{
						s0=A[ii][j];s1=A[ii+1][j];s2=A[ii+2][j];s3=A[ii+3][j];
						s4=A[ii][j+1];s5=A[ii+1][j+1];s6=A[ii+2][j+1];s7=A[ii+3][j+1];
						B[j][ii]=s0;B[j][ii+1]=s1;B[j][ii+2]=s2;B[j][ii+3]=s3;
						B[j+1][ii]=s4;B[j+1][ii+1]=s5;B[j+1][ii+2]=s6;B[j+1][ii+3]=s7;
					}

				}
				else
				{
					for(i=ii;(i<N)&&(i<ii+4);i+=2)
					{
						s0=A[i][jj];s1=A[i][jj+1];s2=A[i][jj+2];s3=A[i][jj+3];
						s4=A[i+1][jj];s5=A[i+1][jj+1];s6=A[i+1][jj+2];s7=A[i+1][jj+3];
						B[jj][i]=s0;B[jj+1][i]=s1;B[jj+2][i]=s2;B[jj+3][i]=s3;
						B[jj][i+1]=s4;B[jj+1][i+1]=s5;B[jj+2][i+1]=s6;B[jj+3][i+1]=s7;
					}

				}

	
				jj+=4;//4
				if(ii!=jj)
				{
					for(j=jj;(j<M)&&(j<jj+4);j+=2)
					{
						s0=A[ii][j];s1=A[ii+1][j];s2=A[ii+2][j];s3=A[ii+3][j];
						s4=A[ii][j+1];s5=A[ii+1][j+1];s6=A[ii+2][j+1];s7=A[ii+3][j+1];
						B[j][ii]=s0;B[j][ii+1]=s1;B[j][ii+2]=s2;B[j][ii+3]=s3;
						B[j+1][ii]=s4;B[j+1][ii+1]=s5;B[j+1][ii+2]=s6;B[j+1][ii+3]=s7;
					}
					
				}
				else
				{
					for(i=ii;(i<N)&&(i<ii+4);i+=2)
					{
						s0=A[i][jj];s1=A[i][jj+1];s2=A[i][jj+2];s3=A[i][jj+3];
						s4=A[i+1][jj];s5=A[i+1][jj+1];s6=A[i+1][jj+2];s7=A[i+1][jj+3];
						B[jj][i]=s0;B[jj+1][i]=s1;B[jj+2][i]=s2;B[jj+3][i]=s3;
						B[jj][i+1]=s4;B[jj+1][i+1]=s5;B[jj+2][i+1]=s6;B[jj+3][i+1]=s7;
					}

					
				}
				ii-=4;//2
				if(ii!=jj)
				{
					for(j=jj;(j<M)&&(j<jj+4);j+=2)
					{
						s0=A[ii][j];s1=A[ii+1][j];s2=A[ii+2][j];s3=A[ii+3][j];
						s4=A[ii][j+1];s5=A[ii+1][j+1];s6=A[ii+2][j+1];s7=A[ii+3][j+1];
						B[j][ii]=s0;B[j][ii+1]=s1;B[j][ii+2]=s2;B[j][ii+3]=s3;
						B[j+1][ii]=s4;B[j+1][ii+1]=s5;B[j+1][ii+2]=s6;B[j+1][ii+3]=s7;
					}
					
				}
				else
				{
					for(i=ii;(i<N)&&(i<ii+4);i+=2)
					{
						s0=A[i][jj];s1=A[i][jj+1];s2=A[i][jj+2];s3=A[i][jj+3];
						s4=A[i+1][jj];s5=A[i+1][jj+1];s6=A[i+1][jj+2];s7=A[i+1][jj+3];
						B[jj][i]=s0;B[jj+1][i]=s1;B[jj+2][i]=s2;B[jj+3][i]=s3;
						B[jj][i+1]=s4;B[jj+1][i+1]=s5;B[jj+2][i+1]=s6;B[jj+3][i+1]=s7;
					}

					
				}
				}

	}
	else//as for 61X67
	{
		for(iii=0;iii<64;iii+=8)//64X56
			for(jjj=0;jjj<56;jjj+=8)
				for(ii=iii;ii<iii+8;ii+=4)
					for(jj=jjj;jj<jjj+8;jj+=4)
				if(ii!=jj)
				{
					for(j=jj;(j<M)&&(j<jj+4);j+=2)
					{
						s0=A[ii][j];s1=A[ii+1][j];s2=A[ii+2][j];s3=A[ii+3][j];
						s4=A[ii][j+1];s5=A[ii+1][j+1];s6=A[ii+2][j+1];s7=A[ii+3][j+1];
						B[j][ii]=s0;B[j][ii+1]=s1;B[j][ii+2]=s2;B[j][ii+3]=s3;
						B[j+1][ii]=s4;B[j+1][ii+1]=s5;B[j+1][ii+2]=s6;B[j+1][ii+3]=s7;
					}
				}
				else
				{
					for(i=ii;(i<64)&&(i<ii+4);i+=2)
					{
						s0=A[i][jj];s1=A[i][jj+1];s2=A[i][jj+2];s3=A[i][jj+3];
						s4=A[i+1][jj];s5=A[i+1][jj+1];s6=A[i+1][jj+2];s7=A[i+1][jj+3];
						B[jj][i]=s0;B[jj+1][i]=s1;B[jj+2][i]=s2;B[jj+3][i]=s3;
						B[jj][i+1]=s4;B[jj+1][i+1]=s5;B[jj+2][i+1]=s6;B[jj+3][i+1]=s7;
					}
				}
				for(i=0;i<64;i++)//64X5
				{
					s0=A[i][56];s1=A[i][57];s2=A[i][58];s3=A[i][59];s4=A[i][60];
					B[56][i]=s0;B[57][i]=s1;B[58][i]=s2;B[59][i]=s3;B[60][i]=s4;
				}
				for(ii=64;ii<67;ii+=8)//3X61
						for(jj=0;jj<61;jj+=8)
					if(ii!=jj)
						{
							for(i=ii;(i<N)&&(i<ii+8);i++)
								for(j=jj;(j<M)&&(j<jj+8);j++)
									B[j][i]=A[i][j];
						}
					else
						{
							for(i=ii;(i<N)&&(i<ii+8);i++)
							{
								s0=A[i][jj];s1=A[i][jj+1];s2=A[i][jj+2];
								s3=A[i][jj+3];s4=A[i][jj+4];s5=A[i][jj+5];
								s6=A[i][jj+6];s7=A[i][jj+7];
								B[jj][i]=s0;B[jj+1][i]=s1;B[jj+2][i]=s2;B[jj+3][i]=s3;
								B[jj+4][i]=s4;B[jj+5][i]=s5;B[jj+6][i]=s6;B[jj+7][i]=s7;
							}
						}
	}
    ENSURES(is_transpose(M, N, A, B));
}