void transpose_submit(int M, int N, int A[N][M], int B[M][N]) { REQUIRES(M > 0); REQUIRES(N > 0); ENSURES(is_transpose(M, N, A, B)); }
void trans(int M, int N, int A[N][M], int B[M][N]){ int i, j, tmp; REQUIRES(M > 0); REQUIRES(N > 0); for (i = 0; i < N; i++){ for (j = 0; j < M; j++){ tmp = A[i][j]; B[j][i] = tmp; } } ENSURES(is_transpose(M, N, A, B)); }
void basicBlockingTrans(int M, int N, int A[N][M], int B[M][N]) { int i, j, k, l, tmp; REQUIRES(M > 0); REQUIRES(N > 0); for (i = 0; i < N; i += 8) { for (j = 0; j < M; j += 8) { for (k = i; k <= i + 3 && k < N; k++) { for (l = j; l <= j + 3 && l < M; l++) { tmp = A[k][l]; B[l][k] = tmp; } } for (k = i; k <= i + 3 && k < N; k++) { for (l = j + 4; l <= j + 7 && l < M; l++) { tmp = A[k][l]; B[l][k] = tmp; } } for (k = i + 4; k <= i + 7 && k < N; k++) { for (l = j; l <= j + 4 && l < M; l++) { tmp = A[k][l]; B[l][k] = tmp; } } for (k = i + 4; k <= i + 7 && k < N; k++) { for (l = j + 4; l <= j + 7 && l < M; l++) { tmp = A[k][l]; B[l][k] = tmp; } } } } ENSURES(is_transpose(M, N, A, B)); }
void transpose_submit(int M, int N, int A[N][M], int B[M][N]) { int i, j, x, y; int temp; int buff1, buff2, buff3, buff4, buff5, buff6, buff7; REQUIRES(M > 0); REQUIRES(N > 0); /* * When the matrix size is (32,32), we cam divide the matrix into several blocks; * The size of the block size is (8,8). */ if (M == 32 && N == 32) { for (i = 0; i <= N - 8; i += 8 ) { for (j = 0; j <= M - 8; j += 8) { for (x = i; x < i + 8; x++) { for (y = j; y < j + 8; y++) { if (x != y) { temp = A[x][y]; B[y][x] = temp; } } if (i == j) { temp = A[x][x]; B[x][x] = temp; } } } } } /* * When the matrix size is (64, 64), there will be conflits every 5 rows. * We can divide the matrix into blocks with size (4,4); */ if (M == 64 && N == 64) { for (i = 0; i <= N - 8; i += 8) { for (j = 0; j <= M - 8; j += 8) { // deal with the subblock on the top-left for (x = i; x < i + 4; x ++) { //miss temp = A[x][j]; //hit buff1 = A[x][j+1]; buff2 = A[x][j+2]; buff3 = A[x][j+3]; buff4 = A[x][j+4]; buff5 = A[x][j+5]; buff6 = A[x][j+6]; buff7 = A[x][j+7]; //B[][x+4] are not in the right place now B[j][x] = temp; B[j][x+4] = buff4; B[j+1][x] = buff1; B[j+1][x+4] = buff5; B[j+2][x] = buff2; B[j+2][x+4] = buff6; B[j+3][x] = buff3; B[j+3][x+4] = buff7; } // deal with the subblock on the diagonal for (x = 0; x < 4; x++) { buff1 = B[j+x][i+4]; buff2 = B[j+x][i+5]; buff3 = B[j+x][i+6]; buff4 = B[j+x][i+7]; buff5 = A[i+4][j+x]; buff6 = A[i+5][j+x]; buff7 = A[i+6][j+x]; temp = A[i+7][j+x]; B[j+x][i+4] = buff5; B[j+x][i+5] = buff6; B[j+x][i+6] = buff7; B[j+x][i+7] = temp; B[j+x+4][i] = buff1; B[j+x+4][i+1] = buff2; B[j+x+4][i+2] = buff3; B[j+x+4][i+3] = buff4; } // deal with the subblock on the button-right for (x = i + 4; x < i + 8; x++) { buff1 = A[x][j+4]; buff2 = A[x][j+5]; buff3 = A[x][j+6]; buff4 = A[x][j+7]; B[j+4][x] = buff1; B[j+5][x] = buff2; B[j+6][x] = buff3; B[j+7][x] = buff4; } } } } /* * As the matrix is not square matrix, it's hard to say the optimal block size. * Try different sizes of block, choose the size with best performance. */ if (M == 61 && N == 67) { for (i = 0; i <= N-1; i += 16) { for (j = 0; j <= M - 1; j += 4) { for (x = i; x < i + 16 && x < N; x++) { buff1 = A[x][j]; buff2 = A[x][j+1]; buff3 = A[x][j+2]; buff4 = A[x][j+3]; B[j][x] = buff1; B[j+1][x] = buff2; B[j+2][x] = buff3; B[j+3][x] = buff4; } } } } ENSURES(is_transpose(M, N, A, B)); }
void trans_old(int M, int N, int A[N][M], int B[M][N]) { int i, j, k, l, t0, t1, t2, t3, t4, t5, t6, t7; REQUIRES(M > 0); REQUIRES(N > 0); if( M == 32 && N == 32 ) { for( j = 0; j < 32; j += 8 ) { for( i = 0; i < 32; ++i ) { //Do local transpose on diagonal block if( i == j ) { //Copy for( k = i; k < i + 8; ++k ) { t0 = A[k][i]; t1 = A[k][i+1]; t2 = A[k][i+2]; t3 = A[k][i+3]; t4 = A[k][i+4]; t5 = A[k][i+5]; t6 = A[k][i+6]; t7 = A[k][i+7]; B[k][i] = t0; B[k][i+1] = t1; B[k][i+2] = t2; B[k][i+3] = t3; B[k][i+4] = t4; B[k][i+5] = t5; B[k][i+6] = t6; B[k][i+7] = t7; } //Local transpose for( k = i; k < i + 8; ++k ) { for( l = k + 1; l < i + 8; ++l ) { t0 = B[k][l]; B[k][l] = B[l][k]; B[l][k] = t0; } } i += 7; } //Simple buffered block transpose else { t0 = A[i][j]; t1 = A[i][j+1]; t2 = A[i][j+2]; t3 = A[i][j+3]; t4 = A[i][j+4]; t5 = A[i][j+5]; t6 = A[i][j+6]; t7 = A[i][j+7]; B[j][i] = t0; B[j+1][i] = t1; B[j+2][i] = t2; B[j+3][i] = t3; B[j+4][i] = t4; B[j+5][i] = t5; B[j+6][i] = t6; B[j+7][i] = t7; } } } } else if( M == 64 && N == 64 ) { for( j = 0; j < 64; j += 8 ) { for( i = 0; i < 64; i += 8 ) { //Diagonal if( i == j ) { //Copy a(1,1),a(1,2) to b(1,1),b(1,2) for( k = i; k < i + 4; ++k ) { t0 = A[k][j]; t1 = A[k][j+1]; t2 = A[k][j+2]; t3 = A[k][j+3]; t4 = A[k][j+4]; t5 = A[k][j+5]; t6 = A[k][j+6]; t7 = A[k][j+7]; B[k][j] = t0; B[k][j+1] = t1; B[k][j+2] = t2; B[k][j+3] = t3; B[k][j+4] = t4; B[k][j+5] = t5; B[k][j+6] = t6; B[k][j+7] = t7; } //Local transpose: b(1,1) = bT(1,1), b(1,2) = bT(1,2) for( k = i; k < i + 4; ++k ) { for( l = k + 1; l < i + 4; ++l ) { t0 = B[k][l]; t1 = B[k][l+4]; B[k][l] = B[l][k]; B[k][l+4] = B[l][k+4]; B[l][k] = t0; B[l][k+4] = t1; } } //Copy a(2,1),a(2,2) to b(2,1),b(2,2) for( k = i + 4; k < i + 8; ++k ) { t0 = A[k][j]; t1 = A[k][j+1]; t2 = A[k][j+2]; t3 = A[k][j+3]; t4 = A[k][j+4]; t5 = A[k][j+5]; t6 = A[k][j+6]; t7 = A[k][j+7]; B[k][j] = t0; B[k][j+1] = t1; B[k][j+2] = t2; B[k][j+3] = t3; B[k][j+4] = t4; B[k][j+5] = t5; B[k][j+6] = t6; B[k][j+7] = t7; } //Local transpose: b(1,1) = bT(1,1), b(1,2) = bT(1,2) for( k = i + 4; k < i + 8; ++k ) { for( l = k + 1; l < i + 8; ++l ) { t0 = B[k][l]; t1 = B[k][l-4]; B[k][l] = B[l][k]; B[k][l-4] = B[l][k-4]; B[l][k] = t0; B[l][k-4] = t1; } } //Swap: b(1,2),b(2,1) for( k = i + 4; k < i + 8; ++k ) { t0 = B[k][j]; t1 = B[k][j+1]; t2 = B[k][j+2]; t3 = B[k][j+3]; t4 = B[k-4][j+4]; t5 = B[k-4][j+5]; t6 = B[k-4][j+6]; t7 = B[k-4][j+7]; B[k-4][j+4] = t0; B[k-4][j+5] = t1; B[k-4][j+6] = t2; B[k-4][j+7] = t3; B[k][j] = t4; B[k][j+1] = t5; B[k][j+2] = t6; B[k][j+3] = t7; } } else { //b(1,1) = aT(1,1), b(1,2) = aT(1,2) for( k = i; k < i + 4; ++k ) { t0 = A[k][j]; t1 = A[k][j+1]; t2 = A[k][j+2]; t3 = A[k][j+3]; t4 = A[k][j+4]; t5 = A[k][j+5]; t6 = A[k][j+6]; t7 = A[k][j+7]; B[j][k] = t0; B[j+1][k] = t1; B[j+2][k] = t2; B[j+3][k] = t3; B[j][k+4] = t4; B[j+1][k+4] = t5; B[j+2][k+4] = t6; B[j+3][k+4] = t7; } //b(1,2) = aT(2,1), b(2,1) = b(1,2) for( k = j; k < j + 4; ++k ) { t0 = B[k][i+4]; t1 = B[k][i+5]; t2 = B[k][i+6]; t3 = B[k][i+7]; t4 = A[i+4][k]; t5 = A[i+5][k]; t6 = A[i+6][k]; t7 = A[i+7][k]; B[k][i+4] = t4; B[k][i+5] = t5; B[k][i+6] = t6; B[k][i+7] = t7; B[k+4][i] = t0; B[k+4][i+1] = t1; B[k+4][i+2] = t2; B[k+4][i+3] = t3; } //b(2,2) = aT(2,2) for( k = i + 4; k < i + 8; ++k ) { t0 = A[k][j+4]; t1 = A[k][j+5]; t2 = A[k][j+6]; t3 = A[k][j+7]; B[j+4][k] = t0; B[j+5][k] = t1; B[j+6][k] = t2; B[j+7][k] = t3; } } } } } else if( M == 61 && N == 67 ) { //Simple buffered block transpose for( j = 0; j < 56; j += 8 ) { for( i = 0; i < 67; ++i ) { t0 = A[i][j]; t1 = A[i][j+1]; t2 = A[i][j+2]; t3 = A[i][j+3]; t4 = A[i][j+4]; t5 = A[i][j+5]; t6 = A[i][j+6]; t7 = A[i][j+7]; B[j][i] = t0; B[j+1][i] = t1; B[j+2][i] = t2; B[j+3][i] = t3; B[j+4][i] = t4; B[j+5][i] = t5; B[j+6][i] = t6; B[j+7][i] = t7; } } for( i = 0; i < 67; ++i ) { t0 = A[i][j]; t1 = A[i][j+1]; t2 = A[i][j+2]; t3 = A[i][j+3]; t4 = A[i][j+4]; B[j][i] = t0; B[j+1][i] = t1; B[j+2][i] = t2; B[j+3][i] = t3; B[j+4][i] = t4; } } ENSURES(is_transpose(M,N,A,B)); }
void transpose_submit(int M, int N, int A[N][M], int B[M][N]) { int i, j, k, l, t0, t1, t2, t3, t4, t5, t6, t7; REQUIRES(M > 0); REQUIRES(N > 0); if( M == 32 && N == 32 ) { for( j = 0; j < 32; j += 8 ) { for( i = 0; i < 32; ++i ) { //Do local transpose on diagonal block if( i == j ) { //Copy for( k = i; k < i + 8; ++k ) { t0 = A[k][i]; t1 = A[k][i+1]; t2 = A[k][i+2]; t3 = A[k][i+3]; t4 = A[k][i+4]; t5 = A[k][i+5]; t6 = A[k][i+6]; t7 = A[k][i+7]; B[k][i] = t0; B[k][i+1] = t1; B[k][i+2] = t2; B[k][i+3] = t3; B[k][i+4] = t4; B[k][i+5] = t5; B[k][i+6] = t6; B[k][i+7] = t7; } //Local transpose for( k = i; k < i + 8; ++k ) { for( l = k + 1; l < i + 8; ++l ) { t0 = B[k][l]; B[k][l] = B[l][k]; B[l][k] = t0; } } i += 7; } //Simple buffered block transpose else { t0 = A[i][j]; t1 = A[i][j+1]; t2 = A[i][j+2]; t3 = A[i][j+3]; t4 = A[i][j+4]; t5 = A[i][j+5]; t6 = A[i][j+6]; t7 = A[i][j+7]; B[j][i] = t0; B[j+1][i] = t1; B[j+2][i] = t2; B[j+3][i] = t3; B[j+4][i] = t4; B[j+5][i] = t5; B[j+6][i] = t6; B[j+7][i] = t7; } } } } else if( M == 64 && N == 64 ) { for( j = 0; j < 64; j += 8 ) { for( i = 0; i < 64; i += 8 ) { //Obmit diagonal if( i == j ) continue; else { //b(1,1) = aT(1,1), b(1,2) = aT(1,2) for( k = i; k < i + 4; ++k ) { t0 = A[k][j]; t1 = A[k][j+1]; t2 = A[k][j+2]; t3 = A[k][j+3]; t4 = A[k][j+4]; t5 = A[k][j+5]; t6 = A[k][j+6]; t7 = A[k][j+7]; B[j][k] = t0; B[j+1][k] = t1; B[j+2][k] = t2; B[j+3][k] = t3; B[j][k+4] = t4; B[j+1][k+4] = t5; B[j+2][k+4] = t6; B[j+3][k+4] = t7; } //b(1,2) = aT(2,1), b(2,1) = b(1,2) for( k = j; k < j + 4; ++k ) { t0 = B[k][i+4]; t1 = B[k][i+5]; t2 = B[k][i+6]; t3 = B[k][i+7]; t4 = A[i+4][k]; t5 = A[i+5][k]; t6 = A[i+6][k]; t7 = A[i+7][k]; B[k][i+4] = t4; B[k][i+5] = t5; B[k][i+6] = t6; B[k][i+7] = t7; B[k+4][i] = t0; B[k+4][i+1] = t1; B[k+4][i+2] = t2; B[k+4][i+3] = t3; } //b(2,2) = aT(2,2) for( k = i + 4; k < i + 8; ++k ) { t0 = A[k][j+4]; t1 = A[k][j+5]; t2 = A[k][j+6]; t3 = A[k][j+7]; B[j+4][k] = t0; B[j+5][k] = t1; B[j+6][k] = t2; B[j+7][k] = t3; } } } } //Handle Diagonal for( j = 0; j < 48; j += 8 ) { //Take B[48~52][], B[56~63][] as buffer i = j; //b(1,1) = aT(1,1), b(1,2) = aT(1,2) for( k = i; k < i + 4; ++k ) { t0 = A[k][j]; t1 = A[k][j+1]; t2 = A[k][j+2]; t3 = A[k][j+3]; t4 = A[k][j+4]; t5 = A[k][j+5]; t6 = A[k][j+6]; t7 = A[k][j+7]; B[48][k-i+48] = t0; B[49][k-i+48] = t1; B[50][k-i+48] = t2; B[51][k-i+48] = t3; B[48][k-i+52] = t4; B[49][k-i+52] = t5; B[50][k-i+52] = t6; B[51][k-i+52] = t7; } //b(1,2) = aT(2,1), b(2,1) = b(1,2) for( k = j; k < j + 4; ++k ) { t0 = B[k-i+48][52]; t1 = B[k-i+48][53]; t2 = B[k-i+48][54]; t3 = B[k-i+48][55]; t4 = A[i+4][k]; t5 = A[i+5][k]; t6 = A[i+6][k]; t7 = A[i+7][k]; B[k-i+48][52] = t4; B[k-i+48][53] = t5; B[k-i+48][54] = t6; B[k-i+48][55] = t7; B[k-i+56][56] = t0; B[k-i+56][57] = t1; B[k-i+56][58] = t2; B[k-i+56][59] = t3; } //b(2,2) = aT(2,2) for( k = i + 4; k < i + 8; ++k ) { t0 = A[k][j+4]; t1 = A[k][j+5]; t2 = A[k][j+6]; t3 = A[k][j+7]; B[56][k-i+56] = t0; B[57][k-i+56] = t1; B[58][k-i+56] = t2; B[59][k-i+56] = t3; } //copy back to B[j][i] for( k = i; k < i + 4; ++k ) { t0 = B[48+k-i][48]; t1 = B[48+k-i][49]; t2 = B[48+k-i][50]; t3 = B[48+k-i][51]; t4 = B[48+k-i][52]; t5 = B[48+k-i][53]; t6 = B[48+k-i][54]; t7 = B[48+k-i][55]; B[k][i] = t0; B[k][i+1] = t1; B[k][i+2] = t2; B[k][i+3] = t3; B[k][i+4] = t4; B[k][i+5] = t5; B[k][i+6] = t6; B[k][i+7] = t7; } for( k = i + 4; k < i + 8; ++k ) { t0 = B[52+k-i][56]; t1 = B[52+k-i][57]; t2 = B[52+k-i][58]; t3 = B[52+k-i][59]; t4 = B[52+k-i][60]; t5 = B[52+k-i][61]; t6 = B[52+k-i][62]; t7 = B[52+k-i][63]; B[k][i+0] = t0; B[k][i+1] = t1; B[k][i+2] = t2; B[k][i+3] = t3; B[k][i+4] = t4; B[k][i+5] = t5; B[k][i+6] = t6; B[k][i+7] = t7; } } //Last 2 diagonal blocks for(j = 48; j < 64; j += 8 ) { i = j; //Copy a(1,1),a(1,2) to b(1,1),b(1,2) for( k = i; k < i + 4; ++k ) { t0 = A[k][j]; t1 = A[k][j+1]; t2 = A[k][j+2]; t3 = A[k][j+3]; t4 = A[k][j+4]; t5 = A[k][j+5]; t6 = A[k][j+6]; t7 = A[k][j+7]; B[k][j] = t0; B[k][j+1] = t1; B[k][j+2] = t2; B[k][j+3] = t3; B[k][j+4] = t4; B[k][j+5] = t5; B[k][j+6] = t6; B[k][j+7] = t7; } //Local transpose: b(1,1) = bT(1,1), b(1,2) = bT(1,2) for( k = i; k < i + 4; ++k ) { for( l = k + 1; l < i + 4; ++l ) { t0 = B[k][l]; t1 = B[k][l+4]; B[k][l] = B[l][k]; B[k][l+4] = B[l][k+4]; B[l][k] = t0; B[l][k+4] = t1; } } //Copy a(2,1),a(2,2) to b(2,1),b(2,2) for( k = i + 4; k < i + 8; ++k ) { t0 = A[k][j]; t1 = A[k][j+1]; t2 = A[k][j+2]; t3 = A[k][j+3]; t4 = A[k][j+4]; t5 = A[k][j+5]; t6 = A[k][j+6]; t7 = A[k][j+7]; B[k][j] = t0; B[k][j+1] = t1; B[k][j+2] = t2; B[k][j+3] = t3; B[k][j+4] = t4; B[k][j+5] = t5; B[k][j+6] = t6; B[k][j+7] = t7; } //Local transpose: b(1,1) = bT(1,1), b(1,2) = bT(1,2) for( k = i + 4; k < i + 8; ++k ) { for( l = k + 1; l < i + 8; ++l ) { t0 = B[k][l]; t1 = B[k][l-4]; B[k][l] = B[l][k]; B[k][l-4] = B[l][k-4]; B[l][k] = t0; B[l][k-4] = t1; } } //Swap: b(1,2),b(2,1) for( k = i + 4; k < i + 8; ++k ) { t0 = B[k][j]; t1 = B[k][j+1]; t2 = B[k][j+2]; t3 = B[k][j+3]; t4 = B[k-4][j+4]; t5 = B[k-4][j+5]; t6 = B[k-4][j+6]; t7 = B[k-4][j+7]; B[k-4][j+4] = t0; B[k-4][j+5] = t1; B[k-4][j+6] = t2; B[k-4][j+7] = t3; B[k][j] = t4; B[k][j+1] = t5; B[k][j+2] = t6; B[k][j+3] = t7; } } } else if( M == 61 && N == 67 ) { //Simple buffered block transpose for( j = 0; j < 56; j += 8 ) { for( i = 0; i < 67; ++i ) { t0 = A[i][j]; t1 = A[i][j+1]; t2 = A[i][j+2]; t3 = A[i][j+3]; t4 = A[i][j+4]; t5 = A[i][j+5]; t6 = A[i][j+6]; t7 = A[i][j+7]; B[j][i] = t0; B[j+1][i] = t1; B[j+2][i] = t2; B[j+3][i] = t3; B[j+4][i] = t4; B[j+5][i] = t5; B[j+6][i] = t6; B[j+7][i] = t7; } } for( i = 0; i < 67; ++i ) { t0 = A[i][j]; t1 = A[i][j+1]; t2 = A[i][j+2]; t3 = A[i][j+3]; t4 = A[i][j+4]; B[j][i] = t0; B[j+1][i] = t1; B[j+2][i] = t2; B[j+3][i] = t3; B[j+4][i] = t4; } } ENSURES(is_transpose(M,N,A,B)); }
void transpose_submit(int M, int N, int A[N][M], int B[M][N]) { REQUIRES(M > 0); REQUIRES(N > 0); if(N==32 && M==32) { int temp1=0, temp2 = 0; for(int i=0;i<N;i+=8)/*each row*/ { for(int j=0;j<M;j+=8)/*each column*/ { for(int i1=i;i1<i+8;i1++)/*inner block row*/ { for(int j1=j;j1<j+8;j1++)/*inner block column*/ { if(i1 != j1)/*if not diagonal*/ { B[j1][i1] = A[i1][j1]; } else/*else for that row store the diagonal*/ { temp1 = A[i1][j1]; temp2 = i1; } } if(i == j)/*check the diagonal*/ { B[temp2][temp2] = temp1; } } } } } else { if(N==64 && M==64) { int temp3=0, temp4=0; for(int i=0;i<M;i=i+8)/*each row*/ { for(int j=0;j<N;j=j+8) { for(int j1=j;j1<j+8;j1+=4)/*each inner row*/ { for(int i1=i;i1<i+8;i1+=4) { for(int i2=i1;i2<i1+4;i2++)/*each row within the inner block*/ { for(int j2=j1;j2<j1+4;j2++) { if(i2 != j2) { B[j2][i2] = A[i2][j2]; } else { temp3 = A[i2][j2]; temp4 = i2; } } if(i1 == j1) { B[temp4][temp4] = temp3; } } } } } } } else { int temp5=0, temp6=0; for(int i=0;i<N;i=i+16)/*outer 16 block size*/ { for(int j=0;j<M;j=j+16) { for(int i1=i;i1<i+16;i1++) { if(i1<N)/*check column of result should not exceed row*/ { for(int j1=j;j1<j+16;j1++) { if(j1<M)/*check row of result should not exceed column*/ { if(i1 != j1) { B[j1][i1] = A[i1][j1]; } else { temp5 = A[i1][j1]; temp6 = i1; } } else { continue; } } if(i == j) { B[temp6][temp6] = temp5; } } else { continue; } } } } } } ENSURES(is_transpose(M, N, A, B)); }
void transpose_submit(int M, int N, int A[N][M], int B[M][N]) { int i, j, k, l; int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; REQUIRES(M > 0); REQUIRES(N > 0); // for 32 * 32 matrix // transpose 8 * 8 block every time if (M == 32) { for (i = 0; i < 32; i += 8) { for (j = 0; j < 32; j += 8) { for (k = 0; k < 8; k++) { tmp0 = A[i + k][j]; tmp1 = A[i + k][j + 1]; tmp2 = A[i + k][j + 2]; tmp3 = A[i + k][j + 3]; tmp4 = A[i + k][j + 4]; tmp5 = A[i + k][j + 5]; tmp6 = A[i + k][j + 6]; tmp7 = A[i + k][j + 7]; B[j][i + k] = tmp0; B[j + 1][i + k] = tmp1; B[j + 2][i + k] = tmp2; B[j + 3][i + k] = tmp3; B[j + 4][i + k] = tmp4; B[j + 5][i + k] = tmp5; B[j + 6][i + k] = tmp6; B[j + 7][i + k] = tmp7; } } } } // for 61 * 67 matrix // transpose 8 * 8 block else if (M == 61) { for (i = 0; i < N; i += 8) { for (j = 0; j < M; j += 8) { for (k = i; k <= i + 3 && k < N; k++) { for (l = j; l <= j + 3 && l < M; l++) { tmp0 = A[k][l]; B[l][k] = tmp0; } } for (k = i; k <= i + 3 && k < N; k++) { for (l = j + 4; l <= j + 7 && l < M; l++) { tmp0 = A[k][l]; B[l][k] = tmp0; } } for (k = i + 4; k <= i + 7 && k < N; k++) { for (l = j; l <= j + 4 && l < M; l++) { tmp0 = A[k][l]; B[l][k] = tmp0; } } for (k = i + 4; k <= i + 7 && k < N; k++) { for (l = j + 4; l <= j + 7 && l < M; l++) { tmp0 = A[k][l]; B[l][k] = tmp0; } } } } } // for 64 * 64 matrix else { // first tranpose blocks at diagonal for (i = 0; i < 64; i += 8) { tmp0 = (i + 8) % 64; for (k = i; k < i + 8; k++) { for (l = 0; l < 8; l++) { B[k][tmp0 + l] = A[k][l + i]; } } for (k = i; k < i + 8; k++) { for (l = 0; l < 4; l++) { B[i + l][k]= B[k][tmp0 + l]; } } for (k = i; k < i + 8; k++) { for (l = 4; l < 8; l++) { B[i + l][k]= B[k][tmp0 + l]; } } } // then tranpose blocks not at diagonal for (i = 0; i < N; i += 8) { for (j = 0; j < M; j += 8) { if (i != j) { for (k = i; k < i + 4; k++) { for (l = j; l < j + 4; l++) { B[l][k] = A[k][l]; } if ((i == 0 && j == 56) || (i == 56 && j == 0)) { tmp0 = 24; tmp1 = 24; } else if (i == 0 || j == 0) { tmp0 = 56; tmp1 = 56; } else { tmp0 = 0; tmp1 = 0; } } for (k = 0; k < 4; k++) for (l = 4; l < 8; l++) A[tmp0 + k][tmp1 + l] = A[k + i][l + j]; for (k = i + 4; k < i + 8; k++) for (l = j; l < j + 4; l++) B[l][k] = A[k][l]; for (k = 0; k < 4; k++) for (l = 4; l < 8; l++) B[l + j][i + k] = A[tmp0 + k][tmp1 + l]; for (k = i + 4; k < i + 8; k++) for (l = j + 4; l < j + 8;l++) B[l][k] = A[k][l]; } } } } ENSURES(is_transpose(M, N, A, B)); }
void transpose_submit(int M, int N, int A[N][M], int B[M][N]){ int n, sn, m, sm, i, j; REQUIRES(M > 0); REQUIRES(N > 0); switch(N){ case 32: for (n = 0; n < 25; n += 8){ for(m = 0; m < 25; m += 8 ){ if(m != n){ for(i = n; i < n+8; i++){ for(j =m; j < m+8; j++){ B[j][i] = A[i][j]; } } } } } for (n = 0; n < 25; n += 8){ for(i = n; i < n+8; i++){ for(j = n; j < n+8; j++){ if(i != j){ B[j][i] = A[i][j]; } } B[i][i] = A[i][i]; } } break; case 64: for(n = 0; n < 57; n +=8){ for(m = 0; m < 57; m += 8){ if(m != n){ sn = n; for(sm = m; sm < m + 5; sm += 4){ for(i = sn; i < sn+4; i++){ for(j = sm; j < sm+4; j++){ B[j][i] = A[i][j]; } } } sn = n+4; for(sm = m + 4; sm > m-1; sm -=4){ for(i = sn; i < sn+4; i++){ for(j = sm; j < sm+4; j++){ B[j][i] = A[i][j]; } } } } } } for(n = 0; n < 57; n+=8){ sn = n; for(i = sn; i < sn+4; i++){ for(j = sn; j < sn+4; j++){ if(i != j){ B[j][i] = A[i][j]; } } B[i][i] = A[i][i]; } for(i = sn; i < sn+4; i++){ for(j = sn+4; j < sn+8; j++){ B[j][i] = A[i][j]; } } sn = n+4; for(i = sn; i < sn+4; i++){ for(j = sn; j < sn+4; j++){ if(i != j){ B[j][i] = A[i][j]; } } B[i][i] = A[i][i]; } for(i = sn; i < sn+4; i++){ for(j = sn-4; j < sn; j++){ B[j][i] = A[i][j]; } } } break; case 67: for(m = 0; m < 61; m += 8){ for(n = 0; n < 67; n += 8){ for(i = n; i < n + 8 && i < 67; i++){ for(j = m; j < m + 8 && j < 61; j++){ B[j][i] = A[i][j]; } } } } break; } ENSURES(is_transpose(M, N, A, B)); }
void transpose_submit(int M, int N, int A[N][M], int B[M][N]) { int start_row, start_column, end_row, end_column; int i,j; int a0,a1,a2,a3,a4,a5,a6,a7; int temp; int row; int flag; int step; REQUIRES(M > 0); REQUIRES(N > 0); if(M == N && M == 32){ if(M == 32) step = 8; for(start_row = 0; start_row < N; start_row += step){ end_row = start_row + step; for(start_column = 0; start_column < M; start_column += step){ end_column = start_column + step; for(i = start_row; i < end_row; ++ i){ for(j = start_column; j < end_column; ++ j){ if(i != j) B[j][i] = A[i][j]; else{ flag = 1; temp = A[i][i]; row = i; } } if(flag){ flag = 0; B[row][row] = temp; } } } } }else if(M == N && M ==64){ for (start_column = 0; start_column < M; start_column += 8) for (start_row = 0; start_row < N; start_row += 8){ for(i = start_row; i < start_row + 8; ++ i){ a0 = A[i][start_column]; a1 = A[i][start_column + 1]; a2 = A[i][start_column + 2]; a3 = A[i][start_column + 3]; if(i == start_row){ a4 = A[i][start_column + 4]; a5 = A[i][start_column + 5]; a6 = A[i][start_column + 6]; a7 = A[i][start_column + 7]; } B[start_column][i] = a0; B[start_column + 1][i] = a1; B[start_column + 2][i] = a2; B[start_column + 3][i] = a3; } for(i = start_row + 7; i > start_row ; --i){ a0 = A[i][start_column + 4]; a1 = A[i][start_column + 5]; a2 = A[i][start_column + 6]; a3 = A[i][start_column + 7]; B[start_column + 4][i] = a0; B[start_column + 5][i] = a1; B[start_column + 6][i] = a2; B[start_column + 7][i] = a3; } B[start_column+4][start_row] = a4; B[start_column+5][start_row] = a5; B[start_column+6][start_row] = a6; B[start_column+7][start_row] = a7; } }else if(M == 61 && N == 67){ step = 21; for(start_column = 0; start_column < M; start_column += step){ end_column = start_column + step; for(start_row = 0; start_row < N; start_row += step){ end_row = start_row + step; for(i = start_row; i < end_row; ++ i){ for(j = start_column; j < end_column; ++ j){ if(i < 67 && j < 61) B[j][i] = A[i][j]; } } } } }else{ for(i = 0; i < M; ++ i) for(j = 0; j < M; ++ j) B[j][i] = A[i][j]; } ENSURES(is_transpose(M, N, A, B)); }
void transpose_submit(int M, int N, int A[N][M], int B[M][N]) { REQUIRES(M > 0); REQUIRES(N > 0); //this is the index we need when calculating int i=0,j=0,k=0,l=0; //this is the temporal number we use to store the swap number int x0,x1,x2,x3; if(M==32&&N==32){ //use the block as size of 8 //generally follows the idea of the block; //But since we don't need to swap the diagnal numbers while(i<32){ j=0; while(j<32){ /* if(i==j){ k=i; while(k<i+block){ x0=A[k][j]; x1=A[k][j+1]; x2=A[k][j+2]; x3=A[k][j+3]; x4=A[k][j+4]; x5=A[k][j+5]; x6=A[k][j+6]; x7=A[k][j+7]; B[j][k]=x0; B[j+1][k]=x1; B[j+2][k]=x2; B[j+3][k]=x3; B[j+4][k]=x4; B[j+5][k]=x5; B[j+6][k]=x6; B[j+7][k]=x7; k++; } }else{ */ //where blocking begins k=j; while(k<j+block){ l=i; while(l<i+block){ B[l][k] = A[k][l]; l++; } k++; } j+=block; } i+=block; } } //This will handle the case when M=64 and N=64 //This time I also use the blocking //But since the cache size could only store 8 //So we swap it one more time if(M==64&&N==64){ i=0; while(i<64){ j=0; while(j<64){ k=0; while(k<block){ l=0; x0 = A[k+j][i]; x1 = A[k+j][i+1]; x2 = A[k+j][i+2]; x3 = A[k+j][i+3]; B[i][k+j] = x0; B[i+1][k+j] = x1; B[i+2][k+j] = x2; B[i+3][k+j] = x3; k++; } l=7; while(l>=0){ x0 = A[l+j][i+4]; x1 = A[l+j][i+5]; x2 = A[l+j][i+6]; x3 = A[l+j][i+7]; B[i+4][l+j] = x0; B[i+5][l+j] = x1; B[i+6][l+j] = x2; B[i+7][l+j] = x3; l--; } j+=block; } i+=block; } } //This will handle the situation when M=61 and N=67 //This time we don't need to handle the diagnal problem //We just need to care about the block size //So it is even easier than the 32*32 //I just tested several times and found out that 17 is the best number //Although I felt this is a magic number if(M==61){ i=0; while(i<61){ j=0; while(j<67){ k=j; while(k<(j+17)&&k<67){ l=i; while(l<(i+17)&&l<61){ B[l][k]=A[k][l]; l++; } k++; } j+=17; } i+=17; } } ENSURES(is_transpose(M, N, A, B)); }
void transpose_submit(int M, int N, int A[N][M], int B[M][N]) { int ii,jj; int i,j; int s0,s1,s2,s3,s4,s5,s6,s7; int iii,jjj; REQUIRES(M > 0); REQUIRES(N > 0); if(M<=32)//as for M==N==32 for(ii=0;ii<N;ii+=8) for(jj=0;jj<M;jj+=8) if(ii!=jj) { for(i=ii;(i<N)&&(i<ii+8);i++) for(j=jj;(j<M)&&(j<jj+8);j++) B[j][i]=A[i][j]; } else { for(i=ii;(i<N)&&(i<ii+8);i++) { s0=A[i][jj];s1=A[i][jj+1];s2=A[i][jj+2]; s3=A[i][jj+3];s4=A[i][jj+4];s5=A[i][jj+5]; s6=A[i][jj+6];s7=A[i][jj+7]; B[jj][i]=s0;B[jj+1][i]=s1;B[jj+2][i]=s2;B[jj+3][i]=s3; B[jj+4][i]=s4;B[jj+5][i]=s5;B[jj+6][i]=s6;B[jj+7][i]=s7; } } else if(M==64)//as for M==N==64 { for(iii=0;iii<N;iii+=8) for(jjj=0;jjj<M;jjj+=8)//8x8 { ii=iii;jj=jjj;//1 if(ii!=jj) { for(j=jj;(j<M)&&(j<jj+4);j+=2)//4x4 { s0=A[ii][j];s1=A[ii+1][j];s2=A[ii+2][j];s3=A[ii+3][j]; s4=A[ii][j+1];s5=A[ii+1][j+1];s6=A[ii+2][j+1];s7=A[ii+3][j+1]; B[j][ii]=s0;B[j][ii+1]=s1;B[j][ii+2]=s2;B[j][ii+3]=s3; B[j+1][ii]=s4;B[j+1][ii+1]=s5;B[j+1][ii+2]=s6;B[j+1][ii+3]=s7; } } else { for(i=ii;(i<N)&&(i<ii+4);i+=2) { s0=A[i][jj];s1=A[i][jj+1];s2=A[i][jj+2];s3=A[i][jj+3]; s4=A[i+1][jj];s5=A[i+1][jj+1];s6=A[i+1][jj+2];s7=A[i+1][jj+3]; B[jj][i]=s0;B[jj+1][i]=s1;B[jj+2][i]=s2;B[jj+3][i]=s3; B[jj][i+1]=s4;B[jj+1][i+1]=s5;B[jj+2][i+1]=s6;B[jj+3][i+1]=s7; } } ii+=4;//3 if(ii!=jj) { for(j=jj;(j<M)&&(j<jj+4);j+=2) { s0=A[ii][j];s1=A[ii+1][j];s2=A[ii+2][j];s3=A[ii+3][j]; s4=A[ii][j+1];s5=A[ii+1][j+1];s6=A[ii+2][j+1];s7=A[ii+3][j+1]; B[j][ii]=s0;B[j][ii+1]=s1;B[j][ii+2]=s2;B[j][ii+3]=s3; B[j+1][ii]=s4;B[j+1][ii+1]=s5;B[j+1][ii+2]=s6;B[j+1][ii+3]=s7; } } else { for(i=ii;(i<N)&&(i<ii+4);i+=2) { s0=A[i][jj];s1=A[i][jj+1];s2=A[i][jj+2];s3=A[i][jj+3]; s4=A[i+1][jj];s5=A[i+1][jj+1];s6=A[i+1][jj+2];s7=A[i+1][jj+3]; B[jj][i]=s0;B[jj+1][i]=s1;B[jj+2][i]=s2;B[jj+3][i]=s3; B[jj][i+1]=s4;B[jj+1][i+1]=s5;B[jj+2][i+1]=s6;B[jj+3][i+1]=s7; } } jj+=4;//4 if(ii!=jj) { for(j=jj;(j<M)&&(j<jj+4);j+=2) { s0=A[ii][j];s1=A[ii+1][j];s2=A[ii+2][j];s3=A[ii+3][j]; s4=A[ii][j+1];s5=A[ii+1][j+1];s6=A[ii+2][j+1];s7=A[ii+3][j+1]; B[j][ii]=s0;B[j][ii+1]=s1;B[j][ii+2]=s2;B[j][ii+3]=s3; B[j+1][ii]=s4;B[j+1][ii+1]=s5;B[j+1][ii+2]=s6;B[j+1][ii+3]=s7; } } else { for(i=ii;(i<N)&&(i<ii+4);i+=2) { s0=A[i][jj];s1=A[i][jj+1];s2=A[i][jj+2];s3=A[i][jj+3]; s4=A[i+1][jj];s5=A[i+1][jj+1];s6=A[i+1][jj+2];s7=A[i+1][jj+3]; B[jj][i]=s0;B[jj+1][i]=s1;B[jj+2][i]=s2;B[jj+3][i]=s3; B[jj][i+1]=s4;B[jj+1][i+1]=s5;B[jj+2][i+1]=s6;B[jj+3][i+1]=s7; } } ii-=4;//2 if(ii!=jj) { for(j=jj;(j<M)&&(j<jj+4);j+=2) { s0=A[ii][j];s1=A[ii+1][j];s2=A[ii+2][j];s3=A[ii+3][j]; s4=A[ii][j+1];s5=A[ii+1][j+1];s6=A[ii+2][j+1];s7=A[ii+3][j+1]; B[j][ii]=s0;B[j][ii+1]=s1;B[j][ii+2]=s2;B[j][ii+3]=s3; B[j+1][ii]=s4;B[j+1][ii+1]=s5;B[j+1][ii+2]=s6;B[j+1][ii+3]=s7; } } else { for(i=ii;(i<N)&&(i<ii+4);i+=2) { s0=A[i][jj];s1=A[i][jj+1];s2=A[i][jj+2];s3=A[i][jj+3]; s4=A[i+1][jj];s5=A[i+1][jj+1];s6=A[i+1][jj+2];s7=A[i+1][jj+3]; B[jj][i]=s0;B[jj+1][i]=s1;B[jj+2][i]=s2;B[jj+3][i]=s3; B[jj][i+1]=s4;B[jj+1][i+1]=s5;B[jj+2][i+1]=s6;B[jj+3][i+1]=s7; } } } } else//as for 61X67 { for(iii=0;iii<64;iii+=8)//64X56 for(jjj=0;jjj<56;jjj+=8) for(ii=iii;ii<iii+8;ii+=4) for(jj=jjj;jj<jjj+8;jj+=4) if(ii!=jj) { for(j=jj;(j<M)&&(j<jj+4);j+=2) { s0=A[ii][j];s1=A[ii+1][j];s2=A[ii+2][j];s3=A[ii+3][j]; s4=A[ii][j+1];s5=A[ii+1][j+1];s6=A[ii+2][j+1];s7=A[ii+3][j+1]; B[j][ii]=s0;B[j][ii+1]=s1;B[j][ii+2]=s2;B[j][ii+3]=s3; B[j+1][ii]=s4;B[j+1][ii+1]=s5;B[j+1][ii+2]=s6;B[j+1][ii+3]=s7; } } else { for(i=ii;(i<64)&&(i<ii+4);i+=2) { s0=A[i][jj];s1=A[i][jj+1];s2=A[i][jj+2];s3=A[i][jj+3]; s4=A[i+1][jj];s5=A[i+1][jj+1];s6=A[i+1][jj+2];s7=A[i+1][jj+3]; B[jj][i]=s0;B[jj+1][i]=s1;B[jj+2][i]=s2;B[jj+3][i]=s3; B[jj][i+1]=s4;B[jj+1][i+1]=s5;B[jj+2][i+1]=s6;B[jj+3][i+1]=s7; } } for(i=0;i<64;i++)//64X5 { s0=A[i][56];s1=A[i][57];s2=A[i][58];s3=A[i][59];s4=A[i][60]; B[56][i]=s0;B[57][i]=s1;B[58][i]=s2;B[59][i]=s3;B[60][i]=s4; } for(ii=64;ii<67;ii+=8)//3X61 for(jj=0;jj<61;jj+=8) if(ii!=jj) { for(i=ii;(i<N)&&(i<ii+8);i++) for(j=jj;(j<M)&&(j<jj+8);j++) B[j][i]=A[i][j]; } else { for(i=ii;(i<N)&&(i<ii+8);i++) { s0=A[i][jj];s1=A[i][jj+1];s2=A[i][jj+2]; s3=A[i][jj+3];s4=A[i][jj+4];s5=A[i][jj+5]; s6=A[i][jj+6];s7=A[i][jj+7]; B[jj][i]=s0;B[jj+1][i]=s1;B[jj+2][i]=s2;B[jj+3][i]=s3; B[jj+4][i]=s4;B[jj+5][i]=s5;B[jj+6][i]=s6;B[jj+7][i]=s7; } } } ENSURES(is_transpose(M, N, A, B)); }