void strassenMMult(double **C, double **A, double **B, int ml, int pl, int nl) { if (((float)ml)*((float)nl)*((float)pl) < THRESHOLD) matmultleaf(ml, nl, pl, A, B, C); else { int m2 = ml/2; int n2 = nl/2; int p2 = pl/2; double **S1 = Allocate2DArray< double >(m2, p2); double **S2 = Allocate2DArray< double >(m2, p2); double **S3 = Allocate2DArray< double >(m2, p2); double **S4 = Allocate2DArray< double >(m2, p2); double **S5 = Allocate2DArray< double >(p2, n2); double **S6 = Allocate2DArray< double >(p2, n2); double **S7 = Allocate2DArray< double >(p2, n2); double **S8 = Allocate2DArray< double >(p2, n2); double **M1 = Allocate2DArray< double >(m2, n2); double **M2 = Allocate2DArray< double >(m2, n2); double **M3 = Allocate2DArray< double >(m2, n2); double **M4 = Allocate2DArray< double >(m2, n2); double **M5 = Allocate2DArray< double >(m2, n2); double **M6 = Allocate2DArray< double >(m2, n2); double **M7 = Allocate2DArray< double >(m2, n2); double **T1 = Allocate2DArray< double >(m2, n2); double **T2 = Allocate2DArray< double >(m2, n2); double **A11 = new double*[m2]; double **A12 = new double*[m2]; double **A21 = new double*[m2]; double **A22 = new double*[m2]; double **B11 = new double*[p2]; double **B12 = new double*[p2]; double **B21 = new double*[p2]; double **B22 = new double*[p2]; double **C11 = new double*[m2]; double **C12 = new double*[m2]; double **C21 = new double*[m2]; double **C22 = new double*[m2]; copyQtrMatrix(A11, m2, A, 0, 0); copyQtrMatrix(A12, m2, A, 0, p2); copyQtrMatrix(A21, m2, A, m2, 0); copyQtrMatrix(A22, m2, A, m2, p2); copyQtrMatrix(B11, p2, B, 0, 0); copyQtrMatrix(B12, p2, B, 0, n2); copyQtrMatrix(B21, p2, B, p2, 0); copyQtrMatrix(B22, p2, B, p2, n2); copyQtrMatrix(C11, m2, C, 0, 0); copyQtrMatrix(C12, m2, C, 0, n2); copyQtrMatrix(C21, m2, C, m2, 0); copyQtrMatrix(C22, m2, C, m2, n2); #pragma omp task { // S1 = A21 + A22 AddMatBlocks(S1, m2, p2, A21, A22); // S2 = S1 - A11 SubMatBlocks(S2, m2, p2, S1, A11); // S4 = A12 - S2 SubMatBlocks(S4, m2, p2, A12, S2); } #pragma omp task { // S3 = A11 - A21 SubMatBlocks(S3, m2, p2, A11, A21); // S7 = B22 - B12 SubMatBlocks(S7, p2, n2, B22, B12); } #pragma omp task { // S5 = B12 - B11 SubMatBlocks(S5, p2, n2, B12, B11); // S6 = B22 - S5 SubMatBlocks(S6, p2, n2, B22, S5); // S8 = S6 - B21 SubMatBlocks(S8, p2, n2, S6, B21); } #pragma omp taskwait #pragma omp task { // M1 = S2 * S6 strassenMMult(M1, S2, S6, m2, p2, n2); // M2 = A11 * B11 strassenMMult(M2, A11, B11, m2, p2, n2); // M4 = S3 * S7 strassenMMult(M4, S3, S7, m2, p2, n2); // T1 = M1 + M2 AddMatBlocks(T1, m2, n2, M1, M2); // T2 = T1 + M4 AddMatBlocks(T2, m2, n2, T1, M4); } #pragma omp task { // M3 = A12 * B21 strassenMMult(M3, A12, B21, m2, p2, n2); // M5 = S1 * S5 strassenMMult(M5, S1, S5, m2, p2, n2); // M6 = S4 * B22 strassenMMult(M6, S4, B22, m2, p2, n2); // M7 = A22 * S8 strassenMMult(M7, A22, S8, m2, p2, n2); } #pragma omp taskwait // C11 = M2 + M3 // C12 = T1 + M5 + M6 // C21 = T2 - M7 // C22 = T2 + M5 //#pragma omp for for (int i = 0; i < m2; ++i) for (int j = 0; j < n2; ++j) { C11[i][j] = M2[i][j] + M3[i][j]; C12[i][j] = T1[i][j] + M5[i][j] + M6[i][j]; C21[i][j] = T2[i][j] - M7[i][j]; C22[i][j] = T2[i][j] + M5[i][j]; } Free2DArray< double >(S1); Free2DArray< double >(S2); Free2DArray< double >(S3); Free2DArray< double >(S4); Free2DArray< double >(S5); Free2DArray< double >(S6); Free2DArray< double >(S7); Free2DArray< double >(S8); Free2DArray< double >(M1); Free2DArray< double >(M2); Free2DArray< double >(M3); Free2DArray< double >(M4); Free2DArray< double >(M5); Free2DArray< double >(M6); Free2DArray< double >(M7); Free2DArray< double >(T1); Free2DArray< double >(T2); delete[] A11; delete[] A12; delete[] A21; delete[] A22; delete[] B11; delete[] B12; delete[] B21; delete[] B22; delete[] C11; delete[] C12; delete[] C21; delete[] C22; } }
void strassenMMult(int mf, int ml, int nf, int nl, int pf, int pl, int **A, int **B, int **C) { if ((long)(ml-mf)*(long)(nl-nf)*(long)(pl-pf) < GRAIN) matmultleaf(mf, ml, nf, nl, pf, pl, A, B, C); else { int m2 = (ml-mf)/2; int n2 = (nl-nf)/2; int p2 = (pl-pf)/2; int **M1 = Allocate2DArray< int >(m2, n2); int **M2 = Allocate2DArray< int >(m2, n2); int **M3 = Allocate2DArray< int >(m2, n2); int **M4 = Allocate2DArray< int >(m2, n2); int **M5 = Allocate2DArray< int >(m2, n2); int **M6 = Allocate2DArray< int >(m2, n2); int **M7 = Allocate2DArray< int >(m2, n2); int **A11 = new int*[m2]; int **A12 = new int*[m2]; int **A21 = new int*[m2]; int **A22 = new int*[m2]; int **B11 = new int*[p2]; int **B12 = new int*[p2]; int **B21 = new int*[p2]; int **B22 = new int*[p2]; int **C11 = new int*[m2]; int **C12 = new int*[m2]; int **C21 = new int*[m2]; int **C22 = new int*[m2]; int **tAM1 = Allocate2DArray< int >(m2, p2); int **tBM1 = Allocate2DArray< int >(p2, n2); int **tAM2 = Allocate2DArray< int >(m2, p2); int **tBM3 = Allocate2DArray< int >(p2, n2); int **tBM4 = Allocate2DArray< int >(p2, n2); int **tAM5 = Allocate2DArray< int >(m2, p2); int **tAM6 = Allocate2DArray< int >(m2, p2); int **tBM6 = Allocate2DArray< int >(p2, n2); int **tAM7 = Allocate2DArray< int >(m2, p2); int **tBM7 = Allocate2DArray< int >(p2, n2); #pragma omp parallel { #pragma omp sections { #pragma omp section { copyQtrMatrix(A11, m2, A, mf, pf); copyQtrMatrix(A12, m2, A, mf, p2); copyQtrMatrix(A21, m2, A, m2, pf); copyQtrMatrix(A22, m2, A, m2, p2); } #pragma omp section { copyQtrMatrix(B11, p2, B, pf, nf); copyQtrMatrix(B12, p2, B, pf, n2); copyQtrMatrix(B21, p2, B, p2, nf); copyQtrMatrix(B22, p2, B, p2, n2); } #pragma omp section { copyQtrMatrix(C11, m2, C, mf, nf); copyQtrMatrix(C12, m2, C, mf, n2); copyQtrMatrix(C21, m2, C, m2, nf); copyQtrMatrix(C22, m2, C, m2, n2); } } #pragma omp barrier #pragma omp sections { #pragma omp section { // M1 = (A11 + A22)*(B11 + B22) AddMatBlocks(tAM1, m2, p2, A11, A22); AddMatBlocks(tBM1, p2, n2, B11, B22); strassenMMult(0, m2, 0, n2, 0, p2, tAM1, tBM1, M1); } #pragma omp section { //M2 = (A21 + A22)*B11 AddMatBlocks(tAM2, m2, p2, A21, A22); strassenMMult(0, m2, 0, n2, 0, p2, tAM2, B11, M2); } #pragma omp section { //M3 = A11*(B12 - B22) SubMatBlocks(tBM3, p2, n2, B12, B22); strassenMMult(0, m2, 0, n2, 0, p2, A11, tBM3, M3); } #pragma omp section { //M4 = A22*(B21 - B11) SubMatBlocks(tBM4, p2, n2, B21, B11); strassenMMult(0, m2, 0, n2, 0, p2, A22, tBM4, M4); } #pragma omp section { //M5 = (A11 + A12)*B22 AddMatBlocks(tAM5, m2, p2, A11, A12); strassenMMult(0, m2, 0, n2, 0, p2, tAM5, B22, M5); } #pragma omp section { //M6 = (A21 - A11)*(B11 + B12) SubMatBlocks(tAM6, m2, p2, A21, A11); AddMatBlocks(tBM6, p2, n2, B11, B12); strassenMMult(0, m2, 0, n2, 0, p2, tAM6, tBM6, M6); } #pragma omp section { //M7 = (A12 - A22)*(B21 + B22) SubMatBlocks(tAM7, m2, p2, A12, A22); AddMatBlocks(tBM7, p2, n2, B21, B22); strassenMMult(0, m2, 0, n2, 0, p2, tAM7, tBM7, M7); } } #pragma omp parallel for for (int i = 0; i < m2; i++) #pragma omp parallel for for (int j = 0; j < n2; j++) { C11[i][j] = M1[i][j] + M4[i][j] - M5[i][j] + M7[i][j]; C12[i][j] = M3[i][j] + M5[i][j]; C21[i][j] = M2[i][j] + M4[i][j]; C22[i][j] = M1[i][j] - M2[i][j] + M3[i][j] + M6[i][j]; } #pragma omp barrier #pragma omp sections { #pragma omp section { Free2DArray< int >(M1); Free2DArray< int >(M2); Free2DArray< int >(M3); Free2DArray< int >(M4); Free2DArray< int >(M5); Free2DArray< int >(M6); Free2DArray< int >(M7); } #pragma omp section { delete[] A11; delete[] A12; delete[] A21; delete[] A22; delete[] B11; delete[] B12; delete[] B21; delete[] B22; delete[] C11; delete[] C12; delete[] C21; delete[] C22; } #pragma omp section { Free2DArray< int >(tAM1); Free2DArray< int >(tBM1); Free2DArray< int >(tAM2); Free2DArray< int >(tBM3); Free2DArray< int >(tBM4); Free2DArray< int >(tAM5); Free2DArray< int >(tAM6); Free2DArray< int >(tBM6); Free2DArray< int >(tAM7); Free2DArray< int >(tBM7); } } } } }