/***************************************************************************** ** ** MultiplyByDivideAndConquer ** ** For medium to medium-large (would you like fries with that) sized ** matrices A, B, and C of size MatrixSize * MatrixSize this function ** efficiently performs the operation ** C = A x B (if AdditiveMode == 0) ** C += A x B (if AdditiveMode != 0) ** ** Note MatrixSize must be divisible by 16. ** ** INPUT: ** C = (*C READ/WRITE) Address of top left element of matrix C. ** A = (*A IS READ ONLY) Address of top left element of matrix A. ** B = (*B IS READ ONLY) Address of top left element of matrix B. ** MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n) ** RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1] ** RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1] ** RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1] ** AdditiveMode = 0 if we want C = A x B, otherwise we'll do C += A x B ** ** OUTPUT: ** C (+)= A x B. (+ if AdditiveMode != 0) ** *****************************************************************************/ void MultiplyByDivideAndConquer(REAL *C, REAL *A, REAL *B, unsigned MatrixSize, unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, int AdditiveMode ) { #define A00 A #define B00 B #define C00 C REAL *A01, *A10, *A11, *B01, *B10, *B11, *C01, *C10, *C11; unsigned QuadrantSize = MatrixSize >> 1; /* partition the matrix */ A01 = A00 + QuadrantSize; A10 = A00 + RowWidthA * QuadrantSize; A11 = A10 + QuadrantSize; B01 = B00 + QuadrantSize; B10 = B00 + RowWidthB * QuadrantSize; B11 = B10 + QuadrantSize; C01 = C00 + QuadrantSize; C10 = C00 + RowWidthC * QuadrantSize; C11 = C10 + QuadrantSize; if (QuadrantSize > SizeAtWhichNaiveAlgorithmIsMoreEfficient) { MultiplyByDivideAndConquer(C00, A00, B00, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, AdditiveMode); MultiplyByDivideAndConquer(C01, A00, B01, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, AdditiveMode); MultiplyByDivideAndConquer(C11, A10, B01, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, AdditiveMode); MultiplyByDivideAndConquer(C10, A10, B00, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, AdditiveMode); MultiplyByDivideAndConquer(C00, A01, B10, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, 1); MultiplyByDivideAndConquer(C01, A01, B11, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, 1); MultiplyByDivideAndConquer(C11, A11, B11, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, 1); MultiplyByDivideAndConquer(C10, A11, B10, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, 1); } else { if (AdditiveMode) { FastAdditiveNaiveMatrixMultiply(C00, A00, B00, QuadrantSize, RowWidthC, RowWidthA, RowWidthB); FastAdditiveNaiveMatrixMultiply(C01, A00, B01, QuadrantSize, RowWidthC, RowWidthA, RowWidthB); FastAdditiveNaiveMatrixMultiply(C11, A10, B01, QuadrantSize, RowWidthC, RowWidthA, RowWidthB); FastAdditiveNaiveMatrixMultiply(C10, A10, B00, QuadrantSize, RowWidthC, RowWidthA, RowWidthB); } else { FastNaiveMatrixMultiply(C00, A00, B00, QuadrantSize, RowWidthC, RowWidthA, RowWidthB); FastNaiveMatrixMultiply(C01, A00, B01, QuadrantSize, RowWidthC, RowWidthA, RowWidthB); FastNaiveMatrixMultiply(C11, A10, B01, QuadrantSize, RowWidthC, RowWidthA, RowWidthB); FastNaiveMatrixMultiply(C10, A10, B00, QuadrantSize, RowWidthC, RowWidthA, RowWidthB); } FastAdditiveNaiveMatrixMultiply(C00, A01, B10, QuadrantSize, RowWidthC, RowWidthA, RowWidthB); FastAdditiveNaiveMatrixMultiply(C01, A01, B11, QuadrantSize, RowWidthC, RowWidthA, RowWidthB); FastAdditiveNaiveMatrixMultiply(C11, A11, B11, QuadrantSize, RowWidthC, RowWidthA, RowWidthB); FastAdditiveNaiveMatrixMultiply(C10, A11, B10, QuadrantSize, RowWidthC, RowWidthA, RowWidthB); } return; }
/***************************************************************************** ** ** OptimizedStrassenMultiply ** ** For large matrices A, B, and C of size MatrixSize * MatrixSize this ** function performs the operation C = A x B efficiently. ** ** INPUT: ** C = (*C WRITE) Address of top left element of matrix C. ** A = (*A IS READ ONLY) Address of top left element of matrix A. ** B = (*B IS READ ONLY) Address of top left element of matrix B. ** MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n) ** RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1] ** RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1] ** RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1] ** ** OUTPUT: ** C = (*C WRITE) Matrix C contains A x B. (Initial value of *C undefined.) ** *****************************************************************************/ VOID_TASK_7(OptimizedStrassenMultiply, REAL *, C, REAL *, A, REAL *, B, unsigned, MatrixSize, unsigned, RowWidthC, unsigned, RowWidthA, unsigned, RowWidthB ) { unsigned QuadrantSize = MatrixSize >> 1; /* MatixSize / 2 */ unsigned QuadrantSizeInBytes = sizeof(REAL) * QuadrantSize * QuadrantSize + 32; unsigned Column, Row; /************************************************************************ ** For each matrix A, B, and C, we'll want pointers to each quandrant ** in the matrix. These quandrants will be addressed as follows: ** -- -- ** | A11 A12 | ** | | ** | A21 A22 | ** -- -- ************************************************************************/ REAL /* *A11, *B11, *C11, */ *A12, *B12, *C12, *A21, *B21, *C21, *A22, *B22, *C22; REAL *S1,*S2,*S3,*S4,*S5,*S6,*S7,*S8,*M2,*M5,*T1sMULT; #define T2sMULT C22 #define NumberOfVariables 11 PTR TempMatrixOffset = 0; PTR MatrixOffsetA = 0; PTR MatrixOffsetB = 0; char *Heap; void *StartHeap; /* Distance between the end of a matrix row and the start of the next row */ PTR RowIncrementA = ( RowWidthA - QuadrantSize ) << 3; PTR RowIncrementB = ( RowWidthB - QuadrantSize ) << 3; PTR RowIncrementC = ( RowWidthC - QuadrantSize ) << 3; if (MatrixSize <= SizeAtWhichDivideAndConquerIsMoreEfficient) { MultiplyByDivideAndConquer(C, A, B, MatrixSize, RowWidthC, RowWidthA, RowWidthB, 0); return; } /* Initialize quandrant matrices */ #define A11 A #define B11 B #define C11 C A12 = A11 + QuadrantSize; B12 = B11 + QuadrantSize; C12 = C11 + QuadrantSize; A21 = A + (RowWidthA * QuadrantSize); B21 = B + (RowWidthB * QuadrantSize); C21 = C + (RowWidthC * QuadrantSize); A22 = A21 + QuadrantSize; B22 = B21 + QuadrantSize; C22 = C21 + QuadrantSize; /* Allocate Heap Space Here */ StartHeap = Heap = malloc(QuadrantSizeInBytes * NumberOfVariables); /* ensure that heap is on cache boundary */ if ( ((PTR) Heap) & 31) Heap = (char*) ( ((PTR) Heap) + 32 - ( ((PTR) Heap) & 31) ); /* Distribute the heap space over the variables */ S1 = (REAL*) Heap; Heap += QuadrantSizeInBytes; S2 = (REAL*) Heap; Heap += QuadrantSizeInBytes; S3 = (REAL*) Heap; Heap += QuadrantSizeInBytes; S4 = (REAL*) Heap; Heap += QuadrantSizeInBytes; S5 = (REAL*) Heap; Heap += QuadrantSizeInBytes; S6 = (REAL*) Heap; Heap += QuadrantSizeInBytes; S7 = (REAL*) Heap; Heap += QuadrantSizeInBytes; S8 = (REAL*) Heap; Heap += QuadrantSizeInBytes; M2 = (REAL*) Heap; Heap += QuadrantSizeInBytes; M5 = (REAL*) Heap; Heap += QuadrantSizeInBytes; T1sMULT = (REAL*) Heap; Heap += QuadrantSizeInBytes; /*************************************************************************** ** Step through all columns row by row (vertically) ** (jumps in memory by RowWidth => bad locality) ** (but we want the best locality on the innermost loop) ***************************************************************************/ for (Row = 0; Row < QuadrantSize; Row++) { /************************************************************************* ** Step through each row horizontally (addressing elements in each column) ** (jumps linearly througn memory => good locality) *************************************************************************/ for (Column = 0; Column < QuadrantSize; Column++) { /*********************************************************** ** Within this loop, the following holds for MatrixOffset: ** MatrixOffset = (Row * RowWidth) + Column ** (note: that the unit of the offset is number of reals) ***********************************************************/ /* Element of Global Matrix, such as A, B, C */ #define E(Matrix) (* (REAL*) ( ((PTR) Matrix) + TempMatrixOffset ) ) #define EA(Matrix) (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetA ) ) #define EB(Matrix) (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetB ) ) /* FIXME - may pay to expand these out - got higher speed-ups below */ /* S4 = A12 - ( S2 = ( S1 = A21 + A22 ) - A11 ) */ E(S4) = EA(A12) - ( E(S2) = ( E(S1) = EA(A21) + EA(A22) ) - EA(A11) ); /* S8 = (S6 = B22 - ( S5 = B12 - B11 ) ) - B21 */ E(S8) = ( E(S6) = EB(B22) - ( E(S5) = EB(B12) - EB(B11) ) ) - EB(B21); /* S3 = A11 - A21 */ E(S3) = EA(A11) - EA(A21); /* S7 = B22 - B12 */ E(S7) = EB(B22) - EB(B12); TempMatrixOffset += sizeof(REAL); MatrixOffsetA += sizeof(REAL); MatrixOffsetB += sizeof(REAL); } /* end row loop*/ MatrixOffsetA += RowIncrementA; MatrixOffsetB += RowIncrementB; } /* end column loop */ /* M2 = A11 x B11 */ SPAWN(OptimizedStrassenMultiply, M2, A11, B11, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB); /* M5 = S1 * S5 */ SPAWN(OptimizedStrassenMultiply, M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize); /* Step 1 of T1 = S2 x S6 + M2 */ SPAWN(OptimizedStrassenMultiply, T1sMULT, S2, S6, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize); /* Step 1 of T2 = T1 + S3 x S7 */ SPAWN(OptimizedStrassenMultiply, C22, S3, S7, QuadrantSize, RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize); /* Step 1 of C11 = M2 + A12 * B21 */ SPAWN(OptimizedStrassenMultiply, C11, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB); /* Step 1 of C12 = S4 x B22 + T1 + M5 */ SPAWN(OptimizedStrassenMultiply, C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB); /* Step 1 of C21 = T2 - A22 * S8 */ SPAWN(OptimizedStrassenMultiply, C21, A22, S8, QuadrantSize, RowWidthC, RowWidthA, QuadrantSize); /********************************************** ** Synchronization Point **********************************************/ SYNC(OptimizedStrassenMultiply); SYNC(OptimizedStrassenMultiply); SYNC(OptimizedStrassenMultiply); SYNC(OptimizedStrassenMultiply); SYNC(OptimizedStrassenMultiply); SYNC(OptimizedStrassenMultiply); SYNC(OptimizedStrassenMultiply); /*************************************************************************** ** Step through all columns row by row (vertically) ** (jumps in memory by RowWidth => bad locality) ** (but we want the best locality on the innermost loop) ***************************************************************************/ for (Row = 0; Row < QuadrantSize; Row++) { /************************************************************************* ** Step through each row horizontally (addressing elements in each column) ** (jumps linearly througn memory => good locality) *************************************************************************/ for (Column = 0; Column < QuadrantSize; Column += 4) { REAL LocalM5_0 = *(M5); REAL LocalM5_1 = *(M5+1); REAL LocalM5_2 = *(M5+2); REAL LocalM5_3 = *(M5+3); REAL LocalM2_0 = *(M2); REAL LocalM2_1 = *(M2+1); REAL LocalM2_2 = *(M2+2); REAL LocalM2_3 = *(M2+3); REAL T1_0 = *(T1sMULT) + LocalM2_0; REAL T1_1 = *(T1sMULT+1) + LocalM2_1; REAL T1_2 = *(T1sMULT+2) + LocalM2_2; REAL T1_3 = *(T1sMULT+3) + LocalM2_3; REAL T2_0 = *(C22) + T1_0; REAL T2_1 = *(C22+1) + T1_1; REAL T2_2 = *(C22+2) + T1_2; REAL T2_3 = *(C22+3) + T1_3; (*(C11)) += LocalM2_0; (*(C11+1)) += LocalM2_1; (*(C11+2)) += LocalM2_2; (*(C11+3)) += LocalM2_3; (*(C12)) += LocalM5_0 + T1_0; (*(C12+1)) += LocalM5_1 + T1_1; (*(C12+2)) += LocalM5_2 + T1_2; (*(C12+3)) += LocalM5_3 + T1_3; (*(C22)) = LocalM5_0 + T2_0; (*(C22+1)) = LocalM5_1 + T2_1; (*(C22+2)) = LocalM5_2 + T2_2; (*(C22+3)) = LocalM5_3 + T2_3; (*(C21 )) = (- *(C21 )) + T2_0; (*(C21+1)) = (- *(C21+1)) + T2_1; (*(C21+2)) = (- *(C21+2)) + T2_2; (*(C21+3)) = (- *(C21+3)) + T2_3; M5 += 4; M2 += 4; T1sMULT += 4; C11 += 4; C12 += 4; C21 += 4; C22 += 4; } C11 = (REAL*) ( ((PTR) C11 ) + RowIncrementC); C12 = (REAL*) ( ((PTR) C12 ) + RowIncrementC); C21 = (REAL*) ( ((PTR) C21 ) + RowIncrementC); C22 = (REAL*) ( ((PTR) C22 ) + RowIncrementC); } free(StartHeap); }
/***************************************************************************** ** ** OptimizedStrassenMultiply ** ** For large matrices A, B, and C of size MatrixSize * MatrixSize this ** function performs the operation C = A x B efficiently. ** ** INPUT: ** C = (*C WRITE) Address of top left element of matrix C. ** A = (*A IS READ ONLY) Address of top left element of matrix A. ** B = (*B IS READ ONLY) Address of top left element of matrix B. ** MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n) ** RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1] ** RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1] ** RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1] ** ** OUTPUT: ** C = (*C WRITE) Matrix C contains A x B. (Initial value of *C undefined.) ** *****************************************************************************/ static void OptimizedStrassenMultiply_par(double *C, double *A, double *B, unsigned MatrixSize, unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, unsigned int Depth, unsigned int cutoff_depth, unsigned cutoff_size) { unsigned QuadrantSize = MatrixSize >> 1; /* MatixSize / 2 */ unsigned QuadrantSizeInBytes = sizeof(double) * QuadrantSize * QuadrantSize; unsigned Column, Row; /************************************************************************ ** For each matrix A, B, and C, we'll want pointers to each quandrant ** in the matrix. These quandrants will be addressed as follows: ** -- -- ** | A A12 | ** | | ** | A21 A22 | ** -- -- ************************************************************************/ double /* *A, *B, *C, */ *A12, *B12, *C12, *A21, *B21, *C21, *A22, *B22, *C22; double *S1,*S2,*S3,*S4,*S5,*S6,*S7,*S8,*M2,*M5,*T1sMULT; #define T2sMULT C22 #define NumberOfVariables 11 char *Heap; void *StartHeap; if (MatrixSize <= cutoff_size) { MultiplyByDivideAndConquer(C, A, B, MatrixSize, RowWidthC, RowWidthA, RowWidthB, 0); return; } /* Initialize quandrant matrices */ A12 = A + QuadrantSize; B12 = B + QuadrantSize; C12 = C + QuadrantSize; A21 = A + (RowWidthA * QuadrantSize); B21 = B + (RowWidthB * QuadrantSize); C21 = C + (RowWidthC * QuadrantSize); A22 = A21 + QuadrantSize; B22 = B21 + QuadrantSize; C22 = C21 + QuadrantSize; /* Allocate Heap Space Here */ StartHeap = Heap = malloc(QuadrantSizeInBytes * NumberOfVariables); /* Distribute the heap space over the variables */ S1 = (double*) Heap; Heap += QuadrantSizeInBytes; S2 = (double*) Heap; Heap += QuadrantSizeInBytes; S3 = (double*) Heap; Heap += QuadrantSizeInBytes; S4 = (double*) Heap; Heap += QuadrantSizeInBytes; S5 = (double*) Heap; Heap += QuadrantSizeInBytes; S6 = (double*) Heap; Heap += QuadrantSizeInBytes; S7 = (double*) Heap; Heap += QuadrantSizeInBytes; S8 = (double*) Heap; Heap += QuadrantSizeInBytes; M2 = (double*) Heap; Heap += QuadrantSizeInBytes; M5 = (double*) Heap; Heap += QuadrantSizeInBytes; T1sMULT = (double*) Heap; Heap += QuadrantSizeInBytes; if (Depth < cutoff_depth) { #pragma omp task depend(in: A21, A22) depend(out: S1) private(Row, Column) for (Row = 0; Row < QuadrantSize; Row++) for (Column = 0; Column < QuadrantSize; Column++) S1[Row * QuadrantSize + Column] = A21[RowWidthA * Row + Column] + A22[RowWidthA * Row + Column]; #pragma omp task depend(in: S1, A) depend(out: S2) private(Row, Column) for (Row = 0; Row < QuadrantSize; Row++) for (Column = 0; Column < QuadrantSize; Column++) S2[Row * QuadrantSize + Column] = S1[Row * QuadrantSize + Column] - A[RowWidthA * Row + Column]; #pragma omp task depend(in: A12, S2) depend(out: S4) private(Row, Column) for (Row = 0; Row < QuadrantSize; Row++) for (Column = 0; Column < QuadrantSize; Column++) S4[Row * QuadrantSize + Column] = A12[Row * RowWidthA + Column] - S2[QuadrantSize * Row + Column]; #pragma omp task depend(in: B12, B) depend(out: S5) private(Row, Column) for (Row = 0; Row < QuadrantSize; Row++) for (Column = 0; Column < QuadrantSize; Column++) S5[Row * QuadrantSize + Column] = B12[Row * RowWidthB + Column] - B[Row * RowWidthB + Column]; #pragma omp task depend(in: B22, S5) depend(out: S6) private(Row, Column) for (Row = 0; Row < QuadrantSize; Row++) for (Column = 0; Column < QuadrantSize; Column++) S6[Row * QuadrantSize + Column] = B22[Row * RowWidthB + Column] - S5[Row * QuadrantSize + Column]; #pragma omp task depend(in: S6, B21) depend(out: S8) private(Row, Column) for (Row = 0; Row < QuadrantSize; Row++) for (Column = 0; Column < QuadrantSize; Column++) S8[Row * QuadrantSize + Column] = S6[Row * QuadrantSize + Column] - B21[Row * RowWidthB + Column]; #pragma omp task depend(in: A, A21) depend(out: S3) private(Row, Column) for (Row = 0; Row < QuadrantSize; Row++) for (Column = 0; Column < QuadrantSize; Column++) S3[Row * QuadrantSize + Column] = A[RowWidthA * Row + Column] - A21[RowWidthA * Row + Column]; #pragma omp task depend(in: B22, B12) depend(out: S7) private(Row, Column) for (Row = 0; Row < QuadrantSize; Row++) for (Column = 0; Column < QuadrantSize; Column++) S7[Row * QuadrantSize + Column] = B22[Row * RowWidthB + Column] - B12[Row * RowWidthB + Column]; /* M2 = A x B */ #pragma omp task depend(in: A, B) depend(out: M2) OptimizedStrassenMultiply_par(M2, A, B, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB, Depth+1, cutoff_depth, cutoff_size); /* M5 = S1 * S5 */ #pragma omp task untied depend(in: S1, S5) depend(out: M5) OptimizedStrassenMultiply_par(M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1, cutoff_depth, cutoff_size); /* Step 1 of T1 = S2 x S6 + M2 */ #pragma omp task untied depend(in: S2, S6) depend(out: T1sMULT) OptimizedStrassenMultiply_par(T1sMULT, S2, S6, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1, cutoff_depth, cutoff_size); /* Step 1 of T2 = T1 + S3 x S7 */ #pragma omp task untied depend(in: S3, S7) depend(out: C22) OptimizedStrassenMultiply_par(C22, S3, S7, QuadrantSize, RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize, Depth+1, cutoff_depth, cutoff_size); /* Step 1 of C = M2 + A12 * B21 */ #pragma omp task untied depend(in: A12, B21) depend(out: C) OptimizedStrassenMultiply_par(C, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, Depth+1, cutoff_depth, cutoff_size); /* Step 1 of C12 = S4 x B22 + T1 + M5 */ #pragma omp task untied depend(in: S4, B22) depend(out: C12) OptimizedStrassenMultiply_par(C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB, Depth+1, cutoff_depth, cutoff_size); /* Step 1 of C21 = T2 - A22 * S8 */ #pragma omp task untied depend(in: A22, S8) depend(out: C21) OptimizedStrassenMultiply_par(C21, A22, S8, QuadrantSize, RowWidthC, RowWidthA, QuadrantSize, Depth+1, cutoff_depth, cutoff_size); #pragma omp task depend(inout: C) depend(in: M2) private(Row, Column) for (Row = 0; Row < QuadrantSize; Row++) for (Column = 0; Column < QuadrantSize; Column += 1) C[RowWidthC * Row + Column] += M2[Row * QuadrantSize + Column]; #pragma omp task depend(inout: C12) depend(in: M5, T1sMULT, M2) private(Row, Column) for (Row = 0; Row < QuadrantSize; Row++) for (Column = 0; Column < QuadrantSize; Column += 1) C12[RowWidthC * Row + Column] += M5[Row * QuadrantSize + Column] + T1sMULT[Row * QuadrantSize + Column] + M2[Row * QuadrantSize + Column]; #pragma omp task depend(inout: C21) depend(in: C22, T1sMULT, M2) private(Row, Column) for (Row = 0; Row < QuadrantSize; Row++) for (Column = 0; Column < QuadrantSize; Column += 1) C21[RowWidthC * Row + Column] = -C21[RowWidthC * Row + Column] + C22[RowWidthC * Row + Column] + T1sMULT[Row * QuadrantSize + Column] + M2[Row * QuadrantSize + Column]; #pragma omp task depend(inout: C22) depend(in: M5, T1sMULT, M2) private(Row, Column) for (Row = 0; Row < QuadrantSize; Row++) for (Column = 0; Column < QuadrantSize; Column += 1) C22[RowWidthC * Row + Column] += M5[Row * QuadrantSize + Column] + T1sMULT[Row * QuadrantSize + Column] + M2[Row * QuadrantSize + Column]; #pragma omp taskwait } else { for (Row = 0; Row < QuadrantSize; Row++) for (Column = 0; Column < QuadrantSize; Column++) { S1[Row * QuadrantSize + Column] = A21[RowWidthA * Row + Column] + A22[RowWidthA * Row + Column]; S2[Row * QuadrantSize + Column] = S1[Row * QuadrantSize + Column] - A[RowWidthA * Row + Column]; S4[Row * QuadrantSize + Column] = A12[Row * RowWidthA + Column] - S2[QuadrantSize * Row + Column]; S5[Row * QuadrantSize + Column] = B12[Row * RowWidthB + Column] - B[Row * RowWidthB + Column]; S6[Row * QuadrantSize + Column] = B22[Row * RowWidthB + Column] - S5[Row * QuadrantSize + Column]; S8[Row * QuadrantSize + Column] = S6[Row * QuadrantSize + Column] - B21[Row * RowWidthB + Column]; S3[Row * QuadrantSize + Column] = A[RowWidthA * Row + Column] - A21[RowWidthA * Row + Column]; S7[Row * QuadrantSize + Column] = B22[Row * RowWidthB + Column] - B12[Row * RowWidthB + Column]; } /* M2 = A x B */ OptimizedStrassenMultiply_par(M2, A, B, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB, Depth+1, cutoff_depth, cutoff_size); /* M5 = S1 * S5 */ OptimizedStrassenMultiply_par(M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1, cutoff_depth, cutoff_size); /* Step 1 of T1 = S2 x S6 + M2 */ OptimizedStrassenMultiply_par(T1sMULT, S2, S6, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1, cutoff_depth, cutoff_size); /* Step 1 of T2 = T1 + S3 x S7 */ OptimizedStrassenMultiply_par(C22, S3, S7, QuadrantSize, RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize, Depth+1, cutoff_depth, cutoff_size); /* Step 1 of C = M2 + A12 * B21 */ OptimizedStrassenMultiply_par(C, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, Depth+1, cutoff_depth, cutoff_size); /* Step 1 of C12 = S4 x B22 + T1 + M5 */ OptimizedStrassenMultiply_par(C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB, Depth+1, cutoff_depth, cutoff_size); /* Step 1 of C21 = T2 - A22 * S8 */ OptimizedStrassenMultiply_par(C21, A22, S8, QuadrantSize, RowWidthC, RowWidthA, QuadrantSize, Depth+1, cutoff_depth, cutoff_size); for (Row = 0; Row < QuadrantSize; Row++) { for (Column = 0; Column < QuadrantSize; Column += 1) { C[RowWidthC * Row + Column] += M2[Row * QuadrantSize + Column]; C12[RowWidthC * Row + Column] += M5[Row * QuadrantSize + Column] + T1sMULT[Row * QuadrantSize + Column] + M2[Row * QuadrantSize + Column]; C21[RowWidthC * Row + Column] = -C21[RowWidthC * Row + Column] + C22[RowWidthC * Row + Column] + T1sMULT[Row * QuadrantSize + Column] + M2[Row * QuadrantSize + Column]; C22[RowWidthC * Row + Column] += M5[Row * QuadrantSize + Column] + T1sMULT[Row * QuadrantSize + Column] + M2[Row * QuadrantSize + Column]; } } } free(StartHeap); }