Example #1
0
/*****************************************************************************
 **
 ** MultiplyByDivideAndConquer
 **
 ** For medium to medium-large (would you like fries with that) sized
 ** matrices A, B, and C of size MatrixSize * MatrixSize this function
 ** efficiently performs the operation
 **    C  = A x B (if AdditiveMode == 0)
 **    C += A x B (if AdditiveMode != 0)
 **
 ** Note MatrixSize must be divisible by 16.
 **
 ** INPUT:
 **    C = (*C READ/WRITE) Address of top left element of matrix C.
 **    A = (*A IS READ ONLY) Address of top left element of matrix A.
 **    B = (*B IS READ ONLY) Address of top left element of matrix B.
 **    MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n)
 **    RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1]
 **    RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1]
 **    RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1]
 **    AdditiveMode = 0 if we want C = A x B, otherwise we'll do C += A x B
 **
 ** OUTPUT:
 **    C (+)= A x B. (+ if AdditiveMode != 0)
 **
 *****************************************************************************/
void MultiplyByDivideAndConquer(REAL *C, REAL *A, REAL *B,
        unsigned MatrixSize,
        unsigned RowWidthC,
        unsigned RowWidthA,
        unsigned RowWidthB,
        int AdditiveMode
        )
{
#define A00 A
#define B00 B
#define C00 C
    REAL  *A01, *A10, *A11, *B01, *B10, *B11, *C01, *C10, *C11;
    unsigned QuadrantSize = MatrixSize >> 1;

    /* partition the matrix */
    A01 = A00 + QuadrantSize;
    A10 = A00 + RowWidthA * QuadrantSize;
    A11 = A10 + QuadrantSize;

    B01 = B00 + QuadrantSize;
    B10 = B00 + RowWidthB * QuadrantSize;
    B11 = B10 + QuadrantSize;

    C01 = C00 + QuadrantSize;
    C10 = C00 + RowWidthC * QuadrantSize;
    C11 = C10 + QuadrantSize;

    if (QuadrantSize > SizeAtWhichNaiveAlgorithmIsMoreEfficient) {

        MultiplyByDivideAndConquer(C00, A00, B00, QuadrantSize,
                RowWidthC, RowWidthA, RowWidthB,
                AdditiveMode);

        MultiplyByDivideAndConquer(C01, A00, B01, QuadrantSize,
                RowWidthC, RowWidthA, RowWidthB,
                AdditiveMode);

        MultiplyByDivideAndConquer(C11, A10, B01, QuadrantSize,
                RowWidthC, RowWidthA, RowWidthB,
                AdditiveMode);

        MultiplyByDivideAndConquer(C10, A10, B00, QuadrantSize,
                RowWidthC, RowWidthA, RowWidthB,
                AdditiveMode);

        MultiplyByDivideAndConquer(C00, A01, B10, QuadrantSize,
                RowWidthC, RowWidthA, RowWidthB,
                1);

        MultiplyByDivideAndConquer(C01, A01, B11, QuadrantSize,
                RowWidthC, RowWidthA, RowWidthB,
                1);

        MultiplyByDivideAndConquer(C11, A11, B11, QuadrantSize,
                RowWidthC, RowWidthA, RowWidthB,
                1);

        MultiplyByDivideAndConquer(C10, A11, B10, QuadrantSize,
                RowWidthC, RowWidthA, RowWidthB,
                1);

    } else {

        if (AdditiveMode) {
            FastAdditiveNaiveMatrixMultiply(C00, A00, B00, QuadrantSize,
                    RowWidthC, RowWidthA, RowWidthB);

            FastAdditiveNaiveMatrixMultiply(C01, A00, B01, QuadrantSize,
                    RowWidthC, RowWidthA, RowWidthB);

            FastAdditiveNaiveMatrixMultiply(C11, A10, B01, QuadrantSize,
                    RowWidthC, RowWidthA, RowWidthB);

            FastAdditiveNaiveMatrixMultiply(C10, A10, B00, QuadrantSize,
                    RowWidthC, RowWidthA, RowWidthB);

        } else {

            FastNaiveMatrixMultiply(C00, A00, B00, QuadrantSize,
                    RowWidthC, RowWidthA, RowWidthB);

            FastNaiveMatrixMultiply(C01, A00, B01, QuadrantSize,
                    RowWidthC, RowWidthA, RowWidthB);

            FastNaiveMatrixMultiply(C11, A10, B01, QuadrantSize,
                    RowWidthC, RowWidthA, RowWidthB);

            FastNaiveMatrixMultiply(C10, A10, B00, QuadrantSize,
                    RowWidthC, RowWidthA, RowWidthB);
        }

        FastAdditiveNaiveMatrixMultiply(C00, A01, B10, QuadrantSize,
                RowWidthC, RowWidthA, RowWidthB);

        FastAdditiveNaiveMatrixMultiply(C01, A01, B11, QuadrantSize,
                RowWidthC, RowWidthA, RowWidthB);

        FastAdditiveNaiveMatrixMultiply(C11, A11, B11, QuadrantSize,
                RowWidthC, RowWidthA, RowWidthB);

        FastAdditiveNaiveMatrixMultiply(C10, A11, B10, QuadrantSize,
                RowWidthC, RowWidthA, RowWidthB);


    }
    return;
}
Example #2
0
/*****************************************************************************
 **
 ** OptimizedStrassenMultiply
 **
 ** For large matrices A, B, and C of size MatrixSize * MatrixSize this
 ** function performs the operation C = A x B efficiently.
 **
 ** INPUT:
 **    C = (*C WRITE) Address of top left element of matrix C.
 **    A = (*A IS READ ONLY) Address of top left element of matrix A.
 **    B = (*B IS READ ONLY) Address of top left element of matrix B.
 **    MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n)
 **    RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1]
 **    RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1]
 **    RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1]
 **
 ** OUTPUT:
 **    C = (*C WRITE) Matrix C contains A x B. (Initial value of *C undefined.)
 **
 *****************************************************************************/
VOID_TASK_7(OptimizedStrassenMultiply, REAL *, C, REAL *, A, REAL *, B,
        unsigned, MatrixSize,
        unsigned, RowWidthC,
        unsigned, RowWidthA,
        unsigned, RowWidthB
        )
{
    unsigned QuadrantSize = MatrixSize >> 1; /* MatixSize / 2 */
    unsigned QuadrantSizeInBytes = sizeof(REAL) * QuadrantSize * QuadrantSize
        + 32;
    unsigned Column, Row;

    /************************************************************************
     ** For each matrix A, B, and C, we'll want pointers to each quandrant
     ** in the matrix. These quandrants will be addressed as follows:
     **  --        --
     **  | A11  A12 |
     **  |          |
     **  | A21  A22 |
     **  --        --
     ************************************************************************/
    REAL /* *A11, *B11, *C11, */ *A12, *B12, *C12,
         *A21, *B21, *C21, *A22, *B22, *C22;

    REAL *S1,*S2,*S3,*S4,*S5,*S6,*S7,*S8,*M2,*M5,*T1sMULT;
#define T2sMULT C22
#define NumberOfVariables 11

    PTR TempMatrixOffset = 0;
    PTR MatrixOffsetA = 0;
    PTR MatrixOffsetB = 0;

    char *Heap;
    void *StartHeap;

    /* Distance between the end of a matrix row and the start of the next row */
    PTR RowIncrementA = ( RowWidthA - QuadrantSize ) << 3;
    PTR RowIncrementB = ( RowWidthB - QuadrantSize ) << 3;
    PTR RowIncrementC = ( RowWidthC - QuadrantSize ) << 3;


    if (MatrixSize <= SizeAtWhichDivideAndConquerIsMoreEfficient) {

        MultiplyByDivideAndConquer(C, A, B,
                MatrixSize,
                RowWidthC,
                RowWidthA,
                RowWidthB,
                0);
        return;
    }

    /* Initialize quandrant matrices */
#define A11 A
#define B11 B
#define C11 C
    A12 = A11 + QuadrantSize;
    B12 = B11 + QuadrantSize;
    C12 = C11 + QuadrantSize;
    A21 = A + (RowWidthA * QuadrantSize);
    B21 = B + (RowWidthB * QuadrantSize);
    C21 = C + (RowWidthC * QuadrantSize);
    A22 = A21 + QuadrantSize;
    B22 = B21 + QuadrantSize;
    C22 = C21 + QuadrantSize;

    /* Allocate Heap Space Here */
    StartHeap = Heap = malloc(QuadrantSizeInBytes * NumberOfVariables);
    /* ensure that heap is on cache boundary */
    if ( ((PTR) Heap) & 31)
        Heap = (char*) ( ((PTR) Heap) + 32 - ( ((PTR) Heap) & 31) );

    /* Distribute the heap space over the variables */
    S1 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
    S2 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
    S3 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
    S4 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
    S5 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
    S6 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
    S7 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
    S8 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
    M2 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
    M5 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
    T1sMULT = (REAL*) Heap; Heap += QuadrantSizeInBytes;

    /***************************************************************************
     ** Step through all columns row by row (vertically)
     ** (jumps in memory by RowWidth => bad locality)
     ** (but we want the best locality on the innermost loop)
     ***************************************************************************/
    for (Row = 0; Row < QuadrantSize; Row++) {

        /*************************************************************************
         ** Step through each row horizontally (addressing elements in each column)
         ** (jumps linearly througn memory => good locality)
         *************************************************************************/
        for (Column = 0; Column < QuadrantSize; Column++) {

            /***********************************************************
             ** Within this loop, the following holds for MatrixOffset:
             ** MatrixOffset = (Row * RowWidth) + Column
             ** (note: that the unit of the offset is number of reals)
             ***********************************************************/
            /* Element of Global Matrix, such as A, B, C */
#define E(Matrix)   (* (REAL*) ( ((PTR) Matrix) + TempMatrixOffset ) )
#define EA(Matrix)  (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetA ) )
#define EB(Matrix)  (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetB ) )

            /* FIXME - may pay to expand these out - got higher speed-ups below */
            /* S4 = A12 - ( S2 = ( S1 = A21 + A22 ) - A11 ) */
            E(S4) = EA(A12) - ( E(S2) = ( E(S1) = EA(A21) + EA(A22) ) - EA(A11) );

            /* S8 = (S6 = B22 - ( S5 = B12 - B11 ) ) - B21 */
            E(S8) = ( E(S6) = EB(B22) - ( E(S5) = EB(B12) - EB(B11) ) ) - EB(B21);

            /* S3 = A11 - A21 */
            E(S3) = EA(A11) - EA(A21);

            /* S7 = B22 - B12 */
            E(S7) = EB(B22) - EB(B12);

            TempMatrixOffset += sizeof(REAL);
            MatrixOffsetA += sizeof(REAL);
            MatrixOffsetB += sizeof(REAL);
        } /* end row loop*/

        MatrixOffsetA += RowIncrementA;
        MatrixOffsetB += RowIncrementB;
    } /* end column loop */

    /* M2 = A11 x B11 */
    SPAWN(OptimizedStrassenMultiply, M2, A11, B11, QuadrantSize,
            QuadrantSize, RowWidthA, RowWidthB);

    /* M5 = S1 * S5 */
    SPAWN(OptimizedStrassenMultiply, M5, S1, S5, QuadrantSize,
            QuadrantSize, QuadrantSize, QuadrantSize);

    /* Step 1 of T1 = S2 x S6 + M2 */
    SPAWN(OptimizedStrassenMultiply, T1sMULT, S2, S6,  QuadrantSize,
            QuadrantSize, QuadrantSize, QuadrantSize);

    /* Step 1 of T2 = T1 + S3 x S7 */
    SPAWN(OptimizedStrassenMultiply, C22, S3, S7, QuadrantSize,
            RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize);

    /* Step 1 of C11 = M2 + A12 * B21 */
    SPAWN(OptimizedStrassenMultiply, C11, A12, B21, QuadrantSize,
            RowWidthC, RowWidthA, RowWidthB);

    /* Step 1 of C12 = S4 x B22 + T1 + M5 */
    SPAWN(OptimizedStrassenMultiply, C12, S4, B22, QuadrantSize,
            RowWidthC, QuadrantSize, RowWidthB);

    /* Step 1 of C21 = T2 - A22 * S8 */
    SPAWN(OptimizedStrassenMultiply, C21, A22, S8, QuadrantSize,
            RowWidthC, RowWidthA, QuadrantSize);

    /**********************************************
     ** Synchronization Point
     **********************************************/
    SYNC(OptimizedStrassenMultiply);
    SYNC(OptimizedStrassenMultiply);
    SYNC(OptimizedStrassenMultiply);
    SYNC(OptimizedStrassenMultiply);
    SYNC(OptimizedStrassenMultiply);
    SYNC(OptimizedStrassenMultiply);
    SYNC(OptimizedStrassenMultiply);


    /***************************************************************************
     ** Step through all columns row by row (vertically)
     ** (jumps in memory by RowWidth => bad locality)
     ** (but we want the best locality on the innermost loop)
     ***************************************************************************/
    for (Row = 0; Row < QuadrantSize; Row++) {

        /*************************************************************************
         ** Step through each row horizontally (addressing elements in each column)
         ** (jumps linearly througn memory => good locality)
         *************************************************************************/
        for (Column = 0; Column < QuadrantSize; Column += 4) {
            REAL LocalM5_0 = *(M5);
            REAL LocalM5_1 = *(M5+1);
            REAL LocalM5_2 = *(M5+2);
            REAL LocalM5_3 = *(M5+3);
            REAL LocalM2_0 = *(M2);
            REAL LocalM2_1 = *(M2+1);
            REAL LocalM2_2 = *(M2+2);
            REAL LocalM2_3 = *(M2+3);
            REAL T1_0 = *(T1sMULT) + LocalM2_0;
            REAL T1_1 = *(T1sMULT+1) + LocalM2_1;
            REAL T1_2 = *(T1sMULT+2) + LocalM2_2;
            REAL T1_3 = *(T1sMULT+3) + LocalM2_3;
            REAL T2_0 = *(C22) + T1_0;
            REAL T2_1 = *(C22+1) + T1_1;
            REAL T2_2 = *(C22+2) + T1_2;
            REAL T2_3 = *(C22+3) + T1_3;
            (*(C11))   += LocalM2_0;
            (*(C11+1)) += LocalM2_1;
            (*(C11+2)) += LocalM2_2;
            (*(C11+3)) += LocalM2_3;
            (*(C12))   += LocalM5_0 + T1_0;
            (*(C12+1)) += LocalM5_1 + T1_1;
            (*(C12+2)) += LocalM5_2 + T1_2;
            (*(C12+3)) += LocalM5_3 + T1_3;
            (*(C22))   = LocalM5_0 + T2_0;
            (*(C22+1)) = LocalM5_1 + T2_1;
            (*(C22+2)) = LocalM5_2 + T2_2;
            (*(C22+3)) = LocalM5_3 + T2_3;
            (*(C21  )) = (- *(C21  )) + T2_0;
            (*(C21+1)) = (- *(C21+1)) + T2_1;
            (*(C21+2)) = (- *(C21+2)) + T2_2;
            (*(C21+3)) = (- *(C21+3)) + T2_3;
            M5 += 4;
            M2 += 4;
            T1sMULT += 4;
            C11 += 4;
            C12 += 4;
            C21 += 4;
            C22 += 4;
        }

        C11 = (REAL*) ( ((PTR) C11 ) + RowIncrementC);
        C12 = (REAL*) ( ((PTR) C12 ) + RowIncrementC);
        C21 = (REAL*) ( ((PTR) C21 ) + RowIncrementC);
        C22 = (REAL*) ( ((PTR) C22 ) + RowIncrementC);
    }

    free(StartHeap);

}
Example #3
0
/*****************************************************************************
 **
 ** OptimizedStrassenMultiply
 **
 ** For large matrices A, B, and C of size MatrixSize * MatrixSize this
 ** function performs the operation C = A x B efficiently.
 **
 ** INPUT:
 **    C = (*C WRITE) Address of top left element of matrix C.
 **    A = (*A IS READ ONLY) Address of top left element of matrix A.
 **    B = (*B IS READ ONLY) Address of top left element of matrix B.
 **    MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n)
 **    RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1]
 **    RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1]
 **    RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1]
 **
 ** OUTPUT:
 **    C = (*C WRITE) Matrix C contains A x B. (Initial value of *C undefined.)
 **
 *****************************************************************************/
static void OptimizedStrassenMultiply_par(double *C, double *A, double *B,
    unsigned MatrixSize, unsigned RowWidthC, unsigned RowWidthA,
    unsigned RowWidthB, unsigned int Depth, unsigned int cutoff_depth,
    unsigned cutoff_size)
{
  unsigned QuadrantSize = MatrixSize >> 1; /* MatixSize / 2 */
  unsigned QuadrantSizeInBytes = sizeof(double) * QuadrantSize * QuadrantSize;
  unsigned Column, Row;

  /************************************************************************
   ** For each matrix A, B, and C, we'll want pointers to each quandrant
   ** in the matrix. These quandrants will be addressed as follows:
   **  --        --
   **  | A    A12 |
   **  |          |
   **  | A21  A22 |
   **  --        --
   ************************************************************************/
  double /* *A, *B, *C, */ *A12, *B12, *C12,
         *A21, *B21, *C21, *A22, *B22, *C22;

  double *S1,*S2,*S3,*S4,*S5,*S6,*S7,*S8,*M2,*M5,*T1sMULT;
#define T2sMULT C22
#define NumberOfVariables 11

  char *Heap;
  void *StartHeap;

  if (MatrixSize <= cutoff_size) {
    MultiplyByDivideAndConquer(C, A, B, MatrixSize, RowWidthC, RowWidthA, RowWidthB, 0);
    return;
  }

  /* Initialize quandrant matrices */
  A12 = A + QuadrantSize;
  B12 = B + QuadrantSize;
  C12 = C + QuadrantSize;
  A21 = A + (RowWidthA * QuadrantSize);
  B21 = B + (RowWidthB * QuadrantSize);
  C21 = C + (RowWidthC * QuadrantSize);
  A22 = A21 + QuadrantSize;
  B22 = B21 + QuadrantSize;
  C22 = C21 + QuadrantSize;

  /* Allocate Heap Space Here */
  StartHeap = Heap = malloc(QuadrantSizeInBytes * NumberOfVariables);

  /* Distribute the heap space over the variables */
  S1 = (double*) Heap; Heap += QuadrantSizeInBytes;
  S2 = (double*) Heap; Heap += QuadrantSizeInBytes;
  S3 = (double*) Heap; Heap += QuadrantSizeInBytes;
  S4 = (double*) Heap; Heap += QuadrantSizeInBytes;
  S5 = (double*) Heap; Heap += QuadrantSizeInBytes;
  S6 = (double*) Heap; Heap += QuadrantSizeInBytes;
  S7 = (double*) Heap; Heap += QuadrantSizeInBytes;
  S8 = (double*) Heap; Heap += QuadrantSizeInBytes;
  M2 = (double*) Heap; Heap += QuadrantSizeInBytes;
  M5 = (double*) Heap; Heap += QuadrantSizeInBytes;
  T1sMULT = (double*) Heap; Heap += QuadrantSizeInBytes;

  if (Depth < cutoff_depth)
  {

#pragma omp task depend(in: A21, A22) depend(out: S1) private(Row, Column)
  for (Row = 0; Row < QuadrantSize; Row++)
    for (Column = 0; Column < QuadrantSize; Column++)
      S1[Row * QuadrantSize + Column] = A21[RowWidthA * Row + Column] + A22[RowWidthA * Row + Column];

#pragma omp task depend(in: S1, A) depend(out: S2) private(Row, Column)
  for (Row = 0; Row < QuadrantSize; Row++)
    for (Column = 0; Column < QuadrantSize; Column++)
      S2[Row * QuadrantSize + Column] = S1[Row * QuadrantSize + Column] - A[RowWidthA * Row + Column];

#pragma omp task depend(in: A12, S2) depend(out: S4) private(Row, Column)
  for (Row = 0; Row < QuadrantSize; Row++)
    for (Column = 0; Column < QuadrantSize; Column++)
      S4[Row * QuadrantSize + Column] = A12[Row * RowWidthA + Column] - S2[QuadrantSize * Row + Column];

#pragma omp task depend(in: B12, B) depend(out: S5) private(Row, Column)
  for (Row = 0; Row < QuadrantSize; Row++)
    for (Column = 0; Column < QuadrantSize; Column++)
      S5[Row * QuadrantSize + Column] = B12[Row * RowWidthB + Column] - B[Row * RowWidthB + Column];

#pragma omp task depend(in: B22, S5) depend(out: S6) private(Row, Column)
  for (Row = 0; Row < QuadrantSize; Row++)
    for (Column = 0; Column < QuadrantSize; Column++)
      S6[Row * QuadrantSize + Column] = B22[Row * RowWidthB + Column] - S5[Row * QuadrantSize + Column];

#pragma omp task depend(in: S6, B21) depend(out: S8) private(Row, Column)
  for (Row = 0; Row < QuadrantSize; Row++)
    for (Column = 0; Column < QuadrantSize; Column++)
      S8[Row * QuadrantSize + Column] = S6[Row * QuadrantSize + Column] - B21[Row * RowWidthB + Column];

#pragma omp task depend(in: A, A21) depend(out: S3) private(Row, Column)
  for (Row = 0; Row < QuadrantSize; Row++)
    for (Column = 0; Column < QuadrantSize; Column++)
      S3[Row * QuadrantSize + Column] = A[RowWidthA * Row + Column] - A21[RowWidthA * Row + Column];

#pragma omp task depend(in: B22, B12) depend(out: S7) private(Row, Column)
  for (Row = 0; Row < QuadrantSize; Row++)
    for (Column = 0; Column < QuadrantSize; Column++)
      S7[Row * QuadrantSize + Column] = B22[Row * RowWidthB + Column] - B12[Row * RowWidthB + Column];

    /* M2 = A x B */
#pragma omp task depend(in: A, B) depend(out: M2)
    OptimizedStrassenMultiply_par(M2, A, B, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB, Depth+1, cutoff_depth, cutoff_size);

    /* M5 = S1 * S5 */
#pragma omp task untied depend(in: S1, S5) depend(out: M5)
    OptimizedStrassenMultiply_par(M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1, cutoff_depth, cutoff_size);

    /* Step 1 of T1 = S2 x S6 + M2 */
#pragma omp task untied depend(in: S2, S6) depend(out: T1sMULT)
    OptimizedStrassenMultiply_par(T1sMULT, S2, S6,  QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1, cutoff_depth, cutoff_size);

    /* Step 1 of T2 = T1 + S3 x S7 */
#pragma omp task untied depend(in: S3, S7) depend(out: C22)
    OptimizedStrassenMultiply_par(C22, S3, S7, QuadrantSize, RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize, Depth+1, cutoff_depth, cutoff_size);

    /* Step 1 of C = M2 + A12 * B21 */
#pragma omp task untied depend(in: A12, B21) depend(out: C)
    OptimizedStrassenMultiply_par(C, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, Depth+1, cutoff_depth, cutoff_size);

    /* Step 1 of C12 = S4 x B22 + T1 + M5 */
#pragma omp task untied depend(in: S4, B22) depend(out: C12)
    OptimizedStrassenMultiply_par(C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB, Depth+1, cutoff_depth, cutoff_size);

    /* Step 1 of C21 = T2 - A22 * S8 */
#pragma omp task untied depend(in: A22, S8) depend(out: C21)
    OptimizedStrassenMultiply_par(C21, A22, S8, QuadrantSize, RowWidthC, RowWidthA, QuadrantSize, Depth+1, cutoff_depth, cutoff_size);

#pragma omp task depend(inout: C) depend(in: M2) private(Row, Column)
  for (Row = 0; Row < QuadrantSize; Row++)
    for (Column = 0; Column < QuadrantSize; Column += 1)
      C[RowWidthC * Row + Column] += M2[Row * QuadrantSize + Column];

#pragma omp task depend(inout: C12) depend(in: M5, T1sMULT, M2) private(Row, Column)
  for (Row = 0; Row < QuadrantSize; Row++)
    for (Column = 0; Column < QuadrantSize; Column += 1)
      C12[RowWidthC * Row + Column] += M5[Row * QuadrantSize + Column] + T1sMULT[Row * QuadrantSize + Column] + M2[Row * QuadrantSize + Column];

#pragma omp task depend(inout: C21) depend(in: C22, T1sMULT, M2) private(Row, Column)
  for (Row = 0; Row < QuadrantSize; Row++)
    for (Column = 0; Column < QuadrantSize; Column += 1)
      C21[RowWidthC * Row + Column] = -C21[RowWidthC * Row + Column] + C22[RowWidthC * Row + Column] + T1sMULT[Row * QuadrantSize + Column] + M2[Row * QuadrantSize + Column];

#pragma omp task depend(inout: C22) depend(in: M5, T1sMULT, M2) private(Row, Column)
  for (Row = 0; Row < QuadrantSize; Row++)
    for (Column = 0; Column < QuadrantSize; Column += 1)
      C22[RowWidthC * Row + Column] += M5[Row * QuadrantSize + Column] + T1sMULT[Row * QuadrantSize + Column] + M2[Row * QuadrantSize + Column];

#pragma omp taskwait
  }
  else
  {
    for (Row = 0; Row < QuadrantSize; Row++)
      for (Column = 0; Column < QuadrantSize; Column++) {
        S1[Row * QuadrantSize + Column] = A21[RowWidthA * Row + Column] + A22[RowWidthA * Row + Column];
        S2[Row * QuadrantSize + Column] = S1[Row * QuadrantSize + Column] - A[RowWidthA * Row + Column];
        S4[Row * QuadrantSize + Column] = A12[Row * RowWidthA + Column] - S2[QuadrantSize * Row + Column];
        S5[Row * QuadrantSize + Column] = B12[Row * RowWidthB + Column] - B[Row * RowWidthB + Column];
        S6[Row * QuadrantSize + Column] = B22[Row * RowWidthB + Column] - S5[Row * QuadrantSize + Column];
        S8[Row * QuadrantSize + Column] = S6[Row * QuadrantSize + Column] - B21[Row * RowWidthB + Column];
        S3[Row * QuadrantSize + Column] = A[RowWidthA * Row + Column] - A21[RowWidthA * Row + Column];
        S7[Row * QuadrantSize + Column] = B22[Row * RowWidthB + Column] - B12[Row * RowWidthB + Column];
      }
    /* M2 = A x B */
    OptimizedStrassenMultiply_par(M2, A, B, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB, Depth+1, cutoff_depth, cutoff_size);
    /* M5 = S1 * S5 */
    OptimizedStrassenMultiply_par(M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1, cutoff_depth, cutoff_size);
    /* Step 1 of T1 = S2 x S6 + M2 */
    OptimizedStrassenMultiply_par(T1sMULT, S2, S6,  QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1, cutoff_depth, cutoff_size);
    /* Step 1 of T2 = T1 + S3 x S7 */
    OptimizedStrassenMultiply_par(C22, S3, S7, QuadrantSize, RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize, Depth+1, cutoff_depth, cutoff_size);
    /* Step 1 of C = M2 + A12 * B21 */
    OptimizedStrassenMultiply_par(C, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, Depth+1, cutoff_depth, cutoff_size);
    /* Step 1 of C12 = S4 x B22 + T1 + M5 */
    OptimizedStrassenMultiply_par(C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB, Depth+1, cutoff_depth, cutoff_size);
    /* Step 1 of C21 = T2 - A22 * S8 */
    OptimizedStrassenMultiply_par(C21, A22, S8, QuadrantSize, RowWidthC, RowWidthA, QuadrantSize, Depth+1, cutoff_depth, cutoff_size);

    for (Row = 0; Row < QuadrantSize; Row++) {
      for (Column = 0; Column < QuadrantSize; Column += 1) {
        C[RowWidthC * Row + Column] += M2[Row * QuadrantSize + Column];
        C12[RowWidthC * Row + Column] += M5[Row * QuadrantSize + Column] + T1sMULT[Row * QuadrantSize + Column] + M2[Row * QuadrantSize + Column];
        C21[RowWidthC * Row + Column] = -C21[RowWidthC * Row + Column] + C22[RowWidthC * Row + Column] + T1sMULT[Row * QuadrantSize + Column] + M2[Row * QuadrantSize + Column];
        C22[RowWidthC * Row + Column] += M5[Row * QuadrantSize + Column] + T1sMULT[Row * QuadrantSize + Column] + M2[Row * QuadrantSize + Column];
      }
    }
  }
  free(StartHeap);
}