Esempi in C++ (Cpp) per A1_STAR_MC

Esempio n. 1

0

Mostra file

File: distance.hpp Progetto: TPNguyen/libskylark

void L1DistanceMatrix(direction_t dirA, direction_t dirB, T alpha,
    const El::ElementalMatrix<T> &APre, const El::ElementalMatrix<T> &BPre,
    T beta, El::ElementalMatrix<T> &CPre) {

    if (dirA == base::COLUMNS && dirB == base::COLUMNS) {
        // Use a SUMMA-like routine, with C as stationary
        // Basically an adaptation of Elementals TN case for stationary C.

        const El::Int m = CPre.Height();
        const El::Int n = CPre.Width();
        const El::Int sumDim = BPre.Height();
        const El::Int bsize = El::Blocksize();
        const El::Grid& g = APre.Grid();

        El::DistMatrixReadProxy<T, T, El::MC, El::MR> AProx(APre);
        El::DistMatrixReadProxy<T, T, El::MC, El::MR> BProx(BPre);
        El::DistMatrixReadWriteProxy<T, T, El::MC, El::MR> CProx(CPre);
        auto& A = AProx.GetLocked();
        auto& B = BProx.GetLocked();
        auto& C = CProx.Get();

        // Temporary distributions
        El::DistMatrix<T, El::STAR, El::MC> A1_STAR_MC(g);
        El::DistMatrix<T, El::STAR, El::MR> B1_STAR_MR(g);

        A1_STAR_MC.AlignWith(C);
        B1_STAR_MR.AlignWith(C);

        El::Scale(beta, C);
        for(El::Int k = 0; k < sumDim; k += bsize) {
            const El::Int nb = std::min(bsize,sumDim-k);
            auto A1 = A(El::IR(k,k+nb), El::IR(0,m));
            auto B1 = B(El::IR(k,k+nb), El::IR(0,n));

            A1_STAR_MC = A1;
            B1_STAR_MR = B1;
            L1DistanceMatrix(base::COLUMNS, base::COLUMNS, alpha,
                A1_STAR_MC.LockedMatrix(), B1_STAR_MR.LockedMatrix(),
                T(1.0), C.Matrix());
        }
    }

    // TODO the rest of the cases.
}

Esempio n. 2

0

Mostra file

File: TN.hpp Progetto: YingzhouLi/Elemental

void SUMMA_TNC
( Orientation orientA,
  T alpha,
  const AbstractDistMatrix<T>& APre,
  const AbstractDistMatrix<T>& BPre,
        AbstractDistMatrix<T>& CPre )
{
    DEBUG_CSE
    const Int sumDim = BPre.Height();
    const Int bsize = Blocksize();
    const Grid& g = APre.Grid();

    DistMatrixReadProxy<T,T,MC,MR> AProx( APre );
    DistMatrixReadProxy<T,T,MC,MR> BProx( BPre );
    DistMatrixReadWriteProxy<T,T,MC,MR> CProx( CPre );
    auto& A = AProx.GetLocked();
    auto& B = BProx.GetLocked();
    auto& C = CProx.Get();

    // Temporary distributions
    DistMatrix<T,STAR,MC> A1_STAR_MC(g);
    DistMatrix<T,MR,STAR> B1Trans_MR_STAR(g);

    A1_STAR_MC.AlignWith( C );
    B1Trans_MR_STAR.AlignWith( C );

    for( Int k=0; k<sumDim; k+=bsize )
    {
        const Int nb = Min(bsize,sumDim-k);
        auto A1 = A( IR(k,k+nb), ALL );
        auto B1 = B( IR(k,k+nb), ALL );

        // C[MC,MR] += alpha (A1[*,MC])^T B1[*,MR]
        //           = alpha (A1^T)[MC,*] B1[*,MR]
        A1_STAR_MC = A1; 
        Transpose( B1, B1Trans_MR_STAR );
        LocalGemm
        ( orientA, TRANSPOSE, alpha, A1_STAR_MC, B1Trans_MR_STAR, T(1), C );
    }
}

Esempio n. 3

0

Mostra file

File: LT.hpp Progetto: elemental/Elemental

void LT_C
( T alpha,
  const AbstractDistMatrix<T>& APre,
  AbstractDistMatrix<T>& CPre,
  bool conjugate=false )
{
    EL_DEBUG_CSE
    const Int r = APre.Height();
    const Int bsize = Blocksize();
    const Grid& g = APre.Grid();
    const Orientation orientation = ( conjugate ? ADJOINT : TRANSPOSE );

    DistMatrixReadProxy<T,T,MC,MR> AProx( APre );
    DistMatrixReadWriteProxy<T,T,MC,MR> CProx( CPre );
    auto& A = AProx.GetLocked();
    auto& C = CProx.Get();

    // Temporary distributions
    DistMatrix<T,MR,  STAR> A1Trans_MR_STAR(g);
    DistMatrix<T,STAR,VR  > A1_STAR_VR(g);
    DistMatrix<T,STAR,MC  > A1_STAR_MC(g);

    A1Trans_MR_STAR.AlignWith( C );
    A1_STAR_MC.AlignWith( C );

    for( Int k=0; k<r; k+=bsize )
    {
        const Int nb = Min(bsize,r-k);
        auto A1 = A( IR(k,k+nb), ALL );

        Transpose( A1, A1Trans_MR_STAR );
        Transpose( A1Trans_MR_STAR, A1_STAR_VR );
        A1_STAR_MC = A1_STAR_VR;

        LocalTrrk
        ( LOWER, orientation, TRANSPOSE,
          alpha, A1_STAR_MC, A1Trans_MR_STAR, T(1), C );
    }
}

Esempio n. 4

0

Mostra file

File: distance.hpp Progetto: TPNguyen/libskylark

void SymmetricL1DistanceMatrix(El::UpperOrLower uplo, direction_t dir, T alpha,
    const El::ElementalMatrix<T> &APre, T beta, El::ElementalMatrix<T> &CPre) {

    if (dir == base::COLUMNS) {

        const El::Int r = APre.Height();
        const El::Int bsize = El::Blocksize();
        const El::Grid& g = APre.Grid();

        El::DistMatrixReadProxy<T, T, El::MC, El::MR> AProx(APre);
        El::DistMatrixReadWriteProxy<T, T, El::MC, El::MR> CProx(CPre);
        auto& A = AProx.GetLocked();
        auto& C = CProx.Get();

        // Temporary distributions
        El::DistMatrix<T, El::STAR, El::MR> A1_STAR_MR(g);
        El::DistMatrix<T, El::STAR, El::MC> A1_STAR_MC(g);

        A1_STAR_MC.AlignWith(C);
        A1_STAR_MR.AlignWith(C);

        El::ScaleTrapezoid(beta, uplo, C);
        for(El::Int k = 0; k < r; k += bsize) {
            const El::Int nb = std::min(bsize, r - k);
            auto A1 = A(El::IR(k, k + nb), El::ALL);

            A1_STAR_MC = A1;
            A1_STAR_MR = A1;

            internal::L1DistanceMatrixTU(uplo, base::COLUMNS, base::COLUMNS, 
                alpha, A1_STAR_MC, A1_STAR_MR,
                T(1.0), C);
        }
    }

    // TODO the rest of the cases.
}

Esempio n. 5

0

Mostra file

File: UT.hpp Progetto: ahmadia/Elemental-1

inline void
Syr2kUT
( T alpha, const DistMatrix<T>& A, const DistMatrix<T>& B,
  T beta,        DistMatrix<T>& C,
  bool conjugate=false )
{
#ifndef RELEASE
    CallStackEntry entry("internal::Syr2kUT");
    if( A.Grid() != B.Grid() || B.Grid() != C.Grid() )
        throw std::logic_error
        ("{A,B,C} must be distributed over the same grid");
    if( A.Width() != C.Height() || 
        A.Width() != C.Width()  ||
        B.Width() != C.Height() ||
        B.Width() != C.Width()  ||
        A.Height() != B.Height()  )
    {
        std::ostringstream msg;
        msg << "Nonconformal Syr2kUT:\n"
            << "  A ~ " << A.Height() << " x " << A.Width() << "\n"
            << "  B ~ " << B.Height() << " x " << B.Width() << "\n"
            << "  C ~ " << C.Height() << " x " << C.Width() << "\n";
        throw std::logic_error( msg.str().c_str() );
    }
#endif
    const Grid& g = A.Grid();
    const Orientation orientation = ( conjugate ? ADJOINT : TRANSPOSE );

    // Matrix views
    DistMatrix<T> AT(g),  A0(g),
                  AB(g),  A1(g),
                          A2(g);
    DistMatrix<T> BT(g),  B0(g),
                  BB(g),  B1(g),
                          B2(g);

    // Temporary distributions
    DistMatrix<T,MR,  STAR> A1Trans_MR_STAR(g);
    DistMatrix<T,MR,  STAR> B1Trans_MR_STAR(g);
    DistMatrix<T,STAR,VR  > A1_STAR_VR(g);
    DistMatrix<T,STAR,VR  > B1_STAR_VR(g);
    DistMatrix<T,STAR,MC  > A1_STAR_MC(g);
    DistMatrix<T,STAR,MC  > B1_STAR_MC(g);

    A1Trans_MR_STAR.AlignWith( C );
    B1Trans_MR_STAR.AlignWith( C );
    A1_STAR_MC.AlignWith( C );
    B1_STAR_MC.AlignWith( C );

    // Start the algorithm
    ScaleTrapezoid( beta, LEFT, UPPER, 0, C );
    LockedPartitionDown
    ( A, AT, 
         AB, 0 );
    LockedPartitionDown
    ( B, BT,
         BB, 0 );
    while( AB.Height() > 0 )
    {
        LockedRepartitionDown
        ( AT,  A0,
         /**/ /**/
               A1,
          AB,  A2 );

        LockedRepartitionDown
        ( BT,  B0,
         /**/ /**/
               B1,
          BB,  B2 );

        //--------------------------------------------------------------------//
        A1Trans_MR_STAR.TransposeFrom( A1 );
        A1_STAR_VR.TransposeFrom( A1Trans_MR_STAR );
        A1_STAR_MC = A1_STAR_VR;

        B1Trans_MR_STAR.TransposeFrom( B1 );
        B1_STAR_VR.TransposeFrom( B1Trans_MR_STAR );
        B1_STAR_MC = B1_STAR_VR;

        LocalTrr2k
        ( UPPER, orientation, TRANSPOSE, orientation, TRANSPOSE, 
          alpha, A1_STAR_MC, B1Trans_MR_STAR,
                 B1_STAR_MC, A1Trans_MR_STAR,
          T(1),  C );
        //--------------------------------------------------------------------//

        SlideLockedPartitionDown
        ( AT,  A0,
               A1,
         /**/ /**/
          AB,  A2 );

        SlideLockedPartitionDown
        ( BT,  B0,
               B1,
         /**/ /**/
          BB,  B2 );
    }
}

Esempio n. 6

0

Mostra file

File: TT.hpp Progetto: certik/Elemental

inline void
GemmTTC
( Orientation orientationOfA, 
  Orientation orientationOfB,
  T alpha, const DistMatrix<T>& A,
           const DistMatrix<T>& B,
  T beta,        DistMatrix<T>& C )
{
#ifndef RELEASE
    PushCallStack("internal::GemmTTC");
    if( A.Grid() != B.Grid() || B.Grid() != C.Grid() )
        throw std::logic_error
        ("{A,B,C} must be distributed over the same grid");
    if( orientationOfA == NORMAL || orientationOfB == NORMAL )
        throw std::logic_error
        ("GemmTTC expects A and B to be (Conjugate)Transposed");
    if( A.Width()  != C.Height() ||
        B.Height() != C.Width()  ||
        A.Height() != B.Width()    )
    {
        std::ostringstream msg;
        msg << "Nonconformal GemmTTC: \n"
            << "  A ~ " << A.Height() << " x " << A.Width() << "\n"
            << "  B ~ " << B.Height() << " x " << B.Width() << "\n"
            << "  C ~ " << C.Height() << " x " << C.Width() << "\n";
        throw std::logic_error( msg.str() );
    }
#endif
    const Grid& g = A.Grid();

    // Matrix views
    DistMatrix<T> AT(g),  A0(g),
                  AB(g),  A1(g),
                          A2(g);
    DistMatrix<T> BL(g), BR(g),
                  B0(g), B1(g), B2(g);

    // Temporary distributions
    DistMatrix<T,STAR,MC  > A1_STAR_MC(g);
    DistMatrix<T,VR,  STAR> B1_VR_STAR(g);
    DistMatrix<T,STAR,MR  > B1AdjOrTrans_STAR_MR(g);

    A1_STAR_MC.AlignWith( C );
    B1_VR_STAR.AlignWith( C );
    B1AdjOrTrans_STAR_MR.AlignWith( C );
    
    // Start the algorithm    
    Scale( beta, C );
    LockedPartitionDown
    ( A, AT,
         AB, 0 ); 
    LockedPartitionRight( B, BL, BR, 0 );
    while( AB.Height() > 0 )
    {
        LockedRepartitionDown
        ( AT,  A0,
         /**/ /**/
               A1,
          AB,  A2 );

        LockedRepartitionRight
        ( BL, /**/     BR,
          B0, /**/ B1, B2 );

        //--------------------------------------------------------------------//
        A1_STAR_MC = A1; 
        B1_VR_STAR = B1;
        if( orientationOfB == ADJOINT )
            B1AdjOrTrans_STAR_MR.AdjointFrom( B1_VR_STAR );
        else
            B1AdjOrTrans_STAR_MR.TransposeFrom( B1_VR_STAR );

        // C[MC,MR] += alpha (A1[*,MC])^[T/H] (B1[MR,*])^[T/H]
        //           = alpha (A1^[T/H])[MC,*] (B1^[T/H])[*,MR]
        LocalGemm
        ( orientationOfA, NORMAL, 
          alpha, A1_STAR_MC, B1AdjOrTrans_STAR_MR, T(1), C );
        //--------------------------------------------------------------------//

        SlideLockedPartitionDown
        ( AT,  A0,
               A1,
         /**/ /**/
          AB,  A2 );

        SlideLockedPartitionRight
        ( BL,     /**/ BR,
          B0, B1, /**/ B2 );
    }
#ifndef RELEASE
    PopCallStack();
#endif
}

Esempio n. 7

0

Mostra file

File: NN.hpp Progetto: jimgoo/Elemental

inline void 
GemmNNB
( T alpha, const DistMatrix<T>& A,
           const DistMatrix<T>& B,
  T beta,        DistMatrix<T>& C )
{
#ifndef RELEASE
    PushCallStack("internal::GemmNNB");
    if( A.Grid() != B.Grid() || B.Grid() != C.Grid() )
        throw std::logic_error
        ("{A,B,C} must be distributed over the same grid");
    if( A.Height() != C.Height() ||
        B.Width()  != C.Width()  ||
        A.Width()  != B.Height() )
    {
        std::ostringstream msg;
        msg << "Nonconformal GemmNNB: \n"
            << "  A ~ " << A.Height() << " x " << A.Width() << "\n"
            << "  B ~ " << B.Height() << " x " << B.Width() << "\n"
            << "  C ~ " << C.Height() << " x " << C.Width() << "\n";
        throw std::logic_error( msg.str().c_str() );
    }
#endif
    const Grid& g = A.Grid();

    // Matrix views
    DistMatrix<T> AT(g),  A0(g),
                  AB(g),  A1(g),
                          A2(g);
    DistMatrix<T> CT(g),  C0(g),
                  CB(g),  C1(g),
                          C2(g);

    // Temporary distributions
    DistMatrix<T,STAR,MC> A1_STAR_MC(g);
    DistMatrix<T,MR,STAR> D1Trans_MR_STAR(g);

    A1_STAR_MC.AlignWith( B );
    D1Trans_MR_STAR.AlignWith( B );

    // Start the algorithm
    Scale( beta, C );
    LockedPartitionDown
    ( A, AT,
         AB, 0 );
    PartitionDown
    ( C, CT,
         CB, 0 );
    while( AB.Height() > 0 )
    {
        LockedRepartitionDown
        ( AT,  A0,
         /**/ /**/
               A1,
          AB,  A2 );

        RepartitionDown
        ( CT,  C0,
         /**/ /**/
               C1,
          CB,  C2 );

        Zeros( C1.Width(), C1.Height(), D1Trans_MR_STAR );
        //--------------------------------------------------------------------//
        A1_STAR_MC = A1; // A1[*,MC] <- A1[MC,MR]

        // D1^T[MR,* ] := alpha B^T[MR,MC] A1^T[MC,* ]
        LocalGemm
        ( TRANSPOSE, TRANSPOSE, alpha, B, A1_STAR_MC, T(0), D1Trans_MR_STAR );

        C1.TransposeSumScatterUpdate( T(1), D1Trans_MR_STAR );
        //--------------------------------------------------------------------//

        SlideLockedPartitionDown
        ( AT,  A0,
               A1,
         /**/ /**/
          AB,  A2 );
 
        SlidePartitionDown
        ( CT,  C0,
               C1,
         /**/ /**/
          CB,  C2 );
    }
#ifndef RELEASE
    PopCallStack();
#endif
}

Esempio n. 8

0

Mostra file

File: TTTT.hpp Progetto: jimgoo/Elemental

inline void
Trr2kTTTT
( UpperOrLower uplo,
  Orientation orientationOfA, Orientation orientationOfB,
  Orientation orientationOfC, Orientation orientationOfD, 
  T alpha, const DistMatrix<T>& A, const DistMatrix<T>& B,
           const DistMatrix<T>& C, const DistMatrix<T>& D,
  T beta,        DistMatrix<T>& E )
{
#ifndef RELEASE
    PushCallStack("internal::Trr2kTTTT");
    if( E.Height() != E.Width()  || A.Height() != C.Height() ||
        A.Width()  != E.Height() || C.Width()  != E.Height() ||
        B.Height() != E.Width()  || D.Height() != E.Width()  ||
        A.Height() != B.Width()  || C.Height() != D.Width() )
        throw std::logic_error("Nonconformal Trr2kTTTT");
#endif
    const Grid& g = E.Grid();

    DistMatrix<T> AT(g),  A0(g),
                  AB(g),  A1(g),
                          A2(g);
    DistMatrix<T> BL(g), BR(g),
                  B0(g), B1(g), B2(g);

    DistMatrix<T> CT(g),  C0(g),
                  CB(g),  C1(g),
                          C2(g);
    DistMatrix<T> DL(g), DR(g),
                  D0(g), D1(g), D2(g);

    DistMatrix<T,STAR,MC  > A1_STAR_MC(g);
    DistMatrix<T,VR,  STAR> B1_VR_STAR(g);
    DistMatrix<T,STAR,MR  > B1AdjOrTrans_STAR_MR(g);
    DistMatrix<T,STAR,MC  > C1_STAR_MC(g);
    DistMatrix<T,VR,  STAR> D1_VR_STAR(g);
    DistMatrix<T,STAR,MR  > D1AdjOrTrans_STAR_MR(g);

    A1_STAR_MC.AlignWith( E );
    B1_VR_STAR.AlignWith( E );
    B1AdjOrTrans_STAR_MR.AlignWith( E );
    C1_STAR_MC.AlignWith( E );
    D1_VR_STAR.AlignWith( E );
    D1AdjOrTrans_STAR_MR.AlignWith( E );

    LockedPartitionDown
    ( A, AT,
         AB, 0 );
    LockedPartitionRight( B, BL, BR, 0 );
    LockedPartitionDown
    ( C, CT,
         CB, 0 );
    LockedPartitionRight( D, DL, DR, 0 );
    while( AT.Height() < A.Height() )
    {
        LockedRepartitionDown
        ( AT,  A0,
         /**/ /**/
               A1,
          AB,  A2 );
        LockedRepartitionRight
        ( BL, /**/ BR,
          B0, /**/ B1, B2 );
        LockedRepartitionDown
        ( CT,  C0,
         /**/ /**/
               C1,
          CB,  C2 );
        LockedRepartitionRight
        ( DL, /**/ DR,
          D0, /**/ D1, D2 );

        //--------------------------------------------------------------------//
        A1_STAR_MC = A1;
        C1_STAR_MC = C1;
        B1_VR_STAR = B1;
        D1_VR_STAR = D1;
        if( orientationOfB == ADJOINT )
            B1AdjOrTrans_STAR_MR.AdjointFrom( B1_VR_STAR );
        else
            B1AdjOrTrans_STAR_MR.TransposeFrom( B1_VR_STAR );
        if( orientationOfD == ADJOINT )
            D1AdjOrTrans_STAR_MR.AdjointFrom( D1_VR_STAR );
        else
            D1AdjOrTrans_STAR_MR.TransposeFrom( D1_VR_STAR );
        LocalTrr2k
        ( uplo, orientationOfA, orientationOfC,
          alpha, A1_STAR_MC, B1AdjOrTrans_STAR_MR,
                 C1_STAR_MC, D1AdjOrTrans_STAR_MR,
          beta,  E );
        //--------------------------------------------------------------------//

        SlideLockedPartitionRight
        ( DL,     /**/ DR,
          D0, D1, /**/ D2 );
        SlideLockedPartitionDown
        ( CT,  C0,
               C1,
         /**/ /**/
          CB,  C2 );
        SlideLockedPartitionRight
        ( BL,     /**/ BR,
          B0, B1, /**/ B2 );
        SlideLockedPartitionDown
        ( AT,  A0,
               A1,
         /**/ /**/
          AB,  A2 );
    }
#ifndef RELEASE
    PopCallStack();
#endif
}

Esempio n. 9

0

Mostra file

File: LC.hpp Progetto: jimgoo/Elemental

inline void
HerkLC( T alpha, const DistMatrix<T>& A, T beta, DistMatrix<T>& C )
{
#ifndef RELEASE
    PushCallStack("internal::HerkLC");
    if( A.Grid() != C.Grid() )
        throw std::logic_error
        ("A and C must be distributed over the same grid");
    if( A.Width() != C.Height() || A.Width() != C.Width() )
    {
        std::ostringstream msg;
        msg << "Nonconformal HerkLC:\n"
            << "  A ~ " << A.Height() << " x " << A.Width() << "\n"
            << "  C ~ " << C.Height() << " x " << C.Width() << "\n";
        throw std::logic_error( msg.str().c_str() );
    }
#endif
    const Grid& g = A.Grid();

    // Matrix views
    DistMatrix<T> AT(g),  A0(g),
                  AB(g),  A1(g),
                          A2(g);

    // Temporary distributions
    DistMatrix<T,MR,  STAR> A1Trans_MR_STAR(g);
    DistMatrix<T,STAR,VR  > A1_STAR_VR(g);
    DistMatrix<T,STAR,MC  > A1_STAR_MC(g);

    A1Trans_MR_STAR.AlignWith( C );
    A1_STAR_MC.AlignWith( C );

    // Start the algorithm
    ScaleTrapezoid( beta, LEFT, LOWER, 0, C );
    LockedPartitionDown
    ( A, AT, 
         AB, 0 );
    while( AB.Height() > 0 )
    {
        LockedRepartitionDown
        ( AT,  A0,
         /**/ /**/
               A1,
          AB,  A2 );

        //--------------------------------------------------------------------//
        A1Trans_MR_STAR.TransposeFrom( A1 );
        A1_STAR_VR.TransposeFrom( A1Trans_MR_STAR );
        A1_STAR_MC = A1_STAR_VR;

        LocalTrrk
        ( LOWER, ADJOINT, TRANSPOSE,
          alpha, A1_STAR_MC, A1Trans_MR_STAR, T(1), C );
        //--------------------------------------------------------------------//

        SlideLockedPartitionDown
        ( AT,  A0,
               A1,
         /**/ /**/
          AB,  A2 );
    }
#ifndef RELEASE
    PopCallStack();
#endif
}

Esempio n. 10

0

Mostra file

File: GemmTN.hpp Progetto: ahmadia/elemental

inline void
internal::GemmTNC
( Orientation orientationOfA,
  T alpha, const DistMatrix<T,MC,MR>& A,
           const DistMatrix<T,MC,MR>& B,
  T beta,        DistMatrix<T,MC,MR>& C )
{
#ifndef RELEASE
    PushCallStack("internal::GemmTNC");
    if( A.Grid() != B.Grid() || B.Grid() != C.Grid() )
        throw std::logic_error
        ("{A,B,C} must be distributed over the same grid");
    if( orientationOfA == NORMAL )
        throw std::logic_error("GemmTNC assumes A is (Conjugate)Transposed");
    if( A.Width()  != C.Height() ||
        B.Width()  != C.Width()  ||
        A.Height() != B.Height()   )
    {
        std::ostringstream msg;
        msg << "Nonconformal GemmTNC: \n"
            << "  A ~ " << A.Height() << " x " << A.Width() << "\n"
            << "  B ~ " << B.Height() << " x " << B.Width() << "\n"
            << "  C ~ " << C.Height() << " x " << C.Width() << "\n";
        throw std::logic_error( msg.str().c_str() );
    }
#endif
    const Grid& g = A.Grid();

    // Matrix views
    DistMatrix<T,MC,MR> AT(g),  A0(g),
                        AB(g),  A1(g),
                                A2(g);

    DistMatrix<T,MC,MR> BT(g),  B0(g),
                        BB(g),  B1(g),
                                B2(g);

    // Temporary distributions
    DistMatrix<T,STAR,MC> A1_STAR_MC(g);
    DistMatrix<T,STAR,MR> B1_STAR_MR(g);

    // Start the algorithm
    Scal( beta, C );
    LockedPartitionDown
    ( A, AT,
         AB, 0 );
    LockedPartitionDown
    ( B, BT,
         BB, 0 );
    while( AB.Height() > 0 )
    {
        LockedRepartitionDown
        ( AT,  A0,
         /**/ /**/
               A1,
          AB,  A2 );

        LockedRepartitionDown
        ( BT,  B0,
         /**/ /**/
               B1,
          BB,  B2 );

        A1_STAR_MC.AlignWith( C );
        B1_STAR_MR.AlignWith( C );
        //--------------------------------------------------------------------//
        A1_STAR_MC = A1; // A1[*,MC] <- A1[MC,MR]
        B1_STAR_MR = B1; // B1[*,MR] <- B1[MC,MR]

        // C[MC,MR] += alpha (A1[*,MC])^T B1[*,MR]
        //           = alpha (A1^T)[MC,*] B1[*,MR]
        internal::LocalGemm
        ( orientationOfA, NORMAL, alpha, A1_STAR_MC, B1_STAR_MR, (T)1, C );
        //--------------------------------------------------------------------//
        A1_STAR_MC.FreeAlignments();
        B1_STAR_MR.FreeAlignments();

        SlideLockedPartitionDown
        ( AT,  A0,
               A1,
         /**/ /**/
          AB,  A2 );

        SlideLockedPartitionDown
        ( BT,  B0,
               B1,
         /**/ /**/
          BB,  B2 );
    }
#ifndef RELEASE
    PopCallStack();
#endif
}