C++ (Cpp) ElementalMatrix::LocalHeight Examples

Example #1

0

Show file

File: Contract.hpp Project: nooperpudd/Elemental

void Contract
( const ElementalMatrix<T>& A,
        ElementalMatrix<T>& B )
{
    DEBUG_ONLY(CSE cse("Contract"))
    AssertSameGrids( A, B );
    const Dist U = B.ColDist();
    const Dist V = B.RowDist();
    // TODO: Shorten this implementation?
    if( A.ColDist() == U && A.RowDist() == V )
    {
        Copy( A, B );
    }
    else if( A.ColDist() == U && A.RowDist() == Partial(V) )
    {
        B.AlignAndResize
        ( A.ColAlign(), A.RowAlign(), A.Height(), A.Width(), false, false );
        Zeros( B.Matrix(), B.LocalHeight(), B.LocalWidth() );
        AxpyContract( T(1), A, B );
    }
    else if( A.ColDist() == Partial(U) && A.RowDist() == V )
    {
        B.AlignAndResize
        ( A.ColAlign(), A.RowAlign(), A.Height(), A.Width(), false, false );
        Zeros( B.Matrix(), B.LocalHeight(), B.LocalWidth() );
        AxpyContract( T(1), A, B );
    }
    else if( A.ColDist() == U && A.RowDist() == Collect(V) )
    {
        B.AlignColsAndResize
        ( A.ColAlign(), A.Height(), A.Width(), false, false );
        Zeros( B.Matrix(), B.LocalHeight(), B.LocalWidth() );
        AxpyContract( T(1), A, B );
    }
    else if( A.ColDist() == Collect(U) && A.RowDist() == V )
    {
        B.AlignRowsAndResize
        ( A.RowAlign(), A.Height(), A.Width(), false, false );
        Zeros( B.Matrix(), B.LocalHeight(), B.LocalWidth() );
        AxpyContract( T(1), A, B );
    }
    else if( A.ColDist() == Collect(U) && A.RowDist() == Collect(V) )
    {
        Zeros( B, A.Height(), A.Width() );
        AxpyContract( T(1), A, B );
    }
    else
        LogicError("Incompatible distributions");
}

Example #2

0

Show file

File: AxpyContract.cpp Project: andreasnoack/Elemental

void PartialColScatter
( T alpha,
  const ElementalMatrix<T>& A,
        ElementalMatrix<T>& B )
{
    DEBUG_ONLY(CSE cse("axpy_contract::PartialColScatter"))
    AssertSameGrids( A, B );
    if( A.Height() != B.Height() || A.Width() != B.Width() )
        LogicError("A and B must be the same size");

#ifdef EL_CACHE_WARNINGS
    if( A.Width() != 1 && A.Grid().Rank() == 0 )
    {
        cerr <<
          "axpy_contract::PartialColScatterUpdate potentially causes a large "
          "amount of cache-thrashing. If possible, avoid it by forming the "
          "(conjugate-)transpose of the [UGath,* ] matrix instead."
          << endl;
    }
#endif
    if( B.ColAlign() % A.ColStride() == A.ColAlign() )
    {
        const Int colStride = B.ColStride();
        const Int colStridePart = B.PartialColStride();
        const Int colStrideUnion = B.PartialUnionColStride();
        const Int colRankPart = B.PartialColRank();
        const Int colAlign = B.ColAlign();

        const Int height = B.Height();
        const Int width = B.Width();
        const Int localHeight = B.LocalHeight();
        const Int maxLocalHeight = MaxLength( height, colStride );
        const Int recvSize = mpi::Pad( maxLocalHeight*width );
        const Int sendSize = colStrideUnion*recvSize;

        //vector<T> buffer( sendSize );
        vector<T> buffer;
        buffer.reserve( sendSize );

        // Pack
        copy::util::PartialColStridedPack
        ( height, width,
          colAlign, colStride,
          colStrideUnion, colStridePart, colRankPart,
          A.ColShift(),
          A.LockedBuffer(), A.LDim(),
          buffer.data(),    recvSize );

        // Communicate
        mpi::ReduceScatter( buffer.data(), recvSize, B.PartialUnionColComm() );

        // Unpack our received data
        axpy::util::InterleaveMatrixUpdate
        ( alpha, localHeight, width,
          buffer.data(), 1, localHeight,
          B.Buffer(),    1, B.LDim() );
    }
    else
        LogicError("Unaligned PartialColScatter not implemented");
}

Example #3

0

Show file

File: UniformHelmholtzGreens.cpp Project: elemental/Elemental

void UniformHelmholtzGreens
( ElementalMatrix<Complex<Real>>& A, Int n, Real lambda )
{
    EL_DEBUG_CSE
    typedef Complex<Real> C;
    const Real pi = 4*Atan( Real(1) );
    const Real k0 = 2*pi/lambda;
    const Grid& g = A.Grid();

    // Generate a list of n uniform samples from the 3D unit ball
    DistMatrix<Real,STAR,VR> X_STAR_VR(3,n,g);
    for( Int jLoc=0; jLoc<X_STAR_VR.LocalWidth(); ++jLoc )
    {
        Real x0, x1, x2;
        // Sample uniformly from [-1,+1]^3 until a point is drawn from the ball
        while( true )
        {
            x0 = SampleUniform( Real(-1), Real(1) );
            x1 = SampleUniform( Real(-1), Real(1) );
            x2 = SampleUniform( Real(-1), Real(1) );
            const Real radiusSq = x0*x0 + x1*x1 + x2*x2;
            if( radiusSq > 0 && radiusSq <= 1 )
                break;
        }
        X_STAR_VR.SetLocal( 0, jLoc, x0 );
        X_STAR_VR.SetLocal( 1, jLoc, x1 );
        X_STAR_VR.SetLocal( 2, jLoc, x2 );
    }
    DistMatrix<Real,STAR,STAR> X_STAR_STAR( X_STAR_VR );

    A.Resize( n, n );
    for( Int jLoc=0; jLoc<A.LocalWidth(); ++jLoc )
    {
        const Int j = A.GlobalCol(jLoc);
        const Real xj0 = X_STAR_STAR.GetLocal(0,j);
        const Real xj1 = X_STAR_STAR.GetLocal(1,j);
        const Real xj2 = X_STAR_STAR.GetLocal(2,j);
        for( Int iLoc=0; iLoc<A.LocalHeight(); ++iLoc )
        {
            const Int i = A.GlobalRow(iLoc);
            if( i == j )
            {
                A.SetLocal( iLoc, jLoc, 0 );
            }
            else
            {
                const Real d0 = X_STAR_STAR.GetLocal(0,i)-xj0;
                const Real d1 = X_STAR_STAR.GetLocal(1,i)-xj1;
                const Real d2 = X_STAR_STAR.GetLocal(2,i)-xj2;
                const Real gamma = k0*Sqrt(d0*d0+d1*d1+d2*d2);
                const Real realPart = Cos(gamma)/gamma;
                const Real imagPart = Sin(gamma)/gamma;
                A.SetLocal( iLoc, jLoc, C(realPart,imagPart) );
            }
        }
    }
}

Example #4

0

Show file

File: AxpyContract.cpp Project: andreasnoack/Elemental

void Scatter
( T alpha,
  const ElementalMatrix<T>& A,
        ElementalMatrix<T>& B )
{
    DEBUG_ONLY(CSE cse("axpy_contract::Scatter"))
    AssertSameGrids( A, B );
    if( A.Height() != B.Height() || A.Width() != B.Width() )
        LogicError("Sizes of A and B must match");
    if( !B.Participating() )
        return;

    const Int colStride = B.ColStride();
    const Int rowStride = B.RowStride();
    const Int colAlign = B.ColAlign();
    const Int rowAlign = B.RowAlign();

    const Int height = B.Height();
    const Int width = B.Width();
    const Int localHeight = B.LocalHeight();
    const Int localWidth = B.LocalWidth();
    const Int maxLocalHeight = MaxLength(height,colStride);
    const Int maxLocalWidth = MaxLength(width,rowStride);

    const Int recvSize = mpi::Pad( maxLocalHeight*maxLocalWidth );
    const Int sendSize = colStride*rowStride*recvSize;

    //vector<T> buffer( sendSize );
    vector<T> buffer;
    buffer.reserve( sendSize );

    // Pack 
    copy::util::StridedPack
    ( height, width,
      colAlign, colStride,
      rowAlign, rowStride,
      A.LockedBuffer(), A.LDim(),
      buffer.data(),    recvSize );

    // Communicate
    mpi::ReduceScatter( buffer.data(), recvSize, B.DistComm() );

    // Unpack our received data
    axpy::util::InterleaveMatrixUpdate
    ( alpha, localHeight, localWidth,
      buffer.data(), 1, localHeight,
      B.Buffer(),    1, B.LDim() );
}

Example #5

0

Show file

File: ExtendedKahan.cpp Project: elemental/Elemental

void MakeExtendedKahan
( ElementalMatrix<F>& A, Base<F> phi, Base<F> mu )
{
    EL_DEBUG_CSE
    typedef Base<F> Real;

    if( A.Height() != A.Width() )
        LogicError("Extended Kahan matrices must be square");
    const Int n = A.Height();
    if( n % 3 != 0 )
        LogicError("Dimension must be an integer multiple of 3");
    const Int l = n / 3;
    if( !l || (l & (l-1)) )
        LogicError("n/3 is not a power of two");
    Int k=0;
    while( Int(1u<<k) < l )
        ++k;

    if( phi <= Real(0) || phi >= Real(1) )
        LogicError("phi must be in (0,1)");
    if( mu <= Real(0) || mu >= Real(1) )
        LogicError("mu must be in (0,1)");

    // Start by setting A to the identity, and then modify the necessary 
    // l x l blocks of its 3 x 3 partitioning.
    MakeIdentity( A );
    unique_ptr<ElementalMatrix<F>> ABlock( A.Construct(A.Grid(),A.Root()) );
    View( *ABlock, A, IR(2*l,3*l), IR(2*l,3*l) );
    *ABlock *= mu;
    View( *ABlock, A, IR(0,l), IR(l,2*l) );
    Walsh( *ABlock, k );
    *ABlock *= -phi;
    View( *ABlock, A, IR(l,2*l), IR(2*l,3*l) );
    Walsh( *ABlock, k );
    *ABlock *= phi;

    // Now scale A by S
    const Real zeta = Sqrt(Real(1)-phi*phi);
    auto& ALoc = A.Matrix();
    for( Int iLoc=0; iLoc<A.LocalHeight(); ++iLoc )
    {
        const Int i = A.GlobalRow(iLoc);
        const Real gamma = Pow(zeta,Real(i));
        for( Int jLoc=0; jLoc<A.LocalWidth(); ++jLoc )
            ALoc(iLoc,jLoc) *= gamma;
    }
}

Example #6

0

Show file

File: IndexDependentMap.hpp Project: YingzhouLi/Elemental

void IndexDependentMap
( const ElementalMatrix<S>& A,
        ElementalMatrix<T>& B, 
  function<T(Int,Int,S)> func )
{
    DEBUG_CSE
    const Int mLoc = A.LocalHeight();
    const Int nLoc = A.LocalWidth();
    B.AlignWith( A.DistData() );
    B.Resize( A.Height(), A.Width() );
    auto& ALoc = A.LockedMatrix();
    auto& BLoc = B.Matrix();
    for( Int jLoc=0; jLoc<nLoc; ++jLoc )
    {
        const Int j = A.GlobalCol(jLoc);
        for( Int iLoc=0; iLoc<mLoc; ++iLoc )
        {
            const Int i = A.GlobalRow(iLoc);
            BLoc(iLoc,jLoc) = func(i,j,ALoc(iLoc,jLoc));
        }
    }
}

Example #7

0

Show file

File: Gather.cpp Project: andreasnoack/Elemental

void Gather
( const ElementalMatrix<T>& A,
        DistMatrix<T,CIRC,CIRC>& B )
{
    DEBUG_ONLY(CSE cse("copy::Gather"))
    AssertSameGrids( A, B );
    if( A.DistSize() == 1 && A.CrossSize() == 1 )
    {
        B.Resize( A.Height(), A.Width() );
        if( B.CrossRank() == B.Root() )
            Copy( A.LockedMatrix(), B.Matrix() );
        return;
    }

    const Int height = A.Height();
    const Int width = A.Width();
    B.SetGrid( A.Grid() );
    B.Resize( height, width );

    // Gather the colShifts and rowShifts
    // ==================================
    Int myShifts[2];
    myShifts[0] = A.ColShift();
    myShifts[1] = A.RowShift();
    vector<Int> shifts;
    const Int crossSize = B.CrossSize();
    if( B.CrossRank() == B.Root() )
        shifts.resize( 2*crossSize );
    mpi::Gather( myShifts, 2, shifts.data(), 2, B.Root(), B.CrossComm() );

    // Gather the payload data
    // =======================
    const bool irrelevant = ( A.RedundantRank()!=0 || A.CrossRank()!=A.Root() );
    int totalSend = ( irrelevant ? 0 : A.LocalHeight()*A.LocalWidth() );
    vector<int> recvCounts, recvOffsets;
    if( B.CrossRank() == B.Root() )
        recvCounts.resize( crossSize );
    mpi::Gather( &totalSend, 1, recvCounts.data(), 1, B.Root(), B.CrossComm() );
    int totalRecv = Scan( recvCounts, recvOffsets );
    //vector<T> sendBuf(totalSend), recvBuf(totalRecv);
    vector<T> sendBuf, recvBuf;
    sendBuf.reserve( totalSend );
    recvBuf.reserve( totalRecv );
    if( !irrelevant )
        copy::util::InterleaveMatrix
        ( A.LocalHeight(), A.LocalWidth(),
          A.LockedBuffer(), 1, A.LDim(),
          sendBuf.data(),   1, A.LocalHeight() );
    mpi::Gather
    ( sendBuf.data(), totalSend,
      recvBuf.data(), recvCounts.data(), recvOffsets.data(), 
      B.Root(), B.CrossComm() );

    // Unpack
    // ======
    if( B.Root() == B.CrossRank() )
    {
        for( Int q=0; q<crossSize; ++q )
        {
            if( recvCounts[q] == 0 )
                continue;
            const Int colShift = shifts[2*q+0];
            const Int rowShift = shifts[2*q+1];
            const Int colStride = A.ColStride();
            const Int rowStride = A.RowStride();
            const Int localHeight = Length( height, colShift, colStride );
            const Int localWidth = Length( width, rowShift, rowStride );
            copy::util::InterleaveMatrix
            ( localHeight, localWidth,
              &recvBuf[recvOffsets[q]],    1,         localHeight,
              B.Buffer(colShift,rowShift), colStride, rowStride*B.LDim() );
        }
    }
}

Example #8

0

Show file

File: Scatter.hpp Project: YingzhouLi/Elemental

void Scatter
( const DistMatrix<T,CIRC,CIRC>& A,
        ElementalMatrix<T>& B )
{
    DEBUG_CSE
    AssertSameGrids( A, B );

    const Int m = A.Height();
    const Int n = A.Width();
    const Int colStride = B.ColStride();
    const Int rowStride = B.RowStride();
    B.Resize( m, n );
    if( B.CrossSize() != 1 || B.RedundantSize() != 1 )
    {
        // TODO:
        // Broadcast over the redundant communicator and use mpi::Translate
        // rank to determine whether a process is the root of the broadcast.
        GeneralPurpose( A, B ); 
        return;
    }

    const Int pkgSize = mpi::Pad(MaxLength(m,colStride)*MaxLength(n,rowStride));
    const Int recvSize = pkgSize;
    const Int sendSize = B.DistSize()*pkgSize;

    // Translate the root of A into the DistComm of B (if possible)
    const Int root = A.Root();
    const Int target = mpi::Translate( A.CrossComm(), root, B.DistComm() ); 
    if( target == mpi::UNDEFINED )
        return;

    if( B.DistSize() == 1 )
    {
        Copy( A.LockedMatrix(), B.Matrix() );
        return;
    }

    vector<T> buffer;
    T* recvBuf=0; // some compilers (falsely) warn otherwise
    if( A.CrossRank() == root )
    {
        FastResize( buffer, sendSize+recvSize );
        T* sendBuf = &buffer[0];
        recvBuf    = &buffer[sendSize];

        // Pack the send buffer
        copy::util::StridedPack
        ( m, n,
          B.ColAlign(), colStride,
          B.RowAlign(), rowStride,
          A.LockedBuffer(), A.LDim(),
          sendBuf,          pkgSize );

        // Scatter from the root
        mpi::Scatter
        ( sendBuf, pkgSize, recvBuf, pkgSize, target, B.DistComm() );
    }
    else
    {
        FastResize( buffer, recvSize );
        recvBuf = &buffer[0];

        // Perform the receiving portion of the scatter from the non-root
        mpi::Scatter
        ( static_cast<T*>(0), pkgSize,
          recvBuf,            pkgSize, target, B.DistComm() );
    }

    // Unpack
    copy::util::InterleaveMatrix
    ( B.LocalHeight(), B.LocalWidth(),
      recvBuf,    1, B.LocalHeight(),
      B.Buffer(), 1, B.LDim() );
}

Example #9

0

Show file

File: AxpyContract.cpp Project: andreasnoack/Elemental

void RowScatter
( T alpha,
  const ElementalMatrix<T>& A,
        ElementalMatrix<T>& B )
{
    DEBUG_ONLY(CSE cse("axpy_contract::RowScatter"))
    AssertSameGrids( A, B );
    if( A.Height() != B.Height() || A.Width() != B.Width() )
        LogicError("Matrix sizes did not match");
    if( !B.Participating() )
        return;

    const Int width = B.Width();
    const Int colDiff = B.ColAlign()-A.ColAlign();
    if( colDiff == 0 )
    {
        if( width == 1 )
        {
            const Int localHeight = B.LocalHeight();
            const Int portionSize = mpi::Pad( localHeight );
            //vector<T> buffer( portionSize );
            vector<T> buffer;
            buffer.reserve( portionSize );

            // Reduce to rowAlign
            const Int rowAlign = B.RowAlign();
            mpi::Reduce
            ( A.LockedBuffer(), buffer.data(), portionSize,
              rowAlign, B.RowComm() );

            if( B.RowRank() == rowAlign )
            {
                axpy::util::InterleaveMatrixUpdate
                ( alpha, localHeight, 1,
                  buffer.data(), 1, localHeight,
                  B.Buffer(),    1, B.LDim() );
            }
        }
        else
        {
            const Int rowStride = B.RowStride();
            const Int rowAlign = B.RowAlign();

            const Int localHeight = B.LocalHeight();
            const Int localWidth = B.LocalWidth();
            const Int maxLocalWidth = MaxLength(width,rowStride);

            const Int portionSize = mpi::Pad( localHeight*maxLocalWidth );
            const Int sendSize = rowStride*portionSize;

            // Pack 
            //vector<T> buffer( sendSize );
            vector<T> buffer;
            buffer.reserve( sendSize );
            copy::util::RowStridedPack
            ( localHeight, width,
              rowAlign, rowStride,
              A.LockedBuffer(), A.LDim(),
              buffer.data(), portionSize );

            // Communicate
            mpi::ReduceScatter( buffer.data(), portionSize, B.RowComm() );

            // Update with our received data
            axpy::util::InterleaveMatrixUpdate
            ( alpha, localHeight, localWidth,
              buffer.data(), 1, localHeight,
              B.Buffer(),    1, B.LDim() );
        }
    }
    else
    {
#ifdef EL_UNALIGNED_WARNINGS
        if( B.Grid().Rank() == 0 )
            cerr << "Unaligned RowScatter" << endl;
#endif
        const Int colRank = B.ColRank();
        const Int colStride = B.ColStride();

        const Int sendRow = Mod( colRank+colDiff, colStride );
        const Int recvRow = Mod( colRank-colDiff, colStride );

        const Int localHeight = B.LocalHeight();
        const Int localHeightA = A.LocalHeight();

        if( width == 1 )
        {
            //vector<T> buffer( localHeight+localHeightA );
            vector<T> buffer;
            buffer.reserve( localHeight+localHeightA );
            T* sendBuf = &buffer[0];
            T* recvBuf = &buffer[localHeightA];

            // Reduce to rowAlign
            const Int rowAlign = B.RowAlign();
            mpi::Reduce
            ( A.LockedBuffer(), sendBuf, localHeightA, rowAlign, B.RowComm() );

            if( B.RowRank() == rowAlign )
            {
                // Perform the realignment
                mpi::SendRecv
                ( sendBuf, localHeightA, sendRow,
                  recvBuf, localHeight,  recvRow, B.ColComm() );

                axpy::util::InterleaveMatrixUpdate
                ( alpha, localHeight, 1,
                  recvBuf,    1, localHeight,
                  B.Buffer(), 1, B.LDim() );
            }
        }
        else
        {
            const Int rowStride = B.RowStride();
            const Int rowAlign = B.RowAlign();

            const Int localWidth = B.LocalWidth();
            const Int maxLocalWidth = MaxLength(width,rowStride);

            const Int recvSize_RS = mpi::Pad( localHeightA*maxLocalWidth );
            const Int sendSize_RS = rowStride * recvSize_RS;
            const Int recvSize_SR = localHeight * localWidth;

            //vector<T> buffer( recvSize_RS + Max(sendSize_RS,recvSize_SR) );
            vector<T> buffer;
            buffer.reserve( recvSize_RS + Max(sendSize_RS,recvSize_SR) );
            T* firstBuf = &buffer[0];
            T* secondBuf = &buffer[recvSize_RS];

            // Pack 
            copy::util::RowStridedPack
            ( localHeightA, width,
              rowAlign, rowStride,
              A.LockedBuffer(), A.LDim(),
              secondBuf,        recvSize_RS );

            // Reduce-scatter over each process row
            mpi::ReduceScatter( secondBuf, firstBuf, recvSize_RS, B.RowComm() );

            // Trade reduced data with the appropriate process row
            mpi::SendRecv
            ( firstBuf,  localHeightA*localWidth, sendRow,
              secondBuf, localHeight*localWidth,  recvRow, B.ColComm() );

            // Update with our received data
            axpy::util::InterleaveMatrixUpdate
            ( alpha, localHeight, localWidth,
              secondBuf,  1, localHeight,
              B.Buffer(), 1, B.LDim() );
        }
    }
}

Example #10

0

Show file

File: AxpyContract.cpp Project: andreasnoack/Elemental

void ColScatter
( T alpha,
  const ElementalMatrix<T>& A,
        ElementalMatrix<T>& B )
{
    DEBUG_ONLY(CSE cse("axpy_contract::ColScatter"))
    AssertSameGrids( A, B );
    if( A.Height() != B.Height() || A.Width() != B.Width() )
        LogicError("A and B must be the same size");
#ifdef EL_VECTOR_WARNINGS
    if( A.Width() == 1 && B.Grid().Rank() == 0 )
    {
        cerr <<
          "The vector version of ColScatter does not"
          " yet have a vector version implemented, but it would only "
          "require a modification of the vector version of RowScatter"
          << endl;
    }
#endif
#ifdef EL_CACHE_WARNINGS
    if( A.Width() != 1 && B.Grid().Rank() == 0 )
    {
        cerr <<
          "axpy_contract::ColScatter potentially causes a large "
          "amount of cache-thrashing. If possible, avoid it by forming the "
          "(conjugate-)transpose of the [* ,V] matrix instead." << endl;
    }
#endif
    if( !B.Participating() )
        return;
    const Int height = B.Height();
    const Int localHeight = B.LocalHeight();
    const Int localWidth = B.LocalWidth();

    const Int colAlign = B.ColAlign();
    const Int colStride = B.ColStride();

    const Int rowDiff = B.RowAlign()-A.RowAlign();
    // TODO: Allow for modular equivalence if possible
    if( rowDiff == 0 )
    {
        const Int maxLocalHeight = MaxLength(height,colStride);

        const Int recvSize = mpi::Pad( maxLocalHeight*localWidth );
        const Int sendSize = colStride*recvSize;
        //vector<T> buffer( sendSize );
        vector<T> buffer;
        buffer.reserve( sendSize );

        // Pack 
        copy::util::ColStridedPack
        ( height, localWidth,
          colAlign, colStride,
          A.LockedBuffer(), A.LDim(),
          buffer.data(),    recvSize );
    
        // Communicate
        mpi::ReduceScatter( buffer.data(), recvSize, B.ColComm() );

        // Update with our received data
        axpy::util::InterleaveMatrixUpdate
        ( alpha, localHeight, localWidth,
          buffer.data(), 1, localHeight,
          B.Buffer(),    1, B.LDim() );
    }
    else
    {
#ifdef EL_UNALIGNED_WARNINGS
        if( B.Grid().Rank() == 0 )
            cerr << "Unaligned ColScatter" << endl;
#endif
        const Int localWidthA = A.LocalWidth();
        const Int maxLocalHeight = MaxLength(height,colStride);

        const Int recvSize_RS = mpi::Pad( maxLocalHeight*localWidthA );
        const Int sendSize_RS = colStride*recvSize_RS;
        const Int recvSize_SR = localHeight*localWidth;

        //vector<T> buffer( recvSize_RS + Max(sendSize_RS,recvSize_SR) );
        vector<T> buffer;
        buffer.reserve( recvSize_RS + Max(sendSize_RS,recvSize_SR) );
        T* firstBuf = &buffer[0];
        T* secondBuf = &buffer[recvSize_RS];

        // Pack
        copy::util::ColStridedPack
        ( height, localWidth,
          colAlign, colStride,
          A.LockedBuffer(), A.LDim(),
          secondBuf,        recvSize_RS );

        // Reduce-scatter over each col
        mpi::ReduceScatter( secondBuf, firstBuf, recvSize_RS, B.ColComm() );

        // Trade reduced data with the appropriate col
        const Int sendCol = Mod( B.RowRank()+rowDiff, B.RowStride() );
        const Int recvCol = Mod( B.RowRank()-rowDiff, B.RowStride() );
        mpi::SendRecv
        ( firstBuf,  localHeight*localWidthA, sendCol,
          secondBuf, localHeight*localWidth,  recvCol, B.RowComm() );

        // Update with our received data
        axpy::util::InterleaveMatrixUpdate
        ( alpha, localHeight, localWidth,
          secondBuf,  1, localHeight,
          B.Buffer(), 1, B.LDim() );
    }
}