Esempio n. 1
0
/*
 * Distributes A in such a way that
 *   Layer 0 <- A(:, 0:(n/h - 1))
 *   Layer 1 <- A(:, (n/h):(2n/h - 1))
 *     .
 *     .
 *     .
 *   Layer h-1 <- A(:, ((h-1)n/h):n)
 */
void DistributeCols
( const mpi::Comm& depthComm,
  const DistMatrix<double,MC,MR>& A, 
        DistMatrix<double,MC,MR>& B )
{
    const Grid& meshGrid = A.Grid();
    const int meshSize = meshGrid.Size();
    const int depthSize = mpi::CommSize( depthComm );
    const int depthRank = mpi::CommRank( depthComm );

    const int sendCount = A.LocalHeight()*A.LocalWidth();
    const int recvCount = sendCount / depthSize;

    // For now, we will make B as large as A...
    // TODO: NOT DO THIS
    if( A.LocalHeight() != A.LocalLDim() )
        throw std::logic_error("Local height did not match local ldim");
    B.Empty();
    B.AlignWith( A );
    Zeros( A.Height(), A.Width(), B );

    // Scatter
    const int localColOffset = (A.LocalWidth()/depthSize)*depthRank;
    mpi::Scatter
    ( A.LockedLocalBuffer(), recvCount, 
      B.LocalBuffer(0,localColOffset), recvCount, 0, depthComm );
}
Esempio n. 2
0
    static void Func
    ( DistMatrix<T,MC,STAR>& A, T center, typename Base<T>::type radius )
    {
        const Grid& grid = A.Grid();
        if( grid.InGrid() )
        {
            const int n = A.Width();
            const int localHeight = A.LocalHeight();
            const int bufSize = localHeight*n;
            std::vector<T> buffer( bufSize );

            // Create random matrix on process column 0, then broadcast
            if( grid.Col() == 0 )
            {
                for( int j=0; j<n; ++j )
                    for( int iLocal=0; iLocal<localHeight; ++iLocal )
                        buffer[iLocal+j*localHeight] = 
                            center + radius*SampleUnitBall<T>();
            }
            mpi::Broadcast( &buffer[0], bufSize, 0, grid.RowComm() );

            // Unpack
            T* localBuffer = A.LocalBuffer();
            const int ldim = A.LocalLDim();
#ifdef HAVE_OPENMP
            #pragma omp parallel for
#endif
            for( int j=0; j<n; ++j )
            {
                const T* bufferCol = &buffer[j*localHeight];
                T* col = &localBuffer[j*ldim];
                MemCopy( col, bufferCol, localHeight );
            }
        }
    }
Esempio n. 3
0
// Broadcast a matrix from the root grid to the others
void DepthBroadcast
( const mpi::Comm& depthComm,
  const DistMatrix<double,MC,MR>& A, 
        DistMatrix<double,MC,MR>& B )
{
    const int rank = mpi::CommRank(mpi::COMM_WORLD);
    const Grid& meshGrid = A.Grid();
    const int meshSize = meshGrid.Size();
    const int depthRank = rank / meshSize;

    const int localSize = A.LocalHeight()*A.LocalWidth();
    if( A.LocalHeight() != A.LocalLDim() )
        throw std::logic_error("Leading dimension did not match local height");

    B.Empty();
    B.AlignWith( A );
    B.ResizeTo( A.Height(), A.Width() );

    // Have the root pack the broadcast data
    if( depthRank == 0 )
        MemCopy( B.LocalBuffer(), A.LockedLocalBuffer(), localSize );

    // Broadcast from the root
    mpi::Broadcast( B.LocalBuffer(), localSize, 0, depthComm );
}
Esempio n. 4
0
    static void Func
    ( DistMatrix<T,STAR,STAR>& A, T center, typename Base<T>::type radius )
    {
        const Grid& grid = A.Grid();
        const int m = A.Height();
        const int n = A.Width();
        const int bufSize = m*n;

        if( grid.InGrid() )
        {
            std::vector<T> buffer( bufSize );

            if( grid.Rank() == 0 )
            {
                for( int j=0; j<n; ++j )
                    for( int i=0; i<m; ++i )
                        buffer[i+j*m] = center+radius*SampleUnitBall<T>();
            }
            mpi::Broadcast( &buffer[0], bufSize, 0, grid.Comm() );

            // Unpack
            T* localBuffer = A.LocalBuffer();
            const int ldim = A.LocalLDim();
#ifdef HAVE_OPENMP
            #pragma omp parallel for
#endif
            for( int j=0; j<n; ++j )
            {
                const T* bufferCol = &buffer[j*m];
                T* col = &localBuffer[j*ldim];
                MemCopy( col, bufferCol, m );
            }
        }
    }
Esempio n. 5
0
    static void Func
    ( DistMatrix<T,STAR,MR>& A, T center, typename Base<T>::type radius )
    {
        const Grid& grid = A.Grid();
        const int m = A.Height();
        const int localWidth = A.LocalWidth();
        const int bufSize = m*localWidth;
        std::vector<T> buffer( bufSize );

        // Create random matrix on process row 0, then broadcast
        if( grid.Row() == 0 )
        {
            for( int j=0; j<localWidth; ++j )
                for( int i=0; i<m; ++i )
                    buffer[i+j*m] = center+radius*SampleUnitBall<T>();
        }
        mpi::Broadcast( &buffer[0], bufSize, 0, grid.ColComm() );

        // Unpack
        T* localBuffer = A.LocalBuffer();
        const int ldim = A.LocalLDim();
#ifdef HAVE_OPENMP
        #pragma omp parallel for
#endif
        for( int jLocal=0; jLocal<localWidth; ++jLocal )
        {
            const T* bufferCol = &buffer[jLocal*m];
            T* col = &localBuffer[jLocal*ldim];
            MemCopy( col, bufferCol, m );
        }
    }
Esempio n. 6
0
    static void Func
    ( DistMatrix<T,MR,STAR>& A, T center, typename Base<T>::type radius )
    {
        const Grid& grid = A.Grid();
        const int n = A.Width();
        const int localHeight = A.LocalHeight();
        const int bufSize = localHeight*n;
        std::vector<T> buffer( bufSize );

        // Create random matrix on process row 0, then broadcast
        if( grid.Row() == 0 )
        {
            for( int j=0; j<n; ++j )
                for( int i=0; i<localHeight; ++i )
                    buffer[i+j*localHeight] = center+radius*SampleUnitBall<T>();
        }
        mpi::Broadcast( &buffer[0], bufSize, 0, grid.ColComm() );

        // Unpack
        T* localBuffer = A.LocalBuffer();
        const int ldim = A.LocalLDim();
#ifdef HAVE_OPENMP
        #pragma omp parallel for COLLAPSE(2)
#endif
        for( int j=0; j<n; ++j )
            for( int iLocal=0; iLocal<localHeight; ++iLocal )
                localBuffer[iLocal+j*ldim] = buffer[iLocal+j*localHeight];
    }
Esempio n. 7
0
inline void
MakeTriangular( UpperOrLower uplo, DistMatrix<T,U,V>& A )
{
#ifndef RELEASE
    PushCallStack("MakeTriangular");
#endif
    const int height = A.Height();
    const int localHeight = A.LocalHeight();
    const int localWidth = A.LocalWidth();
    const int colShift = A.ColShift();
    const int rowShift = A.RowShift();
    const int colStride = A.ColStride();
    const int rowStride = A.RowStride();

    T* localBuffer = A.LocalBuffer();
    const int ldim = A.LocalLDim();

    if( uplo == LOWER )
    {

#ifdef HAVE_OPENMP
        #pragma omp parallel for
#endif
        for( int jLocal=0; jLocal<localWidth; ++jLocal )
        {
            const int j = rowShift + jLocal*rowStride;
            const int lastZeroRow = j-1;
            if( lastZeroRow >= 0 )
            {
                const int boundary = std::min( lastZeroRow+1, height );
                const int numZeroRows =
                    RawLocalLength( boundary, colShift, colStride );
                MemZero( &localBuffer[jLocal*ldim], numZeroRows );
            }
        }
    }
    else
    {
#ifdef HAVE_OPENMP
        #pragma omp parallel for
#endif
        for( int jLocal=0; jLocal<localWidth; ++jLocal )
        {
            const int j = rowShift + jLocal*rowStride;
            const int firstZeroRow = j+1;
            const int numNonzeroRows =
                RawLocalLength(firstZeroRow,colShift,colStride);
            if( numNonzeroRows < localHeight )
            {
                T* col = &localBuffer[numNonzeroRows+jLocal*ldim];
                MemZero( col, localHeight-numNonzeroRows );
            }
        }
    }
#ifndef RELEASE
    PopCallStack();
#endif
}
Esempio n. 8
0
// Reduce across depth to get end result C
void SumContributions
( mpi::Comm& depthComm,
  const DistMatrix<double,MC,MR>& APartial,
        DistMatrix<double,MC,MR>& A )
{
    const int rank = mpi::CommRank( mpi::COMM_WORLD );
    const Grid& meshGrid = APartial.Grid();

    A.Empty();
    A.AlignWith( APartial );
    A.ResizeTo( APartial.Height(), APartial.Width() );

    if( APartial.LocalHeight() != APartial.LocalLDim() )
        throw std::logic_error
        ("APartial did not have matching local height/ldim");
    if( A.LocalHeight() != A.LocalLDim() )
        throw std::logic_error("A did not have matching local height/ldim");

    const int dataSize = APartial.LocalHeight()*APartial.LocalWidth();
    mpi::AllReduce
    ( APartial.LockedLocalBuffer(), A.LocalBuffer(), dataSize, 
      mpi::SUM, depthComm );
}
Esempio n. 9
0
// Have the top layer initialize the distributed matrix, B
void InitB( DistMatrix<double,MC,MR>& B )
{
    const int rank = mpi::CommRank(mpi::COMM_WORLD);
    const Grid& g = B.Grid();
    const int meshSize = g.Size();
    const int depthRank = rank / meshSize;

    if( depthRank == 0 )
    {
        if( B.LocalHeight() != B.LocalLDim() )
            throw std::logic_error("Local ldim of B was too large");

        double* localBuffer = B.LocalBuffer();
        const int localSize = B.LocalHeight()*B.LocalWidth();
        for( int iLocal=0; iLocal<localSize; ++iLocal )
            localBuffer[iLocal] = iLocal*meshSize + rank;

        B.Print("B");
    }
}
Esempio n. 10
0
inline const DistMatrix<T,MD,STAR,Int>&
DistMatrix<T,MD,STAR,Int>::operator=( const DistMatrix<T,STAR,STAR,Int>& A )
{
#ifndef RELEASE
    PushCallStack("[MD,* ] = [* ,* ]");
    this->AssertNotLockedView();
    this->AssertSameGrid( A );
    if( this->Viewing() )
        this->AssertSameSize( A );
#endif
    if( !this->Viewing() )
        this->ResizeTo( A.Height(), A.Width() );

    if( this->Participating() )
    {
        const Int lcm = this->grid_->LCM();
        const Int colShift = this->ColShift();

        const Int width = this->Width();
        const Int localHeight = this->LocalHeight();

        const T* ALocalBuffer = A.LockedLocalBuffer();
        const Int ALDim = A.LocalLDim();
        T* thisLocalBuffer = this->LocalBuffer();
        const Int thisLDim = this->LocalLDim();
#ifdef HAVE_OPENMP
        #pragma omp parallel for 
#endif
        for( Int j=0; j<width; ++j )
        {
            T* destCol = &thisLocalBuffer[j*thisLDim];
            const T* sourceCol = &ALocalBuffer[colShift+j*ALDim];
            for( Int iLocal=0; iLocal<localHeight; ++iLocal )
                destCol[iLocal] = sourceCol[iLocal*lcm];
        }
    }
#ifndef RELEASE
    PopCallStack();
#endif
    return *this;
}
Esempio n. 11
0
inline void
PanelLU
( DistMatrix<F,  STAR,STAR>& A, 
  DistMatrix<F,  MC,  STAR>& B, 
  DistMatrix<int,STAR,STAR>& p, 
  int pivotOffset )
{
#ifndef RELEASE
    PushCallStack("internal::PanelLU");
    if( A.Grid() != p.Grid() || p.Grid() != B.Grid() )
        throw std::logic_error
        ("Matrices must be distributed over the same grid");
    if( A.Width() != B.Width() )
        throw std::logic_error("A and B must be the same width");
    if( A.Height() != p.Height() || p.Width() != 1 )
        throw std::logic_error("p must be a vector that conforms with A");
#endif
    const Grid& g = A.Grid();
    const int r = g.Height();
    const int colShift = B.ColShift();
    const int colAlignment = B.ColAlignment();

    // Matrix views
    DistMatrix<F,STAR,STAR> 
        ATL(g), ATR(g),  A00(g), a01(g),     A02(g),  
        ABL(g), ABR(g),  a10(g), alpha11(g), a12(g),  
                         A20(g), a21(g),     A22(g);

    DistMatrix<F,MC,STAR>
        BL(g), BR(g),
        B0(g), b1(g), B2(g);

    const int width = A.Width();
    const int numBytes = (width+1)*sizeof(F)+sizeof(int);
    std::vector<byte> sendData(numBytes);
    std::vector<byte> recvData(numBytes);

    // Extract pointers to send and recv data
    // TODO: Think of how to make this safer with respect to alignment issues
    F* sendBufFloat = (F*)&sendData[0];
    F* recvBufFloat = (F*)&recvData[0];
    int* sendBufInt = (int*)&sendData[(width+1)*sizeof(F)];
    int* recvBufInt = (int*)&recvData[(width+1)*sizeof(F)];

    // Start the algorithm
    PushBlocksizeStack( 1 );
    PartitionDownDiagonal
    ( A, ATL, ATR,
         ABL, ABR, 0 );
    PartitionRight( B, BL, BR, 0 );
    while( ATL.Height() < A.Height() )
    {
        RepartitionDownDiagonal
        ( ATL, /**/ ATR,  A00, /**/ a01,     A02,
         /*************/ /**********************/
               /**/       a10, /**/ alpha11, a12,
          ABL, /**/ ABR,  A20, /**/ a21,     A22 );

        RepartitionRight
        ( BL, /**/ BR,  
          B0, /**/ b1, B2 );

        //--------------------------------------------------------------------//
        const int currentRow = a01.Height();
        
        // Store the index/value of the pivot candidate in A
        F pivot = alpha11.GetLocal(0,0);
        int pivotRow = currentRow;
        for( int i=0; i<a21.Height(); ++i )
        {
            F value = a21.GetLocal(i,0);
            if( FastAbs(value) > FastAbs(pivot) )
            {
                pivot = value;
                pivotRow = currentRow + i + 1;
            }
        }

        // Update the pivot candidate to include local data from B
        for( int i=0; i<B.LocalHeight(); ++i )
        {
            F value = b1.GetLocal(i,0);
            if( FastAbs(value) > FastAbs(pivot) )
            {
                pivot = value;
                pivotRow = A.Height() + colShift + i*r;
            }
        }

        // Fill the send buffer with:
        // [ pivotValue | pivot row data | pivotRow ]
        if( pivotRow < A.Height() )
        {
            sendBufFloat[0] = A.GetLocal(pivotRow,a10.Width());

            const int ALDim = A.LocalLDim();
            const F* ABuffer = A.LocalBuffer(pivotRow,0);
            for( int j=0; j<width; ++j )
                sendBufFloat[j+1] = ABuffer[j*ALDim];
        }
        else
        {
            const int localRow = ((pivotRow-A.Height())-colShift)/r;
            sendBufFloat[0] = b1.GetLocal(localRow,0);

            const int BLDim = B.LocalLDim();
            const F* BBuffer = B.LocalBuffer(localRow,0);
            for( int j=0; j<width; ++j )
                sendBufFloat[j+1] = BBuffer[j*BLDim];
        }
        *sendBufInt = pivotRow;

        // Communicate to establish the pivot information
        mpi::AllReduce
        ( &sendData[0], &recvData[0], numBytes, PivotOp<F>(), g.ColComm() );

        // Update the pivot vector
        pivotRow = *recvBufInt;
        p.SetLocal(currentRow,0,pivotRow+pivotOffset);

        // Copy the current row into the pivot row
        if( pivotRow < A.Height() )
        {
            const int ALDim = A.LocalLDim();
            F* ASetBuffer = A.LocalBuffer(pivotRow,0);
            const F* AGetBuffer = A.LocalBuffer(currentRow,0);
            for( int j=0; j<width; ++j )
                ASetBuffer[j*ALDim] = AGetBuffer[j*ALDim];
        }
        else
        {
            const int ownerRank = (colAlignment+(pivotRow-A.Height())) % r;
            if( g.Row() == ownerRank )
            {
                const int localRow = ((pivotRow-A.Height())-colShift) / r;

                const int ALDim = A.LocalLDim();
                const int BLDim = B.LocalLDim();
                F* BBuffer = B.LocalBuffer(localRow,0);
                const F* ABuffer = A.LocalBuffer(currentRow,0);
                for( int j=0; j<width; ++j )
                    BBuffer[j*BLDim] = ABuffer[j*ALDim];
            }
        }

        // Copy the pivot row into the current row
        {
            F* ABuffer = A.LocalBuffer(currentRow,0);
            const int ALDim = A.LocalLDim();
            for( int j=0; j<width; ++j )
                ABuffer[j*ALDim] = recvBufFloat[j+1];
        }

        // Now we can perform the update of the current panel
        const F alpha = alpha11.GetLocal(0,0);
        if( alpha == F(0) )
            throw SingularMatrixException();
        const F alpha11Inv = F(1) / alpha;
        Scale( alpha11Inv, a21.LocalMatrix() );
        Scale( alpha11Inv, b1.LocalMatrix()  );
        Geru( F(-1), a21.LocalMatrix(), a12.LocalMatrix(), A22.LocalMatrix() );
        Geru( F(-1), b1.LocalMatrix(), a12.LocalMatrix(), B2.LocalMatrix() );
        //--------------------------------------------------------------------//

        SlidePartitionDownDiagonal
        ( ATL, /**/ ATR,  A00, a01,     /**/ A02,
               /**/       a10, alpha11, /**/ a12,
         /*************/ /**********************/
          ABL, /**/ ABR,  A20, a21,     /**/ A22 );

        SlidePartitionRight
        ( BL,     /**/ BR,  
          B0, b1, /**/ B2 );
    }
    PopBlocksizeStack();

#ifndef RELEASE
    PopCallStack();
#endif
}
Esempio n. 12
0
inline void
MakeTrapezoidal
( LeftOrRight side, UpperOrLower uplo, int offset,
  DistMatrix<T,U,V>& A )
{
#ifndef RELEASE
    PushCallStack("MakeTrapezoidal");
#endif
    const int height = A.Height();
    const int width = A.Width();
    const int localHeight = A.LocalHeight();
    const int localWidth = A.LocalWidth();
    const int colShift = A.ColShift();
    const int rowShift = A.RowShift();
    const int colStride = A.ColStride();
    const int rowStride = A.RowStride();

    T* localBuffer = A.LocalBuffer();
    const int ldim = A.LocalLDim();

    if( uplo == LOWER )
    {

#ifdef HAVE_OPENMP
        #pragma omp parallel for
#endif
        for( int jLocal=0; jLocal<localWidth; ++jLocal )
        {
            const int j = rowShift + jLocal*rowStride;
            const int lastZeroRow =
                ( side==LEFT ? j-offset-1
                  : j-offset+height-width-1 );
            if( lastZeroRow >= 0 )
            {
                const int boundary = std::min( lastZeroRow+1, height );
                const int numZeroRows =
                    RawLocalLength( boundary, colShift, colStride );
                MemZero( &localBuffer[jLocal*ldim], numZeroRows );
            }
        }
    }
    else
    {
#ifdef HAVE_OPENMP
        #pragma omp parallel for
#endif
        for( int jLocal=0; jLocal<localWidth; ++jLocal )
        {
            const int j = rowShift + jLocal*rowStride;
            const int firstZeroRow =
                ( side==LEFT ? std::max(j-offset+1,0)
                  : std::max(j-offset+height-width+1,0) );
            const int numNonzeroRows =
                RawLocalLength(firstZeroRow,colShift,colStride);
            if( numNonzeroRows < localHeight )
            {
                T* col = &localBuffer[numNonzeroRows+jLocal*ldim];
                MemZero( col, localHeight-numNonzeroRows );
            }
        }
    }
#ifndef RELEASE
    PopCallStack();
#endif
}
Esempio n. 13
0
inline void
ApplyRowPivots
( DistMatrix<F>& A, 
  const std::vector<int>& image,
  const std::vector<int>& preimage )
{
    const int b = image.size();
#ifndef RELEASE
    PushCallStack("ApplyRowPivots");
    if( A.Height() < b || b != (int)preimage.size() )
        throw std::logic_error
        ("image and preimage must be vectors of equal length that are not "
         "taller than A.");
#endif
    const int localWidth = A.LocalWidth();
    if( A.Height() == 0 || A.Width() == 0 )
    {
#ifndef RELEASE
        PopCallStack();
#endif
        return;
    }
    
    // Extract the relevant process grid information
    const Grid& g = A.Grid();
    const int r = g.Height();
    const int colAlignment = A.ColAlignment();
    const int colShift = A.ColShift();
    const int myRow = g.Row();

    // Extract the send and recv counts from the image and preimage.
    // This process's sends may be logically partitioned into two sets:
    //   (a) sends from rows [0,...,b-1]
    //   (b) sends from rows [b,...]
    // The latter is analyzed with image, the former deduced with preimage.
    std::vector<int> sendCounts(r,0), recvCounts(r,0);
    for( int i=colShift; i<b; i+=r )
    {
        const int sendRow = preimage[i];         
        const int sendTo = (colAlignment+sendRow) % r; 
        sendCounts[sendTo] += localWidth;

        const int recvRow = image[i];
        const int recvFrom = (colAlignment+recvRow) % r;
        recvCounts[recvFrom] += localWidth;
    }
    for( int i=0; i<b; ++i )
    {
        const int sendRow = preimage[i];
        if( sendRow >= b )
        {
            const int sendTo = (colAlignment+sendRow) % r;
            if( sendTo == myRow )
            {
                const int sendFrom = (colAlignment+i) % r;
                recvCounts[sendFrom] += localWidth;
            }
        }

        const int recvRow = image[i];
        if( recvRow >= b )
        {
            const int recvFrom = (colAlignment+recvRow) % r;
            if( recvFrom == myRow )
            {
                const int recvTo = (colAlignment+i) % r;
                sendCounts[recvTo] += localWidth;
            }
        }
    }

    // Construct the send and recv displacements from the counts
    std::vector<int> sendDispls(r), recvDispls(r);
    int totalSend=0, totalRecv=0;
    for( int i=0; i<r; ++i )
    {
        sendDispls[i] = totalSend;
        recvDispls[i] = totalRecv;
        totalSend += sendCounts[i];
        totalRecv += recvCounts[i];
    }
#ifndef RELEASE
    if( totalSend != totalRecv )
    {
        std::ostringstream msg;
        msg << "Send and recv counts do not match: (send,recv)=" 
             << totalSend << "," << totalRecv;
        throw std::logic_error( msg.str().c_str() );
    }
#endif

    // Fill vectors with the send data
    const int ALDim = A.LocalLDim();
    std::vector<F> sendData(std::max(1,totalSend));
    std::vector<int> offsets(r,0);
    const int localHeight = LocalLength( b, colShift, r );
    for( int iLocal=0; iLocal<localHeight; ++iLocal )
    {
        const int sendRow = preimage[colShift+iLocal*r];
        const int sendTo = (colAlignment+sendRow) % r;
        const int offset = sendDispls[sendTo]+offsets[sendTo];
        const F* ABuffer = A.LocalBuffer(iLocal,0);
        for( int jLocal=0; jLocal<localWidth; ++jLocal )
            sendData[offset+jLocal] = ABuffer[jLocal*ALDim];
        offsets[sendTo] += localWidth;
    }
    for( int i=0; i<b; ++i )
    {
        const int recvRow = image[i];
        if( recvRow >= b )
        {
            const int recvFrom = (colAlignment+recvRow) % r; 
            if( recvFrom == myRow )
            {
                const int recvTo = (colAlignment+i) % r;
                const int iLocal = (recvRow-colShift) / r;
                const int offset = sendDispls[recvTo]+offsets[recvTo];
                const F* ABuffer = A.LocalBuffer(iLocal,0);
                for( int jLocal=0; jLocal<localWidth; ++jLocal )
                    sendData[offset+jLocal] = ABuffer[jLocal*ALDim];
                offsets[recvTo] += localWidth;
            }
        }
    }

    // Communicate all pivot rows
    std::vector<F> recvData(std::max(1,totalRecv));
    mpi::AllToAll
    ( &sendData[0], &sendCounts[0], &sendDispls[0],
      &recvData[0], &recvCounts[0], &recvDispls[0], g.ColComm() );

    // Unpack the recv data
    for( int k=0; k<r; ++k )
    {
        offsets[k] = 0;
        int thisColShift = Shift( k, colAlignment, r );
        for( int i=thisColShift; i<b; i+=r )
        {
            const int sendRow = preimage[i];
            const int sendTo = (colAlignment+sendRow) % r;
            if( sendTo == myRow )
            {
                const int offset = recvDispls[k]+offsets[k];
                const int iLocal = (sendRow-colShift) / r;
                F* ABuffer = A.LocalBuffer(iLocal,0);
                for( int jLocal=0; jLocal<localWidth; ++jLocal )
                    ABuffer[jLocal*ALDim] = recvData[offset+jLocal];
                offsets[k] += localWidth;
            }
        }
    }
    for( int i=0; i<b; ++i )
    {
        const int recvRow = image[i];
        if( recvRow >= b )
        {
            const int recvTo = (colAlignment+i) % r;
            if( recvTo == myRow )
            {
                const int recvFrom = (colAlignment+recvRow) % r; 
                const int iLocal = (i-colShift) / r;
                const int offset = recvDispls[recvFrom]+offsets[recvFrom];
                F* ABuffer = A.LocalBuffer(iLocal,0);
                for( int jLocal=0; jLocal<localWidth; ++jLocal )
                    ABuffer[jLocal*ALDim] = recvData[offset+jLocal];
                offsets[recvFrom] += localWidth;
            }
        }
    }
#ifndef RELEASE
    PopCallStack();
#endif
}