示例#1
0
inline void
MakeTriangular( UpperOrLower uplo, DistMatrix<T,U,V>& A )
{
#ifndef RELEASE
    PushCallStack("MakeTriangular");
#endif
    const int height = A.Height();
    const int localHeight = A.LocalHeight();
    const int localWidth = A.LocalWidth();
    const int colShift = A.ColShift();
    const int rowShift = A.RowShift();
    const int colStride = A.ColStride();
    const int rowStride = A.RowStride();

    T* buffer = A.Buffer();
    const int ldim = A.LDim();

    if( uplo == LOWER )
    {

#ifdef HAVE_OPENMP
        #pragma omp parallel for
#endif
        for( int jLocal=0; jLocal<localWidth; ++jLocal )
        {
            const int j = rowShift + jLocal*rowStride;
            const int lastZeroRow = j-1;
            if( lastZeroRow >= 0 )
            {
                const int boundary = std::min( lastZeroRow+1, height );
                const int numZeroRows =
                    Length_( boundary, colShift, colStride );
                MemZero( &buffer[jLocal*ldim], numZeroRows );
            }
        }
    }
    else
    {
#ifdef HAVE_OPENMP
        #pragma omp parallel for
#endif
        for( int jLocal=0; jLocal<localWidth; ++jLocal )
        {
            const int j = rowShift + jLocal*rowStride;
            const int firstZeroRow = j+1;
            const int numNonzeroRows =
                Length_(firstZeroRow,colShift,colStride);
            if( numNonzeroRows < localHeight )
            {
                T* col = &buffer[numNonzeroRows+jLocal*ldim];
                MemZero( col, localHeight-numNonzeroRows );
            }
        }
    }
#ifndef RELEASE
    PopCallStack();
#endif
}
示例#2
0
文件: Str.cpp 项目: 0rel/TexTxtEd
int CStr::FindRev( unsigned int uiStart, const char * pcStr ) const
{
	const unsigned int uiLen = Length_( pcStr );
	
	if( !uiLen )
		return -1;
	if( uiLen == 1 )
		return Find( uiStart, pcStr[0] );
	
	const unsigned int uiMax = m_uiSize - 1 - uiLen;
	if( uiStart > uiMax )
		uiStart = uiMax;
	unsigned int i = uiStart + 1;
	
	while( i )
	{
		--i;
		const int iFound = FindRev( i, pcStr[0] );
		if( iFound < 0 )
		{
			break;
		}
		if( Match_( m_pcData + iFound + 1, pcStr + 1 ) )
		{
			return iFound;
		}
	}
	return -1;
}
示例#3
0
文件: Str.cpp 项目: 0rel/TexTxtEd
int CStr::Replace( const char * pcOld, const char * pcNew )
{
	int iReplaced = 0;
	CStr oTmp;
	const char cFirst = *pcOld;
	const unsigned int uiLenOld = Length_( pcOld );
	char * pcEnd = m_pcData + m_uiSize;
	char * pcD = m_pcData;
	while( pcD < pcEnd )
	{
		if( *pcD == cFirst )
		{
			if( Match_( pcD, pcOld ) )
			{
				oTmp += pcNew;
				pcD += uiLenOld;
				++iReplaced;
				continue;
			}
		}
		oTmp += *pcD;
		++pcD;
	}
	*this = oTmp;
	return iReplaced;
}
示例#4
0
文件: Str.cpp 项目: 0rel/TexTxtEd
CStr::CStr( const char * pcStr )
{
	unsigned int uiSize = Length_( pcStr );
	Init( uiSize );                  // uiSize + 1 Plätze
	Copy_( m_pcData, pcStr, uiSize ); // Null-Terminator muss nicht kopiert werden.
	m_pcData[uiSize] = '\0';
}
示例#5
0
文件: Str.cpp 项目: 0rel/TexTxtEd
int CStr::Find( unsigned int uiStart, const char * pcStr ) const
{
	const unsigned int uiLen = Length_( pcStr );
	
	if( !uiLen )
		return -1;
	if( uiLen == 1 )
		return Find( uiStart, pcStr[0] );
	
	const unsigned int uiMax = m_uiSize - uiLen; // ACHTUNG: Mod: vorher -1!
	unsigned int i = uiStart;
	
	while( i < uiMax )
	{
		const int iFound = Find( i, pcStr[0] );
		if( iFound < 0 )
		{
			break;
		}
		if( Match_( m_pcData + iFound + 1, pcStr + 1 ) )
		{
			return iFound;
		}
		++i;
	}
	return -1;
}
示例#6
0
const DistMatrix<T,STAR,STAR>&
DistMatrix<T,STAR,STAR>::operator=( const DistMatrix<T,VR,STAR>& A )
{
#ifndef RELEASE
    CallStackEntry entry("[* ,* ] = [VR,* ]");
    this->AssertNotLocked();
    this->AssertSameGrid( A.Grid() );
#endif
    const elem::Grid& g = this->Grid();
    this->ResizeTo( A.Height(), A.Width() );
    if( !this->Participating() )
        return *this;

    const Int p = g.Size();
    const Int height = this->Height();
    const Int width = this->Width();
    const Int localHeightOfA = A.LocalHeight();
    const Int maxLocalHeight = MaxLength(height,p);

    const Int portionSize = mpi::Pad( maxLocalHeight*width );
    T* buffer = this->auxMemory_.Require( (p+1)*portionSize );
    T* sendBuf = &buffer[0];
    T* recvBuf = &buffer[portionSize];

    // Pack
    const Int ALDim = A.LDim();
    const T* ABuf = A.LockedBuffer();
    PARALLEL_FOR
    for( Int j=0; j<width; ++j )
        MemCopy
        ( &sendBuf[j*localHeightOfA], &ABuf[j*ALDim], localHeightOfA );

    // Communicate
    mpi::AllGather
    ( sendBuf, portionSize,
      recvBuf, portionSize, g.VRComm() );

    // Unpack
    T* thisBuf = this->Buffer();
    const Int thisLDim = this->LDim();
    const Int colAlignmentOfA = A.ColAlignment();
    OUTER_PARALLEL_FOR
    for( Int k=0; k<p; ++k )
    {
        const T* data = &recvBuf[k*portionSize];
        const Int colShift = Shift_( k, colAlignmentOfA, p );
        const Int localHeight = Length_( height, colShift, p );
        INNER_PARALLEL_FOR
        for( Int j=0; j<width; ++j )
        {
            T* destCol = &thisBuf[colShift+j*thisLDim];
            const T* sourceCol = &data[j*localHeight];
            for( Int iLoc=0; iLoc<localHeight; ++iLoc )
                destCol[iLoc*p] = sourceCol[iLoc];
        }
    }
    this->auxMemory_.Release();
    return *this;
}
示例#7
0
const DistMatrix<T,STAR,STAR>&
DistMatrix<T,STAR,STAR>::operator=( const DistMatrix<T,STAR,VR>& A )
{
#ifndef RELEASE
    CallStackEntry entry("[* ,* ] = [* ,VR]");
    this->AssertNotLocked();
    this->AssertSameGrid( A.Grid() );
#endif
    const elem::Grid& g = this->Grid();
    this->ResizeTo( A.Height(), A.Width() );
    if( !this->Participating() )
        return *this;

    const Int p = g.Size();
    const Int height = this->Height();
    const Int width = this->Width();
    const Int localWidthOfA = A.LocalWidth();
    const Int maxLocalWidth = MaxLength(width,p);

    const Int portionSize = mpi::Pad( height*maxLocalWidth );
    T* buffer = this->auxMemory_.Require( (p+1)*portionSize );
    T* sendBuf = &buffer[0];
    T* recvBuf = &buffer[portionSize];

    // Pack
    const Int ALDim = A.LDim();
    const T* ABuf = A.LockedBuffer();
    PARALLEL_FOR
    for( Int jLoc=0; jLoc<localWidthOfA; ++jLoc )
        MemCopy( &sendBuf[jLoc*height], &ABuf[jLoc*ALDim], height );

    // Communicate
    mpi::AllGather
    ( sendBuf, portionSize,
      recvBuf, portionSize, g.VRComm() );

    // Unpack
    T* thisBuf = this->Buffer();
    const Int thisLDim = this->LDim();
    const Int rowAlignmentOfA = A.RowAlignment();
    OUTER_PARALLEL_FOR
    for( Int k=0; k<p; ++k )
    {
        const T* data = &recvBuf[k*portionSize];
        const Int rowShift = Shift_( k, rowAlignmentOfA, p );
        const Int localWidth = Length_( width, rowShift, p );
        INNER_PARALLEL_FOR
        for( Int jLoc=0; jLoc<localWidth; ++jLoc )
            MemCopy
            ( &thisBuf[(rowShift+jLoc*p)*thisLDim], 
              &data[jLoc*height], height );
    }
    this->auxMemory_.Release();
    return *this;
}
示例#8
0
void StridedUnpack
( Int height, Int width,
  Int colAlign, Int colStride,
  Int rowAlign, Int rowStride,
  const T* APortions, Int portionSize,
        T* B,         Int BLDim )
{
    for( Int l=0; l<rowStride; ++l )
    {
        const Int rowShift = Shift_( l, rowAlign, rowStride );
        const Int localWidth = Length_( width, rowShift, rowStride );
        for( Int k=0; k<colStride; ++k )
        {
            const Int colShift = Shift_( k, colAlign, colStride );
            const Int localHeight = Length_( height, colShift, colStride );
            InterleaveMatrix
            ( localHeight, localWidth,
              &APortions[(k+l*colStride)*portionSize], 1, localHeight,
              &B[colShift+rowShift*BLDim], colStride, rowStride*BLDim );
        }
    }
}
示例#9
0
void ColStridedColumnPack
( Int height, 
  Int colAlign, Int colStride,
  const T* A,
        T* BPortions, Int portionSize )
{
    for( Int k=0; k<colStride; ++k )
    {
        const Int colShift = Shift_( k, colAlign, colStride );
        const Int localHeight = Length_( height, colShift, colStride );
        StridedMemCopy
        ( &BPortions[k*portionSize], 1, 
          &A[colShift],              colStride, localHeight );
    }
}
示例#10
0
void RowStridedUnpack
( Int height, Int width,
  Int rowAlign, Int rowStride,
  const T* APortions, Int portionSize,
        T* B,         Int BLDim )
{
    for( Int k=0; k<rowStride; ++k )
    {
        const Int rowShift = Shift_( k, rowAlign, rowStride );
        const Int localWidth = Length_( width, rowShift, rowStride );
        lapack::Copy
        ( 'F', height, localWidth,
          &APortions[k*portionSize], height,
          &B[rowShift*BLDim],        rowStride*BLDim );
    }
}
示例#11
0
void RowStridedPack
( Int height, Int width,
  Int rowAlign, Int rowStride,
  const T* A,         Int ALDim,
        T* BPortions, Int portionSize )
{
    for( Int k=0; k<rowStride; ++k )
    {
        const Int rowShift = Shift_( k, rowAlign, rowStride );
        const Int localWidth = Length_( width, rowShift, rowStride );
        lapack::Copy
        ( 'F', height, localWidth, 
          &A[rowShift*ALDim],        rowStride*ALDim,
          &BPortions[k*portionSize], height );
    }
}
示例#12
0
void ColStridedPack
( Int height, Int width,
  Int colAlign, Int colStride,
  const T* A,         Int ALDim,
        T* BPortions, Int portionSize )
{
    for( Int k=0; k<colStride; ++k )
    {
        const Int colShift = Shift_( k, colAlign, colStride );
        const Int localHeight = Length_( height, colShift, colStride );
        InterleaveMatrix
        ( localHeight, width,
          &A[colShift],              colStride, ALDim,
          &BPortions[k*portionSize], 1,         localHeight );
    }
}
示例#13
0
void ColStridedUnpack
( Int height, Int width,
  Int colAlign, Int colStride,
  const T* APortions, Int portionSize,
        T* B,         Int BLDim )
{
    for( Int k=0; k<colStride; ++k )
    {
        const Int colShift = Shift_( k, colAlign, colStride );
        const Int localHeight = Length_( height, colShift, colStride );
        InterleaveMatrix
        ( localHeight, width,
          &APortions[k*portionSize], 1,         localHeight,
          &B[colShift],              colStride, BLDim );
    }
}
示例#14
0
文件: Str.cpp 项目: 0rel/TexTxtEd
int CStr::FindVecRev( unsigned int uiStart, const char * pcStr ) const
{
	const unsigned int uiLen = Length_( pcStr );
	
	unsigned int i = uiStart + 1;
	while( i )
	{
		--i;
		for( unsigned int c=0; c<uiLen; ++c )
		{
			if ( m_pcData[i] == pcStr[c] )
				return i;
		}
	}
	return -1;
}
示例#15
0
文件: Str.cpp 项目: 0rel/TexTxtEd
int CStr::FindVec( unsigned int uiStart, const char * pcStr ) const
{
	const unsigned int uiLen = Length_( pcStr );
	
	unsigned int i = uiStart;
	while( i < m_uiSize )
	{
		for( unsigned int c=0; c<uiLen; ++c )
		{
			if ( m_pcData[i] == pcStr[c] )
				return i;
		}
		++i;
	}
	return -1;
}
示例#16
0
void PartialColStridedColumnUnpack
( Int height, 
  Int colAlign, Int colStride,
  Int colStrideUnion, Int colStridePart, Int colRankPart,
  Int colShiftB,
  const T* APortions, Int portionSize,
        T* B )
{
    for( Int k=0; k<colStrideUnion; ++k )
    {
        const Int colShift =
            Shift_( colRankPart+k*colStridePart, colAlign, colStride );
        const Int colOffset = (colShift-colShiftB) / colStridePart;
        const Int localHeight = Length_( height, colShift, colStride );
        StridedMemCopy
        ( &B[colOffset],             colStrideUnion,
          &APortions[k*portionSize], 1,              localHeight );
    }
}
示例#17
0
void PartialColStridedPack
( Int height, Int width,
  Int colAlign, Int colStride,
  Int colStrideUnion, Int colStridePart, Int colRankPart,
  Int colShiftA,
  const T* A,         Int ALDim,
        T* BPortions, Int portionSize )
{
    for( Int k=0; k<colStrideUnion; ++k )
    {
        const Int colShift =
            Shift_( colRankPart+k*colStridePart, colAlign, colStride );
        const Int colOffset = (colShift-colShiftA) / colStridePart;
        const Int localHeight = Length_( height, colShift, colStride );
        InterleaveMatrix
        ( localHeight, width,
          &A[colOffset],             colStrideUnion, ALDim,
          &BPortions[k*portionSize], 1,              localHeight );
    }
}
示例#18
0
文件: Str.cpp 项目: 0rel/TexTxtEd
int CStr::FindVecInvRev( unsigned int uiStart, const char * pcStr ) const 
{
	const unsigned int uiLen = Length_( pcStr );
	
	unsigned int i = uiStart + 1;
	while( i )
	{
		--i;
		unsigned int c = 0;
		while( c < uiLen )
		{
			if ( m_pcData[i] == pcStr[c] )
				break;
			++c;
		}
		if( c == uiLen )
			return i;
	}
	return -1;
}
示例#19
0
void PartialColStridedUnpack
( Int height, Int width,
  Int colAlign, Int colStride,
  Int colStrideUnion, Int colStridePart, Int colRankPart,
  Int colShiftB,
  const T* APortions, Int portionSize,
        T* B,         Int BLDim )
{
    for( Int k=0; k<colStrideUnion; ++k )
    {
        const Int colShift =
            Shift_( colRankPart+k*colStridePart, colAlign, colStride );
        const Int colOffset = (colShift-colShiftB) / colStridePart;
        const Int localHeight = Length_( height, colShift, colStride );
        InterleaveMatrix
        ( localHeight, width,
          &APortions[k*portionSize], 1,              localHeight,
          &B[colOffset],             colStrideUnion, BLDim );
    }
}
示例#20
0
void PartialRowStridedPack
( Int height, Int width,
  Int rowAlign, Int rowStride,
  Int rowStrideUnion, Int rowStridePart, Int rowRankPart,
  Int rowShiftA,
  const T* A,         Int ALDim,
        T* BPortions, Int portionSize )
{
    for( Int k=0; k<rowStrideUnion; ++k )
    {
        const Int rowShift =
            Shift_( rowRankPart+k*rowStridePart, rowAlign, rowStride );
        const Int rowOffset = (rowShift-rowShiftA) / rowStridePart;
        const Int localWidth = Length_( width, rowShift, rowStride );
        lapack::Copy
        ( 'F', height, localWidth,
          &A[rowOffset*ALDim],       rowStrideUnion*ALDim,
          &BPortions[k*portionSize], height );
    }
}
示例#21
0
void PartialRowStridedUnpack
( Int height, Int width,
  Int rowAlign, Int rowStride,
  Int rowStrideUnion, Int rowStridePart, Int rowRankPart,
  Int rowShiftB,
  const T* APortions, Int portionSize,
        T* B,         Int BLDim )
{
    for( Int k=0; k<rowStrideUnion; ++k )
    {
        const Int rowShift =
            Shift_( rowRankPart+k*rowStridePart, rowAlign, rowStride );
        const Int rowOffset = (rowShift-rowShiftB) / rowStridePart;
        const Int localWidth = Length_( width, rowShift, rowStride );
        lapack::Copy
        ( 'F', height, localWidth,
          &APortions[k*portionSize], height,
          &B[rowOffset*BLDim],       rowStrideUnion*BLDim );
    }
}
示例#22
0
inline void
MakeTrapezoidal
( LeftOrRight side, UpperOrLower uplo, int offset,
  DistMatrix<T,U,V>& A )
{
#ifndef RELEASE
    PushCallStack("MakeTrapezoidal");
#endif
    const int height = A.Height();
    const int width = A.Width();
    const int localHeight = A.LocalHeight();
    const int localWidth = A.LocalWidth();
    const int colShift = A.ColShift();
    const int rowShift = A.RowShift();
    const int colStride = A.ColStride();
    const int rowStride = A.RowStride();

    T* buffer = A.Buffer();
    const int ldim = A.LDim();

    if( uplo == LOWER )
    {

#ifdef HAVE_OPENMP
        #pragma omp parallel for
#endif
        for( int jLocal=0; jLocal<localWidth; ++jLocal )
        {
            const int j = rowShift + jLocal*rowStride;
            const int lastZeroRow =
                ( side==LEFT ? j-offset-1
                             : j-offset+height-width-1 );
            if( lastZeroRow >= 0 )
            {
                const int boundary = std::min( lastZeroRow+1, height );
                const int numZeroRows =
                    Length_( boundary, colShift, colStride );
                MemZero( &buffer[jLocal*ldim], numZeroRows );
            }
        }
    }
    else
    {
#ifdef HAVE_OPENMP
        #pragma omp parallel for
#endif
        for( int jLocal=0; jLocal<localWidth; ++jLocal )
        {
            const int j = rowShift + jLocal*rowStride;
            const int firstZeroRow =
                ( side==LEFT ? std::max(j-offset+1,0)
                             : std::max(j-offset+height-width+1,0) );
            const int numNonzeroRows =
                Length_(firstZeroRow,colShift,colStride);
            if( numNonzeroRows < localHeight )
            {
                T* col = &buffer[numNonzeroRows+jLocal*ldim];
                MemZero( col, localHeight-numNonzeroRows );
            }
        }
    }
#ifndef RELEASE
    PopCallStack();
#endif
}
示例#23
0
const DistMatrix<T,STAR,STAR>&
DistMatrix<T,STAR,STAR>::operator=( const DistMatrix<T,STAR,MD>& A )
{ 
#ifndef RELEASE
    CallStackEntry entry("[* ,* ] = [* ,MD]");
    this->AssertNotLocked();
    this->AssertSameGrid( A.Grid() );
#endif
    const elem::Grid& g = this->Grid();
    this->ResizeTo( A.Height(), A.Width() );
    if( !this->Participating() )
        return *this;

    const Int p = g.Size();
    const Int lcm = g.LCM();
    const Int ownerPath = A.diagPath_;
    const Int ownerPathRank = A.rowAlignment_;

    const Int height = this->Height();
    const Int width = this->Width();
    const Int localWidth = A.LocalWidth();
    const Int maxLocalWidth = MaxLength( width, lcm );
    const Int portionSize = mpi::Pad( height*maxLocalWidth );

    // Since a MD communicator has not been implemented, we will take
    // the suboptimal route of 'rounding up' everyone's contribution over 
    // the VC communicator.
    T* buffer = this->auxMemory_.Require( (p+1)*portionSize );
    T* sendBuf = &buffer[0];
    T* recvBuf = &buffer[portionSize];

    // Pack
    if( A.Participating() )
    {
        const Int ALDim = A.LDim();
        const T* ABuf = A.LockedBuffer();
        PARALLEL_FOR
        for( Int jLoc=0; jLoc<localWidth; ++jLoc )
            MemCopy( &sendBuf[jLoc*height], &ABuf[jLoc*ALDim], height );
    }

    // Communicate
    mpi::AllGather
    ( sendBuf, portionSize,
      recvBuf, portionSize, g.VCComm() );

    // Unpack
    T* thisBuf = this->Buffer();
    const Int thisLDim = this->LDim();
    OUTER_PARALLEL_FOR
    for( Int k=0; k<p; ++k )
    {
        if( g.DiagPath( k ) == ownerPath )
        {
            const T* data = &recvBuf[k*portionSize];
            const Int thisPathRank = g.DiagPathRank( k );
            const Int thisRowShift = Shift_( thisPathRank, ownerPathRank, lcm );
            const Int thisLocalWidth = Length_( width, thisRowShift, lcm );
            INNER_PARALLEL_FOR
            for( Int jLoc=0; jLoc<thisLocalWidth; ++jLoc )
                MemCopy
                ( &thisBuf[(thisRowShift+jLoc*lcm)*thisLDim], 
                  &data[jLoc*height], height );
        }
    }
    this->auxMemory_.Release();
    return *this;
}
示例#24
0
void TransposeDist( const DistMatrix<T,U,V>& A, DistMatrix<T,V,U>& B ) 
{
    DEBUG_ONLY(CSE cse("copy::TransposeDist"))
    AssertSameGrids( A, B );

    const Grid& g = B.Grid();
    B.Resize( A.Height(), A.Width() );
    if( !B.Participating() )
        return;

    const Int colStrideA = A.ColStride();
    const Int rowStrideA = A.RowStride();
    const Int distSize = A.DistSize();

    if( A.DistSize() == 1 && B.DistSize() == 1 ) 
    {
        Copy( A.LockedMatrix(), B.Matrix() );
    }
    else if( A.Width() == 1 )
    {
        const Int height = A.Height();
        const Int maxLocalHeight = MaxLength(height,distSize);
        const Int portionSize = mpi::Pad( maxLocalHeight );

        const Int colDiff = Shift(A.DistRank(),A.ColAlign(),distSize) - 
                            Shift(B.DistRank(),B.ColAlign(),distSize);
        const Int sendRankB = Mod( B.DistRank()+colDiff, distSize );
        const Int recvRankA = Mod( A.DistRank()-colDiff, distSize );
        const Int recvRankB = 
            (recvRankA/colStrideA)+rowStrideA*(recvRankA%colStrideA);

        vector<T> buffer;
        FastResize( buffer, (colStrideA+rowStrideA)*portionSize );
        T* sendBuf = &buffer[0];
        T* recvBuf = &buffer[colStrideA*portionSize];

        if( A.RowRank() == A.RowAlign() )
        {
            // Pack
            // TODO: Use kernel from copy::util
            const Int AColShift = A.ColShift();
            const T* ABuf = A.LockedBuffer();
            EL_PARALLEL_FOR
            for( Int k=0; k<rowStrideA; ++k )
            {
                T* data = &recvBuf[k*portionSize];

                const Int shift = 
                  Shift_(A.ColRank()+colStrideA*k,A.ColAlign(),distSize);
                const Int offset = (shift-AColShift) / colStrideA;
                const Int thisLocalHeight = Length_(height,shift,distSize);

                for( Int iLoc=0; iLoc<thisLocalHeight; ++iLoc )
                    data[iLoc] = ABuf[offset+iLoc*rowStrideA];
            }
        }

        // (e.g., A[VC,STAR] <- A[MC,MR])
        mpi::Scatter
        ( recvBuf, portionSize,
          sendBuf, portionSize, A.RowAlign(), A.RowComm() );

        // (e.g., A[VR,STAR] <- A[VC,STAR])
        mpi::SendRecv
        ( sendBuf, portionSize, sendRankB,
          recvBuf, portionSize, recvRankB, B.DistComm() );

        // (e.g., A[MR,MC] <- A[VR,STAR])
        mpi::Gather
        ( recvBuf, portionSize,
          sendBuf, portionSize, B.RowAlign(), B.RowComm() );

        if( B.RowRank() == B.RowAlign() )
        {
            // Unpack
            // TODO: Use kernel from copy::util
            T* bufB = B.Buffer();
            EL_PARALLEL_FOR
            for( Int k=0; k<colStrideA; ++k )
            {
                const T* data = &sendBuf[k*portionSize];

                const Int shift = 
                  Shift_(B.ColRank()+rowStrideA*k,B.ColAlign(),distSize);
                const Int offset = (shift-B.ColShift()) / rowStrideA;
                const Int thisLocalHeight = Length_(height,shift,distSize);

                for( Int iLoc=0; iLoc<thisLocalHeight; ++iLoc )
                    bufB[offset+iLoc*colStrideA] = data[iLoc];
            }
        }
    }