inline void MakeTriangular( UpperOrLower uplo, DistMatrix<T,U,V>& A ) { #ifndef RELEASE PushCallStack("MakeTriangular"); #endif const int height = A.Height(); const int localHeight = A.LocalHeight(); const int localWidth = A.LocalWidth(); const int colShift = A.ColShift(); const int rowShift = A.RowShift(); const int colStride = A.ColStride(); const int rowStride = A.RowStride(); T* buffer = A.Buffer(); const int ldim = A.LDim(); if( uplo == LOWER ) { #ifdef HAVE_OPENMP #pragma omp parallel for #endif for( int jLocal=0; jLocal<localWidth; ++jLocal ) { const int j = rowShift + jLocal*rowStride; const int lastZeroRow = j-1; if( lastZeroRow >= 0 ) { const int boundary = std::min( lastZeroRow+1, height ); const int numZeroRows = Length_( boundary, colShift, colStride ); MemZero( &buffer[jLocal*ldim], numZeroRows ); } } } else { #ifdef HAVE_OPENMP #pragma omp parallel for #endif for( int jLocal=0; jLocal<localWidth; ++jLocal ) { const int j = rowShift + jLocal*rowStride; const int firstZeroRow = j+1; const int numNonzeroRows = Length_(firstZeroRow,colShift,colStride); if( numNonzeroRows < localHeight ) { T* col = &buffer[numNonzeroRows+jLocal*ldim]; MemZero( col, localHeight-numNonzeroRows ); } } } #ifndef RELEASE PopCallStack(); #endif }
int CStr::FindRev( unsigned int uiStart, const char * pcStr ) const { const unsigned int uiLen = Length_( pcStr ); if( !uiLen ) return -1; if( uiLen == 1 ) return Find( uiStart, pcStr[0] ); const unsigned int uiMax = m_uiSize - 1 - uiLen; if( uiStart > uiMax ) uiStart = uiMax; unsigned int i = uiStart + 1; while( i ) { --i; const int iFound = FindRev( i, pcStr[0] ); if( iFound < 0 ) { break; } if( Match_( m_pcData + iFound + 1, pcStr + 1 ) ) { return iFound; } } return -1; }
int CStr::Replace( const char * pcOld, const char * pcNew ) { int iReplaced = 0; CStr oTmp; const char cFirst = *pcOld; const unsigned int uiLenOld = Length_( pcOld ); char * pcEnd = m_pcData + m_uiSize; char * pcD = m_pcData; while( pcD < pcEnd ) { if( *pcD == cFirst ) { if( Match_( pcD, pcOld ) ) { oTmp += pcNew; pcD += uiLenOld; ++iReplaced; continue; } } oTmp += *pcD; ++pcD; } *this = oTmp; return iReplaced; }
CStr::CStr( const char * pcStr ) { unsigned int uiSize = Length_( pcStr ); Init( uiSize ); // uiSize + 1 Plätze Copy_( m_pcData, pcStr, uiSize ); // Null-Terminator muss nicht kopiert werden. m_pcData[uiSize] = '\0'; }
int CStr::Find( unsigned int uiStart, const char * pcStr ) const { const unsigned int uiLen = Length_( pcStr ); if( !uiLen ) return -1; if( uiLen == 1 ) return Find( uiStart, pcStr[0] ); const unsigned int uiMax = m_uiSize - uiLen; // ACHTUNG: Mod: vorher -1! unsigned int i = uiStart; while( i < uiMax ) { const int iFound = Find( i, pcStr[0] ); if( iFound < 0 ) { break; } if( Match_( m_pcData + iFound + 1, pcStr + 1 ) ) { return iFound; } ++i; } return -1; }
const DistMatrix<T,STAR,STAR>& DistMatrix<T,STAR,STAR>::operator=( const DistMatrix<T,VR,STAR>& A ) { #ifndef RELEASE CallStackEntry entry("[* ,* ] = [VR,* ]"); this->AssertNotLocked(); this->AssertSameGrid( A.Grid() ); #endif const elem::Grid& g = this->Grid(); this->ResizeTo( A.Height(), A.Width() ); if( !this->Participating() ) return *this; const Int p = g.Size(); const Int height = this->Height(); const Int width = this->Width(); const Int localHeightOfA = A.LocalHeight(); const Int maxLocalHeight = MaxLength(height,p); const Int portionSize = mpi::Pad( maxLocalHeight*width ); T* buffer = this->auxMemory_.Require( (p+1)*portionSize ); T* sendBuf = &buffer[0]; T* recvBuf = &buffer[portionSize]; // Pack const Int ALDim = A.LDim(); const T* ABuf = A.LockedBuffer(); PARALLEL_FOR for( Int j=0; j<width; ++j ) MemCopy ( &sendBuf[j*localHeightOfA], &ABuf[j*ALDim], localHeightOfA ); // Communicate mpi::AllGather ( sendBuf, portionSize, recvBuf, portionSize, g.VRComm() ); // Unpack T* thisBuf = this->Buffer(); const Int thisLDim = this->LDim(); const Int colAlignmentOfA = A.ColAlignment(); OUTER_PARALLEL_FOR for( Int k=0; k<p; ++k ) { const T* data = &recvBuf[k*portionSize]; const Int colShift = Shift_( k, colAlignmentOfA, p ); const Int localHeight = Length_( height, colShift, p ); INNER_PARALLEL_FOR for( Int j=0; j<width; ++j ) { T* destCol = &thisBuf[colShift+j*thisLDim]; const T* sourceCol = &data[j*localHeight]; for( Int iLoc=0; iLoc<localHeight; ++iLoc ) destCol[iLoc*p] = sourceCol[iLoc]; } } this->auxMemory_.Release(); return *this; }
const DistMatrix<T,STAR,STAR>& DistMatrix<T,STAR,STAR>::operator=( const DistMatrix<T,STAR,VR>& A ) { #ifndef RELEASE CallStackEntry entry("[* ,* ] = [* ,VR]"); this->AssertNotLocked(); this->AssertSameGrid( A.Grid() ); #endif const elem::Grid& g = this->Grid(); this->ResizeTo( A.Height(), A.Width() ); if( !this->Participating() ) return *this; const Int p = g.Size(); const Int height = this->Height(); const Int width = this->Width(); const Int localWidthOfA = A.LocalWidth(); const Int maxLocalWidth = MaxLength(width,p); const Int portionSize = mpi::Pad( height*maxLocalWidth ); T* buffer = this->auxMemory_.Require( (p+1)*portionSize ); T* sendBuf = &buffer[0]; T* recvBuf = &buffer[portionSize]; // Pack const Int ALDim = A.LDim(); const T* ABuf = A.LockedBuffer(); PARALLEL_FOR for( Int jLoc=0; jLoc<localWidthOfA; ++jLoc ) MemCopy( &sendBuf[jLoc*height], &ABuf[jLoc*ALDim], height ); // Communicate mpi::AllGather ( sendBuf, portionSize, recvBuf, portionSize, g.VRComm() ); // Unpack T* thisBuf = this->Buffer(); const Int thisLDim = this->LDim(); const Int rowAlignmentOfA = A.RowAlignment(); OUTER_PARALLEL_FOR for( Int k=0; k<p; ++k ) { const T* data = &recvBuf[k*portionSize]; const Int rowShift = Shift_( k, rowAlignmentOfA, p ); const Int localWidth = Length_( width, rowShift, p ); INNER_PARALLEL_FOR for( Int jLoc=0; jLoc<localWidth; ++jLoc ) MemCopy ( &thisBuf[(rowShift+jLoc*p)*thisLDim], &data[jLoc*height], height ); } this->auxMemory_.Release(); return *this; }
void StridedUnpack ( Int height, Int width, Int colAlign, Int colStride, Int rowAlign, Int rowStride, const T* APortions, Int portionSize, T* B, Int BLDim ) { for( Int l=0; l<rowStride; ++l ) { const Int rowShift = Shift_( l, rowAlign, rowStride ); const Int localWidth = Length_( width, rowShift, rowStride ); for( Int k=0; k<colStride; ++k ) { const Int colShift = Shift_( k, colAlign, colStride ); const Int localHeight = Length_( height, colShift, colStride ); InterleaveMatrix ( localHeight, localWidth, &APortions[(k+l*colStride)*portionSize], 1, localHeight, &B[colShift+rowShift*BLDim], colStride, rowStride*BLDim ); } } }
void ColStridedColumnPack ( Int height, Int colAlign, Int colStride, const T* A, T* BPortions, Int portionSize ) { for( Int k=0; k<colStride; ++k ) { const Int colShift = Shift_( k, colAlign, colStride ); const Int localHeight = Length_( height, colShift, colStride ); StridedMemCopy ( &BPortions[k*portionSize], 1, &A[colShift], colStride, localHeight ); } }
void RowStridedUnpack ( Int height, Int width, Int rowAlign, Int rowStride, const T* APortions, Int portionSize, T* B, Int BLDim ) { for( Int k=0; k<rowStride; ++k ) { const Int rowShift = Shift_( k, rowAlign, rowStride ); const Int localWidth = Length_( width, rowShift, rowStride ); lapack::Copy ( 'F', height, localWidth, &APortions[k*portionSize], height, &B[rowShift*BLDim], rowStride*BLDim ); } }
void RowStridedPack ( Int height, Int width, Int rowAlign, Int rowStride, const T* A, Int ALDim, T* BPortions, Int portionSize ) { for( Int k=0; k<rowStride; ++k ) { const Int rowShift = Shift_( k, rowAlign, rowStride ); const Int localWidth = Length_( width, rowShift, rowStride ); lapack::Copy ( 'F', height, localWidth, &A[rowShift*ALDim], rowStride*ALDim, &BPortions[k*portionSize], height ); } }
void ColStridedPack ( Int height, Int width, Int colAlign, Int colStride, const T* A, Int ALDim, T* BPortions, Int portionSize ) { for( Int k=0; k<colStride; ++k ) { const Int colShift = Shift_( k, colAlign, colStride ); const Int localHeight = Length_( height, colShift, colStride ); InterleaveMatrix ( localHeight, width, &A[colShift], colStride, ALDim, &BPortions[k*portionSize], 1, localHeight ); } }
void ColStridedUnpack ( Int height, Int width, Int colAlign, Int colStride, const T* APortions, Int portionSize, T* B, Int BLDim ) { for( Int k=0; k<colStride; ++k ) { const Int colShift = Shift_( k, colAlign, colStride ); const Int localHeight = Length_( height, colShift, colStride ); InterleaveMatrix ( localHeight, width, &APortions[k*portionSize], 1, localHeight, &B[colShift], colStride, BLDim ); } }
int CStr::FindVecRev( unsigned int uiStart, const char * pcStr ) const { const unsigned int uiLen = Length_( pcStr ); unsigned int i = uiStart + 1; while( i ) { --i; for( unsigned int c=0; c<uiLen; ++c ) { if ( m_pcData[i] == pcStr[c] ) return i; } } return -1; }
int CStr::FindVec( unsigned int uiStart, const char * pcStr ) const { const unsigned int uiLen = Length_( pcStr ); unsigned int i = uiStart; while( i < m_uiSize ) { for( unsigned int c=0; c<uiLen; ++c ) { if ( m_pcData[i] == pcStr[c] ) return i; } ++i; } return -1; }
void PartialColStridedColumnUnpack ( Int height, Int colAlign, Int colStride, Int colStrideUnion, Int colStridePart, Int colRankPart, Int colShiftB, const T* APortions, Int portionSize, T* B ) { for( Int k=0; k<colStrideUnion; ++k ) { const Int colShift = Shift_( colRankPart+k*colStridePart, colAlign, colStride ); const Int colOffset = (colShift-colShiftB) / colStridePart; const Int localHeight = Length_( height, colShift, colStride ); StridedMemCopy ( &B[colOffset], colStrideUnion, &APortions[k*portionSize], 1, localHeight ); } }
void PartialColStridedPack ( Int height, Int width, Int colAlign, Int colStride, Int colStrideUnion, Int colStridePart, Int colRankPart, Int colShiftA, const T* A, Int ALDim, T* BPortions, Int portionSize ) { for( Int k=0; k<colStrideUnion; ++k ) { const Int colShift = Shift_( colRankPart+k*colStridePart, colAlign, colStride ); const Int colOffset = (colShift-colShiftA) / colStridePart; const Int localHeight = Length_( height, colShift, colStride ); InterleaveMatrix ( localHeight, width, &A[colOffset], colStrideUnion, ALDim, &BPortions[k*portionSize], 1, localHeight ); } }
int CStr::FindVecInvRev( unsigned int uiStart, const char * pcStr ) const { const unsigned int uiLen = Length_( pcStr ); unsigned int i = uiStart + 1; while( i ) { --i; unsigned int c = 0; while( c < uiLen ) { if ( m_pcData[i] == pcStr[c] ) break; ++c; } if( c == uiLen ) return i; } return -1; }
void PartialColStridedUnpack ( Int height, Int width, Int colAlign, Int colStride, Int colStrideUnion, Int colStridePart, Int colRankPart, Int colShiftB, const T* APortions, Int portionSize, T* B, Int BLDim ) { for( Int k=0; k<colStrideUnion; ++k ) { const Int colShift = Shift_( colRankPart+k*colStridePart, colAlign, colStride ); const Int colOffset = (colShift-colShiftB) / colStridePart; const Int localHeight = Length_( height, colShift, colStride ); InterleaveMatrix ( localHeight, width, &APortions[k*portionSize], 1, localHeight, &B[colOffset], colStrideUnion, BLDim ); } }
void PartialRowStridedPack ( Int height, Int width, Int rowAlign, Int rowStride, Int rowStrideUnion, Int rowStridePart, Int rowRankPart, Int rowShiftA, const T* A, Int ALDim, T* BPortions, Int portionSize ) { for( Int k=0; k<rowStrideUnion; ++k ) { const Int rowShift = Shift_( rowRankPart+k*rowStridePart, rowAlign, rowStride ); const Int rowOffset = (rowShift-rowShiftA) / rowStridePart; const Int localWidth = Length_( width, rowShift, rowStride ); lapack::Copy ( 'F', height, localWidth, &A[rowOffset*ALDim], rowStrideUnion*ALDim, &BPortions[k*portionSize], height ); } }
void PartialRowStridedUnpack ( Int height, Int width, Int rowAlign, Int rowStride, Int rowStrideUnion, Int rowStridePart, Int rowRankPart, Int rowShiftB, const T* APortions, Int portionSize, T* B, Int BLDim ) { for( Int k=0; k<rowStrideUnion; ++k ) { const Int rowShift = Shift_( rowRankPart+k*rowStridePart, rowAlign, rowStride ); const Int rowOffset = (rowShift-rowShiftB) / rowStridePart; const Int localWidth = Length_( width, rowShift, rowStride ); lapack::Copy ( 'F', height, localWidth, &APortions[k*portionSize], height, &B[rowOffset*BLDim], rowStrideUnion*BLDim ); } }
inline void MakeTrapezoidal ( LeftOrRight side, UpperOrLower uplo, int offset, DistMatrix<T,U,V>& A ) { #ifndef RELEASE PushCallStack("MakeTrapezoidal"); #endif const int height = A.Height(); const int width = A.Width(); const int localHeight = A.LocalHeight(); const int localWidth = A.LocalWidth(); const int colShift = A.ColShift(); const int rowShift = A.RowShift(); const int colStride = A.ColStride(); const int rowStride = A.RowStride(); T* buffer = A.Buffer(); const int ldim = A.LDim(); if( uplo == LOWER ) { #ifdef HAVE_OPENMP #pragma omp parallel for #endif for( int jLocal=0; jLocal<localWidth; ++jLocal ) { const int j = rowShift + jLocal*rowStride; const int lastZeroRow = ( side==LEFT ? j-offset-1 : j-offset+height-width-1 ); if( lastZeroRow >= 0 ) { const int boundary = std::min( lastZeroRow+1, height ); const int numZeroRows = Length_( boundary, colShift, colStride ); MemZero( &buffer[jLocal*ldim], numZeroRows ); } } } else { #ifdef HAVE_OPENMP #pragma omp parallel for #endif for( int jLocal=0; jLocal<localWidth; ++jLocal ) { const int j = rowShift + jLocal*rowStride; const int firstZeroRow = ( side==LEFT ? std::max(j-offset+1,0) : std::max(j-offset+height-width+1,0) ); const int numNonzeroRows = Length_(firstZeroRow,colShift,colStride); if( numNonzeroRows < localHeight ) { T* col = &buffer[numNonzeroRows+jLocal*ldim]; MemZero( col, localHeight-numNonzeroRows ); } } } #ifndef RELEASE PopCallStack(); #endif }
const DistMatrix<T,STAR,STAR>& DistMatrix<T,STAR,STAR>::operator=( const DistMatrix<T,STAR,MD>& A ) { #ifndef RELEASE CallStackEntry entry("[* ,* ] = [* ,MD]"); this->AssertNotLocked(); this->AssertSameGrid( A.Grid() ); #endif const elem::Grid& g = this->Grid(); this->ResizeTo( A.Height(), A.Width() ); if( !this->Participating() ) return *this; const Int p = g.Size(); const Int lcm = g.LCM(); const Int ownerPath = A.diagPath_; const Int ownerPathRank = A.rowAlignment_; const Int height = this->Height(); const Int width = this->Width(); const Int localWidth = A.LocalWidth(); const Int maxLocalWidth = MaxLength( width, lcm ); const Int portionSize = mpi::Pad( height*maxLocalWidth ); // Since a MD communicator has not been implemented, we will take // the suboptimal route of 'rounding up' everyone's contribution over // the VC communicator. T* buffer = this->auxMemory_.Require( (p+1)*portionSize ); T* sendBuf = &buffer[0]; T* recvBuf = &buffer[portionSize]; // Pack if( A.Participating() ) { const Int ALDim = A.LDim(); const T* ABuf = A.LockedBuffer(); PARALLEL_FOR for( Int jLoc=0; jLoc<localWidth; ++jLoc ) MemCopy( &sendBuf[jLoc*height], &ABuf[jLoc*ALDim], height ); } // Communicate mpi::AllGather ( sendBuf, portionSize, recvBuf, portionSize, g.VCComm() ); // Unpack T* thisBuf = this->Buffer(); const Int thisLDim = this->LDim(); OUTER_PARALLEL_FOR for( Int k=0; k<p; ++k ) { if( g.DiagPath( k ) == ownerPath ) { const T* data = &recvBuf[k*portionSize]; const Int thisPathRank = g.DiagPathRank( k ); const Int thisRowShift = Shift_( thisPathRank, ownerPathRank, lcm ); const Int thisLocalWidth = Length_( width, thisRowShift, lcm ); INNER_PARALLEL_FOR for( Int jLoc=0; jLoc<thisLocalWidth; ++jLoc ) MemCopy ( &thisBuf[(thisRowShift+jLoc*lcm)*thisLDim], &data[jLoc*height], height ); } } this->auxMemory_.Release(); return *this; }
void TransposeDist( const DistMatrix<T,U,V>& A, DistMatrix<T,V,U>& B ) { DEBUG_ONLY(CSE cse("copy::TransposeDist")) AssertSameGrids( A, B ); const Grid& g = B.Grid(); B.Resize( A.Height(), A.Width() ); if( !B.Participating() ) return; const Int colStrideA = A.ColStride(); const Int rowStrideA = A.RowStride(); const Int distSize = A.DistSize(); if( A.DistSize() == 1 && B.DistSize() == 1 ) { Copy( A.LockedMatrix(), B.Matrix() ); } else if( A.Width() == 1 ) { const Int height = A.Height(); const Int maxLocalHeight = MaxLength(height,distSize); const Int portionSize = mpi::Pad( maxLocalHeight ); const Int colDiff = Shift(A.DistRank(),A.ColAlign(),distSize) - Shift(B.DistRank(),B.ColAlign(),distSize); const Int sendRankB = Mod( B.DistRank()+colDiff, distSize ); const Int recvRankA = Mod( A.DistRank()-colDiff, distSize ); const Int recvRankB = (recvRankA/colStrideA)+rowStrideA*(recvRankA%colStrideA); vector<T> buffer; FastResize( buffer, (colStrideA+rowStrideA)*portionSize ); T* sendBuf = &buffer[0]; T* recvBuf = &buffer[colStrideA*portionSize]; if( A.RowRank() == A.RowAlign() ) { // Pack // TODO: Use kernel from copy::util const Int AColShift = A.ColShift(); const T* ABuf = A.LockedBuffer(); EL_PARALLEL_FOR for( Int k=0; k<rowStrideA; ++k ) { T* data = &recvBuf[k*portionSize]; const Int shift = Shift_(A.ColRank()+colStrideA*k,A.ColAlign(),distSize); const Int offset = (shift-AColShift) / colStrideA; const Int thisLocalHeight = Length_(height,shift,distSize); for( Int iLoc=0; iLoc<thisLocalHeight; ++iLoc ) data[iLoc] = ABuf[offset+iLoc*rowStrideA]; } } // (e.g., A[VC,STAR] <- A[MC,MR]) mpi::Scatter ( recvBuf, portionSize, sendBuf, portionSize, A.RowAlign(), A.RowComm() ); // (e.g., A[VR,STAR] <- A[VC,STAR]) mpi::SendRecv ( sendBuf, portionSize, sendRankB, recvBuf, portionSize, recvRankB, B.DistComm() ); // (e.g., A[MR,MC] <- A[VR,STAR]) mpi::Gather ( recvBuf, portionSize, sendBuf, portionSize, B.RowAlign(), B.RowComm() ); if( B.RowRank() == B.RowAlign() ) { // Unpack // TODO: Use kernel from copy::util T* bufB = B.Buffer(); EL_PARALLEL_FOR for( Int k=0; k<colStrideA; ++k ) { const T* data = &sendBuf[k*portionSize]; const Int shift = Shift_(B.ColRank()+rowStrideA*k,B.ColAlign(),distSize); const Int offset = (shift-B.ColShift()) / rowStrideA; const Int thisLocalHeight = Length_(height,shift,distSize); for( Int iLoc=0; iLoc<thisLocalHeight; ++iLoc ) bufB[offset+iLoc*colStrideA] = data[iLoc]; } } }