inline void CheckInput ( const DistMatrix<T,MC, STAR>& A, const DistMatrix<T,STAR,MR >& B, const DistMatrix<T,MC, MR >& C ) { if( A.Grid() != B.Grid() || B.Grid() != C.Grid() ) throw std::logic_error ("A, B, and C must be distributed over the same grid"); if( A.Height() != C.Height() || B.Width() != C.Width() || A.Width() != B.Height() || A.Height() != B.Width() ) { std::ostringstream msg; msg << "Nonconformal LocalTrrk: \n" << " A[MC,* ] ~ " << A.Height() << " x " << A.Width() << "\n" << " B[* ,MR] ~ " << B.Height() << " x " << B.Width() << "\n" << " C[MC,MR] ~ " << C.Height() << " x " << C.Width() << "\n"; throw std::logic_error( msg.str().c_str() ); } if( A.ColAlignment() != C.ColAlignment() || B.RowAlignment() != C.RowAlignment() ) { std::ostringstream msg; msg << "Misaligned LocalTrrk: \n" << " A[MC,* ] ~ " << A.ColAlignment() << "\n" << " B[* ,MR] ~ " << B.RowAlignment() << "\n" << " C[MC,MR] ~ " << C.ColAlignment() << " , " << C.RowAlignment() << "\n"; throw std::logic_error( msg.str().c_str() ); } }
inline F Reflector( DistMatrix<F>& chi, DistMatrix<F>& x ) { #ifndef RELEASE CallStackEntry entry("Reflector"); if( chi.Grid() != x.Grid() ) LogicError("chi and x must be distributed over the same grid"); if( chi.Height() != 1 || chi.Width() != 1 ) LogicError("chi must be a scalar"); if( x.Height() != 1 && x.Width() != 1 ) LogicError("x must be a vector"); #endif const Grid& g = x.Grid(); F tau; if( x.Width() == 1 && x.RowAlignment() == chi.RowAlignment() ) { if( g.Col() == x.RowAlignment() ) tau = reflector::Col( chi, x ); mpi::Broadcast( tau, x.RowAlignment(), g.RowComm() ); } else { if( g.Row() == x.ColAlignment() ) tau = reflector::Row( chi, x ); mpi::Broadcast( tau, x.ColAlignment(), g.ColComm() ); } return tau; }
const DistMatrix<T,STAR,STAR>& DistMatrix<T,STAR,STAR>::operator=( const DistMatrix<T,VR,STAR>& A ) { #ifndef RELEASE CallStackEntry entry("[* ,* ] = [VR,* ]"); this->AssertNotLocked(); this->AssertSameGrid( A.Grid() ); #endif const elem::Grid& g = this->Grid(); this->ResizeTo( A.Height(), A.Width() ); if( !this->Participating() ) return *this; const Int p = g.Size(); const Int height = this->Height(); const Int width = this->Width(); const Int localHeightOfA = A.LocalHeight(); const Int maxLocalHeight = MaxLength(height,p); const Int portionSize = mpi::Pad( maxLocalHeight*width ); T* buffer = this->auxMemory_.Require( (p+1)*portionSize ); T* sendBuf = &buffer[0]; T* recvBuf = &buffer[portionSize]; // Pack const Int ALDim = A.LDim(); const T* ABuf = A.LockedBuffer(); PARALLEL_FOR for( Int j=0; j<width; ++j ) MemCopy ( &sendBuf[j*localHeightOfA], &ABuf[j*ALDim], localHeightOfA ); // Communicate mpi::AllGather ( sendBuf, portionSize, recvBuf, portionSize, g.VRComm() ); // Unpack T* thisBuf = this->Buffer(); const Int thisLDim = this->LDim(); const Int colAlignmentOfA = A.ColAlignment(); OUTER_PARALLEL_FOR for( Int k=0; k<p; ++k ) { const T* data = &recvBuf[k*portionSize]; const Int colShift = Shift_( k, colAlignmentOfA, p ); const Int localHeight = Length_( height, colShift, p ); INNER_PARALLEL_FOR for( Int j=0; j<width; ++j ) { T* destCol = &thisBuf[colShift+j*thisLDim]; const T* sourceCol = &data[j*localHeight]; for( Int iLoc=0; iLoc<localHeight; ++iLoc ) destCol[iLoc*p] = sourceCol[iLoc]; } } this->auxMemory_.Release(); return *this; }
inline void DiagonalScale ( LeftOrRight side, Orientation orientation, const DistMatrix<typename Base<T>::type,U,V>& d, DistMatrix<T,W,Z>& X ) { #ifndef RELEASE PushCallStack("DiagonalScale"); #endif typedef typename Base<T>::type R; if( side == LEFT ) { if( U == W && V == STAR && d.ColAlignment() == X.ColAlignment() ) { DiagonalScale( LEFT, orientation, d.LockedMatrix(), X.Matrix() ); } else { DistMatrix<R,W,STAR> d_W_STAR( X.Grid() ); d_W_STAR = d; DiagonalScale ( LEFT, orientation, d_W_STAR.LockedMatrix(), X.Matrix() ); } } else { if( U == Z && V == STAR && d.ColAlignment() == X.RowAlignment() ) { DiagonalScale( RIGHT, orientation, d.LockedMatrix(), X.Matrix() ); } else { DistMatrix<R,Z,STAR> d_Z_STAR( X.Grid() ); d_Z_STAR = d; DiagonalScale ( RIGHT, orientation, d_Z_STAR.LockedMatrix(), X.Matrix() ); } } #ifndef RELEASE PopCallStack(); #endif }
inline void CheckInput ( Orientation orientationOfA, Orientation orientationOfB, const DistMatrix<T,STAR,MC >& A, const DistMatrix<T,MR, STAR>& B, const DistMatrix<T,MC, MR >& C ) { if( orientationOfA == NORMAL ) throw std::logic_error("A[* ,MC] must be (Conjugate)Transpose'd"); if( orientationOfB == NORMAL ) throw std::logic_error("B[MR,* ] must be (Conjugate)Transpose'd"); if( A.Grid() != B.Grid() || B.Grid() != C.Grid() ) throw std::logic_error ("A, B, and C must be distributed over the same grid"); if( A.Width() != C.Height() || B.Height() != C.Width() || A.Height() != B.Width() || A.Width() != B.Height() ) { std::ostringstream msg; msg << "Nonconformal LocalTrrk: \n" << " A[* ,MC] ~ " << A.Height() << " x " << A.Width() << "\n" << " B[MR,* ] ~ " << B.Height() << " x " << B.Width() << "\n" << " C[MC,MR] ~ " << C.Height() << " x " << C.Width() << "\n"; throw std::logic_error( msg.str().c_str() ); } if( A.RowAlignment() != C.ColAlignment() || B.ColAlignment() != C.RowAlignment() ) { std::ostringstream msg; msg << "Misaligned LocalTrrk: \n" << " A[* ,MC] ~ " << A.RowAlignment() << "\n" << " B[MR,* ] ~ " << B.ColAlignment() << "\n" << " C[MC,MR] ~ " << C.ColAlignment() << " , " << C.RowAlignment() << "\n"; throw std::logic_error( msg.str().c_str() ); } }
inline void DistMatrix<T,MD,STAR,Int>::AlignWithDiagonal ( const DistMatrix<S,MR,MC,N>& A, Int offset ) { #ifndef RELEASE PushCallStack("[MD,* ]::AlignWithDiagonal([MR,MC])"); this->AssertFreeColAlignment(); this->AssertSameGrid( A ); #endif const elem::Grid& g = this->Grid(); const Int r = g.Height(); const Int c = g.Width(); const Int lcm = g.LCM(); const Int colAlignment = A.ColAlignment(); const Int rowAlignment = A.RowAlignment(); this->Empty(); Int owner; if( offset >= 0 ) { const Int ownerRow = rowAlignment; const Int ownerCol = (colAlignment + offset) % c; owner = ownerRow + r*ownerCol; } else { const Int ownerRow = (rowAlignment-offset) % r; const Int ownerCol = colAlignment; owner = ownerRow + r*ownerCol; } this->diagPath_ = g.DiagPath(owner); this->colAlignment_ = g.DiagPathRank(owner); this->constrainedColAlignment_ = true; if( this->Participating() ) this->colShift_ = (g.DiagPathRank()+lcm-this->colAlignment_) % lcm; else this->colShift_ = 0; #ifndef RELEASE PopCallStack(); #endif }
inline bool DistMatrix<T,MD,STAR,Int>::AlignedWithDiagonal ( const DistMatrix<S,MR,MC,N>& A, Int offset ) const { #ifndef RELEASE PushCallStack("[MD,* ]::AlignedWithDiagonal([MR,MC])"); this->AssertSameGrid( A ); #endif const elem::Grid& g = this->Grid(); const Int r = g.Height(); const Int c = g.Width(); const Int colAlignment = A.ColAlignment(); const Int rowAlignment = A.RowAlignment(); const Int firstDiagRow = 0; const Int firstDiagCol = this->diagPath_; const Int diagRow = (firstDiagRow+this->ColAlignment()) % r; const Int diagCol = (firstDiagCol+this->ColAlignment()) % c; bool aligned; if( offset >= 0 ) { const Int ownerCol = colAlignment; const Int ownerRow = (rowAlignment + offset) % r; aligned = ( ownerRow==diagRow && ownerCol==diagCol ); } else { const Int ownerCol = (colAlignment-offset) % c; const Int ownerRow = rowAlignment; aligned = ( ownerRow==diagRow && ownerCol==diagCol ); } #ifndef RELEASE PopCallStack(); #endif return aligned; }
inline void internal::CholeskyUVar3Square( DistMatrix<F,MC,MR>& A ) { #ifndef RELEASE PushCallStack("internal::CholeskyUVar3Square"); if( A.Height() != A.Width() ) throw std::logic_error ("Can only compute Cholesky factor of square matrices."); if( A.Grid().Height() != A.Grid().Width() ) throw std::logic_error ("CholeskyUVar3Square assumes a square process grid."); #endif const Grid& g = A.Grid(); // Find the process holding our transposed data const int r = g.Height(); int transposeRank; { const int colAlignment = A.ColAlignment(); const int rowAlignment = A.RowAlignment(); const int colShift = A.ColShift(); const int rowShift = A.RowShift(); const int transposeRow = (colAlignment+rowShift) % r; const int transposeCol = (rowAlignment+colShift) % r; transposeRank = transposeRow + r*transposeCol; } const bool onDiagonal = ( transposeRank == g.VCRank() ); // Matrix views DistMatrix<F,MC,MR> ATL(g), ATR(g), A00(g), A01(g), A02(g), ABL(g), ABR(g), A10(g), A11(g), A12(g), A20(g), A21(g), A22(g); // Temporary matrix distributions DistMatrix<F,STAR,STAR> A11_STAR_STAR(g); DistMatrix<F,STAR,VR > A12_STAR_VR(g); DistMatrix<F,STAR,MC > A12_STAR_MC(g); DistMatrix<F,STAR,MR > A12_STAR_MR(g); // Start the algorithm PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); while( ABR.Height() > 0 ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); A12_STAR_MC.AlignWith( A22 ); A12_STAR_MR.AlignWith( A22 ); A12_STAR_VR.AlignWith( A22 ); //--------------------------------------------------------------------// A11_STAR_STAR = A11; internal::LocalCholesky( UPPER, A11_STAR_STAR ); A11 = A11_STAR_STAR; A12_STAR_VR = A12; internal::LocalTrsm ( LEFT, UPPER, ADJOINT, NON_UNIT, (F)1, A11_STAR_STAR, A12_STAR_VR ); A12_STAR_MR = A12_STAR_VR; // SendRecv to form A12[* ,MC] from A12[* ,MR] A12_STAR_MC.ResizeTo( A12.Height(), A12.Width() ); { if( onDiagonal ) { const int size = A11.Height()*A22.LocalWidth(); MemCopy ( A12_STAR_MC.LocalBuffer(), A12_STAR_MR.LocalBuffer(), size ); } else { const int sendSize = A11.Height()*A22.LocalWidth(); const int recvSize = A11.Width()*A22.LocalHeight(); // We know that the ldim is the height since we have manually // created both temporary matrices. mpi::SendRecv ( A12_STAR_MR.LocalBuffer(), sendSize, transposeRank, 0, A12_STAR_MC.LocalBuffer(), recvSize, transposeRank, 0, g.VCComm() ); } } internal::LocalTrrk ( UPPER, ADJOINT, (F)-1, A12_STAR_MC, A12_STAR_MR, (F)1, A22 ); A12 = A12_STAR_MR; //--------------------------------------------------------------------// A12_STAR_MC.FreeAlignments(); A12_STAR_MR.FreeAlignments(); A12_STAR_VR.FreeAlignments(); SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void PanelLU ( DistMatrix<F, STAR,STAR>& A, DistMatrix<F, MC, STAR>& B, DistMatrix<int,STAR,STAR>& p, int pivotOffset ) { #ifndef RELEASE PushCallStack("internal::PanelLU"); if( A.Grid() != p.Grid() || p.Grid() != B.Grid() ) throw std::logic_error ("Matrices must be distributed over the same grid"); if( A.Width() != B.Width() ) throw std::logic_error("A and B must be the same width"); if( A.Height() != p.Height() || p.Width() != 1 ) throw std::logic_error("p must be a vector that conforms with A"); #endif const Grid& g = A.Grid(); const int r = g.Height(); const int colShift = B.ColShift(); const int colAlignment = B.ColAlignment(); // Matrix views DistMatrix<F,STAR,STAR> ATL(g), ATR(g), A00(g), a01(g), A02(g), ABL(g), ABR(g), a10(g), alpha11(g), a12(g), A20(g), a21(g), A22(g); DistMatrix<F,MC,STAR> BL(g), BR(g), B0(g), b1(g), B2(g); const int width = A.Width(); const int numBytes = (width+1)*sizeof(F)+sizeof(int); std::vector<byte> sendData(numBytes); std::vector<byte> recvData(numBytes); // Extract pointers to send and recv data // TODO: Think of how to make this safer with respect to alignment issues F* sendBufFloat = (F*)&sendData[0]; F* recvBufFloat = (F*)&recvData[0]; int* sendBufInt = (int*)&sendData[(width+1)*sizeof(F)]; int* recvBufInt = (int*)&recvData[(width+1)*sizeof(F)]; // Start the algorithm PushBlocksizeStack( 1 ); PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); PartitionRight( B, BL, BR, 0 ); while( ATL.Height() < A.Height() ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ a01, A02, /*************/ /**********************/ /**/ a10, /**/ alpha11, a12, ABL, /**/ ABR, A20, /**/ a21, A22 ); RepartitionRight ( BL, /**/ BR, B0, /**/ b1, B2 ); //--------------------------------------------------------------------// const int currentRow = a01.Height(); // Store the index/value of the pivot candidate in A F pivot = alpha11.GetLocal(0,0); int pivotRow = currentRow; for( int i=0; i<a21.Height(); ++i ) { F value = a21.GetLocal(i,0); if( FastAbs(value) > FastAbs(pivot) ) { pivot = value; pivotRow = currentRow + i + 1; } } // Update the pivot candidate to include local data from B for( int i=0; i<B.LocalHeight(); ++i ) { F value = b1.GetLocal(i,0); if( FastAbs(value) > FastAbs(pivot) ) { pivot = value; pivotRow = A.Height() + colShift + i*r; } } // Fill the send buffer with: // [ pivotValue | pivot row data | pivotRow ] if( pivotRow < A.Height() ) { sendBufFloat[0] = A.GetLocal(pivotRow,a10.Width()); const int ALDim = A.LocalLDim(); const F* ABuffer = A.LocalBuffer(pivotRow,0); for( int j=0; j<width; ++j ) sendBufFloat[j+1] = ABuffer[j*ALDim]; } else { const int localRow = ((pivotRow-A.Height())-colShift)/r; sendBufFloat[0] = b1.GetLocal(localRow,0); const int BLDim = B.LocalLDim(); const F* BBuffer = B.LocalBuffer(localRow,0); for( int j=0; j<width; ++j ) sendBufFloat[j+1] = BBuffer[j*BLDim]; } *sendBufInt = pivotRow; // Communicate to establish the pivot information mpi::AllReduce ( &sendData[0], &recvData[0], numBytes, PivotOp<F>(), g.ColComm() ); // Update the pivot vector pivotRow = *recvBufInt; p.SetLocal(currentRow,0,pivotRow+pivotOffset); // Copy the current row into the pivot row if( pivotRow < A.Height() ) { const int ALDim = A.LocalLDim(); F* ASetBuffer = A.LocalBuffer(pivotRow,0); const F* AGetBuffer = A.LocalBuffer(currentRow,0); for( int j=0; j<width; ++j ) ASetBuffer[j*ALDim] = AGetBuffer[j*ALDim]; } else { const int ownerRank = (colAlignment+(pivotRow-A.Height())) % r; if( g.Row() == ownerRank ) { const int localRow = ((pivotRow-A.Height())-colShift) / r; const int ALDim = A.LocalLDim(); const int BLDim = B.LocalLDim(); F* BBuffer = B.LocalBuffer(localRow,0); const F* ABuffer = A.LocalBuffer(currentRow,0); for( int j=0; j<width; ++j ) BBuffer[j*BLDim] = ABuffer[j*ALDim]; } } // Copy the pivot row into the current row { F* ABuffer = A.LocalBuffer(currentRow,0); const int ALDim = A.LocalLDim(); for( int j=0; j<width; ++j ) ABuffer[j*ALDim] = recvBufFloat[j+1]; } // Now we can perform the update of the current panel const F alpha = alpha11.GetLocal(0,0); if( alpha == F(0) ) throw SingularMatrixException(); const F alpha11Inv = F(1) / alpha; Scale( alpha11Inv, a21.LocalMatrix() ); Scale( alpha11Inv, b1.LocalMatrix() ); Geru( F(-1), a21.LocalMatrix(), a12.LocalMatrix(), A22.LocalMatrix() ); Geru( F(-1), b1.LocalMatrix(), a12.LocalMatrix(), B2.LocalMatrix() ); //--------------------------------------------------------------------// SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, a01, /**/ A02, /**/ a10, alpha11, /**/ a12, /*************/ /**********************/ ABL, /**/ ABR, A20, a21, /**/ A22 ); SlidePartitionRight ( BL, /**/ BR, B0, b1, /**/ B2 ); } PopBlocksizeStack(); #ifndef RELEASE PopCallStack(); #endif }
inline void Var3( Orientation orientation, DistMatrix<F>& A, DistMatrix<F,MC,STAR>& d ) { #ifndef RELEASE PushCallStack("ldl::Var3"); if( orientation == NORMAL ) throw std::logic_error("Can only perform LDL^T and LDL^H"); if( A.Height() != A.Width() ) throw std::logic_error("A must be square"); if( A.Grid() != d.Grid() ) throw std::logic_error("A and d must use the same grid"); if( d.Viewing() && (d.Height() != A.Height() || d.Width() != 1) ) throw std::logic_error ("d must be a column vector of the same height as A"); if( d.Viewing() && d.ColAlignment() != A.ColAlignment() ) throw std::logic_error("d must be aligned with A"); #endif const Grid& g = A.Grid(); if( !d.Viewing() ) { d.AlignWith( A ); d.ResizeTo( A.Height(), 1 ); } // Matrix views DistMatrix<F> ATL(g), ATR(g), A00(g), A01(g), A02(g), ABL(g), ABR(g), A10(g), A11(g), A12(g), A20(g), A21(g), A22(g); DistMatrix<F,MC,STAR> dT(g), d0(g), dB(g), d1(g), d2(g); // Temporary matrices DistMatrix<F,STAR,STAR> A11_STAR_STAR(g); DistMatrix<F,STAR,STAR> d1_STAR_STAR(g); DistMatrix<F,VC, STAR> A21_VC_STAR(g); DistMatrix<F,VR, STAR> A21_VR_STAR(g); DistMatrix<F,STAR,MC > S21Trans_STAR_MC(g); DistMatrix<F,STAR,MR > A21AdjOrTrans_STAR_MR(g); const bool conjugate = ( orientation == ADJOINT ); // Start the algorithm PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); PartitionDown ( d, dT, dB, 0 ); while( ABR.Height() > 0 ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); RepartitionDown ( dT, d0, /**/ /**/ d1, dB, d2 ); A21_VC_STAR.AlignWith( A22 ); A21_VR_STAR.AlignWith( A22 ); S21Trans_STAR_MC.AlignWith( A22 ); A21AdjOrTrans_STAR_MR.AlignWith( A22 ); //--------------------------------------------------------------------// A11_STAR_STAR = A11; LocalLDL( orientation, A11_STAR_STAR, d1_STAR_STAR ); A11 = A11_STAR_STAR; d1 = d1_STAR_STAR; A21_VC_STAR = A21; LocalTrsm ( RIGHT, LOWER, orientation, UNIT, F(1), A11_STAR_STAR, A21_VC_STAR ); S21Trans_STAR_MC.TransposeFrom( A21_VC_STAR ); DiagonalSolve( RIGHT, NORMAL, d1_STAR_STAR, A21_VC_STAR ); A21_VR_STAR = A21_VC_STAR; A21AdjOrTrans_STAR_MR.TransposeFrom( A21_VR_STAR, conjugate ); LocalTrrk ( LOWER, TRANSPOSE, F(-1), S21Trans_STAR_MC, A21AdjOrTrans_STAR_MR, F(1), A22 ); A21 = A21_VC_STAR; //--------------------------------------------------------------------// A21_VC_STAR.FreeAlignments(); A21_VR_STAR.FreeAlignments(); S21Trans_STAR_MC.FreeAlignments(); A21AdjOrTrans_STAR_MR.FreeAlignments(); SlidePartitionDown ( dT, d0, d1, /**/ /**/ dB, d2 ); SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void LocalTrmmAccumulateRUN ( Orientation orientation, UnitOrNonUnit diag, T alpha, const DistMatrix<T,MC, MR >& U, const DistMatrix<T,STAR,MC >& X_STAR_MC, DistMatrix<T,MR, STAR>& ZTrans_MR_STAR ) { #ifndef RELEASE CallStackEntry entry("internal::LocalTrmmAccumulateRUN"); if( U.Grid() != X_STAR_MC.Grid() || X_STAR_MC.Grid() != ZTrans_MR_STAR.Grid() ) throw std::logic_error ("{U,X,Z} must be distributed over the same grid"); if( U.Height() != U.Width() || U.Height() != X_STAR_MC.Width() || U.Height() != ZTrans_MR_STAR.Height() ) { std::ostringstream msg; msg << "Nonconformal LocalTrmmAccumulateRUN: \n" << " U ~ " << U.Height() << " x " << U.Width() << "\n" << " X[* ,MC] ~ " << X_STAR_MC.Height() << " x " << X_STAR_MC.Width() << "\n" << " Z^H/T[MR,* ] ~ " << ZTrans_MR_STAR.Height() << " x " << ZTrans_MR_STAR.Width() << "\n"; throw std::logic_error( msg.str().c_str() ); } if( X_STAR_MC.RowAlignment() != U.ColAlignment() || ZTrans_MR_STAR.ColAlignment() != U.RowAlignment() ) throw std::logic_error("Partial matrix distributions are misaligned"); #endif const Grid& g = U.Grid(); // Matrix views DistMatrix<T> UTL(g), UTR(g), U00(g), U01(g), U02(g), UBL(g), UBR(g), U10(g), U11(g), U12(g), U20(g), U21(g), U22(g); DistMatrix<T> D11(g); DistMatrix<T,STAR,MC> XL_STAR_MC(g), XR_STAR_MC(g), X0_STAR_MC(g), X1_STAR_MC(g), X2_STAR_MC(g); DistMatrix<T,MR,STAR> ZTTrans_MR_STAR(g), Z0Trans_MR_STAR(g), ZBTrans_MR_STAR(g), Z1Trans_MR_STAR(g), Z2Trans_MR_STAR(g); const int ratio = std::max( g.Height(), g.Width() ); PushBlocksizeStack( ratio*Blocksize() ); LockedPartitionDownDiagonal ( U, UTL, UTR, UBL, UBR, 0 ); LockedPartitionRight( X_STAR_MC, XL_STAR_MC, XR_STAR_MC, 0 ); PartitionDown ( ZTrans_MR_STAR, ZTTrans_MR_STAR, ZBTrans_MR_STAR, 0 ); while( UTL.Height() < U.Height() ) { LockedRepartitionDownDiagonal ( UTL, /**/ UTR, U00, /**/ U01, U02, /*************/ /******************/ /**/ U10, /**/ U11, U12, UBL, /**/ UBR, U20, /**/ U21, U22 ); LockedRepartitionRight ( XL_STAR_MC, /**/ XR_STAR_MC, X0_STAR_MC, /**/ X1_STAR_MC, X2_STAR_MC ); RepartitionDown ( ZTTrans_MR_STAR, Z0Trans_MR_STAR, /***************/ /***************/ Z1Trans_MR_STAR, ZBTrans_MR_STAR, Z2Trans_MR_STAR ); D11.AlignWith( U11 ); //--------------------------------------------------------------------// D11 = U11; MakeTriangular( UPPER, D11 ); if( diag == UNIT ) SetDiagonal( D11, T(1) ); LocalGemm ( orientation, orientation, alpha, D11, X1_STAR_MC, T(1), Z1Trans_MR_STAR ); LocalGemm ( orientation, orientation, alpha, U01, X0_STAR_MC, T(1), Z1Trans_MR_STAR ); //--------------------------------------------------------------------// D11.FreeAlignments(); SlideLockedPartitionDownDiagonal ( UTL, /**/ UTR, U00, U01, /**/ U02, /**/ U10, U11, /**/ U12, /*************/ /******************/ UBL, /**/ UBR, U20, U21, /**/ U22 ); SlideLockedPartitionRight ( XL_STAR_MC, /**/ XR_STAR_MC, X0_STAR_MC, X1_STAR_MC, /**/ X2_STAR_MC ); SlidePartitionDown ( ZTTrans_MR_STAR, Z0Trans_MR_STAR, Z1Trans_MR_STAR, /***************/ /***************/ ZBTrans_MR_STAR, Z2Trans_MR_STAR ); } PopBlocksizeStack(); }
inline void Cannon_NN ( T alpha, const DistMatrix<T>& A, const DistMatrix<T>& B, T beta, DistMatrix<T>& C ) { #ifndef RELEASE CallStackEntry entry("gemm::Cannon_NN"); if( A.Grid() != B.Grid() || B.Grid() != C.Grid() ) LogicError("{A,B,C} must have the same grid"); if( A.Height() != C.Height() || B.Width() != C.Width() || A.Width() != B.Height() ) { std::ostringstream msg; msg << "Nonconformal matrices: \n" << " A ~ " << A.Height() << " x " << A.Width() << "\n" << " B ~ " << B.Height() << " x " << B.Width() << "\n" << " C ~ " << C.Height() << " x " << C.Width() << "\n"; LogicError( msg.str() ); } #endif const Grid& g = A.Grid(); if( g.Height() != g.Width() ) LogicError("Process grid must be square for Cannon's"); if( C.ColAlignment() != A.ColAlignment() || C.RowAlignment() != B.RowAlignment() ) LogicError("C is not properly aligned"); const Int row = g.Row(); const Int col = g.Col(); const Int pSqrt = g.Height(); mpi::Comm rowComm = g.RowComm(); mpi::Comm colComm = g.ColComm(); if( A.Width() % pSqrt != 0 ) LogicError("For now, width(A) must be integer multiple of sqrt(p)"); // Begin by scaling our local portion of C Scale( beta, C ); // Load the initial A and B packages (may want to transpose B...) const Int localHeightA = A.LocalHeight(); const Int localHeightB = B.LocalHeight(); const Int localWidthA = A.LocalWidth(); const Int localWidthB = B.LocalWidth(); Matrix<T> pkgA(localHeightA,localWidthA,localHeightA), pkgB(localHeightB,localWidthB,localHeightB); for( Int jLoc=0; jLoc<localWidthA; ++jLoc ) MemCopy ( pkgA.Buffer(0,jLoc), A.LockedBuffer(0,jLoc), localHeightA ); for( Int jLoc=0; jLoc<localWidthB; ++jLoc ) MemCopy ( pkgB.Buffer(0,jLoc), B.LockedBuffer(0,jLoc), localHeightB ); // Perform the initial circular shifts so that our A and B packages align const Int rowShiftA = A.RowShift(); const Int colShiftB = B.ColShift(); const Int leftInitA = (col+pSqrt-colShiftB) % pSqrt; const Int rightInitA = (col+colShiftB) % pSqrt; const Int aboveInitB = (row+pSqrt-rowShiftA) % pSqrt; const Int belowInitB = (row+rowShiftA) % pSqrt; const Int pkgSizeA = localHeightA*localWidthA; const Int pkgSizeB = localHeightB*localWidthB; mpi::SendRecv( pkgA.Buffer(), pkgSizeA, leftInitA, rightInitA, rowComm ); mpi::SendRecv( pkgB.Buffer(), pkgSizeB, aboveInitB, belowInitB, colComm ); // Now begin the data flow const Int aboveRow = (row+pSqrt-1) % pSqrt; const Int belowRow = (row+1) % pSqrt; const Int leftCol = (col+pSqrt-1) % pSqrt; const Int rightCol = (col+1) % pSqrt; for( Int q=0; q<pSqrt; ++q ) { Gemm( NORMAL, NORMAL, alpha, pkgA, pkgB, T(1), C.Matrix() ); if( q != pSqrt-1 ) { mpi::SendRecv ( pkgA.Buffer(), pkgSizeA, leftCol, rightCol, rowComm ); mpi::SendRecv ( pkgB.Buffer(), pkgSizeB, aboveRow, belowRow, colComm ); } } }
inline void LocalSymvColAccumulateU ( T alpha, const DistMatrix<T>& A, const DistMatrix<T,MC,STAR>& x_MC_STAR, const DistMatrix<T,MR,STAR>& x_MR_STAR, DistMatrix<T,MC,STAR>& z_MC_STAR, DistMatrix<T,MR,STAR>& z_MR_STAR, bool conjugate=false ) { #ifndef RELEASE CallStackEntry entry("internal::LocalSymvColAccumulateU"); if( A.Grid() != x_MC_STAR.Grid() || x_MC_STAR.Grid() != x_MR_STAR.Grid() || x_MR_STAR.Grid() != z_MC_STAR.Grid() || z_MC_STAR.Grid() != z_MR_STAR.Grid() ) LogicError ("{A,x,z} must be distributed over the same grid"); if( x_MC_STAR.Width() != 1 || x_MR_STAR.Width() != 1 || z_MC_STAR.Width() != 1 || z_MR_STAR.Width() != 1 ) LogicError("Expected x and z to be column vectors"); if( A.Height() != A.Width() || A.Height() != x_MC_STAR.Height() || A.Height() != x_MR_STAR.Height() || A.Height() != z_MC_STAR.Height() || A.Height() != z_MR_STAR.Height() ) { std::ostringstream msg; msg << "Nonconformal LocalSymvColAccumulateU: \n" << " A ~ " << A.Height() << " x " << A.Width() << "\n" << " x[MC,* ] ~ " << x_MC_STAR.Height() << " x " << x_MC_STAR.Width() << "\n" << " x[MR,* ] ~ " << x_MR_STAR.Height() << " x " << x_MR_STAR.Width() << "\n" << " z[MC,* ] ~ " << z_MC_STAR.Height() << " x " << z_MC_STAR.Width() << "\n" << " z[MR,* ] ~ " << z_MR_STAR.Height() << " x " << z_MR_STAR.Width() << "\n"; LogicError( msg.str() ); } if( x_MC_STAR.ColAlignment() != A.ColAlignment() || x_MR_STAR.ColAlignment() != A.RowAlignment() || z_MC_STAR.ColAlignment() != A.ColAlignment() || z_MR_STAR.ColAlignment() != A.RowAlignment() ) LogicError("Partial matrix distributions are misaligned"); #endif const Grid& g = A.Grid(); const Orientation orientation = ( conjugate ? ADJOINT : TRANSPOSE ); // Matrix views DistMatrix<T> A11(g), A12(g); DistMatrix<T> D11(g); DistMatrix<T,MC,STAR> x1_MC_STAR(g); DistMatrix<T,MR,STAR> xT_MR_STAR(g), x0_MR_STAR(g), xB_MR_STAR(g), x1_MR_STAR(g), x2_MR_STAR(g); DistMatrix<T,MC,STAR> z1_MC_STAR(g); DistMatrix<T,MR,STAR> z1_MR_STAR(g), z2_MR_STAR(g); // We want our local gemvs to be of width blocksize, so we will // temporarily change to max(r,c) times the current blocksize const Int ratio = Max( g.Height(), g.Width() ); PushBlocksizeStack( ratio*LocalSymvBlocksize<T>() ); LockedPartitionDown ( x_MR_STAR, xT_MR_STAR, xB_MR_STAR, 0 ); while( xT_MR_STAR.Height() < x_MR_STAR.Height() ) { LockedRepartitionDown ( xT_MR_STAR, x0_MR_STAR, /**********/ /**********/ x1_MR_STAR, xB_MR_STAR, x2_MR_STAR ); const Int n0 = x0_MR_STAR.Height(); const Int n1 = x1_MR_STAR.Height(); const Int n2 = x2_MR_STAR.Height(); LockedView( A11, A, n0, n0, n1, n1 ); LockedView( A12, A, n0, n0+n1, n1, n2 ); LockedView( x1_MC_STAR, x_MC_STAR, n0, 0, n1, 1 ); View( z1_MC_STAR, z_MC_STAR, n0, 0, n1, 1 ); View( z1_MR_STAR, z_MR_STAR, n0, 0, n1, 1 ); View( z2_MR_STAR, z_MR_STAR, n0+n1, 0, n2, 1 ); D11.AlignWith( A11 ); //--------------------------------------------------------------------// // TODO: These diagonal block updates can be greatly improved D11 = A11; MakeTriangular( UPPER, D11 ); LocalGemv( NORMAL, alpha, D11, x1_MR_STAR, T(1), z1_MC_STAR ); SetDiagonal( D11, T(0) ); LocalGemv( orientation, alpha, D11, x1_MC_STAR, T(1), z1_MR_STAR ); LocalGemv( NORMAL, alpha, A12, x2_MR_STAR, T(1), z1_MC_STAR ); LocalGemv( orientation, alpha, A12, x1_MC_STAR, T(1), z2_MR_STAR ); //--------------------------------------------------------------------// SlideLockedPartitionDown ( xT_MR_STAR, x0_MR_STAR, x1_MR_STAR, /**********/ /**********/ xB_MR_STAR, x2_MR_STAR ); } PopBlocksizeStack(); }
inline T DotuHelper( const DistMatrix<T,U,V>& x, const DistMatrix<T,MC,MR>& y ) { #ifndef RELEASE PushCallStack("internal::DotuHelper"); if( x.Grid() != y.Grid() ) throw std::logic_error("{x,y} must be distributed over the same grid"); if( (x.Height() != 1 && x.Width() != 1) || (y.Height() != 1 && y.Width() != 1) ) throw std::logic_error("Dotu requires x and y to be vectors"); int xLength = ( x.Width() == 1 ? x.Height() : x.Width() ); int yLength = ( y.Width() == 1 ? y.Height() : y.Width() ); if( xLength != yLength ) throw std::logic_error("Dotu requires x and y to be the same length"); #endif const Grid& g = x.Grid(); T globalDotu; if( x.Width() == 1 && y.Width() == 1 ) { DistMatrix<T,MC,MR> xRedist(g); xRedist.AlignWith( y ); xRedist = x; int ownerCol = y.RowAlignment(); if( g.Col() == ownerCol ) { T localDotu = Dotu( xRedist.LockedLocalMatrix(), y.LockedLocalMatrix() ); mpi::AllReduce( &localDotu, &globalDotu, 1, mpi::SUM, g.ColComm() ); } mpi::Broadcast( &globalDotu, 1, ownerCol, g.RowComm() ); } else if( x.Width() == 1 ) { DistMatrix<T,MR,MC> xRedist(g); xRedist.AlignWith( y ); xRedist = x; int ownerRow = y.ColAlignment(); if( g.Row() == ownerRow ) { T localDotu = Dotu( xRedist.LockedLocalMatrix(), y.LockedLocalMatrix() ); mpi::AllReduce( &localDotu, &globalDotu, 1, mpi::SUM, g.RowComm() ); } mpi::Broadcast( &globalDotu, 1, ownerRow, g.ColComm() ); } else if( y.Width() == 1 ) { DistMatrix<T,MR,MC> xRedist(g); xRedist.AlignWith( y ); xRedist = x; int ownerCol = y.RowAlignment(); if( g.Col() == ownerCol ) { T localDotu = Dotu( xRedist.LockedLocalMatrix(), y.LockedLocalMatrix() ); mpi::AllReduce( &localDotu, &globalDotu, 1, mpi::SUM, g.ColComm() ); } mpi::Broadcast( &globalDotu, 1, ownerCol, g.RowComm() ); } else { DistMatrix<T,MC,MR> xRedist(g); xRedist.AlignWith( y ); xRedist = x; int ownerRow = y.ColAlignment(); if( g.Row() == ownerRow ) { T localDotu = Dotu( xRedist.LockedLocalMatrix(), y.LockedLocalMatrix() ); mpi::AllReduce( &localDotu, &globalDotu, 1, mpi::SUM, g.RowComm() ); } mpi::Broadcast( &globalDotu, 1, ownerRow, g.ColComm() ); } #ifndef RELEASE PopCallStack(); #endif return globalDotu; }
inline void LocalSymmetricAccumulateLU ( Orientation orientation, T alpha, const DistMatrix<T>& A, const DistMatrix<T,MC, STAR>& B_MC_STAR, const DistMatrix<T,STAR,MR >& BAdjOrTrans_STAR_MR, DistMatrix<T,MC, STAR>& Z_MC_STAR, DistMatrix<T,MR, STAR>& Z_MR_STAR ) { #ifndef RELEASE PushCallStack("internal::LocalSymmetricAccumulateLU"); if( A.Grid() != B_MC_STAR.Grid() || B_MC_STAR.Grid() != BAdjOrTrans_STAR_MR.Grid() || BAdjOrTrans_STAR_MR.Grid() != Z_MC_STAR.Grid() || Z_MC_STAR.Grid() != Z_MR_STAR.Grid() ) throw std::logic_error ("{A,B,Z} must be distributed over the same grid"); if( A.Height() != A.Width() || A.Height() != B_MC_STAR.Height() || A.Height() != BAdjOrTrans_STAR_MR.Width() || A.Height() != Z_MC_STAR.Height() || A.Height() != Z_MR_STAR.Height() || B_MC_STAR.Width() != BAdjOrTrans_STAR_MR.Height() || BAdjOrTrans_STAR_MR.Height() != Z_MC_STAR.Width() || Z_MC_STAR.Width() != Z_MR_STAR.Width() ) { std::ostringstream msg; msg << "Nonconformal LocalSymmetricAccumulateLU: \n" << " A ~ " << A.Height() << " x " << A.Width() << "\n" << " B[MC,* ] ~ " << B_MC_STAR.Height() << " x " << B_MC_STAR.Width() << "\n" << " B^H/T[* ,MR] ~ " << BAdjOrTrans_STAR_MR.Height() << " x " << BAdjOrTrans_STAR_MR.Width() << "\n" << " Z[MC,* ] ~ " << Z_MC_STAR.Height() << " x " << Z_MC_STAR.Width() << "\n" << " Z[MR,* ] ` " << Z_MR_STAR.Height() << " x " << Z_MR_STAR.Width() << "\n"; throw std::logic_error( msg.str().c_str() ); } if( B_MC_STAR.ColAlignment() != A.ColAlignment() || BAdjOrTrans_STAR_MR.RowAlignment() != A.RowAlignment() || Z_MC_STAR.ColAlignment() != A.ColAlignment() || Z_MR_STAR.ColAlignment() != A.RowAlignment() ) throw std::logic_error("Partial matrix distributions are misaligned"); #endif const Grid& g = A.Grid(); DistMatrix<T> ATL(g), ATR(g), A00(g), A01(g), A02(g), ABL(g), ABR(g), A10(g), A11(g), A12(g), A20(g), A21(g), A22(g); DistMatrix<T> D11(g); DistMatrix<T,MC,STAR> BT_MC_STAR(g), B0_MC_STAR(g), BB_MC_STAR(g), B1_MC_STAR(g), B2_MC_STAR(g); DistMatrix<T,STAR,MR> BLAdjOrTrans_STAR_MR(g), BRAdjOrTrans_STAR_MR(g), B0AdjOrTrans_STAR_MR(g), B1AdjOrTrans_STAR_MR(g), B2AdjOrTrans_STAR_MR(g); DistMatrix<T,MC,STAR> ZT_MC_STAR(g), Z0_MC_STAR(g), ZB_MC_STAR(g), Z1_MC_STAR(g), Z2_MC_STAR(g); DistMatrix<T,MR,STAR> ZT_MR_STAR(g), Z0_MR_STAR(g), ZB_MR_STAR(g), Z1_MR_STAR(g), Z2_MR_STAR(g); const int ratio = std::max( g.Height(), g.Width() ); PushBlocksizeStack( ratio*Blocksize() ); LockedPartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); LockedPartitionDown ( B_MC_STAR, BT_MC_STAR, BB_MC_STAR, 0 ); LockedPartitionRight ( BAdjOrTrans_STAR_MR, BLAdjOrTrans_STAR_MR, BRAdjOrTrans_STAR_MR, 0 ); PartitionDown ( Z_MC_STAR, ZT_MC_STAR, ZB_MC_STAR, 0 ); PartitionDown ( Z_MR_STAR, ZT_MR_STAR, ZB_MR_STAR, 0 ); while( ATL.Height() < A.Height() ) { LockedRepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); LockedRepartitionDown ( BT_MC_STAR, B0_MC_STAR, /**********/ /**********/ B1_MC_STAR, BB_MC_STAR, B2_MC_STAR ); LockedRepartitionRight ( BLAdjOrTrans_STAR_MR, /**/ BRAdjOrTrans_STAR_MR, B0AdjOrTrans_STAR_MR, /**/ B1AdjOrTrans_STAR_MR, B2AdjOrTrans_STAR_MR ); RepartitionDown ( ZT_MC_STAR, Z0_MC_STAR, /**********/ /**********/ Z1_MC_STAR, ZB_MC_STAR, Z2_MC_STAR ); RepartitionDown ( ZT_MR_STAR, Z0_MR_STAR, /**********/ /**********/ Z1_MR_STAR, ZB_MR_STAR, Z2_MR_STAR ); D11.AlignWith( A11 ); //--------------------------------------------------------------------// D11 = A11; MakeTrapezoidal( LEFT, UPPER, 0, D11 ); LocalGemm ( NORMAL, orientation, alpha, D11, B1AdjOrTrans_STAR_MR, T(1), Z1_MC_STAR ); MakeTrapezoidal( LEFT, UPPER, 1, D11 ); LocalGemm ( orientation, NORMAL, alpha, D11, B1_MC_STAR, T(1), Z1_MR_STAR ); LocalGemm ( NORMAL, orientation, alpha, A12, B2AdjOrTrans_STAR_MR, T(1), Z1_MC_STAR ); LocalGemm ( orientation, NORMAL, alpha, A12, B1_MC_STAR, T(1), Z2_MR_STAR ); //--------------------------------------------------------------------// D11.FreeAlignments(); SlideLockedPartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); SlideLockedPartitionDown ( BT_MC_STAR, B0_MC_STAR, B1_MC_STAR, /**********/ /**********/ BB_MC_STAR, B2_MC_STAR ); SlideLockedPartitionRight ( BLAdjOrTrans_STAR_MR, /**/ BRAdjOrTrans_STAR_MR, B0AdjOrTrans_STAR_MR, B1AdjOrTrans_STAR_MR, /**/ B2AdjOrTrans_STAR_MR ); SlidePartitionDown ( ZT_MC_STAR, Z0_MC_STAR, Z1_MC_STAR, /**********/ /**********/ ZB_MC_STAR, Z2_MC_STAR ); SlidePartitionDown ( ZT_MR_STAR, Z0_MR_STAR, Z1_MR_STAR, /**********/ /**********/ ZB_MR_STAR, Z2_MR_STAR ); } PopBlocksizeStack(); #ifndef RELEASE PopCallStack(); #endif }
inline void TrsmLUNSmall ( UnitOrNonUnit diag, F alpha, const DistMatrix<F,VC,STAR>& U, DistMatrix<F,VC,STAR>& X, bool checkIfSingular ) { #ifndef RELEASE PushCallStack("internal::TrsmLUNSmall"); if( U.Grid() != X.Grid() ) throw std::logic_error ("U and X must be distributed over the same grid"); if( U.Height() != U.Width() || U.Width() != X.Height() ) { std::ostringstream msg; msg << "Nonconformal TrsmLUN: \n" << " U ~ " << U.Height() << " x " << U.Width() << "\n" << " X ~ " << X.Height() << " x " << X.Width() << "\n"; throw std::logic_error( msg.str() ); } if( U.ColAlignment() != X.ColAlignment() ) throw std::logic_error("U and X are assumed to be aligned"); #endif const Grid& g = U.Grid(); // Matrix views DistMatrix<F,VC,STAR> UTL(g), UTR(g), U00(g), U01(g), U02(g), UBL(g), UBR(g), U10(g), U11(g), U12(g), U20(g), U21(g), U22(g); DistMatrix<F,VC,STAR> XT(g), X0(g), XB(g), X1(g), X2(g); // Temporary distributions DistMatrix<F,STAR,STAR> U11_STAR_STAR(g); DistMatrix<F,STAR,STAR> X1_STAR_STAR(g); // Start the algorithm Scale( alpha, X ); LockedPartitionUpDiagonal ( U, UTL, UTR, UBL, UBR, 0 ); PartitionUp ( X, XT, XB, 0 ); while( XT.Height() > 0 ) { LockedRepartitionUpDiagonal ( UTL, /**/ UTR, U00, U01, /**/ U02, /**/ U10, U11, /**/ U12, /*************/ /******************/ UBL, /**/ UBR, U20, U21, /**/ U22 ); RepartitionUp ( XT, X0, X1, /**/ /**/ XB, X2 ); //--------------------------------------------------------------------// U11_STAR_STAR = U11; // U11[* ,* ] <- U11[VC,* ] X1_STAR_STAR = X1; // X1[* ,* ] <- X1[VC,* ] // X1[* ,* ] := U11^-1[* ,* ] X1[* ,* ] LocalTrsm ( LEFT, UPPER, NORMAL, diag, F(1), U11_STAR_STAR, X1_STAR_STAR, checkIfSingular ); X1 = X1_STAR_STAR; // X0[VC,* ] -= U01[VC,* ] X1[* ,* ] LocalGemm( NORMAL, NORMAL, F(-1), U01, X1_STAR_STAR, F(1), X0 ); //--------------------------------------------------------------------// SlideLockedPartitionUpDiagonal ( UTL, /**/ UTR, U00, /**/ U01, U02, /*************/ /******************/ /**/ U10, /**/ U11, U12, UBL, /**/ UBR, U20, /**/ U21, U22 ); SlidePartitionUp ( XT, X0, /**/ /**/ X1, XB, X2 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void ApplyRowPivots ( DistMatrix<F>& A, const std::vector<int>& image, const std::vector<int>& preimage ) { const int b = image.size(); #ifndef RELEASE PushCallStack("ApplyRowPivots"); if( A.Height() < b || b != (int)preimage.size() ) throw std::logic_error ("image and preimage must be vectors of equal length that are not " "taller than A."); #endif const int localWidth = A.LocalWidth(); if( A.Height() == 0 || A.Width() == 0 ) { #ifndef RELEASE PopCallStack(); #endif return; } // Extract the relevant process grid information const Grid& g = A.Grid(); const int r = g.Height(); const int colAlignment = A.ColAlignment(); const int colShift = A.ColShift(); const int myRow = g.Row(); // Extract the send and recv counts from the image and preimage. // This process's sends may be logically partitioned into two sets: // (a) sends from rows [0,...,b-1] // (b) sends from rows [b,...] // The latter is analyzed with image, the former deduced with preimage. std::vector<int> sendCounts(r,0), recvCounts(r,0); for( int i=colShift; i<b; i+=r ) { const int sendRow = preimage[i]; const int sendTo = (colAlignment+sendRow) % r; sendCounts[sendTo] += localWidth; const int recvRow = image[i]; const int recvFrom = (colAlignment+recvRow) % r; recvCounts[recvFrom] += localWidth; } for( int i=0; i<b; ++i ) { const int sendRow = preimage[i]; if( sendRow >= b ) { const int sendTo = (colAlignment+sendRow) % r; if( sendTo == myRow ) { const int sendFrom = (colAlignment+i) % r; recvCounts[sendFrom] += localWidth; } } const int recvRow = image[i]; if( recvRow >= b ) { const int recvFrom = (colAlignment+recvRow) % r; if( recvFrom == myRow ) { const int recvTo = (colAlignment+i) % r; sendCounts[recvTo] += localWidth; } } } // Construct the send and recv displacements from the counts std::vector<int> sendDispls(r), recvDispls(r); int totalSend=0, totalRecv=0; for( int i=0; i<r; ++i ) { sendDispls[i] = totalSend; recvDispls[i] = totalRecv; totalSend += sendCounts[i]; totalRecv += recvCounts[i]; } #ifndef RELEASE if( totalSend != totalRecv ) { std::ostringstream msg; msg << "Send and recv counts do not match: (send,recv)=" << totalSend << "," << totalRecv; throw std::logic_error( msg.str().c_str() ); } #endif // Fill vectors with the send data const int ALDim = A.LocalLDim(); std::vector<F> sendData(std::max(1,totalSend)); std::vector<int> offsets(r,0); const int localHeight = LocalLength( b, colShift, r ); for( int iLocal=0; iLocal<localHeight; ++iLocal ) { const int sendRow = preimage[colShift+iLocal*r]; const int sendTo = (colAlignment+sendRow) % r; const int offset = sendDispls[sendTo]+offsets[sendTo]; const F* ABuffer = A.LocalBuffer(iLocal,0); for( int jLocal=0; jLocal<localWidth; ++jLocal ) sendData[offset+jLocal] = ABuffer[jLocal*ALDim]; offsets[sendTo] += localWidth; } for( int i=0; i<b; ++i ) { const int recvRow = image[i]; if( recvRow >= b ) { const int recvFrom = (colAlignment+recvRow) % r; if( recvFrom == myRow ) { const int recvTo = (colAlignment+i) % r; const int iLocal = (recvRow-colShift) / r; const int offset = sendDispls[recvTo]+offsets[recvTo]; const F* ABuffer = A.LocalBuffer(iLocal,0); for( int jLocal=0; jLocal<localWidth; ++jLocal ) sendData[offset+jLocal] = ABuffer[jLocal*ALDim]; offsets[recvTo] += localWidth; } } } // Communicate all pivot rows std::vector<F> recvData(std::max(1,totalRecv)); mpi::AllToAll ( &sendData[0], &sendCounts[0], &sendDispls[0], &recvData[0], &recvCounts[0], &recvDispls[0], g.ColComm() ); // Unpack the recv data for( int k=0; k<r; ++k ) { offsets[k] = 0; int thisColShift = Shift( k, colAlignment, r ); for( int i=thisColShift; i<b; i+=r ) { const int sendRow = preimage[i]; const int sendTo = (colAlignment+sendRow) % r; if( sendTo == myRow ) { const int offset = recvDispls[k]+offsets[k]; const int iLocal = (sendRow-colShift) / r; F* ABuffer = A.LocalBuffer(iLocal,0); for( int jLocal=0; jLocal<localWidth; ++jLocal ) ABuffer[jLocal*ALDim] = recvData[offset+jLocal]; offsets[k] += localWidth; } } } for( int i=0; i<b; ++i ) { const int recvRow = image[i]; if( recvRow >= b ) { const int recvTo = (colAlignment+i) % r; if( recvTo == myRow ) { const int recvFrom = (colAlignment+recvRow) % r; const int iLocal = (i-colShift) / r; const int offset = recvDispls[recvFrom]+offsets[recvFrom]; F* ABuffer = A.LocalBuffer(iLocal,0); for( int jLocal=0; jLocal<localWidth; ++jLocal ) ABuffer[jLocal*ALDim] = recvData[offset+jLocal]; offsets[recvFrom] += localWidth; } } } #ifndef RELEASE PopCallStack(); #endif }
inline void internal::LocalTrmmAccumulateLUN ( Orientation orientation, UnitOrNonUnit diag, T alpha, const DistMatrix<T,MC, MR >& U, const DistMatrix<T,STAR,MR >& XAdjOrTrans_STAR_MR, DistMatrix<T,MC, STAR>& Z_MC_STAR ) { #ifndef RELEASE PushCallStack("internal::LocalTrmmAccumulateLUN"); if( U.Grid() != XAdjOrTrans_STAR_MR.Grid() || XAdjOrTrans_STAR_MR.Grid() != Z_MC_STAR.Grid() ) throw std::logic_error ("{U,X,Z} must be distributed over the same grid"); if( U.Height() != U.Width() || U.Height() != XAdjOrTrans_STAR_MR.Width() || U.Height() != Z_MC_STAR.Height() || XAdjOrTrans_STAR_MR.Height() != Z_MC_STAR.Width() ) { std::ostringstream msg; msg << "Nonconformal LocalTrmmAccumulateLUN: \n" << " U ~ " << U.Height() << " x " << U.Width() << "\n" << " X^H/T[* ,MR] ~ " << XAdjOrTrans_STAR_MR.Height() << " x " << XAdjOrTrans_STAR_MR.Width() << "\n" << " Z[MC,* ] ~ " << Z_MC_STAR.Height() << " x " << Z_MC_STAR.Width() << "\n"; throw std::logic_error( msg.str().c_str() ); } if( XAdjOrTrans_STAR_MR.RowAlignment() != U.RowAlignment() || Z_MC_STAR.ColAlignment() != U.ColAlignment() ) throw std::logic_error("Partial matrix distributions are misaligned"); #endif const Grid& g = U.Grid(); // Matrix views DistMatrix<T,MC,MR> UTL(g), UTR(g), U00(g), U01(g), U02(g), UBL(g), UBR(g), U10(g), U11(g), U12(g), U20(g), U21(g), U22(g); DistMatrix<T,MC,MR> D11(g); DistMatrix<T,STAR,MR> XLAdjOrTrans_STAR_MR(g), XRAdjOrTrans_STAR_MR(g), X0AdjOrTrans_STAR_MR(g), X1AdjOrTrans_STAR_MR(g), X2AdjOrTrans_STAR_MR(g); DistMatrix<T,MC,STAR> ZT_MC_STAR(g), Z0_MC_STAR(g), ZB_MC_STAR(g), Z1_MC_STAR(g), Z2_MC_STAR(g); const int ratio = std::max( g.Height(), g.Width() ); PushBlocksizeStack( ratio*Blocksize() ); LockedPartitionDownDiagonal ( U, UTL, UTR, UBL, UBR, 0 ); LockedPartitionRight ( XAdjOrTrans_STAR_MR, XLAdjOrTrans_STAR_MR, XRAdjOrTrans_STAR_MR, 0 ); PartitionDown ( Z_MC_STAR, ZT_MC_STAR, ZB_MC_STAR, 0 ); while( UTL.Height() < U.Height() ) { LockedRepartitionDownDiagonal ( UTL, /**/ UTR, U00, /**/ U01, U02, /*************/ /******************/ /**/ U10, /**/ U11, U12, UBL, /**/ UBR, U20, /**/ U21, U22 ); LockedRepartitionRight ( XLAdjOrTrans_STAR_MR, /**/ XRAdjOrTrans_STAR_MR, X0AdjOrTrans_STAR_MR, /**/ X1AdjOrTrans_STAR_MR, X2AdjOrTrans_STAR_MR ); RepartitionDown ( ZT_MC_STAR, Z0_MC_STAR, /**********/ /**********/ Z1_MC_STAR, ZB_MC_STAR, Z2_MC_STAR ); D11.AlignWith( U11 ); //--------------------------------------------------------------------// D11 = U11; MakeTrapezoidal( LEFT, UPPER, 0, D11 ); if( diag == UNIT ) SetDiagonalToOne( D11 ); internal::LocalGemm ( NORMAL, orientation, alpha, D11, X1AdjOrTrans_STAR_MR, (T)1, Z1_MC_STAR ); internal::LocalGemm ( NORMAL, orientation, alpha, U01, X1AdjOrTrans_STAR_MR, (T)1, Z0_MC_STAR ); //--------------------------------------------------------------------// D11.FreeAlignments(); SlideLockedPartitionDownDiagonal ( UTL, /**/ UTR, U00, U01, /**/ U02, /**/ U10, U11, /**/ U12, /*************/ /******************/ UBL, /**/ UBR, U20, U21, /**/ U22 ); SlideLockedPartitionRight ( XLAdjOrTrans_STAR_MR, /**/ XRAdjOrTrans_STAR_MR, X0AdjOrTrans_STAR_MR, X1AdjOrTrans_STAR_MR, /**/ X2AdjOrTrans_STAR_MR ); SlidePartitionDown ( ZT_MC_STAR, Z0_MC_STAR, Z1_MC_STAR, /**********/ /**********/ ZB_MC_STAR, Z2_MC_STAR ); } PopBlocksizeStack(); #ifndef RELEASE PopCallStack(); #endif }
inline void LocalSymvRowAccumulateU ( T alpha, const DistMatrix<T>& A, const DistMatrix<T,STAR,MC>& x_STAR_MC, const DistMatrix<T,STAR,MR>& x_STAR_MR, DistMatrix<T,STAR,MC>& z_STAR_MC, DistMatrix<T,STAR,MR>& z_STAR_MR ) { #ifndef RELEASE PushCallStack("internal::LocalSymvRowAccumulateU"); if( A.Grid() != x_STAR_MC.Grid() || x_STAR_MC.Grid() != x_STAR_MR.Grid() || x_STAR_MR.Grid() != z_STAR_MC.Grid() || z_STAR_MC.Grid() != z_STAR_MR.Grid() ) throw std::logic_error ("{A,x,z} must be distributed over the same grid"); if( x_STAR_MC.Height() != 1 || x_STAR_MR.Height() != 1 || z_STAR_MC.Height() != 1 || z_STAR_MR.Height() != 1 ) throw std::logic_error("Expected x and z to be row vectors"); if( A.Height() != A.Width() || A.Height() != x_STAR_MC.Width() || A.Height() != x_STAR_MR.Width() || A.Height() != z_STAR_MC.Width() || A.Height() != z_STAR_MR.Width() ) { std::ostringstream msg; msg << "Nonconformal LocalSymvRowAccumulateU: \n" << " A ~ " << A.Height() << " x " << A.Width() << "\n" << " x[* ,MC] ~ " << x_STAR_MC.Height() << " x " << x_STAR_MC.Width() << "\n" << " x[* ,MR] ~ " << x_STAR_MR.Height() << " x " << x_STAR_MR.Width() << "\n" << " z[* ,MC] ~ " << z_STAR_MC.Height() << " x " << z_STAR_MC.Width() << "\n" << " z[* ,MR] ~ " << z_STAR_MR.Height() << " x " << z_STAR_MR.Width() << "\n"; throw std::logic_error( msg.str() ); } if( x_STAR_MC.RowAlignment() != A.ColAlignment() || x_STAR_MR.RowAlignment() != A.RowAlignment() || z_STAR_MC.RowAlignment() != A.ColAlignment() || z_STAR_MR.RowAlignment() != A.RowAlignment() ) throw std::logic_error("Partial matrix distributions are misaligned"); #endif const Grid& g = A.Grid(); // Matrix views DistMatrix<T> A11(g), A12(g); DistMatrix<T> D11(g); DistMatrix<T,STAR,MC> x1_STAR_MC(g); DistMatrix<T,STAR,MR> xL_STAR_MR(g), xR_STAR_MR(g), x0_STAR_MR(g), x1_STAR_MR(g), x2_STAR_MR(g); DistMatrix<T,STAR,MC> z1_STAR_MC(g); DistMatrix<T,STAR,MR> z1_STAR_MR(g), z2_STAR_MR(g); // We want our local gemvs to be of width blocksize, so we will // temporarily change to max(r,c) times the current blocksize const int ratio = std::max( g.Height(), g.Width() ); PushBlocksizeStack( ratio*LocalSymvBlocksize<T>() ); LockedPartitionRight( x_STAR_MR, xL_STAR_MR, xR_STAR_MR, 0 ); while( xL_STAR_MR.Width() < x_STAR_MR.Width() ) { LockedRepartitionRight ( xL_STAR_MR, /**/ xR_STAR_MR, x0_STAR_MR, /**/ x1_STAR_MR, x2_STAR_MR ); const int n0 = x0_STAR_MR.Width(); const int n1 = x1_STAR_MR.Width(); const int n2 = x2_STAR_MR.Width(); LockedView( A11, A, n0, n0, n1, n1 ); LockedView( A12, A, n0, n0+n1, n1, n2 ); LockedView( x1_STAR_MC, x_STAR_MC, 0, n0, 1, n1 ); View( z1_STAR_MC, z_STAR_MC, 0, n0, 1, n1 ); View( z1_STAR_MR, z_STAR_MR, 0, n0, 1, n1 ); View( z2_STAR_MR, z_STAR_MR, 0, n0+n1, 1, n2 ); D11.AlignWith( A11 ); //--------------------------------------------------------------------// // TODO: These diagonal block updates can be greatly improved D11 = A11; MakeTrapezoidal( LEFT, UPPER, 0, D11 ); Gemv ( NORMAL, alpha, D11.LockedLocalMatrix(), x1_STAR_MR.LockedLocalMatrix(), T(1), z1_STAR_MC.LocalMatrix() ); MakeTrapezoidal( LEFT, UPPER, 1, D11 ); Gemv ( TRANSPOSE, alpha, D11.LockedLocalMatrix(), x1_STAR_MC.LockedLocalMatrix(), T(1), z1_STAR_MR.LocalMatrix() ); Gemv ( NORMAL, alpha, A12.LockedLocalMatrix(), x2_STAR_MR.LockedLocalMatrix(), T(1), z1_STAR_MC.LocalMatrix() ); Gemv ( TRANSPOSE, alpha, A12.LockedLocalMatrix(), x1_STAR_MC.LockedLocalMatrix(), T(1), z2_STAR_MR.LocalMatrix() ); //--------------------------------------------------------------------// D11.FreeAlignments(); SlideLockedPartitionRight ( xL_STAR_MR, /**/ xR_STAR_MR, x0_STAR_MR, x1_STAR_MR, /**/ x2_STAR_MR ); } PopBlocksizeStack(); #ifndef RELEASE PopCallStack(); #endif }
inline void TrsmLLTSmall ( Orientation orientation, UnitOrNonUnit diag, F alpha, const DistMatrix<F,STAR,VR>& L, DistMatrix<F,VR,STAR>& X, bool checkIfSingular ) { #ifndef RELEASE PushCallStack("internal::TrsmLLTSmall"); if( L.Grid() != X.Grid() ) throw std::logic_error ("L and X must be distributed over the same grid"); if( orientation == NORMAL ) throw std::logic_error("TrsmLLT expects a (Conjugate)Transpose option"); if( L.Height() != L.Width() || L.Height() != X.Height() ) { std::ostringstream msg; msg << "Nonconformal TrsmLLT: \n" << " L ~ " << L.Height() << " x " << L.Width() << "\n" << " X ~ " << X.Height() << " x " << X.Width() << "\n"; throw std::logic_error( msg.str().c_str() ); } if( L.RowAlignment() != X.ColAlignment() ) throw std::logic_error("L and X must be aligned"); #endif const Grid& g = L.Grid(); // Matrix views DistMatrix<F,STAR,VR> LTL(g), LTR(g), L00(g), L01(g), L02(g), LBL(g), LBR(g), L10(g), L11(g), L12(g), L20(g), L21(g), L22(g); DistMatrix<F,VR,STAR> XT(g), X0(g), XB(g), X1(g), X2(g); // Temporary distributions DistMatrix<F,STAR,STAR> L11_STAR_STAR(g); DistMatrix<F,STAR,STAR> X1_STAR_STAR(g); // Start the algorithm Scale( alpha, X ); LockedPartitionUpDiagonal ( L, LTL, LTR, LBL, LBR, 0 ); PartitionUp ( X, XT, XB, 0 ); while( XT.Height() > 0 ) { LockedRepartitionUpDiagonal ( LTL, /**/ LTR, L00, L01, /**/ L02, /**/ L10, L11, /**/ L12, /*************/ /******************/ LBL, /**/ LBR, L20, L21, /**/ L22 ); RepartitionUp ( XT, X0, X1, /**/ /**/ XB, X2 ); //--------------------------------------------------------------------// L11_STAR_STAR = L11; // L11[* ,* ] <- L11[* ,VR] X1_STAR_STAR = X1; // X1[* ,* ] <- X1[VR,* ] // X1[* ,* ] := L11^-[T/H][* ,* ] X1[* ,* ] LocalTrsm ( LEFT, LOWER, orientation, diag, F(1), L11_STAR_STAR, X1_STAR_STAR, checkIfSingular ); X1 = X1_STAR_STAR; // X0[VR,* ] -= L10[* ,VR]^(T/H) X1[* ,* ] LocalGemm( orientation, NORMAL, F(-1), L10, X1_STAR_STAR, F(1), X0 ); //--------------------------------------------------------------------// SlideLockedPartitionUpDiagonal ( LTL, /**/ LTR, L00, /**/ L01, L02, /*************/ /******************/ /**/ L10, /**/ L11, L12, LBL, /**/ LBR, L20, /**/ L21, L22 ); SlidePartitionUp ( XT, X0, /**/ /**/ X1, XB, X2 ); } #ifndef RELEASE PopCallStack(); #endif }
inline R Row( DistMatrix<R>& chi, DistMatrix<R>& x ) { #ifndef RELEASE PushCallStack("reflector::Row"); if( chi.Grid() != x.Grid() ) throw std::logic_error ("chi and x must be distributed over the same grid"); if( chi.Height() != 1 || chi.Width() != 1 ) throw std::logic_error("chi must be a scalar"); if( x.Height() != 1 ) throw std::logic_error("x must be a row vector"); if( chi.Grid().Row() != chi.ColAlignment() ) throw std::logic_error("Reflecting with incorrect row of processes"); if( x.Grid().Row() != x.ColAlignment() ) throw std::logic_error("Reflecting with incorrect row of processes"); #endif const Grid& grid = x.Grid(); mpi::Comm rowComm = grid.RowComm(); const int gridCol = grid.Col(); const int gridWidth = grid.Width(); const int rowAlignment = chi.RowAlignment(); std::vector<R> localNorms(gridWidth); R localNorm = Nrm2( x.LockedMatrix() ); mpi::AllGather( &localNorm, 1, &localNorms[0], 1, rowComm ); R norm = blas::Nrm2( gridWidth, &localNorms[0], 1 ); if( norm == 0 ) { if( gridCol == rowAlignment ) chi.SetLocal(0,0,-chi.GetLocal(0,0)); #ifndef RELEASE PopCallStack(); #endif return R(2); } R alpha; if( gridCol == rowAlignment ) alpha = chi.GetLocal(0,0); mpi::Broadcast( &alpha, 1, rowAlignment, rowComm ); R beta; if( alpha <= 0 ) beta = lapack::SafeNorm( alpha, norm ); else beta = -lapack::SafeNorm( alpha, norm ); const R one = 1; const R safeMin = lapack::MachineSafeMin<R>(); const R epsilon = lapack::MachineEpsilon<R>(); const R safeInv = safeMin/epsilon; int count = 0; if( Abs(beta) < safeInv ) { R invOfSafeInv = one/safeInv; do { ++count; Scale( invOfSafeInv, x ); alpha *= invOfSafeInv; beta *= invOfSafeInv; } while( Abs(beta) < safeInv ); localNorm = Nrm2( x.LockedMatrix() ); mpi::AllGather( &localNorm, 1, &localNorms[0], 1, rowComm ); norm = blas::Nrm2( gridWidth, &localNorms[0], 1 ); if( alpha <= 0 ) beta = lapack::SafeNorm( alpha, norm ); else beta = -lapack::SafeNorm( alpha, norm ); } R tau = (beta-alpha)/beta; Scale( one/(alpha-beta), x ); for( int j=0; j<count; ++j ) beta *= safeInv; if( gridCol == rowAlignment ) chi.SetLocal(0,0,beta); #ifndef RELEASE PopCallStack(); #endif return tau; }
inline void LocalTrmmAccumulateLLT ( Orientation orientation, UnitOrNonUnit diag, T alpha, const DistMatrix<T>& L, const DistMatrix<T,MC,STAR>& X_MC_STAR, DistMatrix<T,MR,STAR>& Z_MR_STAR ) { #ifndef RELEASE PushCallStack("internal::LocalTrmmAccumulateLLT"); if( L.Grid() != X_MC_STAR.Grid() || X_MC_STAR.Grid() != Z_MR_STAR.Grid() ) throw std::logic_error ("{L,X,Z} must be distributed over the same grid"); if( L.Height() != L.Width() || L.Height() != X_MC_STAR.Height() || L.Height() != Z_MR_STAR.Height() ) { std::ostringstream msg; msg << "Nonconformal LocalTrmmAccumulateLLT: " << "\n" << " L ~ " << L.Height() << " x " << L.Width() << "\n" << " X[MC,* ] ~ " << X_MC_STAR.Height() << " x " << X_MC_STAR.Width() << "\n" << " Z[MR,* ] ` " << Z_MR_STAR.Height() << " x " << Z_MR_STAR.Width() << "\n"; throw std::logic_error( msg.str().c_str() ); } if( X_MC_STAR.ColAlignment() != L.ColAlignment() || Z_MR_STAR.ColAlignment() != L.RowAlignment() ) throw std::logic_error("Partial matrix distributions are misaligned"); #endif const Grid& g = L.Grid(); // Matrix views DistMatrix<T> LTL(g), LTR(g), L00(g), L01(g), L02(g), LBL(g), LBR(g), L10(g), L11(g), L12(g), L20(g), L21(g), L22(g); DistMatrix<T> D11(g); DistMatrix<T,MC,STAR> XT_MC_STAR(g), X0_MC_STAR(g), XB_MC_STAR(g), X1_MC_STAR(g), X2_MC_STAR(g); DistMatrix<T,MR,STAR> ZT_MR_STAR(g), Z0_MR_STAR(g), ZB_MR_STAR(g), Z1_MR_STAR(g), Z2_MR_STAR(g); const int ratio = std::max( g.Height(), g.Width() ); PushBlocksizeStack( ratio*Blocksize() ); LockedPartitionDownDiagonal ( L, LTL, LTR, LBL, LBR, 0 ); LockedPartitionDown ( X_MC_STAR, XT_MC_STAR, XB_MC_STAR, 0 ); PartitionDown ( Z_MR_STAR, ZT_MR_STAR, ZB_MR_STAR, 0 ); while( LTL.Height() < L.Height() ) { LockedRepartitionDownDiagonal ( LTL, /**/ LTR, L00, /**/ L01, L02, /*************/ /******************/ /**/ L10, /**/ L11, L12, LBL, /**/ LBR, L20, /**/ L21, L22 ); LockedRepartitionDown ( XT_MC_STAR, X0_MC_STAR, /**********/ /**********/ X1_MC_STAR, XB_MC_STAR, X2_MC_STAR ); RepartitionDown ( ZT_MR_STAR, Z0_MR_STAR, /**********/ /**********/ Z1_MR_STAR, ZB_MR_STAR, Z2_MR_STAR ); D11.AlignWith( L11 ); //--------------------------------------------------------------------// D11 = L11; MakeTrapezoidal( LEFT, LOWER, 0, D11 ); if( diag == UNIT ) SetDiagonalToOne( D11 ); LocalGemm ( orientation, NORMAL, alpha, D11, X1_MC_STAR, T(1), Z1_MR_STAR ); LocalGemm ( orientation, NORMAL, alpha, L21, X2_MC_STAR, T(1), Z1_MR_STAR ); //--------------------------------------------------------------------// D11.FreeAlignments(); SlideLockedPartitionDownDiagonal ( LTL, /**/ LTR, L00, L01, /**/ L02, /**/ L10, L11, /**/ L12, /*************/ /******************/ LBL, /**/ LBR, L20, L21, /**/ L22 ); SlideLockedPartitionDown ( XT_MC_STAR, X0_MC_STAR, X1_MC_STAR, /**********/ /**********/ XB_MC_STAR, X2_MC_STAR ); SlidePartitionDown ( ZT_MR_STAR, Z0_MR_STAR, Z1_MR_STAR, /**********/ /**********/ ZB_MR_STAR, Z2_MR_STAR ); } PopBlocksizeStack(); #ifndef RELEASE PopCallStack(); #endif }
inline void LocalSymmetricAccumulateRU ( Orientation orientation, T alpha, const DistMatrix<T,MC, MR >& A, const DistMatrix<T,STAR,MC >& B_STAR_MC, const DistMatrix<T,MR, STAR>& BTrans_MR_STAR, DistMatrix<T,MC, STAR>& ZTrans_MC_STAR, DistMatrix<T,MR, STAR>& ZTrans_MR_STAR ) { #ifndef RELEASE PushCallStack("internal::LocalSymmetricAccumulateRU"); if( A.Grid() != B_STAR_MC.Grid() || B_STAR_MC.Grid() != BTrans_MR_STAR.Grid() || BTrans_MR_STAR.Grid() != ZTrans_MC_STAR.Grid() || ZTrans_MC_STAR.Grid() != ZTrans_MR_STAR.Grid() ) throw std::logic_error ("{A,B,C} must be distributed over the same grid"); if( A.Height() != A.Width() || A.Height() != B_STAR_MC.Width() || A.Height() != BTrans_MR_STAR.Height() || A.Height() != ZTrans_MC_STAR.Height() || A.Height() != ZTrans_MR_STAR.Height() || B_STAR_MC.Height() != BTrans_MR_STAR.Width() || BTrans_MR_STAR.Width() != ZTrans_MC_STAR.Width() || ZTrans_MC_STAR.Width() != ZTrans_MR_STAR.Width() ) { std::ostringstream msg; msg << "Nonconformal LocalSymmetricAccumulateRU: \n" << " A ~ " << A.Height() << " x " << A.Width() << "\n" << " B[* ,MC] ~ " << B_STAR_MC.Height() << " x " << B_STAR_MC.Width() << "\n" << " B^H/T[MR,* ] ~ " << BTrans_MR_STAR.Height() << " x " << BTrans_MR_STAR.Width() << "\n" << " Z^H/T[MC,* ] ~ " << ZTrans_MC_STAR.Height() << " x " << ZTrans_MC_STAR.Width() << "\n" << " Z^H/T[MR,* ] ~ " << ZTrans_MR_STAR.Height() << " x " << ZTrans_MR_STAR.Width() << "\n"; throw std::logic_error( msg.str().c_str() ); } if( B_STAR_MC.RowAlignment() != A.ColAlignment() || BTrans_MR_STAR.ColAlignment() != A.RowAlignment() || ZTrans_MC_STAR.ColAlignment() != A.ColAlignment() || ZTrans_MR_STAR.ColAlignment() != A.RowAlignment() ) throw std::logic_error("Partial matrix distributions are misaligned"); #endif const Grid& g = A.Grid(); // Matrix views DistMatrix<T> ATL(g), ATR(g), A00(g), A01(g), A02(g), ABL(g), ABR(g), A10(g), A11(g), A12(g), A20(g), A21(g), A22(g); DistMatrix<T> D11(g); DistMatrix<T,STAR,MC> BL_STAR_MC(g), BR_STAR_MC(g), B0_STAR_MC(g), B1_STAR_MC(g), B2_STAR_MC(g); DistMatrix<T,MR,STAR> BTTrans_MR_STAR(g), B0Trans_MR_STAR(g), BBTrans_MR_STAR(g), B1Trans_MR_STAR(g), B2Trans_MR_STAR(g); DistMatrix<T,MC,STAR> ZTTrans_MC_STAR(g), Z0Trans_MC_STAR(g), ZBTrans_MC_STAR(g), Z1Trans_MC_STAR(g), Z2Trans_MC_STAR(g); DistMatrix<T,MR,STAR> ZBTrans_MR_STAR(g), Z0Trans_MR_STAR(g), ZTTrans_MR_STAR(g), Z1Trans_MR_STAR(g), Z2Trans_MR_STAR(g); const int ratio = std::max( g.Height(), g.Width() ); PushBlocksizeStack( ratio*Blocksize() ); LockedPartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); LockedPartitionRight( B_STAR_MC, BL_STAR_MC, BR_STAR_MC, 0 ); LockedPartitionDown ( BTrans_MR_STAR, BTTrans_MR_STAR, BBTrans_MR_STAR, 0 ); PartitionDown ( ZTrans_MC_STAR, ZTTrans_MC_STAR, ZBTrans_MC_STAR, 0 ); PartitionDown ( ZTrans_MR_STAR, ZTTrans_MR_STAR, ZBTrans_MR_STAR, 0 ); while( ATL.Height() < A.Height() ) { LockedRepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); LockedRepartitionRight ( BL_STAR_MC, /**/ BR_STAR_MC, B0_STAR_MC, /**/ B1_STAR_MC, B2_STAR_MC ); LockedRepartitionDown ( BTTrans_MR_STAR, B0Trans_MR_STAR, /***************/ /***************/ B1Trans_MR_STAR, BBTrans_MR_STAR, B2Trans_MR_STAR ); RepartitionDown ( ZTTrans_MC_STAR, Z0Trans_MC_STAR, /***************/ /***************/ Z1Trans_MC_STAR, ZBTrans_MC_STAR, Z2Trans_MC_STAR ); RepartitionDown ( ZTTrans_MR_STAR, Z0Trans_MR_STAR, /***************/ /***************/ Z1Trans_MR_STAR, ZBTrans_MR_STAR, Z2Trans_MR_STAR ); D11.AlignWith( A11 ); //--------------------------------------------------------------------// D11 = A11; MakeTriangular( UPPER, D11 ); LocalGemm ( orientation, orientation, alpha, D11, B1_STAR_MC, T(1), Z1Trans_MR_STAR ); SetDiagonal( D11, T(0) ); LocalGemm ( NORMAL, NORMAL, alpha, D11, B1Trans_MR_STAR, T(1), Z1Trans_MC_STAR ); LocalGemm ( orientation, orientation, alpha, A12, B1_STAR_MC, T(1), Z2Trans_MR_STAR ); LocalGemm ( NORMAL, NORMAL, alpha, A12, B2Trans_MR_STAR, T(1), Z1Trans_MC_STAR ); //--------------------------------------------------------------------// D11.FreeAlignments(); SlideLockedPartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); SlideLockedPartitionRight ( BL_STAR_MC, /**/ BR_STAR_MC, B0_STAR_MC, B1_STAR_MC, /**/ B2_STAR_MC ); SlideLockedPartitionDown ( BTTrans_MR_STAR, B0Trans_MR_STAR, B1Trans_MR_STAR, /***************/ /***************/ BBTrans_MR_STAR, B2Trans_MR_STAR ); SlidePartitionDown ( ZTTrans_MC_STAR, Z0Trans_MC_STAR, Z1Trans_MC_STAR, /***************/ /***************/ ZBTrans_MC_STAR, Z2Trans_MC_STAR ); SlidePartitionDown ( ZTTrans_MR_STAR, Z0Trans_MR_STAR, Z1Trans_MR_STAR, /***************/ /***************/ ZBTrans_MR_STAR, Z2Trans_MR_STAR ); } PopBlocksizeStack(); #ifndef RELEASE PopCallStack(); #endif }