void UN_C ( T alpha, const AbstractDistMatrix<T>& APre, AbstractDistMatrix<T>& CPre, bool conjugate=false ) { EL_DEBUG_CSE const Int r = APre.Width(); const Int bsize = Blocksize(); const Grid& g = APre.Grid(); DistMatrixReadProxy<T,T,MC,MR> AProx( APre ); DistMatrixReadWriteProxy<T,T,MC,MR> CProx( CPre ); auto& A = AProx.GetLocked(); auto& C = CProx.Get(); // Temporary distributions DistMatrix<T,MC, STAR> A1_MC_STAR(g); DistMatrix<T,VR, STAR> A1_VR_STAR(g); DistMatrix<T,STAR,MR > A1Trans_STAR_MR(g); A1_MC_STAR.AlignWith( C ); A1_VR_STAR.AlignWith( C ); A1Trans_STAR_MR.AlignWith( C ); for( Int k=0; k<r; k+=bsize ) { const Int nb = Min(bsize,r-k); auto A1 = A( ALL, IR(k,k+nb) ); A1_VR_STAR = A1_MC_STAR = A1; Transpose( A1_VR_STAR, A1Trans_STAR_MR, conjugate ); LocalTrrk( UPPER, alpha, A1_MC_STAR, A1Trans_STAR_MR, T(1), C ); } }
void LT_Dot ( T alpha, const AbstractDistMatrix<T>& APre, AbstractDistMatrix<T>& CPre, const bool conjugate, Int blockSize=2000 ) { EL_DEBUG_CSE const Int n = CPre.Height(); const Grid& g = APre.Grid(); const Orientation orient = ( conjugate ? ADJOINT : TRANSPOSE ); DistMatrixReadProxy<T,T,VC,STAR> AProx( APre ); auto& A = AProx.GetLocked(); DistMatrixReadWriteProxy<T,T,MC,MR> CProx( CPre ); auto& C = CProx.Get(); DistMatrix<T,STAR,STAR> Z( blockSize, blockSize, g ); Zero( Z ); for( Int kOuter=0; kOuter<n; kOuter+=blockSize ) { const Int nbOuter = Min(blockSize,n-kOuter); const Range<Int> indOuter( kOuter, kOuter+nbOuter ); auto A1 = A( ALL, indOuter ); auto C11 = C( indOuter, indOuter ); Z.Resize( nbOuter, nbOuter ); Syrk( LOWER, TRANSPOSE, alpha, A1.Matrix(), Z.Matrix(), conjugate ); AxpyContract( T(1), Z, C11 ); for( Int kInner=kOuter+nbOuter; kInner<n; kInner+=blockSize ) { const Int nbInner = Min(blockSize,n-kInner); const Range<Int> indInner( kInner, kInner+nbInner ); auto A2 = A( ALL, indInner ); auto C21 = C( indInner, indOuter ); LocalGemm( orient, NORMAL, alpha, A1, A2, Z ); AxpyContract( T(1), Z, C21 ); } } }
void SUMMA_NTDot ( Orientation orientB, T alpha, const AbstractDistMatrix<T>& APre, const AbstractDistMatrix<T>& BPre, AbstractDistMatrix<T>& CPre, Int blockSize=2000 ) { EL_DEBUG_CSE const Int m = CPre.Height(); const Int n = CPre.Width(); const Grid& g = APre.Grid(); DistMatrixReadProxy<T,T,STAR,VC> AProx( APre ); auto& A = AProx.GetLocked(); ElementalProxyCtrl BCtrl; BCtrl.rowConstrain = true; BCtrl.rowAlign = A.RowAlign(); DistMatrixReadProxy<T,T,STAR,VC> BProx( BPre, BCtrl ); auto& B = BProx.GetLocked(); DistMatrixReadWriteProxy<T,T,MC,MR> CProx( CPre ); auto& C = CProx.Get(); DistMatrix<T,STAR,STAR> C11_STAR_STAR(g); for( Int kOuter=0; kOuter<m; kOuter+=blockSize ) { const Int nbOuter = Min(blockSize,m-kOuter); const Range<Int> indOuter( kOuter, kOuter+nbOuter ); auto A1 = A( indOuter, ALL ); for( Int kInner=0; kInner<n; kInner+=blockSize ) { const Int nbInner = Min(blockSize,n-kInner); const Range<Int> indInner( kInner, kInner+nbInner ); auto B1 = B( indInner, ALL ); auto C11 = C( indOuter, indInner ); LocalGemm( NORMAL, orientB, alpha, A1, B1, C11_STAR_STAR ); AxpyContract( T(1), C11_STAR_STAR, C11 ); } } }
void SUMMA_NTC ( Orientation orientB, T alpha, const AbstractDistMatrix<T>& APre, const AbstractDistMatrix<T>& BPre, AbstractDistMatrix<T>& CPre ) { EL_DEBUG_CSE const Int sumDim = APre.Width(); const Int bsize = Blocksize(); const Grid& g = APre.Grid(); const bool conjugate = ( orientB == ADJOINT ); DistMatrixReadProxy<T,T,MC,MR> AProx( APre ); DistMatrixReadProxy<T,T,MC,MR> BProx( BPre ); DistMatrixReadWriteProxy<T,T,MC,MR> CProx( CPre ); auto& A = AProx.GetLocked(); auto& B = BProx.GetLocked(); auto& C = CProx.Get(); // Temporary distributions DistMatrix<T,MC,STAR> A1_MC_STAR(g); DistMatrix<T,VR,STAR> B1_VR_STAR(g); DistMatrix<T,STAR,MR> B1Trans_STAR_MR(g); A1_MC_STAR.AlignWith( C ); B1_VR_STAR.AlignWith( C ); B1Trans_STAR_MR.AlignWith( C ); for( Int k=0; k<sumDim; k+=bsize ) { const Int nb = Min(bsize,sumDim-k); auto A1 = A( ALL, IR(k,k+nb) ); auto B1 = B( ALL, IR(k,k+nb) ); A1_MC_STAR = A1; B1_VR_STAR = B1; Transpose( B1_VR_STAR, B1Trans_STAR_MR, conjugate ); // C[MC,MR] += alpha A1[MC,*] (B1[MR,*])^T LocalGemm ( NORMAL, NORMAL, alpha, A1_MC_STAR, B1Trans_STAR_MR, T(1), C ); } }
void L1DistanceMatrix(direction_t dirA, direction_t dirB, T alpha, const El::ElementalMatrix<T> &APre, const El::ElementalMatrix<T> &BPre, T beta, El::ElementalMatrix<T> &CPre) { if (dirA == base::COLUMNS && dirB == base::COLUMNS) { // Use a SUMMA-like routine, with C as stationary // Basically an adaptation of Elementals TN case for stationary C. const El::Int m = CPre.Height(); const El::Int n = CPre.Width(); const El::Int sumDim = BPre.Height(); const El::Int bsize = El::Blocksize(); const El::Grid& g = APre.Grid(); El::DistMatrixReadProxy<T, T, El::MC, El::MR> AProx(APre); El::DistMatrixReadProxy<T, T, El::MC, El::MR> BProx(BPre); El::DistMatrixReadWriteProxy<T, T, El::MC, El::MR> CProx(CPre); auto& A = AProx.GetLocked(); auto& B = BProx.GetLocked(); auto& C = CProx.Get(); // Temporary distributions El::DistMatrix<T, El::STAR, El::MC> A1_STAR_MC(g); El::DistMatrix<T, El::STAR, El::MR> B1_STAR_MR(g); A1_STAR_MC.AlignWith(C); B1_STAR_MR.AlignWith(C); El::Scale(beta, C); for(El::Int k = 0; k < sumDim; k += bsize) { const El::Int nb = std::min(bsize,sumDim-k); auto A1 = A(El::IR(k,k+nb), El::IR(0,m)); auto B1 = B(El::IR(k,k+nb), El::IR(0,n)); A1_STAR_MC = A1; B1_STAR_MR = B1; L1DistanceMatrix(base::COLUMNS, base::COLUMNS, alpha, A1_STAR_MC.LockedMatrix(), B1_STAR_MR.LockedMatrix(), T(1.0), C.Matrix()); } } // TODO the rest of the cases. }
void SUMMA_NTB ( Orientation orientB, T alpha, const AbstractDistMatrix<T>& APre, const AbstractDistMatrix<T>& BPre, AbstractDistMatrix<T>& CPre ) { EL_DEBUG_CSE const Int m = CPre.Height(); const Int bsize = Blocksize(); const Grid& g = APre.Grid(); DistMatrixReadProxy<T,T,MC,MR> AProx( APre ); DistMatrixReadProxy<T,T,MC,MR> BProx( BPre ); DistMatrixReadWriteProxy<T,T,MC,MR> CProx( CPre ); auto& A = AProx.GetLocked(); auto& B = BProx.GetLocked(); auto& C = CProx.Get(); // Temporary distributions DistMatrix<T,MR,STAR> A1Trans_MR_STAR(g); DistMatrix<T,STAR,MC> D1_STAR_MC(g); DistMatrix<T,MR,MC> D1_MR_MC(g); A1Trans_MR_STAR.AlignWith( B ); D1_STAR_MC.AlignWith( B ); for( Int k=0; k<m; k+=bsize ) { const Int nb = Min(bsize,m-k); auto A1 = A( IR(k,k+nb), ALL ); auto C1 = C( IR(k,k+nb), ALL ); // D1[*,MC] := alpha A1[*,MR] (B[MC,MR])^T // = alpha (A1^T)[MR,*] (B^T)[MR,MC] Transpose( A1, A1Trans_MR_STAR ); LocalGemm( TRANSPOSE, orientB, alpha, A1Trans_MR_STAR, B, D1_STAR_MC ); // C1[MC,MR] += scattered & transposed D1[*,MC] summed over grid rows Contract( D1_STAR_MC, D1_MR_MC ); Axpy( T(1), D1_MR_MC, C1 ); } }
void SUMMA_TNA ( Orientation orientA, T alpha, const AbstractDistMatrix<T>& APre, const AbstractDistMatrix<T>& BPre, AbstractDistMatrix<T>& CPre ) { DEBUG_CSE const Int n = CPre.Width(); const Int bsize = Blocksize(); const Grid& g = APre.Grid(); DistMatrixReadProxy<T,T,MC,MR> AProx( APre ); DistMatrixReadProxy<T,T,MC,MR> BProx( BPre ); DistMatrixReadWriteProxy<T,T,MC,MR> CProx( CPre ); auto& A = AProx.GetLocked(); auto& B = BProx.GetLocked(); auto& C = CProx.Get(); // Temporary distributions DistMatrix<T,MC,STAR> B1_MC_STAR(g); DistMatrix<T,MR,STAR> D1_MR_STAR(g); DistMatrix<T,MR,MC > D1_MR_MC(g); B1_MC_STAR.AlignWith( A ); D1_MR_STAR.AlignWith( A ); for( Int k=0; k<n; k+=bsize ) { const Int nb = Min(bsize,n-k); auto B1 = B( ALL, IR(k,k+nb) ); auto C1 = C( ALL, IR(k,k+nb) ); // D1[MR,*] := alpha (A1[MC,MR])^T B1[MC,*] // = alpha (A1^T)[MR,MC] B1[MC,*] B1_MC_STAR = B1; LocalGemm( orientA, NORMAL, alpha, A, B1_MC_STAR, D1_MR_STAR ); // C1[MC,MR] += scattered & transposed D1[MR,*] summed over grid cols Contract( D1_MR_STAR, D1_MR_MC ); Axpy( T(1), D1_MR_MC, C1 ); } }
void SUMMA_NTA ( Orientation orientB, T alpha, const AbstractDistMatrix<T>& APre, const AbstractDistMatrix<T>& BPre, AbstractDistMatrix<T>& CPre ) { EL_DEBUG_CSE const Int n = CPre.Width(); const Int bsize = Blocksize(); const Grid& g = APre.Grid(); const bool conjugate = ( orientB == ADJOINT ); DistMatrixReadProxy<T,T,MC,MR> AProx( APre ); DistMatrixReadProxy<T,T,MC,MR> BProx( BPre ); DistMatrixReadWriteProxy<T,T,MC,MR> CProx( CPre ); auto& A = AProx.GetLocked(); auto& B = BProx.GetLocked(); auto& C = CProx.Get(); // Temporary distributions DistMatrix<T,MR,STAR> B1Trans_MR_STAR(g); DistMatrix<T,MC,STAR> D1_MC_STAR(g); B1Trans_MR_STAR.AlignWith( A ); D1_MC_STAR.AlignWith( A ); for( Int k=0; k<n; k+=bsize ) { const Int nb = Min(bsize,n-k); auto B1 = B( IR(k,k+nb), ALL ); auto C1 = C( ALL, IR(k,k+nb) ); // C1[MC,*] := alpha A[MC,MR] (B1^[T/H])[MR,*] Transpose( B1, B1Trans_MR_STAR, conjugate ); LocalGemm( NORMAL, NORMAL, alpha, A, B1Trans_MR_STAR, D1_MC_STAR ); // C1[MC,MR] += scattered result of D1[MC,*] summed over grid rows AxpyContract( T(1), D1_MC_STAR, C1 ); } }
void SUMMA_TNB ( Orientation orientA, T alpha, const AbstractDistMatrix<T>& APre, const AbstractDistMatrix<T>& BPre, AbstractDistMatrix<T>& CPre ) { DEBUG_CSE const Int m = CPre.Height(); const Int bsize = Blocksize(); const Grid& g = APre.Grid(); const bool conjugate = ( orientA == ADJOINT ); DistMatrixReadProxy<T,T,MC,MR> AProx( APre ); DistMatrixReadProxy<T,T,MC,MR> BProx( BPre ); DistMatrixReadWriteProxy<T,T,MC,MR> CProx( CPre ); auto& A = AProx.GetLocked(); auto& B = BProx.GetLocked(); auto& C = CProx.Get(); // Temporary distributions DistMatrix<T,MC,STAR> A1_MC_STAR(g); DistMatrix<T,MR,STAR> D1Trans_MR_STAR(g); A1_MC_STAR.AlignWith( B ); D1Trans_MR_STAR.AlignWith( B ); for( Int k=0; k<m; k+=bsize ) { const Int nb = Min(bsize,m-k); auto A1 = A( ALL, IR(k,k+nb) ); auto C1 = C( IR(k,k+nb), ALL ); // D1[*,MR] := alpha (A1[MC,*])^[T/H] B[MC,MR] // = alpha (A1^[T/H])[*,MC] B[MC,MR] A1_MC_STAR = A1; // A1[MC,*] <- A1[MC,MR] LocalGemm( orientA, NORMAL, T(1), B, A1_MC_STAR, D1Trans_MR_STAR ); TransposeAxpyContract( alpha, D1Trans_MR_STAR, C1, conjugate ); } }
void SUMMA_TNC ( Orientation orientA, T alpha, const AbstractDistMatrix<T>& APre, const AbstractDistMatrix<T>& BPre, AbstractDistMatrix<T>& CPre ) { DEBUG_CSE const Int sumDim = BPre.Height(); const Int bsize = Blocksize(); const Grid& g = APre.Grid(); DistMatrixReadProxy<T,T,MC,MR> AProx( APre ); DistMatrixReadProxy<T,T,MC,MR> BProx( BPre ); DistMatrixReadWriteProxy<T,T,MC,MR> CProx( CPre ); auto& A = AProx.GetLocked(); auto& B = BProx.GetLocked(); auto& C = CProx.Get(); // Temporary distributions DistMatrix<T,STAR,MC> A1_STAR_MC(g); DistMatrix<T,MR,STAR> B1Trans_MR_STAR(g); A1_STAR_MC.AlignWith( C ); B1Trans_MR_STAR.AlignWith( C ); for( Int k=0; k<sumDim; k+=bsize ) { const Int nb = Min(bsize,sumDim-k); auto A1 = A( IR(k,k+nb), ALL ); auto B1 = B( IR(k,k+nb), ALL ); // C[MC,MR] += alpha (A1[*,MC])^T B1[*,MR] // = alpha (A1^T)[MC,*] B1[*,MR] A1_STAR_MC = A1; Transpose( B1, B1Trans_MR_STAR ); LocalGemm ( orientA, TRANSPOSE, alpha, A1_STAR_MC, B1Trans_MR_STAR, T(1), C ); } }
void LT_C ( T alpha, const AbstractDistMatrix<T>& APre, AbstractDistMatrix<T>& CPre, bool conjugate=false ) { EL_DEBUG_CSE const Int r = APre.Height(); const Int bsize = Blocksize(); const Grid& g = APre.Grid(); const Orientation orientation = ( conjugate ? ADJOINT : TRANSPOSE ); DistMatrixReadProxy<T,T,MC,MR> AProx( APre ); DistMatrixReadWriteProxy<T,T,MC,MR> CProx( CPre ); auto& A = AProx.GetLocked(); auto& C = CProx.Get(); // Temporary distributions DistMatrix<T,MR, STAR> A1Trans_MR_STAR(g); DistMatrix<T,STAR,VR > A1_STAR_VR(g); DistMatrix<T,STAR,MC > A1_STAR_MC(g); A1Trans_MR_STAR.AlignWith( C ); A1_STAR_MC.AlignWith( C ); for( Int k=0; k<r; k+=bsize ) { const Int nb = Min(bsize,r-k); auto A1 = A( IR(k,k+nb), ALL ); Transpose( A1, A1Trans_MR_STAR ); Transpose( A1Trans_MR_STAR, A1_STAR_VR ); A1_STAR_MC = A1_STAR_VR; LocalTrrk ( LOWER, orientation, TRANSPOSE, alpha, A1_STAR_MC, A1Trans_MR_STAR, T(1), C ); } }
void SymmetricL1DistanceMatrix(El::UpperOrLower uplo, direction_t dir, T alpha, const El::ElementalMatrix<T> &APre, T beta, El::ElementalMatrix<T> &CPre) { if (dir == base::COLUMNS) { const El::Int r = APre.Height(); const El::Int bsize = El::Blocksize(); const El::Grid& g = APre.Grid(); El::DistMatrixReadProxy<T, T, El::MC, El::MR> AProx(APre); El::DistMatrixReadWriteProxy<T, T, El::MC, El::MR> CProx(CPre); auto& A = AProx.GetLocked(); auto& C = CProx.Get(); // Temporary distributions El::DistMatrix<T, El::STAR, El::MR> A1_STAR_MR(g); El::DistMatrix<T, El::STAR, El::MC> A1_STAR_MC(g); A1_STAR_MC.AlignWith(C); A1_STAR_MR.AlignWith(C); El::ScaleTrapezoid(beta, uplo, C); for(El::Int k = 0; k < r; k += bsize) { const El::Int nb = std::min(bsize, r - k); auto A1 = A(El::IR(k, k + nb), El::ALL); A1_STAR_MC = A1; A1_STAR_MR = A1; internal::L1DistanceMatrixTU(uplo, base::COLUMNS, base::COLUMNS, alpha, A1_STAR_MC, A1_STAR_MR, T(1.0), C); } } // TODO the rest of the cases. }