void gatherStack (MatrixViewType& R_stack, ConstMatrixViewType& R_local, const Teuchos::RCP<MessengerBase<typename MatrixViewType::scalar_type> >& messenger) { typedef typename MatrixViewType::ordinal_type ordinal_type; typedef typename MatrixViewType::scalar_type scalar_type; typedef MatView<ordinal_type, scalar_type> mat_view_type; const int nprocs = messenger->size(); const int my_rank = messenger->rank(); if (my_rank == 0) { const ordinal_type ncols = R_stack.ncols(); // Copy data from R_local into top ncols x ncols block of R_stack. mat_view_type R_stack_view_first (ncols, ncols, R_stack.get(), R_stack.lda()); deep_copy (R_stack_view_first, R_local); // Loop through all other processors, fetching their matrix data. RMessenger< ordinal_type, scalar_type > receiver (messenger); for (int srcProc = 1; srcProc < nprocs; ++srcProc) { const scalar_type* const R_ptr = R_stack.get() + srcProc*ncols; mat_view_type R_stack_view_cur (ncols, ncols, R_ptr, R_stack.lda()); // Fill (the lower triangle) with zeros, since // RMessenger::recv() only writes to the upper triangle. R_stack_view_cur.fill (scalar_type (0)); receiver.recv (R_stack_view_cur, srcProc); } } else { // We only read R_stack on Proc 0, not on this proc. // Send data from R_local to Proc 0. const int destProc = 0; RMessenger<ordinal_type, scalar_type> sender (messenger); sender.send (R_local, destProc); } messenger->barrier (); }
Matrix (const MatrixViewType& in) : nrows_ (in.nrows()), ncols_ (in.ncols()), A_ (verified_alloc_size (in.nrows(), in.ncols())) { if (A_.size() != 0) copy_matrix (nrows(), ncols(), get(), lda(), in.get(), in.lda()); }
void unpack (MatrixViewType& R) { typedef typename MatrixViewType::ordinal_type view_ordinal_type; typedef typename std::vector< Scalar >::const_iterator const_iter_type; const view_ordinal_type ncols = R.ncols(); const_iter_type iter = buffer_.begin(); for (view_ordinal_type j = 0; j < ncols; ++j) { std::copy (iter, iter + (j+1), &R(0,j)); iter += (j+1); } }
void fill_with_zeros (MatrixViewType A, const bool contiguous_cache_blocks) const { // Note: if the cache blocks are stored contiguously, A.lda() // won't be the correct leading dimension of A, but it won't // matter: we only ever operate on A_cur here, and A_cur's // leading dimension is set correctly by split_top_block(). while (! A.empty()) { // This call modifies the matrix view A, but that's OK since // we passed the input view by copy, not by reference. MatrixViewType A_cur = split_top_block (A, contiguous_cache_blocks); A_cur.fill (Scalar(0)); } }
void implicit_Q (MatrixViewType& Q, typename MatrixViewType::scalar_type tau[]) { implicit_Q (Q.nrows(), Q.ncols(), Q.get(), Q.lda(), tau); }
static void printMatrix (std::ostream& out, const MatrixViewType& A) { print_local_matrix (out, A.nrows(), A.ncols(), A.get(), A.lda()); }
void randomGlobalMatrix (Generator* const pGenerator, MatrixViewType& A_local, const typename Teuchos::ScalarTraits< typename MatrixViewType::scalar_type >::magnitudeType singular_values[], MessengerBase< typename MatrixViewType::ordinal_type >* const ordinalMessenger, MessengerBase< typename MatrixViewType::scalar_type >* const scalarMessenger) { using Teuchos::NO_TRANS; using std::vector; typedef typename MatrixViewType::ordinal_type ordinal_type; typedef typename MatrixViewType::scalar_type scalar_type; const bool b_local_debug = false; const int rootProc = 0; const int nprocs = ordinalMessenger->size(); const int myRank = ordinalMessenger->rank(); Teuchos::BLAS<ordinal_type, scalar_type> blas; const ordinal_type nrowsLocal = A_local.nrows(); const ordinal_type ncols = A_local.ncols(); // Theory: Suppose there are P processors. Proc q wants an m_q by n // component of the matrix A, which we write as A_q. On Proc 0, we // generate random m_q by n orthogonal matrices Q_q (in explicit // form), and send Q_q to Proc q. The m by n matrix [Q_0; Q_1; ...; // Q_{P-1}] is not itself orthogonal. However, the m by n matrix // Q = [Q_0 / P; Q_1 / P; ...; Q_{P-1} / P] is orthogonal: // // \sum_{q = 0}^{P-1} (Q_q^T * Q_q) / P = I. if (myRank == rootProc) { typedef Random::MatrixGenerator< ordinal_type, scalar_type, Generator > matgen_type; matgen_type matGen (*pGenerator); // Generate a random ncols by ncols upper triangular matrix // R with the given singular values. Matrix< ordinal_type, scalar_type > R (ncols, ncols, scalar_type(0)); matGen.fill_random_R (ncols, R.get(), R.lda(), singular_values); // Broadcast R to all the processors. scalarMessenger->broadcast (R.get(), ncols*ncols, rootProc); // Generate (for myself) a random nrowsLocal x ncols // orthogonal matrix, stored in explicit form. Matrix< ordinal_type, scalar_type > Q_local (nrowsLocal, ncols); matGen.explicit_Q (nrowsLocal, ncols, Q_local.get(), Q_local.lda()); // Scale the (local) orthogonal matrix by the number of // processors P, to make the columns of the global matrix Q // orthogonal. (Otherwise the norm of each column will be P // instead of 1.) const scalar_type P = static_cast< scalar_type > (nprocs); // Do overflow check. If casting P back to scalar_type // doesn't produce the same value as nprocs, the cast // overflowed. We take the real part, because scalar_type // might be complex. if (nprocs != static_cast<int> (Teuchos::ScalarTraits<scalar_type>::real (P))) throw std::runtime_error ("Casting nprocs to Scalar failed"); scaleMatrix (Q_local, P); // A_local := Q_local * R blas.GEMM (NO_TRANS, NO_TRANS, nrowsLocal, ncols, ncols, scalar_type(1), Q_local.get(), Q_local.lda(), R.get(), R.lda(), scalar_type(0), A_local.get(), A_local.lda()); for (int recvProc = 1; recvProc < nprocs; ++recvProc) { // Ask the receiving processor how big (i.e., how many rows) // its local component of the matrix is. ordinal_type nrowsRemote = 0; ordinalMessenger->recv (&nrowsRemote, 1, recvProc, 0); if (b_local_debug) { std::ostringstream os; os << "For Proc " << recvProc << ": local block is " << nrowsRemote << " by " << ncols << std::endl; std::cerr << os.str(); } // Make sure Q_local is big enough to hold the data for // the current receiver proc. Q_local.reshape (nrowsRemote, ncols); // Compute a random nrowsRemote * ncols orthogonal // matrix Q_local, for the current receiving processor. matGen.explicit_Q (nrowsRemote, ncols, Q_local.get(), Q_local.lda()); // Send Q_local to the current receiving processor. scalarMessenger->send (Q_local.get(), nrowsRemote*ncols, recvProc, 0); } } else { // Receive the R factor from Proc 0. There's only 1 R // factor for all the processes. Matrix< ordinal_type, scalar_type > R (ncols, ncols, scalar_type (0)); scalarMessenger->broadcast (R.get(), ncols*ncols, rootProc); // Q_local (nrows_local by ncols, random orthogonal matrix) // will be received from Proc 0, where it was generated. const ordinal_type recvSize = nrowsLocal * ncols; Matrix< ordinal_type, scalar_type > Q_local (nrowsLocal, ncols); // Tell Proc 0 how many rows there are in the random orthogonal // matrix I want to receive from Proc 0. ordinalMessenger->send (&nrowsLocal, 1, rootProc, 0); // Receive the orthogonal matrix from Proc 0. scalarMessenger->recv (Q_local.get(), recvSize, rootProc, 0); // Scale the (local) orthogonal matrix by the number of // processors, to make the global matrix Q orthogonal. const scalar_type P = static_cast< scalar_type > (nprocs); // Do overflow check. If casting P back to scalar_type // doesn't produce the same value as nprocs, the cast // overflowed. We take the real part, because scalar_type // might be complex. if (nprocs != static_cast<int> (Teuchos::ScalarTraits<scalar_type>::real (P))) throw std::runtime_error ("Casting nprocs to Scalar failed"); scaleMatrix (Q_local, P); // A_local := Q_local * R blas.GEMM (NO_TRANS, NO_TRANS, nrowsLocal, ncols, ncols, scalar_type(1), Q_local.get(), Q_local.lda(), R.get(), R.lda(), scalar_type(0), A_local.get(), A_local.lda()); } }