MatrixViewType get_cache_block (MatrixViewType A, const typename MatrixViewType::ordinal_type cache_block_index, const bool contiguous_cache_blocks) const { typedef typename MatrixViewType::ordinal_type ordinal_type; typedef typename MatrixViewType::scalar_type scalar_type; // Total number of cache blocks. const ordinal_type num_cache_blocks = strategy_.num_cache_blocks (A.nrows(), A.ncols(), nrows_cache_block()); if (cache_block_index >= num_cache_blocks) return MatrixViewType (0, 0, NULL, 0); // empty // result[0] = starting row index of the cache block // result[1] = number of rows in the cache block // result[2] = pointer offset (A.get() + result[2]) // result[3] = leading dimension (a.k.a. stride) of the cache block std::vector<Ordinal> result = strategy_.cache_block_details (cache_block_index, A.nrows(), A.ncols(), A.lda(), nrows_cache_block(), contiguous_cache_blocks); if (result[1] == 0) // For some reason, the cache block is empty. return MatrixViewType (0, 0, NULL, 0); // We expect that ordinal_type is signed, so adding signed // (ordinal_type) to unsigned (pointer) may raise compiler // warnings. return MatrixViewType (result[1], A.ncols(), A.get() + static_cast<size_t>(result[2]), result[3]); }
Matrix (const MatrixViewType& in) : nrows_ (in.nrows()), ncols_ (in.ncols()), A_ (verified_alloc_size (in.nrows(), in.ncols())) { if (A_.size() != 0) copy_matrix (nrows(), ncols(), get(), lda(), in.get(), in.lda()); }
static void scaleMatrix (MatrixViewType& A, const typename MatrixViewType::scalar_type& denom) { typedef typename MatrixViewType::ordinal_type ordinal_type; typedef typename MatrixViewType::scalar_type scalar_type; const ordinal_type nrows = A.nrows(); const ordinal_type ncols = A.ncols(); const ordinal_type lda = A.lda(); if (nrows == lda) { // A is stored contiguously. const ordinal_type nelts = nrows * ncols; scalar_type* const A_ptr = A.get (); for (ordinal_type k = 0; k < nelts; ++k) { A_ptr[k] /= denom; } } else { // Each column of A is stored contiguously. for (ordinal_type j = 0; j < ncols; ++j) { scalar_type* const A_j = &A(0,j); for (ordinal_type i = 0; i < nrows; ++i) { A_j[i] /= denom; } } } }
bool operator== (const MatrixViewType& B) const { if (get() != B.get() || nrows() != B.nrows() || ncols() != B.ncols() || lda() != B.lda()) { return false; } else { return true; } }
static std::vector<typename Teuchos::ScalarTraits<typename MatrixViewType::scalar_type>::magnitudeType> localVerify (const MatrixViewType& A, const MatrixViewType& Q, const MatrixViewType& R) { return local_verify (A.nrows(), A.ncols(), A.get(), A.lda(), Q.get(), Q.lda(), R.get(), R.lda()); }
MatrixViewType split_top_block (MatrixViewType& A, const bool contiguous_cache_blocks) const { typedef typename MatrixViewType::ordinal_type ordinal_type; const ordinal_type nrows_top = strategy_.top_block_split_nrows (A.nrows(), ncols(), nrows_cache_block()); // split_top() sets A to A_rest, and returns A_top. return A.split_top (nrows_top, contiguous_cache_blocks); }
MatrixViewType split_bottom_block (MatrixViewType& A, const bool contiguous_cache_blocks) const { typedef typename MatrixViewType::ordinal_type ordinal_type; const ordinal_type nrows_bottom = strategy_.bottom_block_split_nrows (A.nrows(), ncols(), nrows_cache_block()); // split_bottom() modifies A return A.split_bottom (nrows_bottom, contiguous_cache_blocks); }
MatrixViewType top_block (const MatrixViewType& A, const bool contiguous_cache_blocks) const { typedef typename MatrixViewType::ordinal_type ordinal_type; const ordinal_type nrows_top = strategy_.top_block_split_nrows (A.nrows(), ncols(), nrows_cache_block()); MatrixViewType A_copy (A); return A_copy.split_top (nrows_top, contiguous_cache_blocks); }
MatrixViewType split_bottom_block (MatrixViewType& A, const bool contiguous_cache_blocks) const { typedef typename MatrixViewType::ordinal_type ordinal_type; // Ignore the number of columns in A, since we want to block all // matrices using the same cache blocking strategy. const ordinal_type nrows_bottom = strategy_.bottom_block_split_nrows (A.nrows(), ncols(), nrows_cache_block()); // split_bottom() sets A to A_rest, and returns A_bot. return A.split_bottom (nrows_bottom, contiguous_cache_blocks); }
MatrixViewType top_block (const MatrixViewType& A, const bool contiguous_cache_blocks) const { typedef typename MatrixViewType::ordinal_type ordinal_type; // Ignore the number of columns in A, since we want to block all // matrices using the same cache blocking strategy. const ordinal_type nrows_top = strategy_.top_block_split_nrows (A.nrows(), ncols(), nrows_cache_block()); MatrixViewType A_copy (A); return A_copy.split_top (nrows_top, contiguous_cache_blocks); }
void implicit_Q (MatrixViewType& Q, typename MatrixViewType::scalar_type tau[]) { implicit_Q (Q.nrows(), Q.ncols(), Q.get(), Q.lda(), tau); }
static void printMatrix (std::ostream& out, const MatrixViewType& A) { print_local_matrix (out, A.nrows(), A.ncols(), A.get(), A.lda()); }
void randomGlobalMatrix (Generator* const pGenerator, MatrixViewType& A_local, const typename Teuchos::ScalarTraits< typename MatrixViewType::scalar_type >::magnitudeType singular_values[], MessengerBase< typename MatrixViewType::ordinal_type >* const ordinalMessenger, MessengerBase< typename MatrixViewType::scalar_type >* const scalarMessenger) { using Teuchos::NO_TRANS; using std::vector; typedef typename MatrixViewType::ordinal_type ordinal_type; typedef typename MatrixViewType::scalar_type scalar_type; const bool b_local_debug = false; const int rootProc = 0; const int nprocs = ordinalMessenger->size(); const int myRank = ordinalMessenger->rank(); Teuchos::BLAS<ordinal_type, scalar_type> blas; const ordinal_type nrowsLocal = A_local.nrows(); const ordinal_type ncols = A_local.ncols(); // Theory: Suppose there are P processors. Proc q wants an m_q by n // component of the matrix A, which we write as A_q. On Proc 0, we // generate random m_q by n orthogonal matrices Q_q (in explicit // form), and send Q_q to Proc q. The m by n matrix [Q_0; Q_1; ...; // Q_{P-1}] is not itself orthogonal. However, the m by n matrix // Q = [Q_0 / P; Q_1 / P; ...; Q_{P-1} / P] is orthogonal: // // \sum_{q = 0}^{P-1} (Q_q^T * Q_q) / P = I. if (myRank == rootProc) { typedef Random::MatrixGenerator< ordinal_type, scalar_type, Generator > matgen_type; matgen_type matGen (*pGenerator); // Generate a random ncols by ncols upper triangular matrix // R with the given singular values. Matrix< ordinal_type, scalar_type > R (ncols, ncols, scalar_type(0)); matGen.fill_random_R (ncols, R.get(), R.lda(), singular_values); // Broadcast R to all the processors. scalarMessenger->broadcast (R.get(), ncols*ncols, rootProc); // Generate (for myself) a random nrowsLocal x ncols // orthogonal matrix, stored in explicit form. Matrix< ordinal_type, scalar_type > Q_local (nrowsLocal, ncols); matGen.explicit_Q (nrowsLocal, ncols, Q_local.get(), Q_local.lda()); // Scale the (local) orthogonal matrix by the number of // processors P, to make the columns of the global matrix Q // orthogonal. (Otherwise the norm of each column will be P // instead of 1.) const scalar_type P = static_cast< scalar_type > (nprocs); // Do overflow check. If casting P back to scalar_type // doesn't produce the same value as nprocs, the cast // overflowed. We take the real part, because scalar_type // might be complex. if (nprocs != static_cast<int> (Teuchos::ScalarTraits<scalar_type>::real (P))) throw std::runtime_error ("Casting nprocs to Scalar failed"); scaleMatrix (Q_local, P); // A_local := Q_local * R blas.GEMM (NO_TRANS, NO_TRANS, nrowsLocal, ncols, ncols, scalar_type(1), Q_local.get(), Q_local.lda(), R.get(), R.lda(), scalar_type(0), A_local.get(), A_local.lda()); for (int recvProc = 1; recvProc < nprocs; ++recvProc) { // Ask the receiving processor how big (i.e., how many rows) // its local component of the matrix is. ordinal_type nrowsRemote = 0; ordinalMessenger->recv (&nrowsRemote, 1, recvProc, 0); if (b_local_debug) { std::ostringstream os; os << "For Proc " << recvProc << ": local block is " << nrowsRemote << " by " << ncols << std::endl; std::cerr << os.str(); } // Make sure Q_local is big enough to hold the data for // the current receiver proc. Q_local.reshape (nrowsRemote, ncols); // Compute a random nrowsRemote * ncols orthogonal // matrix Q_local, for the current receiving processor. matGen.explicit_Q (nrowsRemote, ncols, Q_local.get(), Q_local.lda()); // Send Q_local to the current receiving processor. scalarMessenger->send (Q_local.get(), nrowsRemote*ncols, recvProc, 0); } } else { // Receive the R factor from Proc 0. There's only 1 R // factor for all the processes. Matrix< ordinal_type, scalar_type > R (ncols, ncols, scalar_type (0)); scalarMessenger->broadcast (R.get(), ncols*ncols, rootProc); // Q_local (nrows_local by ncols, random orthogonal matrix) // will be received from Proc 0, where it was generated. const ordinal_type recvSize = nrowsLocal * ncols; Matrix< ordinal_type, scalar_type > Q_local (nrowsLocal, ncols); // Tell Proc 0 how many rows there are in the random orthogonal // matrix I want to receive from Proc 0. ordinalMessenger->send (&nrowsLocal, 1, rootProc, 0); // Receive the orthogonal matrix from Proc 0. scalarMessenger->recv (Q_local.get(), recvSize, rootProc, 0); // Scale the (local) orthogonal matrix by the number of // processors, to make the global matrix Q orthogonal. const scalar_type P = static_cast< scalar_type > (nprocs); // Do overflow check. If casting P back to scalar_type // doesn't produce the same value as nprocs, the cast // overflowed. We take the real part, because scalar_type // might be complex. if (nprocs != static_cast<int> (Teuchos::ScalarTraits<scalar_type>::real (P))) throw std::runtime_error ("Casting nprocs to Scalar failed"); scaleMatrix (Q_local, P); // A_local := Q_local * R blas.GEMM (NO_TRANS, NO_TRANS, nrowsLocal, ncols, ncols, scalar_type(1), Q_local.get(), Q_local.lda(), R.get(), R.lda(), scalar_type(0), A_local.get(), A_local.lda()); } }