Example #1
    get_cache_block (MatrixViewType A,
		     const typename MatrixViewType::ordinal_type cache_block_index,
		     const bool contiguous_cache_blocks) const
      typedef typename MatrixViewType::ordinal_type ordinal_type;
      typedef typename MatrixViewType::scalar_type scalar_type;

      // Total number of cache blocks.
      const ordinal_type num_cache_blocks = 
	strategy_.num_cache_blocks (A.nrows(), A.ncols(), nrows_cache_block());

      if (cache_block_index >= num_cache_blocks)
	return MatrixViewType (0, 0, NULL, 0); // empty

      // result[0] = starting row index of the cache block
      // result[1] = number of rows in the cache block
      // result[2] = pointer offset (A.get() + result[2])
      // result[3] = leading dimension (a.k.a. stride) of the cache block
      std::vector<Ordinal> result = 
	strategy_.cache_block_details (cache_block_index, A.nrows(), A.ncols(),
				       A.lda(), nrows_cache_block(), 
      if (result[1] == 0)
	// For some reason, the cache block is empty.  	
	return MatrixViewType (0, 0, NULL, 0);

      // We expect that ordinal_type is signed, so adding signed
      // (ordinal_type) to unsigned (pointer) may raise compiler
      // warnings.
      return MatrixViewType (result[1], A.ncols(), 
			     A.get() + static_cast<size_t>(result[2]), 
 Matrix (const MatrixViewType& in) :
     nrows_ (in.nrows()),
     ncols_ (in.ncols()),
     A_ (verified_alloc_size (in.nrows(), in.ncols()))
     if (A_.size() != 0)
         copy_matrix (nrows(), ncols(), get(), lda(), in.get(), in.lda());
    static void
    scaleMatrix (MatrixViewType& A,
                 const typename MatrixViewType::scalar_type& denom)
      typedef typename MatrixViewType::ordinal_type ordinal_type;
      typedef typename MatrixViewType::scalar_type scalar_type;

      const ordinal_type nrows = A.nrows();
      const ordinal_type ncols = A.ncols();
      const ordinal_type lda = A.lda();

      if (nrows == lda) { // A is stored contiguously.
        const ordinal_type nelts = nrows * ncols;
        scalar_type* const A_ptr = A.get ();
        for (ordinal_type k = 0; k < nelts; ++k) {
          A_ptr[k] /= denom;
      else { // Each column of A is stored contiguously.
        for (ordinal_type j = 0; j < ncols; ++j) {
          scalar_type* const A_j = &A(0,j);
          for (ordinal_type i = 0; i < nrows; ++i) {
            A_j[i] /= denom;
 bool operator== (const MatrixViewType& B) const
     if (get() != B.get() || nrows() != B.nrows() || ncols() != B.ncols() || lda() != B.lda()) {
         return false;
     } else {
         return true;
Example #5
 std::vector<typename Teuchos::ScalarTraits<typename MatrixViewType::scalar_type>::magnitudeType>
 localVerify (const MatrixViewType& A,
              const MatrixViewType& Q,
              const MatrixViewType& R)
   return local_verify (A.nrows(), A.ncols(), A.get(), A.lda(),
                        Q.get(), Q.lda(), R.get(), R.lda());
Example #6
 recv (MatrixViewType& R, const int srcProc)
   const typename MatrixViewType::ordinal_type ncols = R.ncols();
   const Ordinal buflen = buffer_length (ncols);
   buffer_.resize (buflen);
   messenger_->recv (&buffer_[0], buflen, srcProc, 0);
   unpack (R);
Example #7
 broadcast (MatrixViewType& R, const int rootProc)
   const int myRank = messenger_->rank();
   if (myRank == rootProc)
     pack (R);
   messenger_->broadcast (&buffer_[0], buffer_length (R.ncols()), rootProc);
   if (myRank != rootProc)
     unpack (R);
Example #8
    unpack (MatrixViewType& R)
      typedef typename MatrixViewType::ordinal_type view_ordinal_type;
      typedef typename std::vector< Scalar >::const_iterator const_iter_type;

      const view_ordinal_type ncols = R.ncols();
      const_iter_type iter = buffer_.begin();
      for (view_ordinal_type j = 0; j < ncols; ++j)
          std::copy (iter, iter + (j+1), &R(0,j));
          iter += (j+1);
Example #9
  gatherStack (MatrixViewType& R_stack,
               ConstMatrixViewType& R_local,
               const Teuchos::RCP<MessengerBase<typename MatrixViewType::scalar_type> >& messenger)
    typedef typename MatrixViewType::ordinal_type ordinal_type;
    typedef typename MatrixViewType::scalar_type scalar_type;
    typedef MatView<ordinal_type, scalar_type> mat_view_type;

    const int nprocs = messenger->size();
    const int my_rank = messenger->rank();

    if (my_rank == 0) {
      const ordinal_type ncols = R_stack.ncols();

      // Copy data from R_local into top ncols x ncols block of R_stack.
      mat_view_type R_stack_view_first (ncols, ncols, R_stack.get(), R_stack.lda());
      deep_copy (R_stack_view_first, R_local);

      // Loop through all other processors, fetching their matrix data.
      RMessenger< ordinal_type, scalar_type > receiver (messenger);
      for (int srcProc = 1; srcProc < nprocs; ++srcProc) {
        const scalar_type* const R_ptr = R_stack.get() + srcProc*ncols;
        mat_view_type R_stack_view_cur (ncols, ncols, R_ptr, R_stack.lda());
        // Fill (the lower triangle) with zeros, since
        // RMessenger::recv() only writes to the upper triangle.
        R_stack_view_cur.fill (scalar_type (0));
        receiver.recv (R_stack_view_cur, srcProc);
    else {
      // We only read R_stack on Proc 0, not on this proc.
      // Send data from R_local to Proc 0.
      const int destProc = 0;
      RMessenger<ordinal_type, scalar_type> sender (messenger);
      sender.send (R_local, destProc);
    messenger->barrier ();
      implicit_Q (MatrixViewType& Q, 
		  typename MatrixViewType::scalar_type tau[])
	implicit_Q (Q.nrows(), Q.ncols(), Q.get(), Q.lda(), tau);
Example #11
 static void
 printMatrix (std::ostream& out,
              const MatrixViewType& A)
   print_local_matrix (out, A.nrows(), A.ncols(), A.get(), A.lda());
    randomGlobalMatrix (Generator* const pGenerator,
                        MatrixViewType& A_local,
                        const typename Teuchos::ScalarTraits< typename MatrixViewType::scalar_type >::magnitudeType singular_values[],
                        MessengerBase< typename MatrixViewType::ordinal_type >* const ordinalMessenger,
                        MessengerBase< typename MatrixViewType::scalar_type >* const scalarMessenger)
      using Teuchos::NO_TRANS;
      using std::vector;
      typedef typename MatrixViewType::ordinal_type ordinal_type;
      typedef typename MatrixViewType::scalar_type scalar_type;

      const bool b_local_debug = false;

      const int rootProc = 0;
      const int nprocs = ordinalMessenger->size();
      const int myRank = ordinalMessenger->rank();
      Teuchos::BLAS<ordinal_type, scalar_type> blas;

      const ordinal_type nrowsLocal = A_local.nrows();
      const ordinal_type ncols = A_local.ncols();

      // Theory: Suppose there are P processors.  Proc q wants an m_q by n
      // component of the matrix A, which we write as A_q.  On Proc 0, we
      // generate random m_q by n orthogonal matrices Q_q (in explicit
      // form), and send Q_q to Proc q.  The m by n matrix [Q_0; Q_1; ...;
      // Q_{P-1}] is not itself orthogonal.  However, the m by n matrix
      // Q = [Q_0 / P; Q_1 / P; ...; Q_{P-1} / P] is orthogonal:
      // \sum_{q = 0}^{P-1} (Q_q^T * Q_q) / P = I.

      if (myRank == rootProc)
          typedef Random::MatrixGenerator< ordinal_type, scalar_type, Generator > matgen_type;
          matgen_type matGen (*pGenerator);

          // Generate a random ncols by ncols upper triangular matrix
          // R with the given singular values.
          Matrix< ordinal_type, scalar_type > R (ncols, ncols, scalar_type(0));
          matGen.fill_random_R (ncols, R.get(), R.lda(), singular_values);

          // Broadcast R to all the processors.
          scalarMessenger->broadcast (R.get(), ncols*ncols, rootProc);

          // Generate (for myself) a random nrowsLocal x ncols
          // orthogonal matrix, stored in explicit form.
          Matrix< ordinal_type, scalar_type > Q_local (nrowsLocal, ncols);
          matGen.explicit_Q (nrowsLocal, ncols, Q_local.get(), Q_local.lda());

          // Scale the (local) orthogonal matrix by the number of
          // processors P, to make the columns of the global matrix Q
          // orthogonal.  (Otherwise the norm of each column will be P
          // instead of 1.)
          const scalar_type P = static_cast< scalar_type > (nprocs);
          // Do overflow check.  If casting P back to scalar_type
          // doesn't produce the same value as nprocs, the cast
          // overflowed.  We take the real part, because scalar_type
          // might be complex.
          if (nprocs != static_cast<int> (Teuchos::ScalarTraits<scalar_type>::real (P)))
            throw std::runtime_error ("Casting nprocs to Scalar failed");

          scaleMatrix (Q_local, P);

          // A_local := Q_local * R
          blas.GEMM (NO_TRANS, NO_TRANS, nrowsLocal, ncols, ncols,
                     scalar_type(1), Q_local.get(), Q_local.lda(),
                     R.get(), R.lda(),
                     scalar_type(0), A_local.get(), A_local.lda());

          for (int recvProc = 1; recvProc < nprocs; ++recvProc)
              // Ask the receiving processor how big (i.e., how many rows)
              // its local component of the matrix is.
              ordinal_type nrowsRemote = 0;
              ordinalMessenger->recv (&nrowsRemote, 1, recvProc, 0);

              if (b_local_debug)
                  std::ostringstream os;
                  os << "For Proc " << recvProc << ": local block is "
                     << nrowsRemote << " by " << ncols << std::endl;
                  std::cerr << os.str();

              // Make sure Q_local is big enough to hold the data for
              // the current receiver proc.
              Q_local.reshape (nrowsRemote, ncols);

              // Compute a random nrowsRemote * ncols orthogonal
              // matrix Q_local, for the current receiving processor.
              matGen.explicit_Q (nrowsRemote, ncols, Q_local.get(), Q_local.lda());

              // Send Q_local to the current receiving processor.
              scalarMessenger->send (Q_local.get(), nrowsRemote*ncols, recvProc, 0);
          // Receive the R factor from Proc 0.  There's only 1 R
          // factor for all the processes.
          Matrix< ordinal_type, scalar_type > R (ncols, ncols, scalar_type (0));
          scalarMessenger->broadcast (R.get(), ncols*ncols, rootProc);

          // Q_local (nrows_local by ncols, random orthogonal matrix)
          // will be received from Proc 0, where it was generated.
          const ordinal_type recvSize = nrowsLocal * ncols;
          Matrix< ordinal_type, scalar_type > Q_local (nrowsLocal, ncols);

          // Tell Proc 0 how many rows there are in the random orthogonal
          // matrix I want to receive from Proc 0.
          ordinalMessenger->send (&nrowsLocal, 1, rootProc, 0);

          // Receive the orthogonal matrix from Proc 0.
          scalarMessenger->recv (Q_local.get(), recvSize, rootProc, 0);

          // Scale the (local) orthogonal matrix by the number of
          // processors, to make the global matrix Q orthogonal.
          const scalar_type P = static_cast< scalar_type > (nprocs);
          // Do overflow check.  If casting P back to scalar_type
          // doesn't produce the same value as nprocs, the cast
          // overflowed.  We take the real part, because scalar_type
          // might be complex.
          if (nprocs != static_cast<int> (Teuchos::ScalarTraits<scalar_type>::real (P)))
            throw std::runtime_error ("Casting nprocs to Scalar failed");
          scaleMatrix (Q_local, P);

          // A_local := Q_local * R
          blas.GEMM (NO_TRANS, NO_TRANS, nrowsLocal, ncols, ncols,
                     scalar_type(1), Q_local.get(), Q_local.lda(),
                     R.get(), R.lda(),
                     scalar_type(0), A_local.get(), A_local.lda());