LocalOrdinal revealRank (Kokkos::MultiVector<Scalar, NodeType>& Q, Teuchos::SerialDenseMatrix<LocalOrdinal, Scalar>& R, const magnitude_type& tol, const bool contiguousCacheBlocks = false) const { typedef Kokkos::MultiVector<Scalar, NodeType> KMV; const LocalOrdinal nrows = static_cast<LocalOrdinal> (Q.getNumRows()); const LocalOrdinal ncols = static_cast<LocalOrdinal> (Q.getNumCols()); const LocalOrdinal ldq = static_cast<LocalOrdinal> (Q.getStride()); Teuchos::ArrayRCP<Scalar> Q_ptr = Q.getValuesNonConst(); // Take the easy exit if available. if (ncols == 0) return 0; // // FIXME (mfh 16 Jul 2010) We _should_ compute the SVD of R (as // the copy B) on Proc 0 only. This would ensure that all // processors get the same SVD and rank (esp. in a heterogeneous // computing environment). For now, we just do this computation // redundantly, and hope that all the returned rank values are // the same. // matrix_type U (ncols, ncols, STS::zero()); const ordinal_type rank = reveal_R_rank (ncols, R.values(), R.stride(), U.get(), U.lda(), tol); if (rank < ncols) { // cerr << ">>> Rank of R: " << rank << " < ncols=" << ncols << endl; // cerr << ">>> Resulting U:" << endl; // print_local_matrix (cerr, ncols, ncols, R, ldr); // cerr << endl; // If R is not full rank: reveal_R_rank() already computed // the SVD \f$R = U \Sigma V^*\f$ of (the input) R, and // overwrote R with \f$\Sigma V^*\f$. Now, we compute \f$Q // := Q \cdot U\f$, respecting cache blocks of Q. Q_times_B (nrows, ncols, Q_ptr.getRawPtr(), ldq, U.get(), U.lda(), contiguousCacheBlocks); } return rank; }
void factorExplicit (Kokkos::MultiVector<Scalar, NodeType>& A, Kokkos::MultiVector<Scalar, NodeType>& Q, Teuchos::SerialDenseMatrix<LocalOrdinal, Scalar>& R, const bool contiguousCacheBlocks, const bool forceNonnegativeDiagonal=false) { using Teuchos::asSafe; typedef Kokkos::MultiVector<Scalar, NodeType> KMV; // Tsqr currently likes LocalOrdinal ordinals, but // Kokkos::MultiVector has size_t ordinals. Do conversions // here. // // Teuchos::asSafe() can do safe conversion (e.g., checking for // overflow when casting to a narrower integer type), if a // custom specialization is defined for // Teuchos::ValueTypeConversionTraits<size_t, LocalOrdinal>. // Otherwise, this has the same (potentially) unsafe effect as // static_cast<LocalOrdinal>(...) would have. const LocalOrdinal A_numRows = asSafe<LocalOrdinal> (A.getNumRows()); const LocalOrdinal A_numCols = asSafe<LocalOrdinal> (A.getNumCols()); const LocalOrdinal A_stride = asSafe<LocalOrdinal> (A.getStride()); const LocalOrdinal Q_numRows = asSafe<LocalOrdinal> (Q.getNumRows()); const LocalOrdinal Q_numCols = asSafe<LocalOrdinal> (Q.getNumCols()); const LocalOrdinal Q_stride = asSafe<LocalOrdinal> (Q.getStride()); // Sanity checks for matrix dimensions if (A_numRows < A_numCols) { std::ostringstream os; os << "In Tsqr::factorExplicit: input matrix A has " << A_numRows << " local rows, and " << A_numCols << " columns. The input " "matrix must have at least as many rows on each processor as " "there are columns."; throw std::invalid_argument(os.str()); } else if (A_numRows != Q_numRows) { std::ostringstream os; os << "In Tsqr::factorExplicit: input matrix A and output matrix Q " "must have the same number of rows. A has " << A_numRows << " rows" " and Q has " << Q_numRows << " rows."; throw std::invalid_argument(os.str()); } else if (R.numRows() < R.numCols()) { std::ostringstream os; os << "In Tsqr::factorExplicit: output matrix R must have at least " "as many rows as columns. R has " << R.numRows() << " rows and " << R.numCols() << " columns."; throw std::invalid_argument(os.str()); } else if (A_numCols != R.numCols()) { std::ostringstream os; os << "In Tsqr::factorExplicit: input matrix A and output matrix R " "must have the same number of columns. A has " << A_numCols << " columns and R has " << R.numCols() << " columns."; throw std::invalid_argument(os.str()); } // Check for quick exit, based on matrix dimensions if (Q_numCols == 0) return; // Hold on to nonconst views of A and Q. This will make TSQR // correct (if perhaps inefficient) for all possible Kokkos Node // types, even GPU nodes. Teuchos::ArrayRCP<scalar_type> A_ptr = A.getValuesNonConst(); Teuchos::ArrayRCP<scalar_type> Q_ptr = Q.getValuesNonConst(); R.putScalar (STS::zero()); NodeOutput nodeResults = nodeTsqr_->factor (A_numRows, A_numCols, A_ptr.getRawPtr(), A_stride, R.values(), R.stride(), contiguousCacheBlocks); // FIXME (mfh 19 Oct 2010) Replace actions on raw pointer with // actions on the Kokkos::MultiVector or at least the ArrayRCP. nodeTsqr_->fill_with_zeros (Q_numRows, Q_numCols, Q_ptr.getRawPtr(), Q_stride, contiguousCacheBlocks); matview_type Q_rawView (Q_numRows, Q_numCols, Q_ptr.getRawPtr(), Q_stride); matview_type Q_top_block = nodeTsqr_->top_block (Q_rawView, contiguousCacheBlocks); if (Q_top_block.nrows() < R.numCols()) { std::ostringstream os; os << "The top block of Q has too few rows. This means that the " << "the intranode TSQR implementation has a bug in its top_block" << "() method. The top block should have at least " << R.numCols() << " rows, but instead has only " << Q_top_block.ncols() << " rows."; throw std::logic_error (os.str()); } { matview_type Q_top (R.numCols(), Q_numCols, Q_top_block.get(), Q_top_block.lda()); matview_type R_view (R.numRows(), R.numCols(), R.values(), R.stride()); distTsqr_->factorExplicit (R_view, Q_top, forceNonnegativeDiagonal); } nodeTsqr_->apply (ApplyType::NoTranspose, A_numRows, A_numCols, A_ptr.getRawPtr(), A_stride, nodeResults, Q_numCols, Q_ptr.getRawPtr(), Q_stride, contiguousCacheBlocks); // If necessary, force the R factor to have a nonnegative diagonal. if (forceNonnegativeDiagonal && ! QR_produces_R_factor_with_nonnegative_diagonal()) { details::NonnegDiagForcer<LocalOrdinal, Scalar, STS::isComplex> forcer; matview_type Q_mine (Q_numRows, Q_numCols, Q_ptr.getRawPtr(), Q_stride); matview_type R_mine (R.numRows(), R.numCols(), R.values(), R.stride()); forcer.force (Q_mine, R_mine); } // "Commit" the changes to the multivector. A_ptr = Teuchos::null; Q_ptr = Teuchos::null; }