void DenseMatrix<T>::_matvec_blas(T alpha, T beta,
                                  DenseVector<T>& dest,
                                  const DenseVector<T>& arg,
                                  bool trans) const
{
  // Ensure that dest and arg sizes are compatible
  if (!trans)
    {
      // dest  ~ A     * arg
      // (mx1)   (mxn) * (nx1)
      if ((dest.size() != this->m()) || (arg.size() != this->n()))
        {
          libMesh::out << "Improper input argument sizes!" << std::endl;
          libmesh_error();
        }
    }

  else // trans == true
    {
      // Ensure that dest and arg are proper size
      // dest  ~ A^T   * arg
      // (nx1)   (nxm) * (mx1)
      if ((dest.size() != this->n()) || (arg.size() != this->m()))
        {
          libMesh::out << "Improper input argument sizes!" << std::endl;
          libmesh_error();
        }
    }

  // Calling sequence for dgemv:
  //
  // dgemv(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)

  //   TRANS  - CHARACTER*1, 't' for transpose, 'n' for non-transpose multiply
  // We store everything in row-major order, so pass the transpose flag for
  // non-transposed matvecs and the 'n' flag for transposed matvecs
  char TRANS[] = "t";
  if (trans)
    TRANS[0] = 'n';

  //   M      - INTEGER.
  //            On entry, M specifies the number of rows of the matrix A.
  // In C/C++, pass the number of *cols* of A
  int M = this->n();

  //   N      - INTEGER.
  //            On entry, N specifies the number of columns of the matrix A.
  // In C/C++, pass the number of *rows* of A
  int N = this->m();

  //   ALPHA  - DOUBLE PRECISION.
  // The scalar constant passed to this function

  //   A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ).
  //            Before entry, the leading m by n part of the array A must
  //            contain the matrix of coefficients.
  // The matrix, *this.  Note that _matvec_blas is called from
  // a const function, vector_mult(), and so we have made this function const
  // as well.  Since BLAS knows nothing about const, we have to cast it away
  // now.
  DenseMatrix<T>& a_ref = const_cast< DenseMatrix<T>& > ( *this );
  std::vector<T>& a = a_ref.get_values();

  //   LDA    - INTEGER.
  //            On entry, LDA specifies the first dimension of A as declared
  //            in the calling (sub) program. LDA must be at least
  //            max( 1, m ).
  int LDA = M;

  //   X      - DOUBLE PRECISION array of DIMENSION at least
  //            ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'
  //            and at least
  //            ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.
  //            Before entry, the incremented array X must contain the
  //            vector x.
  // Here, we must cast away the const-ness of "arg" since BLAS knows
  // nothing about const
  DenseVector<T>& x_ref = const_cast< DenseVector<T>& > ( arg );
  std::vector<T>& x = x_ref.get_values();

  //   INCX   - INTEGER.
  //            On entry, INCX specifies the increment for the elements of
  //            X. INCX must not be zero.
  int INCX = 1;

  //   BETA   - DOUBLE PRECISION.
  //            On entry, BETA specifies the scalar beta. When BETA is
  //            supplied as zero then Y need not be set on input.
  // The second scalar constant passed to this function

  //   Y      - DOUBLE PRECISION array of DIMENSION at least
  //            ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'
  //            and at least
  //            ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.
  //            Before entry with BETA non-zero, the incremented array Y
  //            must contain the vector y. On exit, Y is overwritten by the
  //            updated vector y.
  // The input vector "dest"
  std::vector<T>& y = dest.get_values();

  //   INCY   - INTEGER.
  //            On entry, INCY specifies the increment for the elements of
  //            Y. INCY must not be zero.
  int INCY = 1;

  // Finally, ready to call the BLAS function
  BLASgemv_(TRANS, &M, &N, &alpha, &(a[0]), &LDA, &(x[0]), &INCX, &beta, &(y[0]), &INCY);
}
void DenseMatrix<T>::_evd_lapack (DenseVector<T> & lambda_real,
                                  DenseVector<T> & lambda_imag)
{
  // The calling sequence for dgeev is:
  // DGEEVX( BALANC, JOBVL, JOBVR, SENSE, N, A, LDA, WR, WI, VL, LDVL, VR,
  //         LDVR, ILO, IHI, SCALE, ABNRM, RCONDE, RCONDV, WORK, LWORK, IWORK, INFO )


  //  BALANC  (input) CHARACTER*1
  //          Indicates how the input matrix should be diagonally scaled
  //          and/or permuted to improve the conditioning of its
  //          eigenvalues.
  //          = 'N': Do not diagonally scale or permute;
  char BALANC = 'N';

  //  JOBVL   (input) CHARACTER*1
  //          = 'N': left eigenvectors of A are not computed;
  //          = 'V': left eigenvectors of A are computed.
  char JOBVL = 'N';

  //  JOBVR   (input) CHARACTER*1
  //          = 'N': right eigenvectors of A are not computed;
  //          = 'V': right eigenvectors of A are computed.
  char JOBVR = 'N';

  //  SENSE   (input) CHARACTER*1
  //          Determines which reciprocal condition numbers are computed.
  //          = 'N': None are computed;
  //          = 'E': Computed for eigenvalues only;
  //          = 'V': Computed for right eigenvectors only;
  //          = 'B': Computed for eigenvalues and right eigenvectors.
  char SENSE = 'N';

  //    N       (input) int *
  //            The number of rows/cols of the matrix A.  N >= 0.
  libmesh_assert( this->m() == this->n() );
  int N = this->m();

  //  A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
  //          On entry, the N-by-N matrix A.
  //          On exit, A has been overwritten.
  // Here, we pass &(_val[0]).

  //    LDA     (input) int *
  //            The leading dimension of the array A.  LDA >= max(1,N).
  int LDA = N;

  //  WR      (output) DOUBLE PRECISION array, dimension (N)
  //  WI      (output) DOUBLE PRECISION array, dimension (N)
  //          WR and WI contain the real and imaginary parts,
  //          respectively, of the computed eigenvalues.  Complex
  //          conjugate pairs of eigenvalues appear consecutively
  //          with the eigenvalue having the positive imaginary part
  //          first.
  lambda_real.resize(N);
  lambda_imag.resize(N);

  //  VL      (output) DOUBLE PRECISION array, dimension (LDVL,N)
  //          If JOBVL = 'V', the left eigenvectors u(j) are stored one
  //          after another in the columns of VL, in the same order
  //          as their eigenvalues.
  //          If JOBVL = 'N', VL is not referenced.
  //          If the j-th eigenvalue is real, then u(j) = VL(:,j),
  //          the j-th column of VL.
  //          If the j-th and (j+1)-st eigenvalues form a complex
  //          conjugate pair, then u(j) = VL(:,j) + i*VL(:,j+1) and
  //          u(j+1) = VL(:,j) - i*VL(:,j+1).
  // Just set to NULL here.

  //  LDVL    (input) INTEGER
  //          The leading dimension of the array VL.  LDVL >= 1; if
  //          JOBVL = 'V', LDVL >= N.
  int LDVL = 1;

  //  VR      (output) DOUBLE PRECISION array, dimension (LDVR,N)
  //          If JOBVR = 'V', the right eigenvectors v(j) are stored one
  //          after another in the columns of VR, in the same order
  //          as their eigenvalues.
  //          If JOBVR = 'N', VR is not referenced.
  //          If the j-th eigenvalue is real, then v(j) = VR(:,j),
  //          the j-th column of VR.
  //          If the j-th and (j+1)-st eigenvalues form a complex
  //          conjugate pair, then v(j) = VR(:,j) + i*VR(:,j+1) and
  //          v(j+1) = VR(:,j) - i*VR(:,j+1).
  // Just set to NULL here.

  //  LDVR    (input) INTEGER
  //          The leading dimension of the array VR.  LDVR >= 1; if
  //          JOBVR = 'V', LDVR >= N.
  int LDVR = 1;

  // Outputs (unused)
  int ILO = 0;
  int IHI = 0;
  std::vector<T> SCALE(N);
  T ABNRM;
  std::vector<T> RCONDE(N);
  std::vector<T> RCONDV(N);

  //  WORK    (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
  //          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
  //
  //  LWORK   (input) INTEGER
  //          The dimension of the array WORK.
  int LWORK = 3*N;
  std::vector<T> WORK( LWORK );

  //  IWORK   (workspace) INTEGER array, dimension (2*N-2)
  //          If SENSE = 'N' or 'E', not referenced.
  // Just set to NULL


  //  INFO    (output) INTEGER
  //          = 0:  successful exit
  //          < 0:  if INFO = -i, the i-th argument had an illegal value.
  //          > 0:  if INFO = i, the QR algorithm failed to compute all the
  //                eigenvalues, and no eigenvectors or condition numbers
  //                have been computed; elements 1:ILO-1 and i+1:N of WR
  //                and WI contain eigenvalues which have converged.
  int INFO = 0;

  // Get references to raw data
  std::vector<T> & lambda_real_val = lambda_real.get_values();
  std::vector<T> & lambda_imag_val = lambda_imag.get_values();

  // Ready to call the actual factorization routine through SLEPc's interface
  LAPACKgeevx_( &BALANC, &JOBVL, &JOBVR, &SENSE, &N, &(_val[0]), &LDA, &lambda_real_val[0],
                &lambda_imag_val[0], libmesh_nullptr, &LDVL, libmesh_nullptr, &LDVR, &ILO, &IHI, &SCALE[0], &ABNRM,
                &RCONDE[0], &RCONDV[0], &WORK[0], &LWORK, libmesh_nullptr, &INFO );

  // Check return value for errors
  if (INFO != 0)
    libmesh_error_msg("INFO=" << INFO << ", Error during Lapack eigenvalue calculation!");
}
void DenseMatrix<T>::_lu_back_substitute_lapack (const DenseVector<T>& b,
                                                 DenseVector<T>& x)
{
  // The calling sequence for getrs is:
  // dgetrs(TRANS, N, NRHS, A, LDA, IPIV, B, LDB, INFO)

  //    trans   (input) char*
  //            'n' for no tranpose, 't' for transpose
  char TRANS[] = "t";

  //    N       (input) int*
  //            The order of the matrix A.  N >= 0.
  int N = this->m();


  //    NRHS    (input) int*
  //            The number of right hand sides, i.e., the number of columns
  //            of the matrix B.  NRHS >= 0.
  int NRHS = 1;

  //    A       (input) DOUBLE PRECISION array, dimension (LDA,N)
  //            The factors L and U from the factorization A = P*L*U
  //            as computed by dgetrf.
  // Here, we pass &(_val[0])

  //    LDA     (input) int*
  //            The leading dimension of the array A.  LDA >= max(1,N).
  int LDA = N;

  //    ipiv    (input) int array, dimension (N)
  //            The pivot indices from DGETRF; for 1<=i<=N, row i of the
  //            matrix was interchanged with row IPIV(i).
  // Here, we pass &(_pivots[0]) which was computed in _lu_decompose_lapack

  //    B       (input/output) DOUBLE PRECISION array, dimension (LDB,NRHS)
  //            On entry, the right hand side matrix B.
  //            On exit, the solution matrix X.
  // Here, we pass a copy of the rhs vector's data array in x, so that the
  // passed right-hand side b is unmodified.  I don't see a way around this
  // copy if we want to maintain an unmodified rhs in LibMesh.
  x = b;
  std::vector<T>& x_vec = x.get_values();

  // We can avoid the copy if we don't care about overwriting the RHS: just
  // pass b to the Lapack routine and then swap with x before exiting
  // std::vector<T>& x_vec = b.get_values();

  //    LDB     (input) int*
  //            The leading dimension of the array B.  LDB >= max(1,N).
  int LDB = N;

  //    INFO    (output) int*
  //            = 0:  successful exit
  //            < 0:  if INFO = -i, the i-th argument had an illegal value
  int INFO = 0;

  // Finally, ready to call the Lapack getrs function
  LAPACKgetrs_(TRANS, &N, &NRHS, &(_val[0]), &LDA, &(_pivots[0]), &(x_vec[0]), &LDB, &INFO);

  // Check return value for errors
  if (INFO != 0)
    {
      libMesh::out << "INFO="
                   << INFO
                   << ", Error during Lapack LU solve!" << std::endl;
      libmesh_error();
    }

  // Don't do this if you already made a copy of b above
  // Swap b and x.  The solution will then be in x, and whatever was originally
  // in x, maybe garbage, maybe nothing, will be in b.
  // FIXME: Rewrite the LU and Cholesky solves to just take one input, and overwrite
  // the input.  This *should* make user code simpler, as they don't have to create
  // an extra vector just to pass it in to the solve function!
  // b.swap(x);
}
void DenseMatrix<T>::_svd_solve_lapack(const DenseVector<T> & rhs,
                                       DenseVector<T> & x,
                                       Real rcond) const
{
  // Since BLAS is expecting column-major storage, we first need to
  // make a transposed copy of *this, then pass it to the gelss
  // routine instead of the original.  This extra copy is kind of a
  // bummer, it might be better if we could use the full SVD to
  // compute the least-squares solution instead...  Note that it isn't
  // completely terrible either, since A_trans gets overwritten by
  // Lapack, and we usually would end up making a copy of A outside
  // the function call anyway.
  DenseMatrix<T> A_trans;
  this->get_transpose(A_trans);

  // M is INTEGER
  // The number of rows of the input matrix. M >= 0.
  // This is actually the number of *columns* of A_trans.
  int M = A_trans.n();

  // N is INTEGER
  // The number of columns of the matrix A. N >= 0.
  // This is actually the number of *rows* of A_trans.
  int N = A_trans.m();

  // We'll use the min and max of (M,N) several times below.
  int max_MN = std::max(M,N);
  int min_MN = std::min(M,N);

  // NRHS is INTEGER
  // The number of right hand sides, i.e., the number of columns
  // of the matrices B and X. NRHS >= 0.
  // This could later be generalized to solve for multiple right-hand
  // sides...
  int NRHS = 1;

  // A is DOUBLE PRECISION array, dimension (LDA,N)
  // On entry, the M-by-N matrix A.
  // On exit, the first min(m,n) rows of A are overwritten with
  // its right singular vectors, stored rowwise.
  //
  // The data vector that will be passed to Lapack.
  std::vector<T> & A_trans_vals = A_trans.get_values();

  // LDA is INTEGER
  // The leading dimension of the array A.  LDA >= max(1,M).
  int LDA = M;

  // B is DOUBLE PRECISION array, dimension (LDB,NRHS)
  // On entry, the M-by-NRHS right hand side matrix B.
  // On exit, B is overwritten by the N-by-NRHS solution
  // matrix X.  If m >= n and RANK = n, the residual
  // sum-of-squares for the solution in the i-th column is given
  // by the sum of squares of elements n+1:m in that column.
  //
  // Since we don't want the user's rhs vector to be overwritten by
  // the solution, we copy the rhs values into the solution vector "x"
  // now.  x needs to be long enough to hold both the (Nx1) solution
  // vector or the (Mx1) rhs, so size it to the max of those.
  x.resize(max_MN);
  for (unsigned i=0; i<rhs.size(); ++i)
    x(i) = rhs(i);

  // Make the syntax below simpler by grabbing a reference to this array.
  std::vector<T> & B = x.get_values();

  // LDB is INTEGER
  // The leading dimension of the array B. LDB >= max(1,max(M,N)).
  int LDB = x.size();

  // S is DOUBLE PRECISION array, dimension (min(M,N))
  // The singular values of A in decreasing order.
  // The condition number of A in the 2-norm = S(1)/S(min(m,n)).
  std::vector<T> S(min_MN);

  // RCOND is DOUBLE PRECISION
  // RCOND is used to determine the effective rank of A.
  // Singular values S(i) <= RCOND*S(1) are treated as zero.
  // If RCOND < 0, machine precision is used instead.
  Real RCOND = rcond;

  // RANK is INTEGER
  // The effective rank of A, i.e., the number of singular values
  // which are greater than RCOND*S(1).
  int RANK = 0;

  // LWORK is INTEGER
  // The dimension of the array WORK. LWORK >= 1, and also:
  // LWORK >= 3*min(M,N) + max( 2*min(M,N), max(M,N), NRHS )
  // For good performance, LWORK should generally be larger.
  //
  // If LWORK = -1, then a workspace query is assumed; the routine
  // only calculates the optimal size of the WORK array, returns
  // this value as the first entry of the WORK array, and no error
  // message related to LWORK is issued by XERBLA.
  //
  // The factor of 1.5 is arbitrary and is used to satisfy the "should
  // generally be larger" clause.
  int LWORK = 1.5 * (3*min_MN + std::max(2*min_MN, std::max(max_MN, NRHS)));

  // WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK))
  // On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
  std::vector<T> WORK(LWORK);

  // INFO is INTEGER
  // = 0:  successful exit
  // < 0:  if INFO = -i, the i-th argument had an illegal value.
  // > 0:  the algorithm for computing the SVD failed to converge;
  //       if INFO = i, i off-diagonal elements of an intermediate
  //       bidiagonal form did not converge to zero.
  int INFO = 0;

  // LAPACKgelss_(const PetscBLASInt *, // M
  //              const PetscBLASInt *, // N
  //              const PetscBLASInt *, // NRHS
  //              PetscScalar *,        // A
  //              const PetscBLASInt *, // LDA
  //              PetscScalar *,        // B
  //              const PetscBLASInt *, // LDB
  //              PetscReal *,          // S(out) = singular values of A in increasing order
  //              const PetscReal *,    // RCOND = tolerance for singular values
  //              PetscBLASInt *,       // RANK(out) = number of "non-zero" singular values
  //              PetscScalar *,        // WORK
  //              const PetscBLASInt *, // LWORK
  //              PetscBLASInt *);      // INFO
  LAPACKgelss_(&M, &N, &NRHS, &A_trans_vals[0], &LDA, &B[0], &LDB, &S[0], &RCOND, &RANK, &WORK[0], &LWORK, &INFO);

  // Check for errors in the Lapack call
  if (INFO < 0)
    libmesh_error_msg("Error, argument " << -INFO << " to LAPACKgelss_ had an illegal value.");
  if (INFO > 0)
    libmesh_error_msg("The algorithm for computing the SVD failed to converge!");

  // Debugging: print singular values and information about condition number:
  // libMesh::err << "RCOND=" << RCOND << std::endl;
  // libMesh::err << "Singular values: " << std::endl;
  // for (unsigned i=0; i<S.size(); ++i)
  //   libMesh::err << S[i] << std::endl;
  // libMesh::err << "The condition number of A is approximately: " << S[0]/S.back() << std::endl;

  // Lapack has already written the solution into B, but it will be
  // the wrong size for non-square problems, so we need to resize it
  // correctly.  The size of the solution vector should be the number
  // of columns of the original A matrix.  Unfortunately, resizing a
  // DenseVector currently also zeros it out (unlike a std::vector) so
  // we'll resize the underlying storage directly (the size is not
  // stored independently elsewhere).
  x.get_values().resize(this->n());
}