void DenseMatrix<T>::_multiply_blas(const DenseMatrixBase<T>& other, _BLAS_Multiply_Flag flag) { int result_size = 0; // For each case, determine the size of the final result make sure // that the inner dimensions match switch (flag) { case LEFT_MULTIPLY: { result_size = other.m() * this->n(); if (other.n() == this->m()) break; } case RIGHT_MULTIPLY: { result_size = other.n() * this->m(); if (other.m() == this->n()) break; } case LEFT_MULTIPLY_TRANSPOSE: { result_size = other.n() * this->n(); if (other.m() == this->m()) break; } case RIGHT_MULTIPLY_TRANSPOSE: { result_size = other.m() * this->m(); if (other.n() == this->n()) break; } default: { libMesh::out << "Unknown flag selected or matrices are "; libMesh::out << "incompatible for multiplication." << std::endl; libmesh_error(); } } // For this to work, the passed arg. must actually be a DenseMatrix<T> const DenseMatrix<T>* const_that = libmesh_cast_ptr< const DenseMatrix<T>* >(&other); // Also, although 'that' is logically const in this BLAS routine, // the PETSc BLAS interface does not specify that any of the inputs are // const. To use it, I must cast away const-ness. DenseMatrix<T>* that = const_cast< DenseMatrix<T>* > (const_that); // Initialize A, B pointers for LEFT_MULTIPLY* cases DenseMatrix<T> *A = this, *B = that; // For RIGHT_MULTIPLY* cases, swap the meaning of A and B. // Here is a full table of combinations we can pass to BLASgemm, and what the answer is when finished: // pass A B -> (Fortran) -> A^T B^T -> (C++) -> (A^T B^T)^T -> (identity) -> B A "lt multiply" // pass B A -> (Fortran) -> B^T A^T -> (C++) -> (B^T A^T)^T -> (identity) -> A B "rt multiply" // pass A B^T -> (Fortran) -> A^T B -> (C++) -> (A^T B)^T -> (identity) -> B^T A "lt multiply t" // pass B^T A -> (Fortran) -> B A^T -> (C++) -> (B A^T)^T -> (identity) -> A B^T "rt multiply t" if (flag==RIGHT_MULTIPLY || flag==RIGHT_MULTIPLY_TRANSPOSE) std::swap(A,B); // transa, transb values to pass to blas char transa[] = "n", transb[] = "n"; // Integer values to pass to BLAS: // // M // In Fortran, the number of rows of op(A), // In the BLAS documentation, typically known as 'M'. // // In C/C++, we set: // M = n_cols(A) if (transa='n') // n_rows(A) if (transa='t') int M = static_cast<int>( A->n() ); // N // In Fortran, the number of cols of op(B), and also the number of cols of C. // In the BLAS documentation, typically known as 'N'. // // In C/C++, we set: // N = n_rows(B) if (transb='n') // n_cols(B) if (transb='t') int N = static_cast<int>( B->m() ); // K // In Fortran, the number of cols of op(A), and also // the number of rows of op(B). In the BLAS documentation, // typically known as 'K'. // // In C/C++, we set: // K = n_rows(A) if (transa='n') // n_cols(A) if (transa='t') int K = static_cast<int>( A->m() ); // LDA (leading dimension of A). In our cases, // LDA is always the number of columns of A. int LDA = static_cast<int>( A->n() ); // LDB (leading dimension of B). In our cases, // LDB is always the number of columns of B. int LDB = static_cast<int>( B->n() ); if (flag == LEFT_MULTIPLY_TRANSPOSE) { transb[0] = 't'; N = static_cast<int>( B->n() ); } else if (flag == RIGHT_MULTIPLY_TRANSPOSE) { transa[0] = 't'; std::swap(M,K); } // LDC (leading dimension of C). LDC is the // number of columns in the solution matrix. int LDC = M; // Scalar values to pass to BLAS // // scalar multiplying the whole product AB T alpha = 1.; // scalar multiplying C, which is the original matrix. T beta = 0.; // Storage for the result std::vector<T> result (result_size); // Finally ready to call the BLAS BLASgemm_(transa, transb, &M, &N, &K, &alpha, &(A->_val[0]), &LDA, &(B->_val[0]), &LDB, &beta, &result[0], &LDC); // Update the relevant dimension for this matrix. switch (flag) { case LEFT_MULTIPLY: { this->_m = other.m(); break; } case RIGHT_MULTIPLY: { this->_n = other.n(); break; } case LEFT_MULTIPLY_TRANSPOSE: { this->_m = other.n(); break; } case RIGHT_MULTIPLY_TRANSPOSE: { this->_n = other.m(); break; } default: { libMesh::out << "Unknown flag selected." << std::endl; libmesh_error(); } } // Swap my data vector with the result this->_val.swap(result); }
PetscErrorCode BDC_dlaed3m_(const char *jobz,const char *defl,PetscBLASInt k,PetscBLASInt n, PetscBLASInt n1,PetscReal *d,PetscReal *q,PetscBLASInt ldq, PetscReal rho,PetscReal *dlamda,PetscReal *q2,PetscBLASInt *indx, PetscBLASInt *ctot,PetscReal *w,PetscReal *s,PetscBLASInt *info, PetscBLASInt jobz_len,PetscBLASInt defl_len) { /* -- Routine written in LAPACK version 3.0 style -- */ /* *************************************************** */ /* Written by */ /* Michael Moldaschl and Wilfried Gansterer */ /* University of Vienna */ /* last modification: March 16, 2014 */ /* Small adaptations of original code written by */ /* Wilfried Gansterer and Bob Ward, */ /* Department of Computer Science, University of Tennessee */ /* see http://dx.doi.org/10.1137/S1064827501399432 */ /* *************************************************** */ /* Purpose */ /* ======= */ /* DLAED3M finds the roots of the secular equation, as defined by the */ /* values in D, W, and RHO, between 1 and K. It makes the */ /* appropriate calls to DLAED4 and then updates the eigenvectors by */ /* multiplying the matrix of eigenvectors of the pair of eigensystems */ /* being combined by the matrix of eigenvectors of the K-by-K system */ /* which is solved here. */ /* This code makes very mild assumptions about floating point */ /* arithmetic. It will work on machines with a guard digit in */ /* add/subtract, or on those binary machines without guard digits */ /* which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. */ /* It could conceivably fail on hexadecimal or decimal machines */ /* without guard digits, but we know of none. */ /* Arguments */ /* ========= */ /* JOBZ (input) CHARACTER*1 */ /* = 'N': Do not accumulate eigenvectors (not implemented); */ /* = 'D': Do accumulate eigenvectors in the divide-and-conquer */ /* process. */ /* DEFL (input) CHARACTER*1 */ /* = '0': No deflation happened in DSRTDF */ /* = '1': Some deflation happened in DSRTDF (and therefore some */ /* Givens rotations need to be applied to the computed */ /* eigenvector matrix Q) */ /* K (input) INTEGER */ /* The number of terms in the rational function to be solved by */ /* DLAED4. 0 <= K <= N. */ /* N (input) INTEGER */ /* The number of rows and columns in the Q matrix. */ /* N >= K (deflation may result in N>K). */ /* N1 (input) INTEGER */ /* The location of the last eigenvalue in the leading submatrix. */ /* min(1,N) <= N1 <= max(1,N-1). */ /* D (output) DOUBLE PRECISION array, dimension (N) */ /* D(I) contains the updated eigenvalues for */ /* 1 <= I <= K. */ /* Q (output) DOUBLE PRECISION array, dimension (LDQ,N) */ /* Initially the first K columns are used as workspace. */ /* On output the columns 1 to K contain */ /* the updated eigenvectors. */ /* LDQ (input) INTEGER */ /* The leading dimension of the array Q. LDQ >= max(1,N). */ /* RHO (input) DOUBLE PRECISION */ /* The value of the parameter in the rank one update equation. */ /* RHO >= 0 required. */ /* DLAMDA (input/output) DOUBLE PRECISION array, dimension (K) */ /* The first K elements of this array contain the old roots */ /* of the deflated updating problem. These are the poles */ /* of the secular equation. May be changed on output by */ /* having lowest order bit set to zero on Cray X-MP, Cray Y-MP, */ /* Cray-2, or Cray C-90, as described above. */ /* Q2 (input) DOUBLE PRECISION array, dimension (LDQ2, N) */ /* The first K columns of this matrix contain the non-deflated */ /* eigenvectors for the split problem. */ /* INDX (input) INTEGER array, dimension (N) */ /* The permutation used to arrange the columns of the deflated */ /* Q matrix into three groups (see DLAED2). */ /* The rows of the eigenvectors found by DLAED4 must be likewise */ /* permuted before the matrix multiply can take place. */ /* CTOT (input) INTEGER array, dimension (4) */ /* A count of the total number of the various types of columns */ /* in Q, as described in INDX. The fourth column type is any */ /* column which has been deflated. */ /* W (input/output) DOUBLE PRECISION array, dimension (K) */ /* The first K elements of this array contain the components */ /* of the deflation-adjusted updating vector. Destroyed on */ /* output. */ /* S (workspace) DOUBLE PRECISION array, dimension */ /* ( MAX(CTOT(1)+CTOT(2),CTOT(2)+CTOT(3)) + 1 )*K */ /* Will contain parts of the eigenvectors of the repaired matrix */ /* which will be multiplied by the previously accumulated */ /* eigenvectors to update the system. This array is a major */ /* source of workspace requirements ! */ /* INFO (output) INTEGER */ /* = 0: successful exit. */ /* < 0: if INFO = -i, the i-th argument had an illegal value. */ /* > 0: if INFO = i, eigenpair i was not computed successfully */ /* Further Details */ /* =============== */ /* Based on code written by */ /* Wilfried Gansterer and Bob Ward, */ /* Department of Computer Science, University of Tennessee */ /* Based on the design of the LAPACK code DLAED3 with small modifications */ /* (Note that in contrast to the original DLAED3, this routine */ /* DOES NOT require that N1 <= N/2) */ /* Based on contributions by */ /* Jeff Rutter, Computer Science Division, University of California */ /* at Berkeley, USA */ /* Modified by Francoise Tisseur, University of Tennessee. */ /* ===================================================================== */ #if defined(SLEPC_MISSING_LAPACK_LAED4) || defined(SLEPC_MISSING_LAPACK_LACPY) || defined(SLEPC_MISSING_LAPACK_LASET) PetscFunctionBegin; SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"LAED4/LACPY/LASET - Lapack routine is unavailable"); #else PetscReal temp, done = 1.0, dzero = 0.0; PetscBLASInt i, j, n2, n12, ii, n23, iq2, i1, one=1; PetscFunctionBegin; *info = 0; if (k < 0) { *info = -3; } else if (n < k) { *info = -4; } else if (n1 < PetscMin(1,n) || n1 > PetscMax(1,n)) { *info = -5; } else if (ldq < PetscMax(1,n)) { *info = -8; } else if (rho < 0.) { *info = -9; } if (*info) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong argument %d in DLAED3M",-(*info)); /* Quick return if possible */ if (k == 0) PetscFunctionReturn(0); /* Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can */ /* be computed with high relative accuracy (barring over/underflow). */ /* This is a problem on machines without a guard digit in */ /* add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2). */ /* The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I), */ /* which on any of these machines zeros out the bottommost */ /* bit of DLAMDA(I) if it is 1; this makes the subsequent */ /* subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation */ /* occurs. On binary machines with a guard digit (almost all */ /* machines) it does not change DLAMDA(I) at all. On hexadecimal */ /* and decimal machines with a guard digit, it slightly */ /* changes the bottommost bits of DLAMDA(I). It does not account */ /* for hexadecimal or decimal machines without guard digits */ /* (we know of none). We use a subroutine call to compute */ /* 2*DLAMBDA(I) to prevent optimizing compilers from eliminating */ /* this code. */ for (i = 0; i < k; ++i) { dlamda[i] = LAPACKlamc3_(&dlamda[i], &dlamda[i]) - dlamda[i]; } for (j = 1; j <= k; ++j) { /* ....calling DLAED4 for eigenpair J.... */ PetscStackCallBLAS("LAPACKlaed4",LAPACKlaed4_(&k, &j, dlamda, w, &q[(j-1)*ldq], &rho, &d[j-1], info)); if (*info) SETERRQ3(PETSC_COMM_SELF,1,"Error in dlaed4, info = %d, failed when computing D(%d)=%g",*info,j,d[j-1]); if (j < k) { /* If the zero finder terminated properly, but the computed */ /* eigenvalues are not ordered, issue an error statement */ /* but continue computation. */ if (dlamda[j-1] >= dlamda[j]) SETERRQ2(PETSC_COMM_SELF,1,"DLAMDA(%d) is greater or equal than DLAMDA(%d)", j, j+1); if (d[j-1] < dlamda[j-1] || d[j-1] > dlamda[j]) SETERRQ6(PETSC_COMM_SELF,1,"DLAMDA(%d) = %g D(%d) = %g DLAMDA(%d) = %g", j, dlamda[j-1], j, d[j-1], j+1, dlamda[j]); } } if (k == 1) goto L110; if (k == 2) { /* permute the components of Q(:,J) (the information returned by DLAED4 */ /* necessary to construct the eigenvectors) according to the permutation */ /* stored in INDX, resulting from deflation */ for (j = 0; j < k; ++j) { w[0] = q[0+j*ldq]; w[1] = q[1+j*ldq]; ii = indx[0]; q[0+j*ldq] = w[ii-1]; ii = indx[1]; q[1+j*ldq] = w[ii-1]; } goto L110; } /* ....K.GE.3.... */ /* Compute updated W (used for computing the eigenvectors corresponding */ /* to the previously computed eigenvalues). */ PetscStackCallBLAS("BLAScopy",BLAScopy_(&k, w, &one, s, &one)); /* Initialize W(I) = Q(I,I) */ i1 = ldq + 1; PetscStackCallBLAS("BLAScopy",BLAScopy_(&k, q, &i1, w, &one)); for (j = 0; j < k; ++j) { for (i = 0; i < j; ++i) { w[i] *= q[i+j*ldq] / (dlamda[i] - dlamda[j]); } for (i = j + 1; i < k; ++i) { w[i] *= q[i+j*ldq] / (dlamda[i] - dlamda[j]); } } for (i = 0; i < k; ++i) { temp = PetscSqrtReal(-w[i]); if (temp<0) temp = -temp; w[i] = (s[i] >= 0) ? temp : -temp; } /* Compute eigenvectors of the modified rank-1 modification (using the */ /* vector W). */ for (j = 0; j < k; ++j) { for (i = 0; i < k; ++i) { s[i] = w[i] / q[i+j*ldq]; } temp = BLASnrm2_(&k, s, &one); for (i = 0; i < k; ++i) { /* apply the permutation resulting from deflation as stored */ /* in INDX */ ii = indx[i]; q[i+j*ldq] = s[ii-1] / temp; } } /* ************************************************************************** */ /* ....updating the eigenvectors.... */ L110: n2 = n - n1; n12 = ctot[0] + ctot[1]; n23 = ctot[1] + ctot[2]; if (*(unsigned char *)jobz == 'D') { /* Compute the updated eigenvectors. (NOTE that every call of */ /* DGEMM requires three DISTINCT arrays) */ /* copy Q( CTOT(1)+1:K,1:K ) to S */ PetscStackCallBLAS("LAPACKlacpy",LAPACKlacpy_("A", &n23, &k, &q[ctot[0]], &ldq, s, &n23)); iq2 = n1 * n12 + 1; if (n23 != 0) { /* multiply the second part of Q2 (the eigenvectors of the */ /* lower block) with S and write the result into the lower part of */ /* Q, i.e., Q( N1+1:N,1:K ) */ PetscStackCallBLAS("BLASgemm",BLASgemm_("N", "N", &n2, &k, &n23, &done, &q2[iq2-1], &n2, s, &n23, &dzero, &q[n1], &ldq)); } else { PetscStackCallBLAS("LAPACKlaset",LAPACKlaset_("A", &n2, &k, &dzero, &dzero, &q[n1], &ldq)); } /* copy Q( 1:CTOT(1)+CTOT(2),1:K ) to S */ PetscStackCallBLAS("LAPACKlacpy",LAPACKlacpy_("A", &n12, &k, q, &ldq, s, &n12)); if (n12 != 0) { /* multiply the first part of Q2 (the eigenvectors of the */ /* upper block) with S and write the result into the upper part of */ /* Q, i.e., Q( 1:N1,1:K ) */ PetscStackCallBLAS("BLASgemm",BLASgemm_("N", "N", &n1, &k, &n12, &done, q2, &n1, s, &n12, &dzero, q, &ldq)); } else { PetscStackCallBLAS("LAPACKlaset",LAPACKlaset_("A", &n1, &k, &dzero, &dzero, q, &ldq)); } } PetscFunctionReturn(0); #endif }
PetscErrorCode DSFunction_EXP_NHEP_PADE(DS ds) { #if defined(PETSC_MISSING_LAPACK_GESV) || defined(SLEPC_MISSING_LAPACK_LANGE) PetscFunctionBegin; SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"GESV/LANGE - Lapack routines are unavailable"); #else PetscErrorCode ierr; PetscBLASInt n,ld,ld2,*ipiv,info,inc=1; PetscInt j,k,odd; const PetscInt p=MAX_PADE; PetscReal c[MAX_PADE+1],s; PetscScalar scale,mone=-1.0,one=1.0,two=2.0,zero=0.0; PetscScalar *A,*A2,*Q,*P,*W,*aux; PetscFunctionBegin; ierr = PetscBLASIntCast(ds->n,&n);CHKERRQ(ierr); ierr = PetscBLASIntCast(ds->ld,&ld);CHKERRQ(ierr); ld2 = ld*ld; ierr = DSAllocateWork_Private(ds,0,ld,ld);CHKERRQ(ierr); ipiv = ds->iwork; if (!ds->mat[DS_MAT_W]) { ierr = DSAllocateMat_Private(ds,DS_MAT_W);CHKERRQ(ierr); } if (!ds->mat[DS_MAT_Z]) { ierr = DSAllocateMat_Private(ds,DS_MAT_Z);CHKERRQ(ierr); } A = ds->mat[DS_MAT_A]; A2 = ds->mat[DS_MAT_Z]; Q = ds->mat[DS_MAT_Q]; P = ds->mat[DS_MAT_F]; W = ds->mat[DS_MAT_W]; /* Pade' coefficients */ c[0] = 1.0; for (k=1;k<=p;k++) { c[k] = c[k-1]*(p+1-k)/(k*(2*p+1-k)); } /* Scaling */ s = LAPACKlange_("I",&n,&n,A,&ld,ds->rwork); if (s>0.5) { s = PetscMax(0,(int)(PetscLogReal(s)/PetscLogReal(2.0)) + 2); scale = PetscPowReal(2.0,(-1)*s); PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&scale,A,&inc)); } /* Horner evaluation */ PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,A,&ld,A,&ld,&zero,A2,&ld)); ierr = PetscMemzero(Q,ld*ld*sizeof(PetscScalar));CHKERRQ(ierr); ierr = PetscMemzero(P,ld*ld*sizeof(PetscScalar));CHKERRQ(ierr); for (j=0;j<n;j++) { Q[j+j*ld] = c[p]; P[j+j*ld] = c[p-1]; } odd = 1; for (k=p-1;k>0;k--) { if (odd==1) { PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,Q,&ld,A2,&ld,&zero,W,&ld)); aux = Q; Q = W; W = aux; for (j=0;j<n;j++) Q[j+j*ld] = Q[j+j*ld] + c[k-1]; } else { PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,A2,&ld,&zero,W,&ld)); aux = P; P = W; W = aux; for (j=0;j<n;j++) P[j+j*ld] = P[j+j*ld] + c[k-1]; } odd = 1-odd; } if (odd==1) { PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,Q,&ld,A,&ld,&zero,W,&ld)); aux = Q; Q = W; W = aux; PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&ld2,&mone,P,&inc,Q,&inc)); PetscStackCallBLAS("LAPACKgesv",LAPACKgesv_(&n,&n,Q,&ld,ipiv,P,&ld,&info)); PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&two,P,&inc)); for (j=0;j<n;j++) P[j+j*ld] = P[j+j*ld] + 1.0; PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&mone,P,&inc)); } else { PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,A,&ld,&zero,W,&ld)); aux = P; P = W; W = aux; PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&ld2,&mone,P,&inc,Q,&inc)); PetscStackCallBLAS("LAPACKgesv",LAPACKgesv_(&n,&n,Q,&ld,ipiv,P,&ld,&info)); PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&two,P,&inc)); for (j=0;j<n;j++) P[j+j*ld] = P[j+j*ld] + 1.0; } for (k=1;k<=s;k++) { PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,P,&ld,&zero,W,&ld)); ierr = PetscMemcpy(P,W,ld2*sizeof(PetscScalar));CHKERRQ(ierr); } if (P!=ds->mat[DS_MAT_F]) { ierr = PetscMemcpy(ds->mat[DS_MAT_F],P,ld2*sizeof(PetscScalar));CHKERRQ(ierr); } PetscFunctionReturn(0); #endif }