PetscErrorCode DSNormalize_NHEP(DS ds,DSMatType mat,PetscInt col) { PetscErrorCode ierr; PetscInt i,i0,i1; PetscBLASInt ld,n,one = 1; PetscScalar *A = ds->mat[DS_MAT_A],norm,*x; #if !defined(PETSC_USE_COMPLEX) PetscScalar norm0; #endif PetscFunctionBegin; switch (mat) { case DS_MAT_X: case DS_MAT_Y: case DS_MAT_Q: /* Supported matrices */ break; case DS_MAT_U: case DS_MAT_VT: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented yet"); break; default: SETERRQ(PetscObjectComm((PetscObject)ds),PETSC_ERR_ARG_OUTOFRANGE,"Invalid mat parameter"); } ierr = PetscBLASIntCast(ds->n,&n);CHKERRQ(ierr); ierr = PetscBLASIntCast(ds->ld,&ld);CHKERRQ(ierr); ierr = DSGetArray(ds,mat,&x);CHKERRQ(ierr); if (col < 0) { i0 = 0; i1 = ds->n; } else if (col>0 && A[ds->ld*(col-1)+col] != 0.0) { i0 = col-1; i1 = col+1; } else { i0 = col; i1 = col+1; } for (i=i0;i<i1;i++) { #if !defined(PETSC_USE_COMPLEX) if (i<n-1 && A[ds->ld*i+i+1] != 0.0) { norm = BLASnrm2_(&n,&x[ld*i],&one); norm0 = BLASnrm2_(&n,&x[ld*(i+1)],&one); norm = 1.0/SlepcAbsEigenvalue(norm,norm0); PetscStackCallBLAS("BLASscal",BLASscal_(&n,&norm,&x[ld*i],&one)); PetscStackCallBLAS("BLASscal",BLASscal_(&n,&norm,&x[ld*(i+1)],&one)); i++; } else #endif { norm = BLASnrm2_(&n,&x[ld*i],&one); norm = 1.0/norm; PetscStackCallBLAS("BLASscal",BLASscal_(&n,&norm,&x[ld*i],&one)); } } PetscFunctionReturn(0); }
PetscErrorCode VecNorm_MPI(Vec xin,NormType type,PetscReal *z) { Vec_MPI *x = (Vec_MPI*)xin->data; PetscReal sum,work = 0.0; PetscScalar *xx = x->array; PetscErrorCode ierr; PetscInt n = xin->map->n; PetscFunctionBegin; if (type == NORM_2 || type == NORM_FROBENIUS) { #if defined(PETSC_HAVE_SLOW_BLAS_NORM2) #if defined(PETSC_USE_FORTRAN_KERNEL_NORM) fortrannormsqr_(xx,&n,&work); #elif defined(PETSC_USE_UNROLLED_NORM) switch (n & 0x3) { case 3: work += PetscRealPart(xx[0]*PetscConj(xx[0])); xx++; case 2: work += PetscRealPart(xx[0]*PetscConj(xx[0])); xx++; case 1: work += PetscRealPart(xx[0]*PetscConj(xx[0])); xx++; n -= 4; } while (n>0) { work += PetscRealPart(xx[0]*PetscConj(xx[0])+xx[1]*PetscConj(xx[1])+ xx[2]*PetscConj(xx[2])+xx[3]*PetscConj(xx[3])); xx += 4; n -= 4; } #else {PetscInt i; for (i=0; i<n; i++) work += PetscRealPart((xx[i])*(PetscConj(xx[i])));} #endif #else {PetscBLASInt one = 1,bn = PetscBLASIntCast(n); work = BLASnrm2_(&bn,xx,&one); work *= work; } #endif ierr = MPI_Allreduce(&work,&sum,1,MPIU_REAL,MPI_SUM,((PetscObject)xin)->comm);CHKERRQ(ierr); *z = sqrt(sum); ierr = PetscLogFlops(2.0*xin->map->n);CHKERRQ(ierr); } else if (type == NORM_1) { /* Find the local part */ ierr = VecNorm_Seq(xin,NORM_1,&work);CHKERRQ(ierr); /* Find the global max */ ierr = MPI_Allreduce(&work,z,1,MPIU_REAL,MPI_SUM,((PetscObject)xin)->comm);CHKERRQ(ierr); } else if (type == NORM_INFINITY) { /* Find the local max */ ierr = VecNorm_Seq(xin,NORM_INFINITY,&work);CHKERRQ(ierr); /* Find the global max */ ierr = MPI_Allreduce(&work,z,1,MPIU_REAL,MPI_MAX,((PetscObject)xin)->comm);CHKERRQ(ierr); } else if (type == NORM_1_AND_2) { PetscReal temp[2]; ierr = VecNorm_Seq(xin,NORM_1,temp);CHKERRQ(ierr); ierr = VecNorm_Seq(xin,NORM_2,temp+1);CHKERRQ(ierr); temp[1] = temp[1]*temp[1]; ierr = MPI_Allreduce(temp,z,2,MPIU_REAL,MPI_SUM,((PetscObject)xin)->comm);CHKERRQ(ierr); z[1] = sqrt(z[1]); } PetscFunctionReturn(0); }
PetscErrorCode VecNorm_Seq(Vec xin,NormType type,PetscReal *z) { const PetscScalar *xx; PetscErrorCode ierr; PetscInt n = xin->map->n; PetscBLASInt one = 1, bn; PetscFunctionBegin; ierr = PetscBLASIntCast(n,&bn);CHKERRQ(ierr); if (type == NORM_2 || type == NORM_FROBENIUS) { ierr = VecGetArrayRead(xin,&xx);CHKERRQ(ierr); #if defined(PETSC_USE_REAL___FP16) *z = BLASnrm2_(&bn,xx,&one); #else *z = PetscRealPart(BLASdot_(&bn,xx,&one,xx,&one)); *z = PetscSqrtReal(*z); #endif ierr = VecRestoreArrayRead(xin,&xx);CHKERRQ(ierr); ierr = PetscLogFlops(PetscMax(2.0*n-1,0.0));CHKERRQ(ierr); } else if (type == NORM_INFINITY) { PetscInt i; PetscReal max = 0.0,tmp; ierr = VecGetArrayRead(xin,&xx);CHKERRQ(ierr); for (i=0; i<n; i++) { if ((tmp = PetscAbsScalar(*xx)) > max) max = tmp; /* check special case of tmp == NaN */ if (tmp != tmp) {max = tmp; break;} xx++; } ierr = VecRestoreArrayRead(xin,&xx);CHKERRQ(ierr); *z = max; } else if (type == NORM_1) { #if defined(PETSC_USE_COMPLEX) PetscReal tmp = 0.0; PetscInt i; #endif ierr = VecGetArrayRead(xin,&xx);CHKERRQ(ierr); #if defined(PETSC_USE_COMPLEX) /* BLASasum() returns the nonstandard 1 norm of the 1 norm of the complex entries so we provide a custom loop instead */ for (i=0; i<n; i++) { tmp += PetscAbsScalar(xx[i]); } *z = tmp; #else PetscStackCallBLAS("BLASasum",*z = BLASasum_(&bn,xx,&one)); #endif ierr = VecRestoreArrayRead(xin,&xx);CHKERRQ(ierr); ierr = PetscLogFlops(PetscMax(n-1.0,0.0));CHKERRQ(ierr); } else if (type == NORM_1_AND_2) { ierr = VecNorm_Seq(xin,NORM_1,z);CHKERRQ(ierr); ierr = VecNorm_Seq(xin,NORM_2,z+1);CHKERRQ(ierr); } PetscFunctionReturn(0); }
PetscErrorCode BDC_dlaed3m_(const char *jobz,const char *defl,PetscBLASInt k,PetscBLASInt n, PetscBLASInt n1,PetscReal *d,PetscReal *q,PetscBLASInt ldq, PetscReal rho,PetscReal *dlamda,PetscReal *q2,PetscBLASInt *indx, PetscBLASInt *ctot,PetscReal *w,PetscReal *s,PetscBLASInt *info, PetscBLASInt jobz_len,PetscBLASInt defl_len) { /* -- Routine written in LAPACK version 3.0 style -- */ /* *************************************************** */ /* Written by */ /* Michael Moldaschl and Wilfried Gansterer */ /* University of Vienna */ /* last modification: March 16, 2014 */ /* Small adaptations of original code written by */ /* Wilfried Gansterer and Bob Ward, */ /* Department of Computer Science, University of Tennessee */ /* see http://dx.doi.org/10.1137/S1064827501399432 */ /* *************************************************** */ /* Purpose */ /* ======= */ /* DLAED3M finds the roots of the secular equation, as defined by the */ /* values in D, W, and RHO, between 1 and K. It makes the */ /* appropriate calls to DLAED4 and then updates the eigenvectors by */ /* multiplying the matrix of eigenvectors of the pair of eigensystems */ /* being combined by the matrix of eigenvectors of the K-by-K system */ /* which is solved here. */ /* This code makes very mild assumptions about floating point */ /* arithmetic. It will work on machines with a guard digit in */ /* add/subtract, or on those binary machines without guard digits */ /* which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. */ /* It could conceivably fail on hexadecimal or decimal machines */ /* without guard digits, but we know of none. */ /* Arguments */ /* ========= */ /* JOBZ (input) CHARACTER*1 */ /* = 'N': Do not accumulate eigenvectors (not implemented); */ /* = 'D': Do accumulate eigenvectors in the divide-and-conquer */ /* process. */ /* DEFL (input) CHARACTER*1 */ /* = '0': No deflation happened in DSRTDF */ /* = '1': Some deflation happened in DSRTDF (and therefore some */ /* Givens rotations need to be applied to the computed */ /* eigenvector matrix Q) */ /* K (input) INTEGER */ /* The number of terms in the rational function to be solved by */ /* DLAED4. 0 <= K <= N. */ /* N (input) INTEGER */ /* The number of rows and columns in the Q matrix. */ /* N >= K (deflation may result in N>K). */ /* N1 (input) INTEGER */ /* The location of the last eigenvalue in the leading submatrix. */ /* min(1,N) <= N1 <= max(1,N-1). */ /* D (output) DOUBLE PRECISION array, dimension (N) */ /* D(I) contains the updated eigenvalues for */ /* 1 <= I <= K. */ /* Q (output) DOUBLE PRECISION array, dimension (LDQ,N) */ /* Initially the first K columns are used as workspace. */ /* On output the columns 1 to K contain */ /* the updated eigenvectors. */ /* LDQ (input) INTEGER */ /* The leading dimension of the array Q. LDQ >= max(1,N). */ /* RHO (input) DOUBLE PRECISION */ /* The value of the parameter in the rank one update equation. */ /* RHO >= 0 required. */ /* DLAMDA (input/output) DOUBLE PRECISION array, dimension (K) */ /* The first K elements of this array contain the old roots */ /* of the deflated updating problem. These are the poles */ /* of the secular equation. May be changed on output by */ /* having lowest order bit set to zero on Cray X-MP, Cray Y-MP, */ /* Cray-2, or Cray C-90, as described above. */ /* Q2 (input) DOUBLE PRECISION array, dimension (LDQ2, N) */ /* The first K columns of this matrix contain the non-deflated */ /* eigenvectors for the split problem. */ /* INDX (input) INTEGER array, dimension (N) */ /* The permutation used to arrange the columns of the deflated */ /* Q matrix into three groups (see DLAED2). */ /* The rows of the eigenvectors found by DLAED4 must be likewise */ /* permuted before the matrix multiply can take place. */ /* CTOT (input) INTEGER array, dimension (4) */ /* A count of the total number of the various types of columns */ /* in Q, as described in INDX. The fourth column type is any */ /* column which has been deflated. */ /* W (input/output) DOUBLE PRECISION array, dimension (K) */ /* The first K elements of this array contain the components */ /* of the deflation-adjusted updating vector. Destroyed on */ /* output. */ /* S (workspace) DOUBLE PRECISION array, dimension */ /* ( MAX(CTOT(1)+CTOT(2),CTOT(2)+CTOT(3)) + 1 )*K */ /* Will contain parts of the eigenvectors of the repaired matrix */ /* which will be multiplied by the previously accumulated */ /* eigenvectors to update the system. This array is a major */ /* source of workspace requirements ! */ /* INFO (output) INTEGER */ /* = 0: successful exit. */ /* < 0: if INFO = -i, the i-th argument had an illegal value. */ /* > 0: if INFO = i, eigenpair i was not computed successfully */ /* Further Details */ /* =============== */ /* Based on code written by */ /* Wilfried Gansterer and Bob Ward, */ /* Department of Computer Science, University of Tennessee */ /* Based on the design of the LAPACK code DLAED3 with small modifications */ /* (Note that in contrast to the original DLAED3, this routine */ /* DOES NOT require that N1 <= N/2) */ /* Based on contributions by */ /* Jeff Rutter, Computer Science Division, University of California */ /* at Berkeley, USA */ /* Modified by Francoise Tisseur, University of Tennessee. */ /* ===================================================================== */ #if defined(SLEPC_MISSING_LAPACK_LAED4) || defined(SLEPC_MISSING_LAPACK_LACPY) || defined(SLEPC_MISSING_LAPACK_LASET) PetscFunctionBegin; SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"LAED4/LACPY/LASET - Lapack routine is unavailable"); #else PetscReal temp, done = 1.0, dzero = 0.0; PetscBLASInt i, j, n2, n12, ii, n23, iq2, i1, one=1; PetscFunctionBegin; *info = 0; if (k < 0) { *info = -3; } else if (n < k) { *info = -4; } else if (n1 < PetscMin(1,n) || n1 > PetscMax(1,n)) { *info = -5; } else if (ldq < PetscMax(1,n)) { *info = -8; } else if (rho < 0.) { *info = -9; } if (*info) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong argument %d in DLAED3M",-(*info)); /* Quick return if possible */ if (k == 0) PetscFunctionReturn(0); /* Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can */ /* be computed with high relative accuracy (barring over/underflow). */ /* This is a problem on machines without a guard digit in */ /* add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2). */ /* The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I), */ /* which on any of these machines zeros out the bottommost */ /* bit of DLAMDA(I) if it is 1; this makes the subsequent */ /* subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation */ /* occurs. On binary machines with a guard digit (almost all */ /* machines) it does not change DLAMDA(I) at all. On hexadecimal */ /* and decimal machines with a guard digit, it slightly */ /* changes the bottommost bits of DLAMDA(I). It does not account */ /* for hexadecimal or decimal machines without guard digits */ /* (we know of none). We use a subroutine call to compute */ /* 2*DLAMBDA(I) to prevent optimizing compilers from eliminating */ /* this code. */ for (i = 0; i < k; ++i) { dlamda[i] = LAPACKlamc3_(&dlamda[i], &dlamda[i]) - dlamda[i]; } for (j = 1; j <= k; ++j) { /* ....calling DLAED4 for eigenpair J.... */ PetscStackCallBLAS("LAPACKlaed4",LAPACKlaed4_(&k, &j, dlamda, w, &q[(j-1)*ldq], &rho, &d[j-1], info)); if (*info) SETERRQ3(PETSC_COMM_SELF,1,"Error in dlaed4, info = %d, failed when computing D(%d)=%g",*info,j,d[j-1]); if (j < k) { /* If the zero finder terminated properly, but the computed */ /* eigenvalues are not ordered, issue an error statement */ /* but continue computation. */ if (dlamda[j-1] >= dlamda[j]) SETERRQ2(PETSC_COMM_SELF,1,"DLAMDA(%d) is greater or equal than DLAMDA(%d)", j, j+1); if (d[j-1] < dlamda[j-1] || d[j-1] > dlamda[j]) SETERRQ6(PETSC_COMM_SELF,1,"DLAMDA(%d) = %g D(%d) = %g DLAMDA(%d) = %g", j, dlamda[j-1], j, d[j-1], j+1, dlamda[j]); } } if (k == 1) goto L110; if (k == 2) { /* permute the components of Q(:,J) (the information returned by DLAED4 */ /* necessary to construct the eigenvectors) according to the permutation */ /* stored in INDX, resulting from deflation */ for (j = 0; j < k; ++j) { w[0] = q[0+j*ldq]; w[1] = q[1+j*ldq]; ii = indx[0]; q[0+j*ldq] = w[ii-1]; ii = indx[1]; q[1+j*ldq] = w[ii-1]; } goto L110; } /* ....K.GE.3.... */ /* Compute updated W (used for computing the eigenvectors corresponding */ /* to the previously computed eigenvalues). */ PetscStackCallBLAS("BLAScopy",BLAScopy_(&k, w, &one, s, &one)); /* Initialize W(I) = Q(I,I) */ i1 = ldq + 1; PetscStackCallBLAS("BLAScopy",BLAScopy_(&k, q, &i1, w, &one)); for (j = 0; j < k; ++j) { for (i = 0; i < j; ++i) { w[i] *= q[i+j*ldq] / (dlamda[i] - dlamda[j]); } for (i = j + 1; i < k; ++i) { w[i] *= q[i+j*ldq] / (dlamda[i] - dlamda[j]); } } for (i = 0; i < k; ++i) { temp = PetscSqrtReal(-w[i]); if (temp<0) temp = -temp; w[i] = (s[i] >= 0) ? temp : -temp; } /* Compute eigenvectors of the modified rank-1 modification (using the */ /* vector W). */ for (j = 0; j < k; ++j) { for (i = 0; i < k; ++i) { s[i] = w[i] / q[i+j*ldq]; } temp = BLASnrm2_(&k, s, &one); for (i = 0; i < k; ++i) { /* apply the permutation resulting from deflation as stored */ /* in INDX */ ii = indx[i]; q[i+j*ldq] = s[ii-1] / temp; } } /* ************************************************************************** */ /* ....updating the eigenvectors.... */ L110: n2 = n - n1; n12 = ctot[0] + ctot[1]; n23 = ctot[1] + ctot[2]; if (*(unsigned char *)jobz == 'D') { /* Compute the updated eigenvectors. (NOTE that every call of */ /* DGEMM requires three DISTINCT arrays) */ /* copy Q( CTOT(1)+1:K,1:K ) to S */ PetscStackCallBLAS("LAPACKlacpy",LAPACKlacpy_("A", &n23, &k, &q[ctot[0]], &ldq, s, &n23)); iq2 = n1 * n12 + 1; if (n23 != 0) { /* multiply the second part of Q2 (the eigenvectors of the */ /* lower block) with S and write the result into the lower part of */ /* Q, i.e., Q( N1+1:N,1:K ) */ PetscStackCallBLAS("BLASgemm",BLASgemm_("N", "N", &n2, &k, &n23, &done, &q2[iq2-1], &n2, s, &n23, &dzero, &q[n1], &ldq)); } else { PetscStackCallBLAS("LAPACKlaset",LAPACKlaset_("A", &n2, &k, &dzero, &dzero, &q[n1], &ldq)); } /* copy Q( 1:CTOT(1)+CTOT(2),1:K ) to S */ PetscStackCallBLAS("LAPACKlacpy",LAPACKlacpy_("A", &n12, &k, q, &ldq, s, &n12)); if (n12 != 0) { /* multiply the first part of Q2 (the eigenvectors of the */ /* upper block) with S and write the result into the upper part of */ /* Q, i.e., Q( 1:N1,1:K ) */ PetscStackCallBLAS("BLASgemm",BLASgemm_("N", "N", &n1, &k, &n12, &done, q2, &n1, s, &n12, &dzero, q, &ldq)); } else { PetscStackCallBLAS("LAPACKlaset",LAPACKlaset_("A", &n1, &k, &dzero, &dzero, q, &ldq)); } } PetscFunctionReturn(0); #endif }
PetscErrorCode DSVectors_NHEP_Eigen_Some(DS ds,PetscInt *k,PetscReal *rnorm,PetscBool left) { #if defined(SLEPC_MISSING_LAPACK_TREVC) PetscFunctionBegin; SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"TREVC - Lapack routine is unavailable"); #else PetscErrorCode ierr; PetscInt i; PetscBLASInt mm=1,mout,info,ld,n,inc = 1; PetscScalar tmp,done=1.0,zero=0.0; PetscReal norm; PetscBool iscomplex = PETSC_FALSE; PetscBLASInt *select; PetscScalar *A = ds->mat[DS_MAT_A]; PetscScalar *Q = ds->mat[DS_MAT_Q]; PetscScalar *X = ds->mat[left?DS_MAT_Y:DS_MAT_X]; PetscScalar *Y; PetscFunctionBegin; ierr = PetscBLASIntCast(ds->n,&n);CHKERRQ(ierr); ierr = PetscBLASIntCast(ds->ld,&ld);CHKERRQ(ierr); ierr = DSAllocateWork_Private(ds,0,0,ld);CHKERRQ(ierr); select = ds->iwork; for (i=0;i<n;i++) select[i] = (PetscBLASInt)PETSC_FALSE; /* Compute k-th eigenvector Y of A */ Y = X+(*k)*ld; select[*k] = (PetscBLASInt)PETSC_TRUE; #if !defined(PETSC_USE_COMPLEX) if ((*k)<n-1 && A[(*k)+1+(*k)*ld]!=0.0) iscomplex = PETSC_TRUE; mm = iscomplex? 2: 1; if (iscomplex) select[(*k)+1] = (PetscBLASInt)PETSC_TRUE; ierr = DSAllocateWork_Private(ds,3*ld,0,0);CHKERRQ(ierr); PetscStackCallBLAS("LAPACKtrevc",LAPACKtrevc_(left?"L":"R","S",select,&n,A,&ld,Y,&ld,Y,&ld,&mm,&mout,ds->work,&info)); #else ierr = DSAllocateWork_Private(ds,2*ld,ld,0);CHKERRQ(ierr); PetscStackCallBLAS("LAPACKtrevc",LAPACKtrevc_(left?"L":"R","S",select,&n,A,&ld,Y,&ld,Y,&ld,&mm,&mout,ds->work,ds->rwork,&info)); #endif if (info) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Error in Lapack xTREVC %d",info); if (mout != mm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Inconsistent arguments"); /* accumulate and normalize eigenvectors */ if (ds->state>=DS_STATE_CONDENSED) { ierr = PetscMemcpy(ds->work,Y,mout*ld*sizeof(PetscScalar));CHKERRQ(ierr); PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&n,&n,&done,Q,&ld,ds->work,&inc,&zero,Y,&inc)); #if !defined(PETSC_USE_COMPLEX) if (iscomplex) PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&n,&n,&done,Q,&ld,ds->work+ld,&inc,&zero,Y+ld,&inc)); #endif norm = BLASnrm2_(&n,Y,&inc); #if !defined(PETSC_USE_COMPLEX) if (iscomplex) { tmp = BLASnrm2_(&n,Y+ld,&inc); norm = SlepcAbsEigenvalue(norm,tmp); } #endif tmp = 1.0 / norm; PetscStackCallBLAS("BLASscal",BLASscal_(&n,&tmp,Y,&inc)); #if !defined(PETSC_USE_COMPLEX) if (iscomplex) PetscStackCallBLAS("BLASscal",BLASscal_(&n,&tmp,Y+ld,&inc)); #endif } /* set output arguments */ if (iscomplex) (*k)++; if (rnorm) { if (iscomplex) *rnorm = SlepcAbsEigenvalue(Y[n-1],Y[n-1+ld]); else *rnorm = PetscAbsScalar(Y[n-1]); } PetscFunctionReturn(0); #endif }
static PetscErrorCode estsv(PetscInt n, PetscReal *r, PetscInt ldr, PetscReal *svmin, PetscReal *z) { PetscBLASInt blas1=1, blasn=n, blasnmi, blasj, blasldr = ldr; PetscInt i,j; PetscReal e,temp,w,wm,ynorm,znorm,s,sm; PetscFunctionBegin; for (i=0;i<n;i++) { z[i]=0.0; } e = PetscAbs(r[0]); if (e == 0.0) { *svmin = 0.0; z[0] = 1.0; } else { /* Solve R'*y = e */ for (i=0;i<n;i++) { /* Scale y. The scaling factor (0.01) reduces the number of scalings */ if (z[i] >= 0.0) e =-PetscAbs(e); else e = PetscAbs(e); if (PetscAbs(e - z[i]) > PetscAbs(r[i + ldr*i])) { temp = PetscMin(0.01,PetscAbs(r[i + ldr*i]))/PetscAbs(e-z[i]); PetscStackCallBLAS("BLASscal",BLASscal_(&blasn, &temp, z, &blas1)); e = temp*e; } /* Determine the two possible choices of y[i] */ if (r[i + ldr*i] == 0.0) { w = wm = 1.0; } else { w = (e - z[i]) / r[i + ldr*i]; wm = - (e + z[i]) / r[i + ldr*i]; } /* Chose y[i] based on the predicted value of y[j] for j>i */ s = PetscAbs(e - z[i]); sm = PetscAbs(e + z[i]); for (j=i+1;j<n;j++) { sm += PetscAbs(z[j] + wm * r[i + ldr*j]); } if (i < n-1) { blasnmi = n-i-1; PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&blasnmi, &w, &r[i + ldr*(i+1)], &blasldr, &z[i+1], &blas1)); s += BLASasum_(&blasnmi, &z[i+1], &blas1); } if (s < sm) { temp = wm - w; w = wm; if (i < n-1) { PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&blasnmi, &temp, &r[i + ldr*(i+1)], &blasldr, &z[i+1], &blas1)); } } z[i] = w; } ynorm = BLASnrm2_(&blasn, z, &blas1); /* Solve R*z = y */ for (j=n-1; j>=0; j--) { /* Scale z */ if (PetscAbs(z[j]) > PetscAbs(r[j + ldr*j])) { temp = PetscMin(0.01, PetscAbs(r[j + ldr*j] / z[j])); PetscStackCallBLAS("BLASscal",BLASscal_(&blasn, &temp, z, &blas1)); ynorm *=temp; } if (r[j + ldr*j] == 0) { z[j] = 1.0; } else { z[j] = z[j] / r[j + ldr*j]; } temp = -z[j]; blasj=j; PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&blasj,&temp,&r[0+ldr*j],&blas1,z,&blas1)); } /* Compute svmin and normalize z */ znorm = 1.0 / BLASnrm2_(&blasn, z, &blas1); *svmin = ynorm*znorm; PetscStackCallBLAS("BLASscal",BLASscal_(&blasn, &znorm, z, &blas1)); } PetscFunctionReturn(0); }
/* c *********** c c Subroutine dgqt c c Given an n by n symmetric matrix A, an n-vector b, and a c positive number delta, this subroutine determines a vector c x which approximately minimizes the quadratic function c c f(x) = (1/2)*x'*A*x + b'*x c c subject to the Euclidean norm constraint c c norm(x) <= delta. c c This subroutine computes an approximation x and a Lagrange c multiplier par such that either par is zero and c c norm(x) <= (1+rtol)*delta, c c or par is positive and c c abs(norm(x) - delta) <= rtol*delta. c c If xsol is the solution to the problem, the approximation x c satisfies c c f(x) <= ((1 - rtol)**2)*f(xsol) c c The subroutine statement is c c subroutine dgqt(n,a,lda,b,delta,rtol,atol,itmax, c par,f,x,info,z,wa1,wa2) c c where c c n is an integer variable. c On entry n is the order of A. c On exit n is unchanged. c c a is a double precision array of dimension (lda,n). c On entry the full upper triangle of a must contain the c full upper triangle of the symmetric matrix A. c On exit the array contains the matrix A. c c lda is an integer variable. c On entry lda is the leading dimension of the array a. c On exit lda is unchanged. c c b is an double precision array of dimension n. c On entry b specifies the linear term in the quadratic. c On exit b is unchanged. c c delta is a double precision variable. c On entry delta is a bound on the Euclidean norm of x. c On exit delta is unchanged. c c rtol is a double precision variable. c On entry rtol is the relative accuracy desired in the c solution. Convergence occurs if c c f(x) <= ((1 - rtol)**2)*f(xsol) c c On exit rtol is unchanged. c c atol is a double precision variable. c On entry atol is the absolute accuracy desired in the c solution. Convergence occurs when c c norm(x) <= (1 + rtol)*delta c c max(-f(x),-f(xsol)) <= atol c c On exit atol is unchanged. c c itmax is an integer variable. c On entry itmax specifies the maximum number of iterations. c On exit itmax is unchanged. c c par is a double precision variable. c On entry par is an initial estimate of the Lagrange c multiplier for the constraint norm(x) <= delta. c On exit par contains the final estimate of the multiplier. c c f is a double precision variable. c On entry f need not be specified. c On exit f is set to f(x) at the output x. c c x is a double precision array of dimension n. c On entry x need not be specified. c On exit x is set to the final estimate of the solution. c c info is an integer variable. c On entry info need not be specified. c On exit info is set as follows: c c info = 1 The function value f(x) has the relative c accuracy specified by rtol. c c info = 2 The function value f(x) has the absolute c accuracy specified by atol. c c info = 3 Rounding errors prevent further progress. c On exit x is the best available approximation. c c info = 4 Failure to converge after itmax iterations. c On exit x is the best available approximation. c c z is a double precision work array of dimension n. c c wa1 is a double precision work array of dimension n. c c wa2 is a double precision work array of dimension n. c c Subprograms called c c MINPACK-2 ...... destsv c c LAPACK ......... dpotrf c c Level 1 BLAS ... daxpy, dcopy, ddot, dnrm2, dscal c c Level 2 BLAS ... dtrmv, dtrsv c c MINPACK-2 Project. October 1993. c Argonne National Laboratory and University of Minnesota. c Brett M. Averick, Richard Carter, and Jorge J. More' c c *********** */ PetscErrorCode gqt(PetscInt n, PetscReal *a, PetscInt lda, PetscReal *b, PetscReal delta, PetscReal rtol, PetscReal atol, PetscInt itmax, PetscReal *retpar, PetscReal *retf, PetscReal *x, PetscInt *retinfo, PetscInt *retits, PetscReal *z, PetscReal *wa1, PetscReal *wa2) { PetscErrorCode ierr; PetscReal f=0.0,p001=0.001,p5=0.5,minusone=-1,delta2=delta*delta; PetscInt iter, j, rednc,info; PetscBLASInt indef; PetscBLASInt blas1=1, blasn=n, iblas, blaslda = lda,blasldap1=lda+1,blasinfo; PetscReal alpha, anorm, bnorm, parc, parf, parl, pars, par=*retpar,paru, prod, rxnorm, rznorm=0.0, temp, xnorm; PetscFunctionBegin; parf = 0.0; xnorm = 0.0; rxnorm = 0.0; rednc = 0; for (j=0; j<n; j++) { x[j] = 0.0; z[j] = 0.0; } /* Copy the diagonal and save A in its lower triangle */ PetscStackCallBLAS("BLAScopy",BLAScopy_(&blasn,a,&blasldap1, wa1, &blas1)); for (j=0;j<n-1;j++) { iblas = n - j - 1; PetscStackCallBLAS("BLAScopy",BLAScopy_(&iblas,&a[j + lda*(j+1)], &blaslda, &a[j+1 + lda*j], &blas1)); } /* Calculate the l1-norm of A, the Gershgorin row sums, and the l2-norm of b */ anorm = 0.0; for (j=0;j<n;j++) { wa2[j] = BLASasum_(&blasn, &a[0 + lda*j], &blas1); CHKMEMQ; anorm = PetscMax(anorm,wa2[j]); } for (j=0;j<n;j++) { wa2[j] = wa2[j] - PetscAbs(wa1[j]); } bnorm = BLASnrm2_(&blasn,b,&blas1); CHKMEMQ; /* Calculate a lower bound, pars, for the domain of the problem. Also calculate an upper bound, paru, and a lower bound, parl, for the Lagrange multiplier. */ pars = parl = paru = -anorm; for (j=0;j<n;j++) { pars = PetscMax(pars, -wa1[j]); parl = PetscMax(parl, wa1[j] + wa2[j]); paru = PetscMax(paru, -wa1[j] + wa2[j]); } parl = PetscMax(bnorm/delta - parl,pars); parl = PetscMax(0.0,parl); paru = PetscMax(0.0, bnorm/delta + paru); /* If the input par lies outside of the interval (parl, paru), set par to the closer endpoint. */ par = PetscMax(par,parl); par = PetscMin(par,paru); /* Special case: parl == paru */ paru = PetscMax(paru, (1.0 + rtol)*parl); /* Beginning of an iteration */ info = 0; for (iter=1;iter<=itmax;iter++) { /* Safeguard par */ if (par <= pars && paru > 0) { par = PetscMax(p001, PetscSqrtScalar(parl/paru)) * paru; } /* Copy the lower triangle of A into its upper triangle and compute A + par*I */ for (j=0;j<n-1;j++) { iblas = n - j - 1; PetscStackCallBLAS("BLAScopy",BLAScopy_(&iblas,&a[j+1 + j*lda], &blas1,&a[j + (j+1)*lda], &blaslda)); } for (j=0;j<n;j++) { a[j + j*lda] = wa1[j] + par; } /* Attempt the Cholesky factorization of A without referencing the lower triangular part. */ PetscStackCallBLAS("LAPACKpotrf",LAPACKpotrf_("U",&blasn,a,&blaslda,&indef)); /* Case 1: A + par*I is pos. def. */ if (indef == 0) { /* Compute an approximate solution x and save the last value of par with A + par*I pos. def. */ parf = par; PetscStackCallBLAS("BLAScopy",BLAScopy_(&blasn, b, &blas1, wa2, &blas1)); PetscStackCallBLAS("LAPACKtrtrs",LAPACKtrtrs_("U","T","N",&blasn,&blas1,a,&blaslda,wa2,&blasn,&blasinfo)); rxnorm = BLASnrm2_(&blasn, wa2, &blas1); PetscStackCallBLAS("LAPACKtrtrs",LAPACKtrtrs_("U","N","N",&blasn,&blas1,a,&blaslda,wa2,&blasn,&blasinfo)); PetscStackCallBLAS("BLAScopy",BLAScopy_(&blasn, wa2, &blas1, x, &blas1)); PetscStackCallBLAS("BLASscal",BLASscal_(&blasn, &minusone, x, &blas1)); xnorm = BLASnrm2_(&blasn, x, &blas1); CHKMEMQ; /* Test for convergence */ if (PetscAbs(xnorm - delta) <= rtol*delta || (par == 0 && xnorm <= (1.0+rtol)*delta)) { info = 1; } /* Compute a direction of negative curvature and use this information to improve pars. */ iblas=blasn*blasn; ierr = estsv(n,a,lda,&rznorm,z);CHKERRQ(ierr); CHKMEMQ; pars = PetscMax(pars, par-rznorm*rznorm); /* Compute a negative curvature solution of the form x + alpha*z, where norm(x+alpha*z)==delta */ rednc = 0; if (xnorm < delta) { /* Compute alpha */ prod = BLASdot_(&blasn, z, &blas1, x, &blas1) / delta; temp = (delta - xnorm)*((delta + xnorm)/delta); alpha = temp/(PetscAbs(prod) + PetscSqrtScalar(prod*prod + temp/delta)); if (prod >= 0) alpha = PetscAbs(alpha); else alpha =-PetscAbs(alpha); /* Test to decide if the negative curvature step produces a larger reduction than with z=0 */ rznorm = PetscAbs(alpha) * rznorm; if ((rznorm*rznorm + par*xnorm*xnorm)/(delta2) <= par) { rednc = 1; } /* Test for convergence */ if (p5 * rznorm*rznorm / delta2 <= rtol*(1.0-p5*rtol)*(par + rxnorm*rxnorm/delta2)) { info = 1; } else if (info == 0 && (p5*(par + rxnorm*rxnorm/delta2) <= atol/delta2)) { info = 2; } } /* Compute the Newton correction parc to par. */ if (xnorm == 0) { parc = -par; } else { PetscStackCallBLAS("BLAScopy",BLAScopy_(&blasn, x, &blas1, wa2, &blas1)); temp = 1.0/xnorm; PetscStackCallBLAS("BLASscal",BLASscal_(&blasn, &temp, wa2, &blas1)); PetscStackCallBLAS("LAPACKtrtrs",LAPACKtrtrs_("U","T","N",&blasn, &blas1, a, &blaslda, wa2, &blasn, &blasinfo)); temp = BLASnrm2_(&blasn, wa2, &blas1); parc = (xnorm - delta)/(delta*temp*temp); } /* update parl or paru */ if (xnorm > delta) { parl = PetscMax(parl, par); } else if (xnorm < delta) { paru = PetscMin(paru, par); } } else { /* Case 2: A + par*I is not pos. def. */ /* Use the rank information from the Cholesky decomposition to update par. */ if (indef > 1) { /* Restore column indef to A + par*I. */ iblas = indef - 1; PetscStackCallBLAS("BLAScopy",BLAScopy_(&iblas,&a[indef-1 + 0*lda],&blaslda,&a[0 + (indef-1)*lda],&blas1)); a[indef-1 + (indef-1)*lda] = wa1[indef-1] + par; /* compute parc. */ PetscStackCallBLAS("BLAScopy",BLAScopy_(&iblas,&a[0 + (indef-1)*lda], &blas1, wa2, &blas1)); PetscStackCallBLAS("LAPACKtrtrs",LAPACKtrtrs_("U","T","N",&iblas,&blas1,a,&blaslda,wa2,&blasn,&blasinfo)); PetscStackCallBLAS("BLAScopy",BLAScopy_(&iblas,wa2,&blas1,&a[0 + (indef-1)*lda],&blas1)); temp = BLASnrm2_(&iblas,&a[0 + (indef-1)*lda],&blas1); CHKMEMQ; a[indef-1 + (indef-1)*lda] -= temp*temp; PetscStackCallBLAS("LAPACKtrtr",LAPACKtrtrs_("U","N","N",&iblas,&blas1,a,&blaslda,wa2,&blasn,&blasinfo)); } wa2[indef-1] = -1.0; iblas = indef; temp = BLASnrm2_(&iblas,wa2,&blas1); parc = - a[indef-1 + (indef-1)*lda]/(temp*temp); pars = PetscMax(pars,par+parc); /* If necessary, increase paru slightly. This is needed because in some exceptional situations paru is the optimal value of par. */ paru = PetscMax(paru, (1.0+rtol)*pars); } /* Use pars to update parl */ parl = PetscMax(parl,pars); /* Test for converged. */ if (info == 0) { if (iter == itmax) info=4; if (paru <= (1.0+p5*rtol)*pars) info=3; if (paru == 0.0) info = 2; } /* If exiting, store the best approximation and restore the upper triangle of A. */ if (info != 0) { /* Compute the best current estimates for x and f. */ par = parf; f = -p5 * (rxnorm*rxnorm + par*xnorm*xnorm); if (rednc) { f = -p5 * (rxnorm*rxnorm + par*delta*delta - rznorm*rznorm); PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&blasn, &alpha, z, &blas1, x, &blas1)); } /* Restore the upper triangle of A */ for (j = 0; j<n; j++) { iblas = n - j - 1; PetscStackCallBLAS("BLAScopy",BLAScopy_(&iblas,&a[j+1 + j*lda],&blas1, &a[j + (j+1)*lda],&blaslda)); } iblas = lda+1; PetscStackCallBLAS("BLAScopy",BLAScopy_(&blasn,wa1,&blas1,a,&iblas)); break; } par = PetscMax(parl,par+parc); } *retpar = par; *retf = f; *retinfo = info; *retits = iter; CHKMEMQ; PetscFunctionReturn(0); }
PetscErrorCode KSPAGMRESRoddec(KSP ksp, PetscInt nvec) { KSP_AGMRES *agmres = (KSP_AGMRES*) ksp->data; MPI_Comm comm; PetscScalar *Qloc = agmres->Qloc; PetscScalar *sgn = agmres->sgn; PetscScalar *tloc = agmres->tloc; PetscErrorCode ierr; PetscReal *wbufptr = agmres->wbufptr; PetscMPIInt rank = agmres->rank; PetscMPIInt First = agmres->First; PetscMPIInt Last = agmres->Last; PetscBLASInt nloc,pas,len; PetscInt d, i, j, k; PetscInt pos,tag; PetscReal c, s, rho, Ajj, val, tt, old; PetscScalar *col; MPI_Status status; PetscBLASInt N = MAXKSPSIZE + 1; PetscFunctionBegin; ierr = PetscObjectGetComm((PetscObject)ksp,&comm);CHKERRQ(ierr); tag = 0x666; ierr = PetscLogEventBegin(KSP_AGMRESRoddec,ksp,0,0,0);CHKERRQ(ierr); ierr = PetscMemzero(agmres->Rloc, N*N*sizeof(PetscScalar));CHKERRQ(ierr); /* check input arguments */ if (nvec < 1) SETERRQ(PetscObjectComm((PetscObject)ksp),PETSC_ERR_ARG_OUTOFRANGE, "The number of input vectors shoud be positive"); ierr = VecGetLocalSize(VEC_V(0), &nloc);CHKERRQ(ierr); if (nvec > nloc) SETERRQ(PetscObjectComm((PetscObject)ksp), PETSC_ERR_ARG_WRONG, "In QR factorization, the number of local rows should be greater or equal to the number of columns"); pas = 1; k = 0; /* Copy the vectors of the basis */ for (j = 0; j < nvec; j++) { ierr = VecGetArray(VEC_V(j), &col);CHKERRQ(ierr); PetscStackCallBLAS("BLAScopy",BLAScopy_(&nloc, col, &pas, &Qloc[j*nloc], &pas)); ierr = VecRestoreArray(VEC_V(j), &col);CHKERRQ(ierr); } /* Each process performs a local QR on its own block */ for (j = 0; j < nvec; j++) { len = nloc - j; Ajj = Qloc[j*nloc+j]; rho = -PetscSign(Ajj) * BLASnrm2_(&len, &(Qloc[j*nloc+j]), &pas); if (rho == 0.0) tloc[j] = 0.0; else { tloc[j] = (Ajj - rho) / rho; len = len - 1; val = 1.0 / (Ajj - rho); PetscStackCallBLAS("BLASscal",BLASscal_(&len, &val, &(Qloc[j*nloc+j+1]), &pas)); Qloc[j*nloc+j] = 1.0; len = len + 1; for (k = j + 1; k < nvec; k++) { PetscStackCallBLAS("BLASdot",tt = tloc[j] * BLASdot_(&len, &(Qloc[j*nloc+j]), &pas, &(Qloc[k*nloc+j]), &pas)); PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&len, &tt, &(Qloc[j*nloc+j]), &pas, &(Qloc[k*nloc+j]), &pas)); } Qloc[j*nloc+j] = rho; } } /*annihilate undesirable Rloc, diagonal by diagonal*/ for (d = 0; d < nvec; d++) { len = nvec - d; if (rank == First) { PetscStackCallBLAS("BLAScopy",BLAScopy_(&len, &(Qloc[d*nloc+d]), &nloc, &(wbufptr[d]), &pas)); ierr = MPI_Send(&(wbufptr[d]), len, MPIU_SCALAR, rank + 1, tag, comm);CHKERRQ(ierr); } else { ierr = MPI_Recv(&(wbufptr[d]), len, MPIU_SCALAR, rank - 1, tag, comm, &status);CHKERRQ(ierr); /*Elimination of Rloc(1,d)*/ c = wbufptr[d]; s = Qloc[d*nloc]; ierr = KSPAGMRESRoddecGivens(&c, &s, &rho, 1); /*Apply Givens Rotation*/ for (k = d; k < nvec; k++) { old = wbufptr[k]; wbufptr[k] = c * old - s * Qloc[k*nloc]; Qloc[k*nloc] = s * old + c * Qloc[k*nloc]; } Qloc[d*nloc] = rho; if (rank != Last) { ierr = MPI_Send(& (wbufptr[d]), len, MPIU_SCALAR, rank + 1, tag, comm);CHKERRQ(ierr); } /* zero-out the d-th diagonal of Rloc ...*/ for (j = d + 1; j < nvec; j++) { /* elimination of Rloc[i][j]*/ i = j - d; c = Qloc[j*nloc+i-1]; s = Qloc[j*nloc+i]; ierr = KSPAGMRESRoddecGivens(&c, &s, &rho, 1);CHKERRQ(ierr); for (k = j; k < nvec; k++) { old = Qloc[k*nloc+i-1]; Qloc[k*nloc+i-1] = c * old - s * Qloc[k*nloc+i]; Qloc[k*nloc+i] = s * old + c * Qloc[k*nloc+i]; } Qloc[j*nloc+i] = rho; } if (rank == Last) { PetscStackCallBLAS("BLAScopy",BLAScopy_(&len, &(wbufptr[d]), &pas, RLOC(d,d), &N)); for (k = d + 1; k < nvec; k++) *RLOC(k,d) = 0.0; } } } if (rank == Last) { for (d = 0; d < nvec; d++) { pos = nvec - d; sgn[d] = PetscSign(*RLOC(d,d)); PetscStackCallBLAS("BLASscal",BLASscal_(&pos, &(sgn[d]), RLOC(d,d), &N)); } } /*BroadCast Rloc to all other processes * NWD : should not be needed */ ierr = MPI_Bcast(agmres->Rloc,N*N,MPIU_SCALAR,Last,comm);CHKERRQ(ierr); ierr = PetscLogEventEnd(KSP_AGMRESRoddec,ksp,0,0,0);CHKERRQ(ierr); PetscFunctionReturn(0); }