/** Purpose ------- DLAEX3 finds the roots of the secular equation, as defined by the values in D, W, and RHO, between 1 and K. It makes the appropriate calls to DLAED4 and then updates the eigenvectors by multiplying the matrix of eigenvectors of the pair of eigensystems being combined by the matrix of eigenvectors of the K-by-K system which is solved here. It is used in the last step when only a part of the eigenvectors is required. It compute only the required part of the eigenvectors and the rest is not used. This code makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments --------- @param[in] k INTEGER The number of terms in the rational function to be solved by DLAED4. K >= 0. @param[in] n INTEGER The number of rows and columns in the Q matrix. N >= K (deflation may result in N > K). @param[in] n1 INTEGER The location of the last eigenvalue in the leading submatrix. min(1,N) <= N1 <= N/2. @param[out] d DOUBLE PRECISION array, dimension (N) D(I) contains the updated eigenvalues for 1 <= I <= K. @param[out] Q DOUBLE PRECISION array, dimension (LDQ,N) Initially the first K columns are used as workspace. On output the columns ??? to ??? contain the updated eigenvectors. @param[in] ldq INTEGER The leading dimension of the array Q. LDQ >= max(1,N). @param[in] rho DOUBLE PRECISION The value of the parameter in the rank one update equation. RHO >= 0 required. @param[in,out] dlamda DOUBLE PRECISION array, dimension (K) The first K elements of this array contain the old roots of the deflated updating problem. These are the poles of the secular equation. May be changed on output by having lowest order bit set to zero on Cray X-MP, Cray Y-MP, Cray-2, or Cray C-90, as described above. @param[in] Q2 DOUBLE PRECISION array, dimension (LDQ2, N) The first K columns of this matrix contain the non-deflated eigenvectors for the split problem. TODO what is LDQ2? @param[in] indx INTEGER array, dimension (N) The permutation used to arrange the columns of the deflated Q matrix into three groups (see DLAED2). The rows of the eigenvectors found by DLAED4 must be likewise permuted before the matrix multiply can take place. @param[in] ctot INTEGER array, dimension (4) A count of the total number of the various types of columns in Q, as described in INDX. The fourth column type is any column which has been deflated. @param[in,out] w DOUBLE PRECISION array, dimension (K) The first K elements of this array contain the components of the deflation-adjusted updating vector. Destroyed on output. @param s (workspace) DOUBLE PRECISION array, dimension (N1 + 1)*K Will contain the eigenvectors of the repaired matrix which will be multiplied by the previously accumulated eigenvectors to update the system. @param[out] indxq INTEGER array, dimension (N) On exit, the permutation which will reintegrate the subproblems back into sorted order, i.e. D( INDXQ( I = 1, N ) ) will be in ascending order. @param dwork (workspace) DOUBLE PRECISION array, dimension (3*N*N/2+3*N) @param[in] range magma_range_t - = MagmaRangeAll: all eigenvalues will be found. - = MagmaRangeV: all eigenvalues in the half-open interval (VL,VU] will be found. - = MagmaRangeI: the IL-th through IU-th eigenvalues will be found. TODO verify range, vl, vu, il, iu -- copied from dlaex1. @param[in] vl DOUBLE PRECISION @param[in] vu DOUBLE PRECISION if RANGE=MagmaRangeV, the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = MagmaRangeAll or MagmaRangeI. @param[in] il INTEGER @param[in] iu INTEGER if RANGE=MagmaRangeI, the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = MagmaRangeAll or MagmaRangeV. @param[out] info INTEGER - = 0: successful exit. - < 0: if INFO = -i, the i-th argument had an illegal value. - > 0: if INFO = 1, an eigenvalue did not converge Further Details --------------- Based on contributions by Jeff Rutter, Computer Science Division, University of California at Berkeley, USA Modified by Francoise Tisseur, University of Tennessee. @ingroup magma_dsyev_aux ********************************************************************/ extern "C" magma_int_t magma_dlaex3(magma_int_t k, magma_int_t n, magma_int_t n1, double* d, double* Q, magma_int_t ldq, double rho, double* dlamda, double* Q2, magma_int_t* indx, magma_int_t* ctot, double* w, double* s, magma_int_t* indxq, double* dwork, magma_range_t range, double vl, double vu, magma_int_t il, magma_int_t iu, magma_int_t* info ) { #define Q(i_,j_) (Q + (i_) + (j_)*ldq) double d_one = 1.; double d_zero = 0.; magma_int_t ione = 1; magma_int_t ineg_one = -1; magma_int_t iil, iiu, rk; double* dq2= dwork; double* ds = dq2 + n*(n/2+1); double* dq = ds + n*(n/2+1); magma_int_t lddq = n/2 + 1; magma_int_t i, iq2, j, n12, n2, n23, tmp, lq2; double temp; magma_int_t alleig, valeig, indeig; alleig = (range == MagmaRangeAll); valeig = (range == MagmaRangeV); indeig = (range == MagmaRangeI); *info = 0; if (k < 0) *info=-1; else if (n < k) *info=-2; else if (ldq < max(1,n)) *info=-6; else if (! (alleig || valeig || indeig)) *info = -15; else { if (valeig) { if (n > 0 && vu <= vl) *info = -17; } else if (indeig) { if (il < 1 || il > max(1,n)) *info = -18; else if (iu < min(n,il) || iu > n) *info = -19; } } if (*info != 0) { magma_xerbla(__func__, -(*info)); return *info; } // Quick return if possible if (k == 0) return *info; /* Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can be computed with high relative accuracy (barring over/underflow). This is a problem on machines without a guard digit in add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2). The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I), which on any of these machines zeros out the bottommost bit of DLAMDA(I) if it is 1; this makes the subsequent subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation occurs. On binary machines with a guard digit (almost all machines) it does not change DLAMDA(I) at all. On hexadecimal and decimal machines with a guard digit, it slightly changes the bottommost bits of DLAMDA(I). It does not account for hexadecimal or decimal machines without guard digits (we know of none). We use a subroutine call to compute 2*DLAMBDA(I) to prevent optimizing compilers from eliminating this code.*/ n2 = n - n1; n12 = ctot[0] + ctot[1]; n23 = ctot[1] + ctot[2]; iq2 = n1 * n12; lq2 = iq2 + n2 * n23; magma_dsetvector_async( lq2, Q2, 1, dq2, 1, NULL ); #ifdef _OPENMP ///////////////////////////////////////////////////////////////////////////////// //openmp implementation ///////////////////////////////////////////////////////////////////////////////// magma_timer_t time=0; timer_start( time ); #pragma omp parallel private(i, j, tmp, temp) { magma_int_t id = omp_get_thread_num(); magma_int_t tot = omp_get_num_threads(); magma_int_t ib = ( id * k) / tot; //start index of local loop magma_int_t ie = ((id+1) * k) / tot; //end index of local loop magma_int_t ik = ie - ib; //number of local indices for (i = ib; i < ie; ++i) dlamda[i]=lapackf77_dlamc3(&dlamda[i], &dlamda[i]) - dlamda[i]; for (j = ib; j < ie; ++j) { magma_int_t tmpp=j+1; magma_int_t iinfo = 0; lapackf77_dlaed4(&k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo); // If the zero finder fails, the computation is terminated. if (iinfo != 0) { #pragma omp critical (info) *info=iinfo; break; } } #pragma omp barrier if (*info == 0) { #pragma omp single { //Prepare the INDXQ sorting permutation. magma_int_t nk = n - k; lapackf77_dlamrg( &k, &nk, d, &ione, &ineg_one, indxq); //compute the lower and upper bound of the non-deflated eigenvectors if (valeig) magma_dvrange(k, d, &iil, &iiu, vl, vu); else if (indeig) magma_dirange(k, indxq, &iil, &iiu, il, iu); else { iil = 1; iiu = k; } rk = iiu - iil + 1; } if (k == 2) { #pragma omp single { for (j = 0; j < k; ++j) { w[0] = *Q(0,j); w[1] = *Q(1,j); i = indx[0] - 1; *Q(0,j) = w[i]; i = indx[1] - 1; *Q(1,j) = w[i]; } } } else if (k != 1) { // Compute updated W. blasf77_dcopy( &ik, &w[ib], &ione, &s[ib], &ione); // Initialize W(I) = Q(I,I) tmp = ldq + 1; blasf77_dcopy( &ik, Q(ib,ib), &tmp, &w[ib], &ione); for (j = 0; j < k; ++j) { magma_int_t i_tmp = min(j, ie); for (i = ib; i < i_tmp; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); i_tmp = max(j+1, ib); for (i = i_tmp; i < ie; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); } for (i = ib; i < ie; ++i) w[i] = copysign( sqrt( -w[i] ), s[i]); #pragma omp barrier //reduce the number of used threads to have enough S workspace tot = min(n1, omp_get_num_threads()); if (id < tot) { ib = ( id * rk) / tot + iil - 1; ie = ((id+1) * rk) / tot + iil - 1; ik = ie - ib; } else { ib = -1; ie = -1; ik = -1; } // Compute eigenvectors of the modified rank-1 modification. for (j = ib; j < ie; ++j) { for (i = 0; i < k; ++i) s[id*k + i] = w[i] / *Q(i,j); temp = magma_cblas_dnrm2( k, s+id*k, 1 ); for (i = 0; i < k; ++i) { magma_int_t iii = indx[i] - 1; *Q(i,j) = s[id*k + iii] / temp; } } } } } if (*info != 0) return *info; timer_stop( time ); timer_printf( "eigenvalues/vector D+zzT = %6.2f\n", time ); #else ///////////////////////////////////////////////////////////////////////////////// // Non openmp implementation ///////////////////////////////////////////////////////////////////////////////// magma_timer_t time=0; timer_start( time ); for (i = 0; i < k; ++i) dlamda[i]=lapackf77_dlamc3(&dlamda[i], &dlamda[i]) - dlamda[i]; for (j = 0; j < k; ++j) { magma_int_t tmpp=j+1; magma_int_t iinfo = 0; lapackf77_dlaed4(&k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo); // If the zero finder fails, the computation is terminated. if (iinfo != 0) *info=iinfo; } if (*info != 0) return *info; //Prepare the INDXQ sorting permutation. magma_int_t nk = n - k; lapackf77_dlamrg( &k, &nk, d, &ione, &ineg_one, indxq); //compute the lower and upper bound of the non-deflated eigenvectors if (valeig) magma_dvrange(k, d, &iil, &iiu, vl, vu); else if (indeig) magma_dirange(k, indxq, &iil, &iiu, il, iu); else { iil = 1; iiu = k; } rk = iiu - iil + 1; if (k == 2) { for (j = 0; j < k; ++j) { w[0] = *Q(0,j); w[1] = *Q(1,j); i = indx[0] - 1; *Q(0,j) = w[i]; i = indx[1] - 1; *Q(1,j) = w[i]; } } else if (k != 1) { // Compute updated W. blasf77_dcopy( &k, w, &ione, s, &ione); // Initialize W(I) = Q(I,I) tmp = ldq + 1; blasf77_dcopy( &k, Q, &tmp, w, &ione); for (j = 0; j < k; ++j) { for (i = 0; i < j; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); for (i = j+1; i < k; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); } for (i = 0; i < k; ++i) w[i] = copysign( sqrt( -w[i] ), s[i]); // Compute eigenvectors of the modified rank-1 modification. for (j = iil-1; j < iiu; ++j) { for (i = 0; i < k; ++i) s[i] = w[i] / *Q(i,j); temp = magma_cblas_dnrm2( k, s, 1 ); for (i = 0; i < k; ++i) { magma_int_t iii = indx[i] - 1; *Q(i,j) = s[iii] / temp; } } } timer_stop( time ); timer_printf( "eigenvalues/vector D+zzT = %6.2f\n", time ); #endif //_OPENMP // Compute the updated eigenvectors. timer_start( time ); magma_queue_sync( NULL ); if (rk != 0) { if ( n23 != 0 ) { if (rk < magma_get_dlaed3_k()) { lapackf77_dlacpy("A", &n23, &rk, Q(ctot[0],iil-1), &ldq, s, &n23); blasf77_dgemm("N", "N", &n2, &rk, &n23, &d_one, &Q2[iq2], &n2, s, &n23, &d_zero, Q(n1,iil-1), &ldq ); } else { magma_dsetmatrix( n23, rk, Q(ctot[0],iil-1), ldq, ds, n23 ); magma_dgemm( MagmaNoTrans, MagmaNoTrans, n2, rk, n23, d_one, &dq2[iq2], n2, ds, n23, d_zero, dq, lddq); magma_dgetmatrix( n2, rk, dq, lddq, Q(n1,iil-1), ldq ); } } else lapackf77_dlaset("A", &n2, &rk, &d_zero, &d_zero, Q(n1,iil-1), &ldq); if ( n12 != 0 ) { if (rk < magma_get_dlaed3_k()) { lapackf77_dlacpy("A", &n12, &rk, Q(0,iil-1), &ldq, s, &n12); blasf77_dgemm("N", "N", &n1, &rk, &n12, &d_one, Q2, &n1, s, &n12, &d_zero, Q(0,iil-1), &ldq); } else { magma_dsetmatrix( n12, rk, Q(0,iil-1), ldq, ds, n12 ); magma_dgemm( MagmaNoTrans, MagmaNoTrans, n1, rk, n12, d_one, dq2, n1, ds, n12, d_zero, dq, lddq); magma_dgetmatrix( n1, rk, dq, lddq, Q(0,iil-1), ldq ); } } else lapackf77_dlaset("A", &n1, &rk, &d_zero, &d_zero, Q(0,iil-1), &ldq); } timer_stop( time ); timer_printf( "gemms = %6.2f\n", time ); return *info; } /* magma_dlaex3 */
extern "C" magma_int_t magma_dgeev( magma_vec_t jobvl, magma_vec_t jobvr, magma_int_t n, double *a, magma_int_t lda, double *WR, double *WI, double *vl, magma_int_t ldvl, double *vr, magma_int_t ldvr, double *work, magma_int_t lwork, magma_queue_t queue, magma_int_t *info) { /* -- clMAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date November 2014 Purpose ======= DGEEV computes for an N-by-N real nonsymmetric matrix A, the eigenvalues and, optionally, the left and/or right eigenvectors. The right eigenvector v(j) of A satisfies A * v(j) = lambda(j) * v(j) where lambda(j) is its eigenvalue. The left eigenvector u(j) of A satisfies u(j)**T * A = lambda(j) * u(j)**T where u(j)**T denotes the transpose of u(j). The computed eigenvectors are normalized to have Euclidean norm equal to 1 and largest component real. Arguments ========= JOBVL (input) CHARACTER*1 = 'N': left eigenvectors of A are not computed; = 'V': left eigenvectors of are computed. JOBVR (input) CHARACTER*1 = 'N': right eigenvectors of A are not computed; = 'V': right eigenvectors of A are computed. N (input) INTEGER The order of the matrix A. N >= 0. A (input/output) DOUBLE PRECISION array, dimension (LDA,N) On entry, the N-by-N matrix A. On exit, A has been overwritten. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). WR (output) DOUBLE PRECISION array, dimension (N) WI (output) DOUBLE PRECISION array, dimension (N) WR and WI contain the real and imaginary parts, respectively, of the computed eigenvalues. Complex conjugate pairs of eigenvalues appear consecutively with the eigenvalue having the positive imaginary part first. VL (output) DOUBLE PRECISION array, dimension (LDVL,N) If JOBVL = 'V', the left eigenvectors u(j) are stored one after another in the columns of VL, in the same order as their eigenvalues. If JOBVL = 'N', VL is not referenced. u(j) = VL(:,j), the j-th column of VL. LDVL (input) INTEGER The leading dimension of the array VL. LDVL >= 1; if JOBVL = 'V', LDVL >= N. VR (output) DOUBLE PRECISION array, dimension (LDVR,N) If JOBVR = 'V', the right eigenvectors v(j) are stored one after another in the columns of VR, in the same order as their eigenvalues. If JOBVR = 'N', VR is not referenced. v(j) = VR(:,j), the j-th column of VR. LDVR (input) INTEGER The leading dimension of the array VR. LDVR >= 1; if JOBVR = 'V', LDVR >= N. WORK (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= (1+nb)*N. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value. > 0: if INFO = i, the QR algorithm failed to compute all the eigenvalues, and no eigenvectors have been computed; elements and i+1:N of W contain eigenvalues which have converged. ===================================================================== */ magma_int_t ione = 1; magma_int_t c__1 = 1; magma_int_t c__0 = 0; magma_int_t c_n1 = -1; magma_int_t a_dim1, a_offset, vl_dim1, vl_offset, vr_dim1, vr_offset, i__1, i__2, i__3; double d__1, d__2; magma_int_t i__, k, ihi, ilo; double r__, cs, sn, scl; double dum[1], eps; magma_int_t ibal; double anrm; magma_int_t ierr, itau, iwrk, nout; magma_int_t scalea; double cscale; double bignum; magma_int_t minwrk; magma_int_t wantvl; double smlnum; magma_int_t lquery, wantvr, select[1]; magma_int_t nb = 0; magmaDouble_ptr dT; //magma_timestr_t start, end; const char* side_ = NULL; *info = 0; lquery = lwork == -1; wantvl = (jobvl == MagmaVec); wantvr = (jobvr == MagmaVec); if (! wantvl && jobvl != MagmaNoVec) { *info = -1; } else if (! wantvr && jobvr != MagmaNoVec) { *info = -2; } else if (n < 0) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if ( (ldvl < 1) || (wantvl && (ldvl < n))) { *info = -9; } else if ( (ldvr < 1) || (wantvr && (ldvr < n))) { *info = -11; } /* Compute workspace */ if (*info == 0) { nb = magma_get_dgehrd_nb(n); minwrk = (2+nb)*n; work[0] = (double) minwrk; if (lwork < minwrk && ! lquery) { *info = -13; } } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { return *info; } // if eigenvectors are needed #if defined(VERSION3) if (MAGMA_SUCCESS != magma_dmalloc( &dT, nb*n )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } #endif // subtract row and col for 1-based indexing a_dim1 = lda; a_offset = 1 + a_dim1; a -= a_offset; vl_dim1 = ldvl; vl_offset = 1 + vl_dim1; vl -= vl_offset; vr_dim1 = ldvr; vr_offset = 1 + vr_dim1; vr -= vr_offset; --work; /* Get machine constants */ eps = lapackf77_dlamch("P"); smlnum = lapackf77_dlamch("S"); bignum = 1. / smlnum; lapackf77_dlabad(&smlnum, &bignum); smlnum = magma_dsqrt(smlnum) / eps; bignum = 1. / smlnum; /* Scale A if max element outside range [SMLNUM,BIGNUM] */ anrm = lapackf77_dlange("M", &n, &n, &a[a_offset], &lda, dum); scalea = 0; if (anrm > 0. && anrm < smlnum) { scalea = 1; cscale = smlnum; } else if (anrm > bignum) { scalea = 1; cscale = bignum; } if (scalea) { lapackf77_dlascl("G", &c__0, &c__0, &anrm, &cscale, &n, &n, &a[a_offset], &lda, &ierr); } /* Balance the matrix (Workspace: need N) */ ibal = 1; lapackf77_dgebal("B", &n, &a[a_offset], &lda, &ilo, &ihi, &work[ibal], &ierr); /* Reduce to upper Hessenberg form (Workspace: need 3*N, prefer 2*N+N*NB) */ itau = ibal + n; iwrk = itau + n; i__1 = lwork - iwrk + 1; //start = get_current_time(); #if defined(VERSION1) /* * Version 1 - LAPACK */ lapackf77_dgehrd(&n, &ilo, &ihi, &a[a_offset], &lda, &work[itau], &work[iwrk], &i__1, &ierr); #elif defined(VERSION2) /* * Version 2 - LAPACK consistent HRD */ magma_dgehrd2(n, ilo, ihi, &a[a_offset], lda, &work[itau], &work[iwrk], &i__1, &ierr); #elif defined(VERSION3) /* * Version 3 - LAPACK consistent MAGMA HRD + matrices T stored, */ magma_dgehrd(n, ilo, ihi, &a[a_offset], lda, &work[itau], &work[iwrk], i__1, dT, 0, queue, &ierr); #endif //end = get_current_time(); //printf(" Time for dgehrd = %5.2f sec\n", GetTimerValue(start,end)/1000.); if (wantvl) { /* Want left eigenvectors Copy Householder vectors to VL */ side_ = "Left"; lapackf77_dlacpy(MagmaLowerStr, &n, &n, &a[a_offset], &lda, &vl[vl_offset], &ldvl); /* * Generate orthogonal matrix in VL * (Workspace: need 3*N-1, prefer 2*N+(N-1)*NB) */ i__1 = lwork - iwrk + 1; //start = get_current_time(); #if defined(VERSION1) || defined(VERSION2) /* * Version 1 & 2 - LAPACK */ lapackf77_dorghr(&n, &ilo, &ihi, &vl[vl_offset], &ldvl, &work[itau], &work[iwrk], &i__1, &ierr); #elif defined(VERSION3) /* * Version 3 - LAPACK consistent MAGMA HRD + matrices T stored */ magma_dorghr(n, ilo, ihi, &vl[vl_offset], ldvl, &work[itau], dT, 0, nb, queue, &ierr); #endif //end = get_current_time(); //printf(" Time for dorghr = %5.2f sec\n", GetTimerValue(start,end)/1000.); /* * Perform QR iteration, accumulating Schur vectors in VL * (Workspace: need N+1, prefer N+HSWORK (see comments) ) */ iwrk = itau; i__1 = lwork - iwrk + 1; lapackf77_dhseqr("S", "V", &n, &ilo, &ihi, &a[a_offset], &lda, WR, WI, &vl[vl_offset], &ldvl, &work[iwrk], &i__1, info); if (wantvr) { /* Want left and right eigenvectors Copy Schur vectors to VR */ side_ = "Both"; lapackf77_dlacpy("F", &n, &n, &vl[vl_offset], &ldvl, &vr[vr_offset], &ldvr); } } else if (wantvr) { /* Want right eigenvectors Copy Householder vectors to VR */ side_ = "Right"; lapackf77_dlacpy("L", &n, &n, &a[a_offset], &lda, &vr[vr_offset], &ldvr); /* * Generate orthogonal matrix in VR * (Workspace: need 3*N-1, prefer 2*N+(N-1)*NB) */ i__1 = lwork - iwrk + 1; //start = get_current_time(); #if defined(VERSION1) || defined(VERSION2) /* * Version 1 & 2 - LAPACK */ lapackf77_dorghr(&n, &ilo, &ihi, &vr[vr_offset], &ldvr, &work[itau], &work[iwrk], &i__1, &ierr); #elif defined(VERSION3) /* * Version 3 - LAPACK consistent MAGMA HRD + matrices T stored */ magma_dorghr(n, ilo, ihi, &vr[vr_offset], ldvr, &work[itau], dT, 0, nb, queue, &ierr); #endif //end = get_current_time(); //printf(" Time for dorghr = %5.2f sec\n", GetTimerValue(start,end)/1000.); /* * Perform QR iteration, accumulating Schur vectors in VR * (Workspace: need N+1, prefer N+HSWORK (see comments) ) */ iwrk = itau; i__1 = lwork - iwrk + 1; lapackf77_dhseqr("S", "V", &n, &ilo, &ihi, &a[a_offset], &lda, WR, WI, &vr[vr_offset], &ldvr, &work[iwrk], &i__1, info); } else { /* * Compute eigenvalues only * (Workspace: need N+1, prefer N+HSWORK (see comments) ) */ iwrk = itau; i__1 = lwork - iwrk + 1; lapackf77_dhseqr("E", "N", &n, &ilo, &ihi, &a[a_offset], &lda, WR, WI, &vr[vr_offset], &ldvr, &work[iwrk], &i__1, info); } /* If INFO > 0 from DHSEQR, then quit */ if (*info > 0) { fprintf(stderr, "DHSEQR returned with info = %d\n", (int) *info); goto L50; } if (wantvl || wantvr) { /* * Compute left and/or right eigenvectors * (Workspace: need 4*N) */ lapackf77_dtrevc(side_, "B", select, &n, &a[a_offset], &lda, &vl[vl_offset], &ldvl, &vr[vr_offset], &ldvr, &n, &nout, &work[iwrk], &ierr); } if (wantvl) { /* * Undo balancing of left eigenvectors * (Workspace: need N) */ lapackf77_dgebak("B", "L", &n, &ilo, &ihi, &work[ibal], &n, &vl[vl_offset], &ldvl, &ierr); /* Normalize left eigenvectors and make largest component real */ for (i__ = 1; i__ <= n; ++i__) { if ( WI[i__-1] == 0.) { scl = magma_cblas_dnrm2(n, &vl[i__ * vl_dim1 + 1], 1); scl = 1. / scl; blasf77_dscal( &n, &scl, &vl[i__ * vl_dim1 + 1], &ione ); } else if (WI[i__-1] > 0.) { d__1 = magma_cblas_dnrm2(n, &vl[ i__ * vl_dim1 + 1], 1); d__2 = magma_cblas_dnrm2(n, &vl[(i__ + 1) * vl_dim1 + 1], 1); scl = lapackf77_dlapy2(&d__1, &d__2); scl = 1. / scl; blasf77_dscal( &n, &scl, &vl[ i__ * vl_dim1 + 1], &ione ); blasf77_dscal( &n, &scl, &vl[(i__ + 1) * vl_dim1 + 1], &ione ); i__2 = n; for (k = 1; k <= i__2; ++k) { /* Computing 2nd power */ d__1 = vl[k + i__ * vl_dim1]; /* Computing 2nd power */ d__2 = vl[k + (i__ + 1) * vl_dim1]; work[iwrk + k - 1] = d__1 * d__1 + d__2 * d__2; } /* Comment: Fortran BLAS does not have to add 1 C BLAS must add one to cblas_idamax */ k = blasf77_idamax( &n, &work[iwrk], &ione ); //+1; lapackf77_dlartg(&vl[k + i__ * vl_dim1], &vl[k + (i__ + 1) * vl_dim1], &cs, &sn, &r__); blasf77_drot( &n, &vl[ i__ * vl_dim1 + 1], &ione, &vl[(i__ + 1) * vl_dim1 + 1], &ione, &cs, &sn ); vl[k + (i__ + 1) * vl_dim1] = 0.; } } } if (wantvr) { /* * Undo balancing of right eigenvectors * (Workspace: need N) */ lapackf77_dgebak("B", "R", &n, &ilo, &ihi, &work[ibal], &n, &vr[vr_offset], &ldvr, &ierr); /* Normalize right eigenvectors and make largest component real */ for (i__ = 1; i__ <= n; ++i__) { if (WI[i__-1] == 0.) { scl = 1. / magma_cblas_dnrm2(n, &vr[i__ * vr_dim1 + 1], 1); blasf77_dscal( &n, &scl, &vr[i__ * vr_dim1 + 1], &ione ); } else if (WI[i__-1] > 0.) { d__1 = magma_cblas_dnrm2(n, &vr[ i__ * vr_dim1 + 1], 1); d__2 = magma_cblas_dnrm2(n, &vr[(i__ + 1) * vr_dim1 + 1], 1); scl = lapackf77_dlapy2(&d__1, &d__2); scl = 1. / scl; blasf77_dscal( &n, &scl, &vr[ i__ * vr_dim1 + 1], &ione ); blasf77_dscal( &n, &scl, &vr[(i__ + 1) * vr_dim1 + 1], &ione ); i__2 = n; for (k = 1; k <= i__2; ++k) { /* Computing 2nd power */ d__1 = vr[k + i__ * vr_dim1]; /* Computing 2nd power */ d__2 = vr[k + (i__ + 1) * vr_dim1]; work[iwrk + k - 1] = d__1 * d__1 + d__2 * d__2; } /* Comment: Fortran BLAS does not have to add 1 C BLAS must add one to cblas_idamax */ k = blasf77_idamax( &n, &work[iwrk], &ione ); //+1; lapackf77_dlartg(&vr[k + i__ * vr_dim1], &vr[k + (i__ + 1) * vr_dim1], &cs, &sn, &r__); blasf77_drot( &n, &vr[ i__ * vr_dim1 + 1], &ione, &vr[(i__ + 1) * vr_dim1 + 1], &ione, &cs, &sn ); vr[k + (i__ + 1) * vr_dim1] = 0.; } } } /* Undo scaling if necessary */ L50: if (scalea) { i__1 = n - *info; /* Computing MAX */ i__3 = n - *info; i__2 = max(i__3,1); lapackf77_dlascl("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1, WR + (*info), &i__2, &ierr); i__1 = n - *info; /* Computing MAX */ i__3 = n - *info; i__2 = max(i__3,1); lapackf77_dlascl("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1, WI + (*info), &i__2, &ierr); if (*info > 0) { i__1 = ilo - 1; lapackf77_dlascl("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1, WR, &n, &ierr); i__1 = ilo - 1; lapackf77_dlascl("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1, WI, &n, &ierr); } } #if defined(VERSION3) magma_free( dT ); #endif return *info; } /* magma_dgeev */
// ---------------------------------------- int main( int argc, char** argv ) { TESTING_INIT(); //real_Double_t t_m, t_c, t_f; magma_int_t ione = 1; double *A, *B; double diff, error; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t m, n, k, size, maxn, ld; double x2_m, x2_c; // real x for magma, cblas/fortran blas respectively double x_m, x_c; // x for magma, cblas/fortran blas respectively magma_opts opts; parse_opts( argc, argv, &opts ); opts.tolerance = max( 100., opts.tolerance ); double tol = opts.tolerance * lapackf77_dlamch("E"); gTol = tol; printf( "!! Calling these CBLAS and Fortran BLAS sometimes crashes (segfault), which !!\n" "!! is why we use wrappers. It does not necesarily indicate a bug in MAGMA. !!\n" "\n" "Diff compares MAGMA wrapper to CBLAS and BLAS function; should be exactly 0.\n" "Error compares MAGMA implementation to CBLAS and BLAS function; should be ~ machine epsilon.\n" "\n" ); double total_diff = 0.; double total_error = 0.; int inc[] = { 1 }; //{ -2, -1, 1, 2 }; //{ 1 }; //{ -1, 1 }; int ninc = sizeof(inc)/sizeof(*inc); for( int itest = 0; itest < opts.ntest; ++itest ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; for( int iincx = 0; iincx < ninc; ++iincx ) { magma_int_t incx = inc[iincx]; for( int iincy = 0; iincy < ninc; ++iincy ) { magma_int_t incy = inc[iincy]; printf("=========================================================================\n"); printf( "m=%d, n=%d, k=%d, incx = %d, incy = %d\n", (int) m, (int) n, (int) k, (int) incx, (int) incy ); printf( "Function MAGMA CBLAS BLAS Diff Error\n" " msec msec msec\n" ); // allocate matrices // over-allocate so they can be any combination of // {m,n,k} * {abs(incx), abs(incy)} by // {m,n,k} * {abs(incx), abs(incy)} maxn = max( max( m, n ), k ) * max( abs(incx), abs(incy) ); ld = max( 1, maxn ); size = ld*maxn; magma_dmalloc_pinned( &A, size ); assert( A != NULL ); magma_dmalloc_pinned( &B, size ); assert( B != NULL ); // initialize matrices lapackf77_dlarnv( &ione, ISEED, &size, A ); lapackf77_dlarnv( &ione, ISEED, &size, B ); printf( "Level 1 BLAS ----------------------------------------------------------\n" ); // ----- test DASUM // get one-norm of column j of A if ( incx > 0 && incx == incy ) { // positive, no incy diff = 0; error = 0; for( int j = 0; j < k; ++j ) { x_m = magma_cblas_dasum( m, A(0,j), incx ); x_c = cblas_dasum( m, A(0,j), incx ); diff += fabs( x_m - x_c ); x_c = blasf77_dasum( &m, A(0,j), &incx ); error += fabs( (x_m - x_c) / (m*x_c) ); } output( "dasum", diff, error ); total_diff += diff; total_error += error; } // ----- test DNRM2 // get two-norm of column j of A if ( incx > 0 && incx == incy ) { // positive, no incy diff = 0; error = 0; for( int j = 0; j < k; ++j ) { x_m = magma_cblas_dnrm2( m, A(0,j), incx ); x_c = cblas_dnrm2( m, A(0,j), incx ); diff += fabs( x_m - x_c ); x_c = blasf77_dnrm2( &m, A(0,j), &incx ); error += fabs( (x_m - x_c) / (m*x_c) ); } output( "dnrm2", diff, error ); total_diff += diff; total_error += error; } // ----- test DDOT // dot columns, Aj^H Bj diff = 0; error = 0; for( int j = 0; j < k; ++j ) { // MAGMA implementation, not just wrapper x2_m = magma_cblas_ddot( m, A(0,j), incx, B(0,j), incy ); // crashes on MKL 11.1.2, ILP64 #if ! defined( MAGMA_WITH_MKL ) #ifdef COMPLEX cblas_ddot_sub( m, A(0,j), incx, B(0,j), incy, &x2_c ); #else x2_c = cblas_ddot( m, A(0,j), incx, B(0,j), incy ); #endif error += fabs( x2_m - x2_c ) / fabs( m*x2_c ); #endif // crashes on MacOS 10.9 #if ! defined( __APPLE__ ) x2_c = blasf77_ddot( &m, A(0,j), &incx, B(0,j), &incy ); error += fabs( x2_m - x2_c ) / fabs( m*x2_c ); #endif } output( "ddot", diff, error ); total_diff += diff; total_error += error; total_error += error; // ----- test DDOT // dot columns, Aj^T * Bj diff = 0; error = 0; for( int j = 0; j < k; ++j ) { // MAGMA implementation, not just wrapper x2_m = magma_cblas_ddot( m, A(0,j), incx, B(0,j), incy ); // crashes on MKL 11.1.2, ILP64 #if ! defined( MAGMA_WITH_MKL ) #ifdef COMPLEX cblas_ddot_sub( m, A(0,j), incx, B(0,j), incy, &x2_c ); #else x2_c = cblas_ddot( m, A(0,j), incx, B(0,j), incy ); #endif error += fabs( x2_m - x2_c ) / fabs( m*x2_c ); #endif // crashes on MacOS 10.9 #if ! defined( __APPLE__ ) x2_c = blasf77_ddot( &m, A(0,j), &incx, B(0,j), &incy ); error += fabs( x2_m - x2_c ) / fabs( m*x2_c ); #endif } output( "ddot", diff, error ); total_diff += diff; total_error += error; // tell user about disabled functions #if defined( MAGMA_WITH_MKL ) printf( "cblas_ddot and cblas_ddot disabled with MKL (segfaults)\n" ); #endif #if defined( __APPLE__ ) printf( "blasf77_ddot and blasf77_ddot disabled on MacOS (segfaults)\n" ); #endif // cleanup magma_free_pinned( A ); magma_free_pinned( B ); fflush( stdout ); }}} // itest, incx, incy // TODO use average error? printf( "sum diffs = %8.2g, MAGMA wrapper compared to CBLAS and Fortran BLAS; should be exactly 0.\n" "sum errors = %8.2e, MAGMA implementation compared to CBLAS and Fortran BLAS; should be ~ machine epsilon.\n\n", total_diff, total_error ); if ( total_diff != 0. ) { printf( "some tests failed diff == 0.; see above.\n" ); } else { printf( "all tests passed diff == 0.\n" ); } TESTING_FINALIZE(); int status = (total_diff != 0.); return status; }
/** Purpose ------- DGEQP3 computes a QR factorization with column pivoting of a matrix A: A*P = Q*R using Level 3 BLAS. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A DOUBLE_PRECISION array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the upper triangle of the array contains the min(M,N)-by-N upper trapezoidal matrix R; the elements below the diagonal, together with the array TAU, represent the unitary matrix Q as a product of min(M,N) elementary reflectors. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[in,out] jpvt INTEGER array, dimension (N) On entry, if JPVT(J).ne.0, the J-th column of A is permuted to the front of A*P (a leading column); if JPVT(J)=0, the J-th column of A is a free column. On exit, if JPVT(J)=K, then the J-th column of A*P was the the K-th column of A. @param[out] tau DOUBLE_PRECISION array, dimension (min(M,N)) The scalar factors of the elementary reflectors. @param[out] work (workspace) DOUBLE_PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO=0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. For [sd]geqp3, LWORK >= (N+1)*NB + 2*N; for [cz]geqp3, LWORK >= (N+1)*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param rwork (workspace, for [cz]geqp3 only) DOUBLE PRECISION array, dimension (2*N) @param[out] info INTEGER - = 0: successful exit. - < 0: if INFO = -i, the i-th argument had an illegal value. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_dgeqp3_comp ********************************************************************/ extern "C" magma_int_t magma_dgeqp3( magma_int_t m, magma_int_t n, double *A, magma_int_t lda, magma_int_t *jpvt, double *tau, double *work, magma_int_t lwork, #ifdef COMPLEX double *rwork, #endif magma_int_t *info ) { #define A(i, j) (A + (i) + (j)*(lda )) #define dA(i, j) (dwork + (i) + (j)*(ldda)) double *dwork, *df; magma_int_t ione = 1; magma_int_t n_j, ldda, ldwork; magma_int_t j, jb, na, nb, sm, sn, fjb, nfxd, minmn; magma_int_t topbmn, sminmn, lwkopt=0, lquery; *info = 0; lquery = (lwork == -1); if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } nb = magma_get_dgeqp3_nb(min(m, n)); minmn = min(m,n); if (*info == 0) { if (minmn == 0) { lwkopt = 1; } else { lwkopt = (n + 1)*nb; #ifdef REAL lwkopt += 2*n; #endif } work[0] = MAGMA_D_MAKE( lwkopt, 0. ); if (lwork < lwkopt && ! lquery) { *info = -8; } } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } if (minmn == 0) return *info; #ifdef REAL double *rwork = work + (n + 1)*nb; #endif ldda = ((m+31)/32)*32; ldwork = n*ldda + (n+1)*nb; if (MAGMA_SUCCESS != magma_dmalloc( &dwork, ldwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } df = dwork + n*ldda; // dwork used for dA magma_queue_t stream; magma_queue_create( &stream ); /* Move initial columns up front. * Note jpvt uses 1-based indices for historical compatibility. */ nfxd = 0; for (j = 0; j < n; ++j) { if (jpvt[j] != 0) { if (j != nfxd) { blasf77_dswap(&m, A(0, j), &ione, A(0, nfxd), &ione); jpvt[j] = jpvt[nfxd]; jpvt[nfxd] = j + 1; } else { jpvt[j] = j + 1; } ++nfxd; } else { jpvt[j] = j + 1; } } /* Factorize fixed columns ======================= Compute the QR factorization of fixed columns and update remaining columns. */ if (nfxd > 0) { na = min(m,nfxd); lapackf77_dgeqrf(&m, &na, A, &lda, tau, work, &lwork, info); if (na < n) { n_j = n - na; lapackf77_dormqr( MagmaLeftStr, MagmaConjTransStr, &m, &n_j, &na, A, &lda, tau, A(0, na), &lda, work, &lwork, info ); } } /* Factorize free columns */ if (nfxd < minmn) { sm = m - nfxd; sn = n - nfxd; sminmn = minmn - nfxd; if (nb < sminmn) { j = nfxd; // Set the original matrix to the GPU magma_dsetmatrix_async( m, sn, A (0,j), lda, dA(0,j), ldda, stream ); } /* Initialize partial column norms. */ for (j = nfxd; j < n; ++j) { rwork[j] = magma_cblas_dnrm2( sm, A(nfxd,j), ione ); rwork[n + j] = rwork[j]; } j = nfxd; if (nb < sminmn) { /* Use blocked code initially. */ magma_queue_sync( stream ); /* Compute factorization: while loop. */ topbmn = minmn - nb; while(j < topbmn) { jb = min(nb, topbmn - j); /* Factorize JB columns among columns J:N. */ n_j = n - j; if (j > nfxd) { // Get panel to the CPU magma_dgetmatrix( m-j, jb, dA(j,j), ldda, A (j,j), lda ); // Get the rows magma_dgetmatrix( jb, n_j - jb, dA(j,j + jb), ldda, A (j,j + jb), lda ); } magma_dlaqps( m, n_j, j, jb, &fjb, A (0, j), lda, dA(0, j), ldda, &jpvt[j], &tau[j], &rwork[j], &rwork[n + j], work, &work[jb], n_j, &df[jb], n_j ); j += fjb; /* fjb is actual number of columns factored */ } } /* Use unblocked code to factor the last or only block. */ if (j < minmn) { n_j = n - j; if (j > nfxd) { magma_dgetmatrix( m-j, n_j, dA(j,j), ldda, A (j,j), lda ); } lapackf77_dlaqp2(&m, &n_j, &j, A(0, j), &lda, &jpvt[j], &tau[j], &rwork[j], &rwork[n+j], work ); } } work[0] = MAGMA_D_MAKE( lwkopt, 0. ); magma_free( dwork ); magma_queue_destroy( stream ); return *info; } /* magma_dgeqp3 */
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgeev */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gpu_time, cpu_time; double *h_A, *h_R, *VL, *VR, *h_work, *w1, *w2; double *w1i, *w2i; magmaDoubleComplex *w1copy, *w2copy; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; double tnrm, result[9]; magma_int_t N, n2, lda, nb, lwork, info; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double ulp, ulpinv, error; magma_int_t status = 0; ulp = lapackf77_dlamch( "P" ); ulpinv = 1./ulp; magma_opts opts; parse_opts( argc, argv, &opts ); // need slightly looser bound (60*eps instead of 30*eps) for some tests opts.tolerance = max( 60., opts.tolerance ); double tol = opts.tolerance * lapackf77_dlamch("E"); double tolulp = opts.tolerance * lapackf77_dlamch("P"); // enable at least some minimal checks, if requested if ( opts.check && !opts.lapack && opts.jobvl == MagmaNoVec && opts.jobvr == MagmaNoVec ) { fprintf( stderr, "NOTE: Some checks require vectors to be computed;\n" " set jobvl=V (option -LV), or jobvr=V (option -RV), or both.\n" " Some checks require running lapack (-l); setting lapack.\n\n"); opts.lapack = true; } printf(" N CPU Time (sec) GPU Time (sec) |W_magma - W_lapack| / |W_lapack|\n"); printf("===========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; nb = magma_get_dgehrd_nb(N); lwork = N*(2 + nb); // generous workspace - required by dget22 lwork = max( lwork, N*(5 + 2*N) ); TESTING_MALLOC_CPU( w1copy, magmaDoubleComplex, N ); TESTING_MALLOC_CPU( w2copy, magmaDoubleComplex, N ); TESTING_MALLOC_CPU( w1, double, N ); TESTING_MALLOC_CPU( w2, double, N ); TESTING_MALLOC_CPU( w1i, double, N ); TESTING_MALLOC_CPU( w2i, double, N ); TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_PIN( h_R, double, n2 ); TESTING_MALLOC_PIN( VL, double, n2 ); TESTING_MALLOC_PIN( VR, double, n2 ); TESTING_MALLOC_PIN( h_work, double, lwork ); /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_dgeev( opts.jobvl, opts.jobvr, N, h_R, lda, w1, w1i, VL, lda, VR, lda, h_work, lwork, opts.queue, &info ); gpu_time = magma_wtime() - gpu_time; if (info != 0) printf("magma_dgeev returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result =================================================================== */ if ( opts.check ) { /* =================================================================== * Check the result following LAPACK's [zcds]drvev routine. * The following tests are performed: * (1) | A * VR - VR * W | / ( n |A| ) * * Here VR is the matrix of unit right eigenvectors. * W is a diagonal matrix with diagonal entries W(j). * * (2) | |VR(i)| - 1 | and whether largest component real * * VR(i) denotes the i-th column of VR. * * (3) | A**T * VL - VL * W**T | / ( n |A| ) * * Here VL is the matrix of unit left eigenvectors, A**T is the * transpose of A, and W is as above. * * (4) | |VL(i)| - 1 | and whether largest component real * * VL(i) denotes the i-th column of VL. * * (5) W(full) = W(partial, W only) -- currently skipped * (6) W(full) = W(partial, W and VR) * (7) W(full) = W(partial, W and VL) * * W(full) denotes the eigenvalues computed when both VR and VL * are also computed, and W(partial) denotes the eigenvalues * computed when only W, only W and VR, or only W and VL are * computed. * * (8) VR(full) = VR(partial, W and VR) * * VR(full) denotes the right eigenvectors computed when both VR * and VL are computed, and VR(partial) denotes the result * when only VR is computed. * * (9) VL(full) = VL(partial, W and VL) * * VL(full) denotes the left eigenvectors computed when both VR * and VL are also computed, and VL(partial) denotes the result * when only VL is computed. * * (1, 2) only if jobvr = V * (3, 4) only if jobvl = V * (5-9) only if check = 2 (option -c2) ================================================================= */ double vmx, vrmx, vtst; // Initialize result. -1 indicates test was not run. for( int j = 0; j < 9; ++j ) result[j] = -1.; if ( opts.jobvr == MagmaVec ) { // Do test 1: | A * VR - VR * W | / ( n |A| ) // Note this writes result[1] also lapackf77_dget22( MagmaNoTransStr, MagmaNoTransStr, MagmaNoTransStr, &N, h_A, &lda, VR, &lda, w1, w1i, h_work, &result[0] ); result[0] *= ulp; // Do test 2: | |VR(i)| - 1 | and whether largest component real result[1] = -1.; for( int j = 0; j < N; ++j ) { tnrm = 1.; if (w1i[j] == 0.) tnrm = magma_cblas_dnrm2( N, &VR[j*lda], ione ); else if (w1i[j] > 0.) tnrm = magma_dlapy2( magma_cblas_dnrm2( N, &VR[j*lda], ione ), magma_cblas_dnrm2( N, &VR[(j+1)*lda], ione )); result[1] = max( result[1], min( ulpinv, MAGMA_D_ABS(tnrm-1.)/ulp )); if (w1i[j] > 0.) { vmx = vrmx = 0.; for( int jj = 0; jj < N; ++jj ) { vtst = magma_dlapy2( VR[jj+j*lda], VR[jj+(j+1)*lda]); if (vtst > vmx) vmx = vtst; if ( (VR[jj + (j+1)*lda])==0. && MAGMA_D_ABS( VR[jj+j*lda] ) > vrmx) { vrmx = MAGMA_D_ABS( VR[jj+j*lda] ); } } if (vrmx / vmx < 1. - ulp*2.) result[1] = ulpinv; } } result[1] *= ulp; } if ( opts.jobvl == MagmaVec ) { // Do test 3: | A**T * VL - VL * W**T | / ( n |A| ) // Note this writes result[3] also lapackf77_dget22( MagmaTransStr, MagmaNoTransStr, MagmaTransStr, &N, h_A, &lda, VL, &lda, w1, w1i, h_work, &result[2] ); result[2] *= ulp; // Do test 4: | |VL(i)| - 1 | and whether largest component real result[3] = -1.; for( int j = 0; j < N; ++j ) { tnrm = 1.; if (w1i[j] == 0.) tnrm = magma_cblas_dnrm2( N, &VL[j*lda], ione ); else if (w1i[j] > 0.) tnrm = magma_dlapy2( magma_cblas_dnrm2( N, &VL[j*lda], ione ), magma_cblas_dnrm2( N, &VL[(j+1)*lda], ione )); result[3] = max( result[3], min( ulpinv, MAGMA_D_ABS(tnrm-1.)/ulp )); if (w1i[j] > 0.) { vmx = vrmx = 0.; for( int jj = 0; jj < N; ++jj ) { vtst = magma_dlapy2( VL[jj+j*lda], VL[jj+(j+1)*lda]); if (vtst > vmx) vmx = vtst; if ( (VL[jj + (j+1)*lda])==0. && MAGMA_D_ABS( VL[jj+j*lda]) > vrmx) { vrmx = MAGMA_D_ABS( VL[jj+j*lda] ); } } if (vrmx / vmx < 1. - ulp*2.) result[3] = ulpinv; } } result[3] *= ulp; } } if ( opts.check == 2 ) { // more extensive tests // this is really slow because it calls magma_zgeev multiple times double *LRE, DUM; TESTING_MALLOC_PIN( LRE, double, n2 ); lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); // ---------- // Compute eigenvalues, left and right eigenvectors magma_dgeev( MagmaVec, MagmaVec, N, h_R, lda, w1, w1i, VL, lda, VR, lda, h_work, lwork, opts.queue, &info ); if (info != 0) printf("magma_zgeev (case V, V) returned error %d: %s.\n", (int) info, magma_strerror( info )); // ---------- // Compute eigenvalues only // These are not exactly equal, and not in the same order, so skip for now. //lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); //magma_dgeev( MagmaNoVec, MagmaNoVec, // N, h_R, lda, w2, w2i, // &DUM, 1, &DUM, 1, // h_work, lwork, opts.queue, &info ); //if (info != 0) // printf("magma_dgeev (case N, N) returned error %d: %s.\n", // (int) info, magma_strerror( info )); // //// Do test 5: W(full) = W(partial, W only) //result[4] = 1; //for( int j = 0; j < N; ++j ) // if ( w1[j] != w2[j] || w1i[j] != w2i[j] ) // result[4] = 0; // ---------- // Compute eigenvalues and right eigenvectors lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); magma_dgeev( MagmaNoVec, MagmaVec, N, h_R, lda, w2, w2i, &DUM, 1, LRE, lda, h_work, lwork, opts.queue, &info ); if (info != 0) printf("magma_dgeev (case N, V) returned error %d: %s.\n", (int) info, magma_strerror( info )); // Do test 6: W(full) = W(partial, W and VR) result[5] = 1; for( int j = 0; j < N; ++j ) if ( w1[j] != w2[j] || w1i[j] != w2i[j] ) result[5] = 0; // Do test 8: VR(full) = VR(partial, W and VR) result[7] = 1; for( int j = 0; j < N; ++j ) for( int jj = 0; jj < N; ++jj ) if ( ! MAGMA_D_EQUAL( VR[j+jj*lda], LRE[j+jj*lda] )) result[7] = 0; // ---------- // Compute eigenvalues and left eigenvectors lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); magma_dgeev( MagmaVec, MagmaNoVec, N, h_R, lda, w2, w2i, LRE, lda, &DUM, 1, h_work, lwork, opts.queue, &info ); if (info != 0) printf("magma_dgeev (case V, N) returned error %d: %s.\n", (int) info, magma_strerror( info )); // Do test 7: W(full) = W(partial, W and VL) result[6] = 1; for( int j = 0; j < N; ++j ) if ( w1[j] != w2[j] || w1i[j] != w2i[j] ) result[6] = 0; // Do test 9: VL(full) = VL(partial, W and VL) result[8] = 1; for( int j = 0; j < N; ++j ) for( int jj = 0; jj < N; ++jj ) if ( ! MAGMA_D_EQUAL( VL[j+jj*lda], LRE[j+jj*lda] )) result[8] = 0; TESTING_FREE_PIN( LRE ); } /* ===================================================================== Performs operation using LAPACK Do this after checks, because it overwrites VL and VR. =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_dgeev( lapack_vec_const(opts.jobvl), lapack_vec_const(opts.jobvr), &N, h_A, &lda, w2, w2i, VL, &lda, VR, &lda, h_work, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; if (info != 0) printf("lapackf77_dgeev returned error %d: %s.\n", (int) info, magma_strerror( info )); // check | W_magma - W_lapack | / | W | // need to sort eigenvalues first // copy them into complex vectors for ease for( int j=0; j < N; ++j ) { w1copy[j] = MAGMA_Z_MAKE( w1[j], w1i[j] ); w2copy[j] = MAGMA_Z_MAKE( w2[j], w2i[j] ); } std::sort( w1copy, &w1copy[N], lessthan ); std::sort( w2copy, &w2copy[N], lessthan ); // adjust sorting to deal with numerical inaccuracy // search down w2 for eigenvalue that matches w1's eigenvalue for( int j=0; j < N; ++j ) { for( int j2=j; j2 < N; ++j2 ) { magmaDoubleComplex diff = MAGMA_Z_SUB( w1copy[j], w2copy[j2] ); double diff2 = magma_dzlapy2( diff ) / max( magma_dzlapy2( w1copy[j] ), tol ); if ( diff2 < 100*tol ) { if ( j != j2 ) { std::swap( w2copy[j], w2copy[j2] ); } break; } } } blasf77_zaxpy( &N, &c_neg_one, w2copy, &ione, w1copy, &ione ); error = magma_cblas_dznrm2( N, w1copy, 1 ); error /= magma_cblas_dznrm2( N, w2copy, 1 ); printf("%5d %7.2f %7.2f %8.2e %s\n", (int) N, cpu_time, gpu_time, error, (error < tolulp ? "ok" : "failed")); status += ! (error < tolulp); } else { printf("%5d --- %7.2f\n", (int) N, gpu_time); } if ( opts.check ) { // -1 indicates test was not run if ( result[0] != -1 ) { printf(" | A * VR - VR * W | / ( n |A| ) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed")); } if ( result[1] != -1 ) { printf(" | |VR(i)| - 1 | = %8.2e %s\n", result[1], (result[1] < tol ? "ok" : "failed")); } if ( result[2] != -1 ) { printf(" | A'* VL - VL * W'| / ( n |A| ) = %8.2e %s\n", result[2], (result[2] < tol ? "ok" : "failed")); } if ( result[3] != -1 ) { printf(" | |VL(i)| - 1 | = %8.2e %s\n", result[3], (result[3] < tol ? "ok" : "failed")); } if ( result[4] != -1 ) { printf(" W (full) == W (partial, W only) %s\n", (result[4] == 1. ? "ok" : "failed")); } if ( result[5] != -1 ) { printf(" W (full) == W (partial, W and VR) %s\n", (result[5] == 1. ? "ok" : "failed")); } if ( result[6] != -1 ) { printf(" W (full) == W (partial, W and VL) %s\n", (result[6] == 1. ? "ok" : "failed")); } if ( result[7] != -1 ) { printf(" VR (full) == VR (partial, W and VR) %s\n", (result[7] == 1. ? "ok" : "failed")); } if ( result[8] != -1 ) { printf(" VL (full) == VL (partial, W and VL) %s\n", (result[8] == 1. ? "ok" : "failed")); } int newline = 0; if ( result[0] != -1 ) { status += ! (result[0] < tol); newline = 1; } if ( result[1] != -1 ) { status += ! (result[1] < tol); newline = 1; } if ( result[2] != -1 ) { status += ! (result[2] < tol); newline = 1; } if ( result[3] != -1 ) { status += ! (result[3] < tol); newline = 1; } if ( result[4] != -1 ) { status += ! (result[4] == 1.); newline = 1; } if ( result[5] != -1 ) { status += ! (result[5] == 1.); newline = 1; } if ( result[6] != -1 ) { status += ! (result[6] == 1.); newline = 1; } if ( result[7] != -1 ) { status += ! (result[7] == 1.); newline = 1; } if ( result[8] != -1 ) { status += ! (result[8] == 1.); newline = 1; } if ( newline ) { printf( "\n" ); } } TESTING_FREE_CPU( w1copy ); TESTING_FREE_CPU( w2copy ); TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( w2 ); TESTING_FREE_CPU( w1i ); TESTING_FREE_CPU( w2i ); TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( VL ); TESTING_FREE_PIN( VR ); TESTING_FREE_PIN( h_work ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/** Purpose ------- DGEEV computes for an N-by-N real nonsymmetric matrix A, the eigenvalues and, optionally, the left and/or right eigenvectors. The right eigenvector v(j) of A satisfies A * v(j) = lambda(j) * v(j) where lambda(j) is its eigenvalue. The left eigenvector u(j) of A satisfies u(j)**T * A = lambda(j) * u(j)**T where u(j)**T denotes the transpose of u(j). The computed eigenvectors are normalized to have Euclidean norm equal to 1 and largest component real. Arguments --------- @param[in] jobvl magma_vec_t - = MagmaNoVec: left eigenvectors of A are not computed; - = MagmaVec: left eigenvectors of are computed. @param[in] jobvr magma_vec_t - = MagmaNoVec: right eigenvectors of A are not computed; - = MagmaVec: right eigenvectors of A are computed. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A DOUBLE PRECISION array, dimension (LDA,N) On entry, the N-by-N matrix A. On exit, A has been overwritten. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] wr DOUBLE PRECISION array, dimension (N) @param[out] wi DOUBLE PRECISION array, dimension (N) WR and WI contain the real and imaginary parts, respectively, of the computed eigenvalues. Complex conjugate pairs of eigenvalues appear consecutively with the eigenvalue having the positive imaginary part first. @param[out] VL DOUBLE PRECISION array, dimension (LDVL,N) If JOBVL = MagmaVec, the left eigenvectors u(j) are stored one after another in the columns of VL, in the same order as their eigenvalues. If JOBVL = MagmaNoVec, VL is not referenced. u(j) = VL(:,j), the j-th column of VL. @param[in] ldvl INTEGER The leading dimension of the array VL. LDVL >= 1; if JOBVL = MagmaVec, LDVL >= N. @param[out] VR DOUBLE PRECISION array, dimension (LDVR,N) If JOBVR = MagmaVec, the right eigenvectors v(j) are stored one after another in the columns of VR, in the same order as their eigenvalues. If JOBVR = MagmaNoVec, VR is not referenced. v(j) = VR(:,j), the j-th column of VR. @param[in] ldvr INTEGER The leading dimension of the array VR. LDVR >= 1; if JOBVR = MagmaVec, LDVR >= N. @param[out] work (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= (2+nb)*N. For optimal performance, LWORK >= (2+2*nb)*N. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value. - > 0: if INFO = i, the QR algorithm failed to compute all the eigenvalues, and no eigenvectors have been computed; elements and i+1:N of W contain eigenvalues which have converged. @ingroup magma_dgeev_driver ********************************************************************/ extern "C" magma_int_t magma_dgeev_m( magma_vec_t jobvl, magma_vec_t jobvr, magma_int_t n, double *A, magma_int_t lda, double *wr, double *wi, double *VL, magma_int_t ldvl, double *VR, magma_int_t ldvr, double *work, magma_int_t lwork, magma_int_t *info ) { #define VL(i,j) (VL + (i) + (j)*ldvl) #define VR(i,j) (VR + (i) + (j)*ldvr) const magma_int_t ione = 1; const magma_int_t izero = 0; double d__1, d__2; double r, cs, sn, scl; double dum[1], eps; double anrm, cscale, bignum, smlnum; magma_int_t i, k, ilo, ihi; magma_int_t ibal, ierr, itau, iwrk, nout, liwrk, nb; magma_int_t scalea, minwrk, optwrk, lquery, wantvl, wantvr, select[1]; magma_side_t side = MagmaRight; magma_timer_t time_total=0, time_gehrd=0, time_unghr=0, time_hseqr=0, time_trevc=0, time_sum=0; magma_flops_t flop_total=0, flop_gehrd=0, flop_unghr=0, flop_hseqr=0, flop_trevc=0, flop_sum=0; timer_start( time_total ); flops_start( flop_total ); *info = 0; lquery = (lwork == -1); wantvl = (jobvl == MagmaVec); wantvr = (jobvr == MagmaVec); if (! wantvl && jobvl != MagmaNoVec) { *info = -1; } else if (! wantvr && jobvr != MagmaNoVec) { *info = -2; } else if (n < 0) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if ( (ldvl < 1) || (wantvl && (ldvl < n))) { *info = -9; } else if ( (ldvr < 1) || (wantvr && (ldvr < n))) { *info = -11; } /* Compute workspace */ nb = magma_get_dgehrd_nb( n ); if (*info == 0) { minwrk = (2 + nb)*n; optwrk = (2 + 2*nb)*n; work[0] = MAGMA_D_MAKE( (double) optwrk, 0. ); if (lwork < minwrk && ! lquery) { *info = -13; } } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { return *info; } #if defined(Version3) || defined(Version4) || defined(Version5) double *dT; if (MAGMA_SUCCESS != magma_dmalloc( &dT, nb*n )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } #endif #if defined(Version4) || defined(Version5) double *T; if (MAGMA_SUCCESS != magma_dmalloc_cpu( &T, nb*n )) { magma_free( dT ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } #endif /* Get machine constants */ eps = lapackf77_dlamch( "P" ); smlnum = lapackf77_dlamch( "S" ); bignum = 1. / smlnum; lapackf77_dlabad( &smlnum, &bignum ); smlnum = magma_dsqrt( smlnum ) / eps; bignum = 1. / smlnum; /* Scale A if max element outside range [SMLNUM,BIGNUM] */ anrm = lapackf77_dlange( "M", &n, &n, A, &lda, dum ); scalea = 0; if (anrm > 0. && anrm < smlnum) { scalea = 1; cscale = smlnum; } else if (anrm > bignum) { scalea = 1; cscale = bignum; } if (scalea) { lapackf77_dlascl( "G", &izero, &izero, &anrm, &cscale, &n, &n, A, &lda, &ierr ); } /* Balance the matrix * (Workspace: need N) * - this space is reserved until after gebak */ ibal = 0; lapackf77_dgebal( "B", &n, A, &lda, &ilo, &ihi, &work[ibal], &ierr ); /* Reduce to upper Hessenberg form * (Workspace: need 3*N, prefer 2*N + N*NB) * - including N reserved for gebal/gebak, unused by dgehrd */ itau = ibal + n; iwrk = itau + n; liwrk = lwork - iwrk; timer_start( time_gehrd ); flops_start( flop_gehrd ); #if defined(Version1) // Version 1 - LAPACK lapackf77_dgehrd( &n, &ilo, &ihi, A, &lda, &work[itau], &work[iwrk], &liwrk, &ierr ); #elif defined(Version2) // Version 2 - LAPACK consistent HRD magma_dgehrd2( n, ilo, ihi, A, lda, &work[itau], &work[iwrk], liwrk, &ierr ); #elif defined(Version3) // Version 3 - LAPACK consistent MAGMA HRD + T matrices stored, magma_dgehrd( n, ilo, ihi, A, lda, &work[itau], &work[iwrk], liwrk, dT, &ierr ); #elif defined(Version4) || defined(Version5) // Version 4 - Multi-GPU, T on host magma_dgehrd_m( n, ilo, ihi, A, lda, &work[itau], &work[iwrk], liwrk, T, &ierr ); magma_dsetmatrix( nb, n, T, nb, dT, nb ); #endif time_sum += timer_stop( time_gehrd ); flop_sum += flops_stop( flop_gehrd ); if (wantvl) { /* Want left eigenvectors * Copy Householder vectors to VL */ side = MagmaLeft; lapackf77_dlacpy( MagmaLowerStr, &n, &n, A, &lda, VL, &ldvl ); /* Generate orthogonal matrix in VL * (Workspace: need 3*N-1, prefer 2*N + (N-1)*NB) * - including N reserved for gebal/gebak, unused by dorghr */ timer_start( time_unghr ); flops_start( flop_unghr ); #if defined(Version1) || defined(Version2) // Version 1 & 2 - LAPACK lapackf77_dorghr( &n, &ilo, &ihi, VL, &ldvl, &work[itau], &work[iwrk], &liwrk, &ierr ); #elif defined(Version3) || defined(Version4) // Version 3 - LAPACK consistent MAGMA HRD + T matrices stored magma_dorghr( n, ilo, ihi, VL, ldvl, &work[itau], dT, nb, &ierr ); #elif defined(Version5) // Version 5 - Multi-GPU, T on host magma_dorghr_m( n, ilo, ihi, VL, ldvl, &work[itau], T, nb, &ierr ); #endif time_sum += timer_stop( time_unghr ); flop_sum += flops_stop( flop_unghr ); timer_start( time_hseqr ); flops_start( flop_hseqr ); /* Perform QR iteration, accumulating Schur vectors in VL * (Workspace: need N+1, prefer N+HSWORK (see comments) ) * - including N reserved for gebal/gebak, unused by dhseqr */ iwrk = itau; liwrk = lwork - iwrk; lapackf77_dhseqr( "S", "V", &n, &ilo, &ihi, A, &lda, wr, wi, VL, &ldvl, &work[iwrk], &liwrk, info ); time_sum += timer_stop( time_hseqr ); flop_sum += flops_stop( flop_hseqr ); if (wantvr) { /* Want left and right eigenvectors * Copy Schur vectors to VR */ side = MagmaBothSides; lapackf77_dlacpy( "F", &n, &n, VL, &ldvl, VR, &ldvr ); } } else if (wantvr) { /* Want right eigenvectors * Copy Householder vectors to VR */ side = MagmaRight; lapackf77_dlacpy( "L", &n, &n, A, &lda, VR, &ldvr ); /* Generate orthogonal matrix in VR * (Workspace: need 3*N-1, prefer 2*N + (N-1)*NB) * - including N reserved for gebal/gebak, unused by dorghr */ timer_start( time_unghr ); flops_start( flop_unghr ); #if defined(Version1) || defined(Version2) // Version 1 & 2 - LAPACK lapackf77_dorghr( &n, &ilo, &ihi, VR, &ldvr, &work[itau], &work[iwrk], &liwrk, &ierr ); #elif defined(Version3) || defined(Version4) // Version 3 - LAPACK consistent MAGMA HRD + T matrices stored magma_dorghr( n, ilo, ihi, VR, ldvr, &work[itau], dT, nb, &ierr ); #elif defined(Version5) // Version 5 - Multi-GPU, T on host magma_dorghr_m( n, ilo, ihi, VR, ldvr, &work[itau], T, nb, &ierr ); #endif time_sum += timer_stop( time_unghr ); flop_sum += flops_stop( flop_unghr ); /* Perform QR iteration, accumulating Schur vectors in VR * (Workspace: need N+1, prefer N+HSWORK (see comments) ) * - including N reserved for gebal/gebak, unused by dhseqr */ timer_start( time_hseqr ); flops_start( flop_hseqr ); iwrk = itau; liwrk = lwork - iwrk; lapackf77_dhseqr( "S", "V", &n, &ilo, &ihi, A, &lda, wr, wi, VR, &ldvr, &work[iwrk], &liwrk, info ); time_sum += timer_stop( time_hseqr ); flop_sum += flops_stop( flop_hseqr ); } else { /* Compute eigenvalues only * (Workspace: need N+1, prefer N+HSWORK (see comments) ) * - including N reserved for gebal/gebak, unused by dhseqr */ timer_start( time_hseqr ); flops_start( flop_hseqr ); iwrk = itau; liwrk = lwork - iwrk; lapackf77_dhseqr( "E", "N", &n, &ilo, &ihi, A, &lda, wr, wi, VR, &ldvr, &work[iwrk], &liwrk, info ); time_sum += timer_stop( time_hseqr ); flop_sum += flops_stop( flop_hseqr ); } /* If INFO > 0 from DHSEQR, then quit */ if (*info > 0) { goto CLEANUP; } timer_start( time_trevc ); flops_start( flop_trevc ); if (wantvl || wantvr) { /* Compute left and/or right eigenvectors * (Workspace: need 4*N, prefer (2 + 2*nb)*N) * - including N reserved for gebal/gebak, unused by dtrevc */ liwrk = lwork - iwrk; #if TREVC_VERSION == 1 lapackf77_dtrevc( lapack_side_const(side), "B", select, &n, A, &lda, VL, &ldvl, VR, &ldvr, &n, &nout, &work[iwrk], &ierr ); #elif TREVC_VERSION == 2 lapackf77_dtrevc3( lapack_side_const(side), "B", select, &n, A, &lda, VL, &ldvl, VR, &ldvr, &n, &nout, &work[iwrk], &liwrk, &ierr ); #elif TREVC_VERSION == 3 magma_dtrevc3( side, MagmaBacktransVec, select, n, A, lda, VL, ldvl, VR, ldvr, n, &nout, &work[iwrk], liwrk, &ierr ); #elif TREVC_VERSION == 4 magma_dtrevc3_mt( side, MagmaBacktransVec, select, n, A, lda, VL, ldvl, VR, ldvr, n, &nout, &work[iwrk], liwrk, &ierr ); #elif TREVC_VERSION == 5 magma_dtrevc3_mt_gpu( side, MagmaBacktransVec, select, n, A, lda, VL, ldvl, VR, ldvr, n, &nout, &work[iwrk], liwrk, &ierr ); #else #error Unknown TREVC_VERSION #endif } time_sum += timer_stop( time_trevc ); flop_sum += flops_stop( flop_trevc ); if (wantvl) { /* Undo balancing of left eigenvectors * (Workspace: need N) */ lapackf77_dgebak( "B", "L", &n, &ilo, &ihi, &work[ibal], &n, VL, &ldvl, &ierr ); /* Normalize left eigenvectors and make largest component real */ for (i = 0; i < n; ++i) { if ( wi[i] == 0. ) { scl = 1. / magma_cblas_dnrm2( n, VL(0,i), 1 ); blasf77_dscal( &n, &scl, VL(0,i), &ione ); } else if ( wi[i] > 0. ) { d__1 = magma_cblas_dnrm2( n, VL(0,i), 1 ); d__2 = magma_cblas_dnrm2( n, VL(0,i+1), 1 ); scl = 1. / lapackf77_dlapy2( &d__1, &d__2 ); blasf77_dscal( &n, &scl, VL(0,i), &ione ); blasf77_dscal( &n, &scl, VL(0,i+1), &ione ); for (k = 0; k < n; ++k) { /* Computing 2nd power */ d__1 = *VL(k,i); d__2 = *VL(k,i+1); work[iwrk + k] = d__1*d__1 + d__2*d__2; } k = blasf77_idamax( &n, &work[iwrk], &ione ) - 1; // subtract 1; k is 0-based lapackf77_dlartg( VL(k,i), VL(k,i+1), &cs, &sn, &r ); blasf77_drot( &n, VL(0,i), &ione, VL(0,i+1), &ione, &cs, &sn ); *VL(k,i+1) = 0.; } } } if (wantvr) { /* Undo balancing of right eigenvectors * (Workspace: need N) */ lapackf77_dgebak( "B", "R", &n, &ilo, &ihi, &work[ibal], &n, VR, &ldvr, &ierr ); /* Normalize right eigenvectors and make largest component real */ for (i = 0; i < n; ++i) { if ( wi[i] == 0. ) { scl = 1. / magma_cblas_dnrm2( n, VR(0,i), 1 ); blasf77_dscal( &n, &scl, VR(0,i), &ione ); } else if ( wi[i] > 0. ) { d__1 = magma_cblas_dnrm2( n, VR(0,i), 1 ); d__2 = magma_cblas_dnrm2( n, VR(0,i+1), 1 ); scl = 1. / lapackf77_dlapy2( &d__1, &d__2 ); blasf77_dscal( &n, &scl, VR(0,i), &ione ); blasf77_dscal( &n, &scl, VR(0,i+1), &ione ); for (k = 0; k < n; ++k) { /* Computing 2nd power */ d__1 = *VR(k,i); d__2 = *VR(k,i+1); work[iwrk + k] = d__1*d__1 + d__2*d__2; } k = blasf77_idamax( &n, &work[iwrk], &ione ) - 1; // subtract 1; k is 0-based lapackf77_dlartg( VR(k,i), VR(k,i+1), &cs, &sn, &r ); blasf77_drot( &n, VR(0,i), &ione, VR(0,i+1), &ione, &cs, &sn ); *VR(k,i+1) = 0.; } } } CLEANUP: /* Undo scaling if necessary */ if (scalea) { // converged eigenvalues, stored in wr[i+1:n] and wi[i+1:n] for i = INFO magma_int_t nval = n - (*info); magma_int_t ld = max( nval, 1 ); lapackf77_dlascl( "G", &izero, &izero, &cscale, &anrm, &nval, &ione, wr + (*info), &ld, &ierr ); lapackf77_dlascl( "G", &izero, &izero, &cscale, &anrm, &nval, &ione, wi + (*info), &ld, &ierr ); if (*info > 0) { // first ilo columns were already upper triangular, // so the corresponding eigenvalues are also valid. nval = ilo - 1; lapackf77_dlascl( "G", &izero, &izero, &cscale, &anrm, &nval, &ione, wr, &n, &ierr ); lapackf77_dlascl( "G", &izero, &izero, &cscale, &anrm, &nval, &ione, wi, &n, &ierr ); } } #if defined(Version3) || defined(Version4) || defined(Version5) magma_free( dT ); #endif #if defined(Version4) || defined(Version5) magma_free_cpu( T ); #endif timer_stop( time_total ); flops_stop( flop_total ); timer_printf( "dgeev times n %5d, gehrd %7.3f, unghr %7.3f, hseqr %7.3f, trevc %7.3f, total %7.3f, sum %7.3f\n", (int) n, time_gehrd, time_unghr, time_hseqr, time_trevc, time_total, time_sum ); timer_printf( "dgeev flops n %5d, gehrd %7lld, unghr %7lld, hseqr %7lld, trevc %7lld, total %7lld, sum %7lld\n", (int) n, flop_gehrd, flop_unghr, flop_hseqr, flop_trevc, flop_total, flop_sum ); work[0] = MAGMA_D_MAKE( (double) optwrk, 0. ); return *info; } /* magma_dgeev */