extern "C" magma_int_t magma_zheevd(char jobz, char uplo, magma_int_t n, magmaDoubleComplex *a, magma_int_t lda, double *w, magmaDoubleComplex *work, magma_int_t lwork, double *rwork, magma_int_t lrwork, magma_int_t *iwork, magma_int_t liwork, magma_int_t *info) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= ZHEEVD computes all eigenvalues and, optionally, eigenvectors of a complex Hermitian matrix A. If eigenvectors are desired, it uses a divide and conquer algorithm. The divide and conquer algorithm makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments ========= JOBZ (input) CHARACTER*1 = 'N': Compute eigenvalues only; = 'V': Compute eigenvalues and eigenvectors. UPLO (input) CHARACTER*1 = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. A (input/output) COMPLEX_16 array, dimension (LDA, N) On entry, the Hermitian matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. On exit, if JOBZ = 'V', then if INFO = 0, A contains the orthonormal eigenvectors of the matrix A. If JOBZ = 'N', then on exit the lower triangle (if UPLO='L') or the upper triangle (if UPLO='U') of A, including the diagonal, is destroyed. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). W (output) DOUBLE PRECISION array, dimension (N) If INFO = 0, the eigenvalues in ascending order. WORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. LWORK (input) INTEGER The length of the array WORK. If N <= 1, LWORK >= 1. If JOBZ = 'N' and N > 1, LWORK >= N + N*NB. If JOBZ = 'V' and N > 1, LWORK >= max( N + N*NB, 2*N + N**2 ). NB can be obtained through magma_get_zhetrd_nb(N). If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. RWORK (workspace/output) DOUBLE PRECISION array, dimension (LRWORK) On exit, if INFO = 0, RWORK[0] returns the optimal LRWORK. LRWORK (input) INTEGER The dimension of the array RWORK. If N <= 1, LRWORK >= 1. If JOBZ = 'N' and N > 1, LRWORK >= N. If JOBZ = 'V' and N > 1, LRWORK >= 1 + 5*N + 2*N**2. If LRWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. IWORK (workspace/output) INTEGER array, dimension (MAX(1,LIWORK)) On exit, if INFO = 0, IWORK[0] returns the optimal LIWORK. LIWORK (input) INTEGER The dimension of the array IWORK. If N <= 1, LIWORK >= 1. If JOBZ = 'N' and N > 1, LIWORK >= 1. If JOBZ = 'V' and N > 1, LIWORK >= 3 + 5*N. If LIWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i and JOBZ = 'N', then the algorithm failed to converge; i off-diagonal elements of an intermediate tridiagonal form did not converge to zero; if INFO = i and JOBZ = 'V', then the algorithm failed to compute an eigenvalue while working on the submatrix lying in rows and columns INFO/(N+1) through mod(INFO,N+1). Further Details =============== Based on contributions by Jeff Rutter, Computer Science Division, University of California at Berkeley, USA Modified description of INFO. Sven, 16 Feb 05. ===================================================================== */ char uplo_[2] = {uplo, 0}; char jobz_[2] = {jobz, 0}; magma_int_t ione = 1; magma_int_t izero = 0; double d_one = 1.; double d__1; double eps; magma_int_t inde; double anrm; magma_int_t imax; double rmin, rmax; double sigma; magma_int_t iinfo, lwmin; magma_int_t lower; magma_int_t llrwk; magma_int_t wantz; magma_int_t indwk2, llwrk2; magma_int_t iscale; double safmin; double bignum; magma_int_t indtau; magma_int_t indrwk, indwrk, liwmin; magma_int_t lrwmin, llwork; double smlnum; magma_int_t lquery; double* dwork; wantz = lapackf77_lsame(jobz_, MagmaVecStr); lower = lapackf77_lsame(uplo_, MagmaLowerStr); lquery = lwork == -1 || lrwork == -1 || liwork == -1; *info = 0; if (! (wantz || lapackf77_lsame(jobz_, MagmaNoVecStr))) { *info = -1; } else if (! (lower || lapackf77_lsame(uplo_, MagmaUpperStr))) { *info = -2; } else if (n < 0) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } magma_int_t nb = magma_get_zhetrd_nb( n ); if ( n <= 1 ) { lwmin = 1; lrwmin = 1; liwmin = 1; } else if ( wantz ) { lwmin = max( n + n*nb, 2*n + n*n ); lrwmin = 1 + 5*n + 2*n*n; liwmin = 3 + 5*n; } else { lwmin = n + n*nb; lrwmin = n; liwmin = 1; } // multiply by 1+eps to ensure length gets rounded up, // if it cannot be exactly represented in floating point. work[0] = MAGMA_Z_MAKE( lwmin * (1. + lapackf77_dlamch("Epsilon")), 0.); rwork[0] = lrwmin * (1. + lapackf77_dlamch("Epsilon")); iwork[0] = liwmin; if ((lwork < lwmin) && !lquery) { *info = -8; } else if ((lrwork < lrwmin) && ! lquery) { *info = -10; } else if ((liwork < liwmin) && ! lquery) { *info = -12; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { return *info; } if (n == 1) { w[0] = MAGMA_Z_REAL(a[0]); if (wantz) { a[0] = MAGMA_Z_ONE; } return *info; } /* Check if matrix is very small then just call LAPACK on CPU, no need for GPU */ if (n <= 128){ #ifdef ENABLE_DEBUG printf("--------------------------------------------------------------\n"); printf(" warning matrix too small N=%d NB=%d, calling lapack on CPU \n", (int) n, (int) nb); printf("--------------------------------------------------------------\n"); #endif lapackf77_zheevd(jobz_, uplo_, &n, a, &lda, w, work, &lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, &lrwork, #endif iwork, &liwork, info); return *info; } /* Get machine constants. */ safmin = lapackf77_dlamch("Safe minimum"); eps = lapackf77_dlamch("Precision"); smlnum = safmin / eps; bignum = 1. / smlnum; rmin = magma_dsqrt(smlnum); rmax = magma_dsqrt(bignum); /* Scale matrix to allowable range, if necessary. */ anrm = lapackf77_zlanhe("M", uplo_, &n, a, &lda, rwork); iscale = 0; if (anrm > 0. && anrm < rmin) { iscale = 1; sigma = rmin / anrm; } else if (anrm > rmax) { iscale = 1; sigma = rmax / anrm; } if (iscale == 1) { lapackf77_zlascl(uplo_, &izero, &izero, &d_one, &sigma, &n, &n, a, &lda, info); } /* Call ZHETRD to reduce Hermitian matrix to tridiagonal form. */ // zhetrd rwork: e (n) // zstedx rwork: e (n) + llrwk (1 + 4*N + 2*N**2) ==> 1 + 5n + 2n^2 inde = 0; indrwk = inde + n; llrwk = lrwork - indrwk; // zhetrd work: tau (n) + llwork (n*nb) ==> n + n*nb // zstedx work: tau (n) + z (n^2) // zunmtr work: tau (n) + z (n^2) + llwrk2 (n or n*nb) ==> 2n + n^2, or n + n*nb + n^2 indtau = 0; indwrk = indtau + n; indwk2 = indwrk + n*n; llwork = lwork - indwrk; llwrk2 = lwork - indwk2; // #ifdef ENABLE_TIMER magma_timestr_t start, end; start = get_current_time(); #endif magma_zhetrd(uplo_[0], n, a, lda, w, &rwork[inde], &work[indtau], &work[indwrk], llwork, &iinfo); #ifdef ENABLE_TIMER end = get_current_time(); printf("time zhetrd = %6.2f\n", GetTimerValue(start,end)/1000.); #endif /* For eigenvalues only, call DSTERF. For eigenvectors, first call ZSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the tridiagonal matrix, then call ZUNMTR to multiply it to the Householder transformations represented as Householder vectors in A. */ if (! wantz) { lapackf77_dsterf(&n, w, &rwork[inde], info); } else { #ifdef ENABLE_TIMER start = get_current_time(); #endif if (MAGMA_SUCCESS != magma_dmalloc( &dwork, 3*n*(n/2 + 1) )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_zstedx('A', n, 0., 0., 0, 0, w, &rwork[inde], &work[indwrk], n, &rwork[indrwk], llrwk, iwork, liwork, dwork, info); magma_free( dwork ); #ifdef ENABLE_TIMER end = get_current_time(); printf("time zstedx = %6.2f\n", GetTimerValue(start,end)/1000.); start = get_current_time(); #endif magma_zunmtr(MagmaLeft, uplo, MagmaNoTrans, n, n, a, lda, &work[indtau], &work[indwrk], n, &work[indwk2], llwrk2, &iinfo); lapackf77_zlacpy("A", &n, &n, &work[indwrk], &n, a, &lda); #ifdef ENABLE_TIMER end = get_current_time(); printf("time zunmtr + copy = %6.2f\n", GetTimerValue(start,end)/1000.); #endif } /* If matrix was scaled, then rescale eigenvalues appropriately. */ if (iscale == 1) { if (*info == 0) { imax = n; } else { imax = *info - 1; } d__1 = 1. / sigma; blasf77_dscal(&imax, &d__1, w, &ione); } work[0] = MAGMA_Z_MAKE( lwmin * (1. + lapackf77_dlamch("Epsilon")), 0.); // round up rwork[0] = lrwmin * (1. + lapackf77_dlamch("Epsilon")); iwork[0] = liwmin; return *info; } /* magma_zheevd */
extern "C" magma_int_t magma_zheevdx_2stage(char jobz, char range, char uplo, magma_int_t n, magmaDoubleComplex *a, magma_int_t lda, double vl, double vu, magma_int_t il, magma_int_t iu, magma_int_t *m, double *w, magmaDoubleComplex *work, magma_int_t lwork, double *rwork, magma_int_t lrwork, magma_int_t *iwork, magma_int_t liwork, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= ZHEEVD_2STAGE computes all eigenvalues and, optionally, eigenvectors of a complex Hermitian matrix A. It uses a two-stage algorithm for the tridiagonalization. If eigenvectors are desired, it uses a divide and conquer algorithm. The divide and conquer algorithm makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments ========= JOBZ (input) CHARACTER*1 = 'N': Compute eigenvalues only; = 'V': Compute eigenvalues and eigenvectors. RANGE (input) CHARACTER*1 = 'A': all eigenvalues will be found. = 'V': all eigenvalues in the half-open interval (VL,VU] will be found. = 'I': the IL-th through IU-th eigenvalues will be found. UPLO (input) CHARACTER*1 = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. A (input/output) COMPLEX_16 array, dimension (LDA, N) On entry, the Hermitian matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. On exit, if JOBZ = 'V', then if INFO = 0, the first m columns of A contains the required orthonormal eigenvectors of the matrix A. If JOBZ = 'N', then on exit the lower triangle (if UPLO='L') or the upper triangle (if UPLO='U') of A, including the diagonal, is destroyed. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). VL (input) DOUBLE PRECISION VU (input) DOUBLE PRECISION If RANGE='V', the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = 'A' or 'I'. IL (input) INTEGER IU (input) INTEGER If RANGE='I', the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = 'A' or 'V'. M (output) INTEGER The total number of eigenvalues found. 0 <= M <= N. If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1. W (output) DOUBLE PRECISION array, dimension (N) If INFO = 0, the required m eigenvalues in ascending order. WORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The length of the array WORK. If N <= 1, LWORK >= 1. If JOBZ = 'N' and N > 1, LWORK >= LQ2 + N * (NB + 1). If JOBZ = 'V' and N > 1, LWORK >= LQ2 + 2*N + N**2. where LQ2 is the size needed to store the Q2 matrix and is returned by MAGMA_BULGE_GET_LQ2. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. RWORK (workspace/output) DOUBLE PRECISION array, dimension (LRWORK) On exit, if INFO = 0, RWORK(1) returns the optimal LRWORK. LRWORK (input) INTEGER The dimension of the array RWORK. If N <= 1, LRWORK >= 1. If JOBZ = 'N' and N > 1, LRWORK >= N. If JOBZ = 'V' and N > 1, LRWORK >= 1 + 5*N + 2*N**2. If LRWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. IWORK (workspace/output) INTEGER array, dimension (MAX(1,LIWORK)) On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. LIWORK (input) INTEGER The dimension of the array IWORK. If N <= 1, LIWORK >= 1. If JOBZ = 'N' and N > 1, LIWORK >= 1. If JOBZ = 'V' and N > 1, LIWORK >= 3 + 5*N. If LIWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i and JOBZ = 'N', then the algorithm failed to converge; i off-diagonal elements of an intermediate tridiagonal form did not converge to zero; if INFO = i and JOBZ = 'V', then the algorithm failed to compute an eigenvalue while working on the submatrix lying in rows and columns INFO/(N+1) through mod(INFO,N+1). Further Details =============== Based on contributions by Jeff Rutter, Computer Science Division, University of California at Berkeley, USA Modified description of INFO. Sven, 16 Feb 05. ===================================================================== */ char uplo_[2] = {uplo, 0}; char jobz_[2] = {jobz, 0}; char range_[2] = {range, 0}; magmaDoubleComplex c_one = MAGMA_Z_ONE; magma_int_t ione = 1; magma_int_t izero = 0; double d_one = 1.; double d__1; double eps; double anrm; magma_int_t imax; double rmin, rmax; double sigma; //magma_int_t iinfo; magma_int_t lwmin, lrwmin, liwmin; magma_int_t lower; magma_int_t wantz; magma_int_t iscale; double safmin; double bignum; double smlnum; magma_int_t lquery; magma_int_t alleig, valeig, indeig; double* dwork; /* determine the number of threads */ magma_int_t threads = magma_get_numthreads(); magma_setlapack_numthreads(threads); wantz = lapackf77_lsame(jobz_, MagmaVecStr); lower = lapackf77_lsame(uplo_, MagmaLowerStr); alleig = lapackf77_lsame( range_, "A" ); valeig = lapackf77_lsame( range_, "V" ); indeig = lapackf77_lsame( range_, "I" ); lquery = lwork == -1 || lrwork == -1 || liwork == -1; *info = 0; if (! (wantz || lapackf77_lsame(jobz_, MagmaNoVecStr))) { *info = -1; } else if (! (alleig || valeig || indeig)) { *info = -2; } else if (! (lower || lapackf77_lsame(uplo_, MagmaUpperStr))) { *info = -3; } else if (n < 0) { *info = -4; } else if (lda < max(1,n)) { *info = -6; } else { if (valeig) { if (n > 0 && vu <= vl) { *info = -8; } } else if (indeig) { if (il < 1 || il > max(1,n)) { *info = -9; } else if (iu < min(n,il) || iu > n) { *info = -10; } } } magma_int_t nb = magma_get_zbulge_nb(n,threads); magma_int_t Vblksiz = magma_zbulge_get_Vblksiz(n, nb, threads); magma_int_t ldt = Vblksiz; magma_int_t ldv = nb + Vblksiz; magma_int_t blkcnt = magma_bulge_get_blkcnt(n, nb, Vblksiz); magma_int_t lq2 = magma_zbulge_get_lq2(n, threads); if (wantz) { lwmin = lq2 + 2 * n + n * n; lrwmin = 1 + 5 * n + 2 * n * n; liwmin = 5 * n + 3; } else { lwmin = lq2 + n * (nb + 1); lrwmin = n; liwmin = 1; } work[0] = MAGMA_Z_MAKE( lwmin * (1. + lapackf77_dlamch("Epsilon")), 0.); // round up rwork[0] = lrwmin * (1. + lapackf77_dlamch("Epsilon")); iwork[0] = liwmin; if ((lwork < lwmin) && !lquery) { *info = -14; } else if ((lrwork < lrwmin) && ! lquery) { *info = -16; } else if ((liwork < liwmin) && ! lquery) { *info = -18; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { return *info; } if (n == 1) { w[0] = MAGMA_Z_REAL(a[0]); if (wantz) { a[0] = MAGMA_Z_ONE; } return *info; } #ifdef ENABLE_TIMER printf("using %d threads\n", threads); #endif /* Check if matrix is very small then just call LAPACK on CPU, no need for GPU */ magma_int_t ntiles = n/nb; if( ( ntiles < 2 ) || ( n <= 128 ) ){ #ifdef ENABLE_DEBUG printf("--------------------------------------------------------------\n"); printf(" warning matrix too small N=%d NB=%d, calling lapack on CPU \n", (int) n, (int) nb); printf("--------------------------------------------------------------\n"); #endif lapackf77_zheevd(jobz_, &uplo, &n, a, &lda, w, work, &lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, &lrwork, #endif iwork, &liwork, info); *m = n; return *info; } /* Get machine constants. */ safmin = lapackf77_dlamch("Safe minimum"); eps = lapackf77_dlamch("Precision"); smlnum = safmin / eps; bignum = 1. / smlnum; rmin = magma_dsqrt(smlnum); rmax = magma_dsqrt(bignum); /* Scale matrix to allowable range, if necessary. */ anrm = lapackf77_zlanhe("M", uplo_, &n, a, &lda, rwork); iscale = 0; if (anrm > 0. && anrm < rmin) { iscale = 1; sigma = rmin / anrm; } else if (anrm > rmax) { iscale = 1; sigma = rmax / anrm; } if (iscale == 1) { lapackf77_zlascl(uplo_, &izero, &izero, &d_one, &sigma, &n, &n, a, &lda, info); } magma_int_t indT2 = 0; magma_int_t indTAU2 = indT2 + blkcnt*ldt*Vblksiz; magma_int_t indV2 = indTAU2+ blkcnt*Vblksiz; magma_int_t indtau1 = indV2 + blkcnt*ldv*Vblksiz; magma_int_t indwrk = indtau1+ n; //magma_int_t indwk2 = indwrk + n * n; magma_int_t llwork = lwork - indwrk; //magma_int_t llwrk2 = lwork - indwk2; magma_int_t inde = 0; magma_int_t indrwk = inde + n; magma_int_t llrwk = lrwork - indrwk; #ifdef ENABLE_TIMER magma_timestr_t start, st1, st2, end; start = get_current_time(); #endif magmaDoubleComplex *dT1; if (MAGMA_SUCCESS != magma_zmalloc( &dT1, n*nb)) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_zhetrd_he2hb(uplo, n, nb, a, lda, &work[indtau1], &work[indwrk], llwork, dT1, threads, info); #ifdef ENABLE_TIMER st1 = get_current_time(); printf(" time zhetrd_he2hb = %6.2f\n" , GetTimerValue(start,st1)/1000.); #endif /* copy the input matrix into WORK(INDWRK) with band storage */ /* PAY ATTENTION THAT work[indwrk] should be able to be of size lda2*n which it should be checked in any future modification of lwork.*/ magma_int_t lda2 = 2*nb; //nb+1+(nb-1); magmaDoubleComplex* A2 = &work[indwrk]; memset(A2 , 0, n*lda2*sizeof(magmaDoubleComplex)); for (magma_int_t j = 0; j < n-nb; j++) { cblas_zcopy(nb+1, &a[j*(lda+1)], 1, &A2[j*lda2], 1); memset(&a[j*(lda+1)], 0, (nb+1)*sizeof(magmaDoubleComplex)); a[nb + j*(lda+1)] = c_one; } for (magma_int_t j = 0; j < nb; j++) { cblas_zcopy(nb-j, &a[(j+n-nb)*(lda+1)], 1, &A2[(j+n-nb)*lda2], 1); memset(&a[(j+n-nb)*(lda+1)], 0, (nb-j)*sizeof(magmaDoubleComplex)); } #ifdef ENABLE_TIMER st2 = get_current_time(); printf(" time zhetrd_convert = %6.2f\n" , GetTimerValue(st1,st2)/1000.); #endif magma_zhetrd_hb2st(threads, uplo, n, nb, Vblksiz, A2, lda2, w, &rwork[inde], &work[indV2], ldv, &work[indTAU2], wantz, &work[indT2], ldt); #ifdef ENABLE_TIMER end = get_current_time(); printf(" time zhetrd_hb2st = %6.2f\n" , GetTimerValue(st2,end)/1000.); printf(" time zhetrd = %6.2f\n", GetTimerValue(start,end)/1000.); #endif /* For eigenvalues only, call DSTERF. For eigenvectors, first call ZSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the tridiagonal matrix, then call ZUNMTR to multiply it to the Householder transformations represented as Householder vectors in A. */ if (! wantz) { #ifdef ENABLE_TIMER start = get_current_time(); #endif lapackf77_dsterf(&n, w, &rwork[inde], info); magma_dmove_eig(range, n, w, &il, &iu, vl, vu, m); #ifdef ENABLE_TIMER end = get_current_time(); printf(" time dstedc = %6.2f\n", GetTimerValue(start,end)/1000.); #endif } else { #ifdef ENABLE_TIMER start = get_current_time(); #endif if (MAGMA_SUCCESS != magma_dmalloc( &dwork, 3*n*(n/2 + 1) )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_zstedx(range, n, vl, vu, il, iu, w, &rwork[inde], &work[indwrk], n, &rwork[indrwk], llrwk, iwork, liwork, dwork, info); magma_free( dwork ); #ifdef ENABLE_TIMER end = get_current_time(); printf(" time zstedx = %6.2f\n", GetTimerValue(start,end)/1000.); start = get_current_time(); #endif magmaDoubleComplex *dZ; magma_int_t lddz = n; magmaDoubleComplex *da; magma_int_t ldda = n; magma_dmove_eig(range, n, w, &il, &iu, vl, vu, m); if (MAGMA_SUCCESS != magma_zmalloc( &dZ, *m*lddz)) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } if (MAGMA_SUCCESS != magma_zmalloc( &da, n*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_zbulge_back(threads, uplo, n, nb, *m, Vblksiz, &work[indwrk + n * (il-1)], n, dZ, lddz, &work[indV2], ldv, &work[indTAU2], &work[indT2], ldt, info); #ifdef ENABLE_TIMER st1 = get_current_time(); printf(" time zbulge_back = %6.2f\n" , GetTimerValue(start,st1)/1000.); #endif magma_zsetmatrix( n, n, a, lda, da, ldda ); magma_zunmqr_gpu_2stages(MagmaLeft, MagmaNoTrans, n-nb, *m, n-nb, da+nb, ldda, dZ+nb, n, dT1, nb, info); magma_zgetmatrix( n, *m, dZ, lddz, a, lda ); magma_free(dT1); magma_free(dZ); magma_free(da); #ifdef ENABLE_TIMER end = get_current_time(); printf(" time zunmqr + copy = %6.2f\n", GetTimerValue(st1,end)/1000.); printf(" time eigenvectors backtransf. = %6.2f\n" , GetTimerValue(start,end)/1000.); #endif } /* If matrix was scaled, then rescale eigenvalues appropriately. */ if (iscale == 1) { if (*info == 0) { imax = n; } else { imax = *info - 1; } d__1 = 1. / sigma; blasf77_dscal(&imax, &d__1, w, &ione); } work[0] = MAGMA_Z_MAKE( lwmin * (1. + lapackf77_dlamch("Epsilon")), 0.); // round up rwork[0] = lrwmin * (1. + lapackf77_dlamch("Epsilon")); iwork[0] = liwmin; return *info; } /* magma_zheevdx_2stage */
/** Purpose ------- ZHEEVDX computes selected eigenvalues and, optionally, eigenvectors of a complex Hermitian matrix A. Eigenvalues and eigenvectors can be selected by specifying either a range of values or a range of indices for the desired eigenvalues. If eigenvectors are desired, it uses a divide and conquer algorithm. The divide and conquer algorithm makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments --------- @param[in] jobz magma_vec_t - = MagmaNoVec: Compute eigenvalues only; - = MagmaVec: Compute eigenvalues and eigenvectors. @param[in] range magma_range_t - = MagmaRangeAll: all eigenvalues will be found. - = MagmaRangeV: all eigenvalues in the half-open interval (VL,VU] will be found. - = MagmaRangeI: the IL-th through IU-th eigenvalues will be found. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A COMPLEX_16 array, dimension (LDA, N) On entry, the Hermitian matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. On exit, if JOBZ = MagmaVec, then if INFO = 0, the first m columns of A contains the required orthonormal eigenvectors of the matrix A. If JOBZ = MagmaNoVec, then on exit the lower triangle (if UPLO=MagmaLower) or the upper triangle (if UPLO=MagmaUpper) of A, including the diagonal, is destroyed. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[in] vl DOUBLE PRECISION @param[in] vu DOUBLE PRECISION If RANGE=MagmaRangeV, the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = MagmaRangeAll or MagmaRangeI. @param[in] il INTEGER @param[in] iu INTEGER If RANGE=MagmaRangeI, the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = MagmaRangeAll or MagmaRangeV. @param[out] m INTEGER The total number of eigenvalues found. 0 <= M <= N. If RANGE = MagmaRangeAll, M = N, and if RANGE = MagmaRangeI, M = IU-IL+1. @param[out] w DOUBLE PRECISION array, dimension (N) If INFO = 0, the required m eigenvalues in ascending order. @param[out] work (workspace) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The length of the array WORK. If N <= 1, LWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LWORK >= N + N*NB. If JOBZ = MagmaVec and N > 1, LWORK >= max( N + N*NB, 2*N + N**2 ). NB can be obtained through magma_get_zhetrd_nb(N). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. @param[out] rwork (workspace) DOUBLE PRECISION array, dimension (LRWORK) On exit, if INFO = 0, RWORK[0] returns the optimal LRWORK. @param[in] lrwork INTEGER The dimension of the array RWORK. If N <= 1, LRWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LRWORK >= N. If JOBZ = MagmaVec and N > 1, LRWORK >= 1 + 5*N + 2*N**2. \n If LRWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. @param[out] iwork (workspace) INTEGER array, dimension (MAX(1,LIWORK)) On exit, if INFO = 0, IWORK[0] returns the optimal LIWORK. @param[in] liwork INTEGER The dimension of the array IWORK. If N <= 1, LIWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LIWORK >= 1. If JOBZ = MagmaVec and N > 1, LIWORK >= 3 + 5*N. \n If LIWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i and JOBZ = MagmaNoVec, then the algorithm failed to converge; i off-diagonal elements of an intermediate tridiagonal form did not converge to zero; if INFO = i and JOBZ = MagmaVec, then the algorithm failed to compute an eigenvalue while working on the submatrix lying in rows and columns INFO/(N+1) through mod(INFO,N+1). Further Details --------------- Based on contributions by Jeff Rutter, Computer Science Division, University of California at Berkeley, USA Modified description of INFO. Sven, 16 Feb 05. @ingroup magma_zheev_driver ********************************************************************/ extern "C" magma_int_t magma_zheevdx( magma_vec_t jobz, magma_range_t range, magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex *A, magma_int_t lda, double vl, double vu, magma_int_t il, magma_int_t iu, magma_int_t *m, double *w, magmaDoubleComplex *work, magma_int_t lwork, #ifdef COMPLEX double *rwork, magma_int_t lrwork, #endif magma_int_t *iwork, magma_int_t liwork, magma_int_t *info) { const char* uplo_ = lapack_uplo_const( uplo ); const char* jobz_ = lapack_vec_const( jobz ); magma_int_t ione = 1; magma_int_t izero = 0; double d_one = 1.; double d__1; double eps; magma_int_t inde; double anrm; magma_int_t imax; double rmin, rmax; double sigma; magma_int_t iinfo, lwmin; magma_int_t lower; magma_int_t llrwk; magma_int_t wantz; magma_int_t indwk2, llwrk2; magma_int_t iscale; double safmin; double bignum; magma_int_t indtau; magma_int_t indrwk, indwrk, liwmin; magma_int_t lrwmin, llwork; double smlnum; magma_int_t lquery; magma_int_t alleig, valeig, indeig; double* dwork; wantz = (jobz == MagmaVec); lower = (uplo == MagmaLower); alleig = (range == MagmaRangeAll); valeig = (range == MagmaRangeV); indeig = (range == MagmaRangeI); lquery = (lwork == -1 || lrwork == -1 || liwork == -1); *info = 0; if (! (wantz || (jobz == MagmaNoVec))) { *info = -1; } else if (! (alleig || valeig || indeig)) { *info = -2; } else if (! (lower || (uplo == MagmaUpper))) { *info = -3; } else if (n < 0) { *info = -4; } else if (lda < max(1,n)) { *info = -6; } else { if (valeig) { if (n > 0 && vu <= vl) { *info = -8; } } else if (indeig) { if (il < 1 || il > max(1,n)) { *info = -9; } else if (iu < min(n,il) || iu > n) { *info = -10; } } } magma_int_t nb = magma_get_zhetrd_nb( n ); if ( n <= 1 ) { lwmin = 1; lrwmin = 1; liwmin = 1; } else if ( wantz ) { lwmin = max( n + n*nb, 2*n + n*n ); lrwmin = 1 + 5*n + 2*n*n; liwmin = 3 + 5*n; } else { lwmin = n + n*nb; lrwmin = n; liwmin = 1; } // multiply by 1+eps (in Double!) to ensure length gets rounded up, // if it cannot be exactly represented in floating point. real_Double_t one_eps = 1. + lapackf77_dlamch("Epsilon"); work[0] = MAGMA_Z_MAKE( lwmin * one_eps, 0.); rwork[0] = lrwmin * one_eps; iwork[0] = liwmin; if ((lwork < lwmin) && !lquery) { *info = -14; } else if ((lrwork < lrwmin) && ! lquery) { *info = -16; } else if ((liwork < liwmin) && ! lquery) { *info = -18; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { return *info; } if (n == 1) { w[0] = MAGMA_Z_REAL(A[0]); if (wantz) { A[0] = MAGMA_Z_ONE; } return *info; } /* Check if matrix is very small then just call LAPACK on CPU, no need for GPU */ if (n <= 128) { #ifdef ENABLE_DEBUG printf("--------------------------------------------------------------\n"); printf(" warning matrix too small N=%d NB=%d, calling lapack on CPU \n", (int) n, (int) nb); printf("--------------------------------------------------------------\n"); #endif lapackf77_zheevd(jobz_, uplo_, &n, A, &lda, w, work, &lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, &lrwork, #endif iwork, &liwork, info); return *info; } /* Get machine constants. */ safmin = lapackf77_dlamch("Safe minimum"); eps = lapackf77_dlamch("Precision"); smlnum = safmin / eps; bignum = 1. / smlnum; rmin = magma_dsqrt(smlnum); rmax = magma_dsqrt(bignum); /* Scale matrix to allowable range, if necessary. */ anrm = lapackf77_zlanhe("M", uplo_, &n, A, &lda, rwork); iscale = 0; if (anrm > 0. && anrm < rmin) { iscale = 1; sigma = rmin / anrm; } else if (anrm > rmax) { iscale = 1; sigma = rmax / anrm; } if (iscale == 1) { lapackf77_zlascl(uplo_, &izero, &izero, &d_one, &sigma, &n, &n, A, &lda, info); } /* Call ZHETRD to reduce Hermitian matrix to tridiagonal form. */ // zhetrd rwork: e (n) // zstedx rwork: e (n) + llrwk (1 + 4*N + 2*N**2) ==> 1 + 5n + 2n^2 inde = 0; indrwk = inde + n; llrwk = lrwork - indrwk; // zhetrd work: tau (n) + llwork (n*nb) ==> n + n*nb // zstedx work: tau (n) + z (n^2) // zunmtr work: tau (n) + z (n^2) + llwrk2 (n or n*nb) ==> 2n + n^2, or n + n*nb + n^2 indtau = 0; indwrk = indtau + n; indwk2 = indwrk + n*n; llwork = lwork - indwrk; llwrk2 = lwork - indwk2; magma_timer_t time=0; timer_start( time ); magma_zhetrd(uplo, n, A, lda, w, &rwork[inde], &work[indtau], &work[indwrk], llwork, &iinfo); timer_stop( time ); timer_printf( "time zhetrd = %6.2f\n", time ); /* For eigenvalues only, call DSTERF. For eigenvectors, first call ZSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the tridiagonal matrix, then call ZUNMTR to multiply it to the Householder transformations represented as Householder vectors in A. */ if (! wantz) { lapackf77_dsterf(&n, w, &rwork[inde], info); magma_dmove_eig(range, n, w, &il, &iu, vl, vu, m); } else { timer_start( time ); if (MAGMA_SUCCESS != magma_dmalloc( &dwork, 3*n*(n/2 + 1) )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_zstedx(range, n, vl, vu, il, iu, w, &rwork[inde], &work[indwrk], n, &rwork[indrwk], llrwk, iwork, liwork, dwork, info); magma_free( dwork ); timer_stop( time ); timer_printf( "time zstedx = %6.2f\n", time ); timer_start( time ); magma_dmove_eig(range, n, w, &il, &iu, vl, vu, m); magma_zunmtr(MagmaLeft, uplo, MagmaNoTrans, n, *m, A, lda, &work[indtau], &work[indwrk + n * (il-1) ], n, &work[indwk2], llwrk2, &iinfo); lapackf77_zlacpy("A", &n, m, &work[indwrk + n * (il-1)], &n, A, &lda); timer_stop( time ); timer_printf( "time zunmtr + copy = %6.2f\n", time ); } /* If matrix was scaled, then rescale eigenvalues appropriately. */ if (iscale == 1) { if (*info == 0) { imax = n; } else { imax = *info - 1; } d__1 = 1. / sigma; blasf77_dscal(&imax, &d__1, w, &ione); } work[0] = MAGMA_Z_MAKE( lwmin * one_eps, 0.); // round up rwork[0] = lrwmin * one_eps; iwork[0] = liwmin; return *info; } /* magma_zheevdx */
/** Purpose ------- ZHEEVDX_GPU computes selected eigenvalues and, optionally, eigenvectors of a complex Hermitian matrix A. Eigenvalues and eigenvectors can be selected by specifying either a range of values or a range of indices for the desired eigenvalues. If eigenvectors are desired, it uses a divide and conquer algorithm. The divide and conquer algorithm makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments --------- @param[in] jobz magma_vec_t - = MagmaNoVec: Compute eigenvalues only; - = MagmaVec: Compute eigenvalues and eigenvectors. @param[in] range magma_range_t - = MagmaRangeAll: all eigenvalues will be found. - = MagmaRangeV: all eigenvalues in the half-open interval (VL,VU] will be found. - = MagmaRangeI: the IL-th through IU-th eigenvalues will be found. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] dA COMPLEX_16 array on the GPU, dimension (LDDA, N). On entry, the Hermitian matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. On exit, if JOBZ = MagmaVec, then if INFO = 0, the first mout columns of A contains the required orthonormal eigenvectors of the matrix A. If JOBZ = MagmaNoVec, then on exit the lower triangle (if UPLO=MagmaLower) or the upper triangle (if UPLO=MagmaUpper) of A, including the diagonal, is destroyed. @param[in] ldda INTEGER The leading dimension of the array DA. LDDA >= max(1,N). @param[in] vl DOUBLE PRECISION @param[in] vu DOUBLE PRECISION If RANGE=MagmaRangeV, the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = MagmaRangeAll or MagmaRangeI. @param[in] il INTEGER @param[in] iu INTEGER If RANGE=MagmaRangeI, the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = MagmaRangeAll or MagmaRangeV. @param[out] mout INTEGER The total number of eigenvalues found. 0 <= MOUT <= N. If RANGE = MagmaRangeAll, MOUT = N, and if RANGE = MagmaRangeI, MOUT = IU-IL+1. @param[out] w DOUBLE PRECISION array, dimension (N) If INFO = 0, the required mout eigenvalues in ascending order. @param wA (workspace) COMPLEX_16 array, dimension (LDWA, N) @param[in] ldwa INTEGER The leading dimension of the array wA. LDWA >= max(1,N). @param[out] work (workspace) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The length of the array WORK. If N <= 1, LWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LWORK >= N + N*NB. If JOBZ = MagmaVec and N > 1, LWORK >= max( N + N*NB, 2*N + N**2 ). NB can be obtained through magma_get_zhetrd_nb(N). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. @param[out] rwork (workspace) DOUBLE PRECISION array, dimension (LRWORK) On exit, if INFO = 0, RWORK[0] returns the optimal LRWORK. @param[in] lrwork INTEGER The dimension of the array RWORK. If N <= 1, LRWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LRWORK >= N. If JOBZ = MagmaVec and N > 1, LRWORK >= 1 + 5*N + 2*N**2. \n If LRWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. @param[out] iwork (workspace) INTEGER array, dimension (MAX(1,LIWORK)) On exit, if INFO = 0, IWORK[0] returns the optimal LIWORK. @param[in] liwork INTEGER The dimension of the array IWORK. If N <= 1, LIWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LIWORK >= 1. If JOBZ = MagmaVec and N > 1, LIWORK >= 3 + 5*N. \n If LIWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i and JOBZ = MagmaNoVec, then the algorithm failed to converge; i off-diagonal elements of an intermediate tridiagonal form did not converge to zero; if INFO = i and JOBZ = MagmaVec, then the algorithm failed to compute an eigenvalue while working on the submatrix lying in rows and columns INFO/(N+1) through mod(INFO,N+1). Further Details --------------- Based on contributions by Jeff Rutter, Computer Science Division, University of California at Berkeley, USA Modified description of INFO. Sven, 16 Feb 05. @ingroup magma_zheev_driver ********************************************************************/ extern "C" magma_int_t magma_zheevdx_gpu( magma_vec_t jobz, magma_range_t range, magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex_ptr dA, magma_int_t ldda, double vl, double vu, magma_int_t il, magma_int_t iu, magma_int_t *mout, double *w, magmaDoubleComplex *wA, magma_int_t ldwa, magmaDoubleComplex *work, magma_int_t lwork, #ifdef COMPLEX double *rwork, magma_int_t lrwork, #endif magma_int_t *iwork, magma_int_t liwork, magma_int_t *info) { const char* uplo_ = lapack_uplo_const( uplo ); const char* jobz_ = lapack_vec_const( jobz ); magma_int_t ione = 1; double d__1; double eps; magma_int_t inde; double anrm; magma_int_t imax; double rmin, rmax; double sigma; magma_int_t iinfo, lwmin; magma_int_t lower; magma_int_t llrwk; magma_int_t wantz; //magma_int_t indwk2; magma_int_t iscale; double safmin; double bignum; magma_int_t indtau; magma_int_t indrwk, indwrk, liwmin; magma_int_t lrwmin, llwork; double smlnum; magma_int_t lquery; magma_int_t alleig, valeig, indeig; magmaDouble_ptr dwork; magmaDoubleComplex_ptr dC; magma_int_t lddc = ldda; wantz = (jobz == MagmaVec); lower = (uplo == MagmaLower); alleig = (range == MagmaRangeAll); valeig = (range == MagmaRangeV); indeig = (range == MagmaRangeI); lquery = (lwork == -1 || lrwork == -1 || liwork == -1); *info = 0; if (! (wantz || (jobz == MagmaNoVec))) { *info = -1; } else if (! (alleig || valeig || indeig)) { *info = -2; } else if (! (lower || (uplo == MagmaUpper))) { *info = -3; } else if (n < 0) { *info = -4; } else if (ldda < max(1,n)) { *info = -6; } else if (ldwa < max(1,n)) { *info = -14; } else { if (valeig) { if (n > 0 && vu <= vl) { *info = -8; } } else if (indeig) { if (il < 1 || il > max(1,n)) { *info = -9; } else if (iu < min(n,il) || iu > n) { *info = -10; } } } magma_int_t nb = magma_get_zhetrd_nb( n ); if ( n <= 1 ) { lwmin = 1; lrwmin = 1; liwmin = 1; } else if ( wantz ) { lwmin = max( n + n*nb, 2*n + n*n ); lrwmin = 1 + 5*n + 2*n*n; liwmin = 3 + 5*n; } else { lwmin = n + n*nb; lrwmin = n; liwmin = 1; } // multiply by 1+eps (in Double!) to ensure length gets rounded up, // if it cannot be exactly represented in floating point. real_Double_t one_eps = 1. + lapackf77_dlamch("Epsilon"); work[0] = MAGMA_Z_MAKE( lwmin * one_eps, 0 ); rwork[0] = lrwmin * one_eps; iwork[0] = liwmin; if ((lwork < lwmin) && !lquery) { *info = -16; } else if ((lrwork < lrwmin) && ! lquery) { *info = -18; } else if ((liwork < liwmin) && ! lquery) { *info = -20; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* If matrix is very small, then just call LAPACK on CPU, no need for GPU */ if (n <= 128) { magma_int_t lda = n; magmaDoubleComplex *A; magma_zmalloc_cpu( &A, lda*n ); magma_zgetmatrix( n, n, dA, ldda, A, lda ); lapackf77_zheevd( jobz_, uplo_, &n, A, &lda, w, work, &lwork, rwork, &lrwork, iwork, &liwork, info ); magma_zsetmatrix( n, n, A, lda, dA, ldda ); magma_free_cpu( A ); *mout = n; return *info; } magma_queue_t stream; magma_queue_create( &stream ); // dC and dwork are never used together, so use one buffer for both; // unfortunately they're different types (complex and double). // (this is easier in dsyevd_gpu where everything is double.) // zhetrd2_gpu requires ldda*ceildiv(n,64) + 2*ldda*nb, in double-complex. // zunmtr_gpu requires lddc*n, in double-complex. // zlanhe requires n, in double. magma_int_t ldwork = max( ldda*ceildiv(n,64) + 2*ldda*nb, lddc*n ); magma_int_t ldwork_real = max( ldwork*2, n ); if ( wantz ) { // zstedx requrise 3n^2/2, in double ldwork_real = max( ldwork_real, 3*n*(n/2 + 1) ); } if (MAGMA_SUCCESS != magma_dmalloc( &dwork, ldwork_real )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dC = (magmaDoubleComplex*) dwork; /* Get machine constants. */ safmin = lapackf77_dlamch("Safe minimum"); eps = lapackf77_dlamch("Precision"); smlnum = safmin / eps; bignum = 1. / smlnum; rmin = magma_dsqrt( smlnum ); rmax = magma_dsqrt( bignum ); /* Scale matrix to allowable range, if necessary. */ anrm = magmablas_zlanhe( MagmaMaxNorm, uplo, n, dA, ldda, dwork ); iscale = 0; sigma = 1; if (anrm > 0. && anrm < rmin) { iscale = 1; sigma = rmin / anrm; } else if (anrm > rmax) { iscale = 1; sigma = rmax / anrm; } if (iscale == 1) { magmablas_zlascl( uplo, 0, 0, 1., sigma, n, n, dA, ldda, info ); } /* Call ZHETRD to reduce Hermitian matrix to tridiagonal form. */ // zhetrd rwork: e (n) // zstedx rwork: e (n) + llrwk (1 + 4*N + 2*N**2) ==> 1 + 5n + 2n^2 inde = 0; indrwk = inde + n; llrwk = lrwork - indrwk; // zhetrd work: tau (n) + llwork (n*nb) ==> n + n*nb // zstedx work: tau (n) + z (n^2) // zunmtr work: tau (n) + z (n^2) + llwrk2 (n or n*nb) ==> 2n + n^2, or n + n*nb + n^2 indtau = 0; indwrk = indtau + n; //indwk2 = indwrk + n*n; llwork = lwork - indwrk; //llwrk2 = lwork - indwk2; magma_timer_t time=0; timer_start( time ); #ifdef FAST_HEMV magma_zhetrd2_gpu( uplo, n, dA, ldda, w, &rwork[inde], &work[indtau], wA, ldwa, &work[indwrk], llwork, dC, ldwork, &iinfo ); #else magma_zhetrd_gpu ( uplo, n, dA, ldda, w, &rwork[inde], &work[indtau], wA, ldwa, &work[indwrk], llwork, &iinfo ); #endif timer_stop( time ); timer_printf( "time zhetrd_gpu = %6.2f\n", time ); /* For eigenvalues only, call DSTERF. For eigenvectors, first call ZSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the tridiagonal matrix, then call ZUNMTR to multiply it to the Householder transformations represented as Householder vectors in A. */ if (! wantz) { lapackf77_dsterf( &n, w, &rwork[inde], info ); magma_dmove_eig( range, n, w, &il, &iu, vl, vu, mout ); } else { timer_start( time ); magma_zstedx( range, n, vl, vu, il, iu, w, &rwork[inde], &work[indwrk], n, &rwork[indrwk], llrwk, iwork, liwork, dwork, info ); timer_stop( time ); timer_printf( "time zstedx = %6.2f\n", time ); timer_start( time ); magma_dmove_eig( range, n, w, &il, &iu, vl, vu, mout ); magma_zsetmatrix( n, *mout, &work[indwrk + n * (il-1) ], n, dC, lddc ); magma_zunmtr_gpu( MagmaLeft, uplo, MagmaNoTrans, n, *mout, dA, ldda, &work[indtau], dC, lddc, wA, ldwa, &iinfo ); magma_zcopymatrix( n, *mout, dC, lddc, dA, ldda ); timer_stop( time ); timer_printf( "time zunmtr_gpu + copy = %6.2f\n", time ); } /* If matrix was scaled, then rescale eigenvalues appropriately. */ if (iscale == 1) { if (*info == 0) { imax = n; } else { imax = *info - 1; } d__1 = 1. / sigma; blasf77_dscal( &imax, &d__1, w, &ione ); } work[0] = MAGMA_Z_MAKE( lwmin * one_eps, 0 ); // round up rwork[0] = lrwmin * one_eps; iwork[0] = liwmin; magma_queue_destroy( stream ); magma_free( dwork ); return *info; } /* magma_zheevdx_gpu */
/** Purpose ------- ZHEEVD_2STAGE computes all eigenvalues and, optionally, eigenvectors of a complex Hermitian matrix A. It uses a two-stage algorithm for the tridiagonalization. If eigenvectors are desired, it uses a divide and conquer algorithm. The divide and conquer algorithm makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments --------- @param[in] nrgpu INTEGER Number of GPUs to use. @param[in] jobz magma_vec_t - = MagmaNoVec: Compute eigenvalues only; - = MagmaVec: Compute eigenvalues and eigenvectors. @param[in] range magma_range_t - = MagmaRangeAll: all eigenvalues will be found. - = MagmaRangeV: all eigenvalues in the half-open interval (VL,VU] will be found. - = MagmaRangeI: the IL-th through IU-th eigenvalues will be found. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A COMPLEX_16 array, dimension (LDA, N) On entry, the Hermitian matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. On exit, if JOBZ = MagmaVec, then if INFO = 0, the first m columns of A contains the required orthonormal eigenvectors of the matrix A. If JOBZ = MagmaNoVec, then on exit the lower triangle (if UPLO=MagmaLower) or the upper triangle (if UPLO=MagmaUpper) of A, including the diagonal, is destroyed. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[in] vl DOUBLE PRECISION @param[in] vu DOUBLE PRECISION If RANGE=MagmaRangeV, the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = MagmaRangeAll or MagmaRangeI. @param[in] il INTEGER @param[in] iu INTEGER If RANGE=MagmaRangeI, the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = MagmaRangeAll or MagmaRangeV. @param[out] m INTEGER The total number of eigenvalues found. 0 <= M <= N. If RANGE = MagmaRangeAll, M = N, and if RANGE = MagmaRangeI, M = IU-IL+1. @param[out] w DOUBLE PRECISION array, dimension (N) If INFO = 0, the required m eigenvalues in ascending order. @param[out] work (workspace) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The length of the array WORK. If N <= 1, LWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LWORK >= LQ2 + N + N*NB. If JOBZ = MagmaVec and N > 1, LWORK >= LQ2 + 2*N + N**2. where LQ2 is the size needed to store the Q2 matrix and is returned by magma_bulge_get_lq2. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. @param[out] rwork (workspace) DOUBLE PRECISION array, dimension (LRWORK) On exit, if INFO = 0, RWORK[0] returns the optimal LRWORK. @param[in] lrwork INTEGER The dimension of the array RWORK. If N <= 1, LRWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LRWORK >= N. If JOBZ = MagmaVec and N > 1, LRWORK >= 1 + 5*N + 2*N**2. \n If LRWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. @param[out] iwork (workspace) INTEGER array, dimension (MAX(1,LIWORK)) On exit, if INFO = 0, IWORK[0] returns the optimal LIWORK. @param[in] liwork INTEGER The dimension of the array IWORK. If N <= 1, LIWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LIWORK >= 1. If JOBZ = MagmaVec and N > 1, LIWORK >= 3 + 5*N. \n If LIWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i and JOBZ = MagmaNoVec, then the algorithm failed to converge; i off-diagonal elements of an intermediate tridiagonal form did not converge to zero; if INFO = i and JOBZ = MagmaVec, then the algorithm failed to compute an eigenvalue while working on the submatrix lying in rows and columns INFO/(N+1) through mod(INFO,N+1). Further Details --------------- Based on contributions by Jeff Rutter, Computer Science Division, University of California at Berkeley, USA Modified description of INFO. Sven, 16 Feb 05. @ingroup magma_zheev_driver ********************************************************************/ extern "C" magma_int_t magma_zheevdx_2stage_m(magma_int_t nrgpu, magma_vec_t jobz, magma_range_t range, magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex *A, magma_int_t lda, double vl, double vu, magma_int_t il, magma_int_t iu, magma_int_t *m, double *w, magmaDoubleComplex *work, magma_int_t lwork, double *rwork, magma_int_t lrwork, magma_int_t *iwork, magma_int_t liwork, magma_int_t *info) { #define A( i_,j_) (A + (i_) + (j_)*lda) #define A2(i_,j_) (A2 + (i_) + (j_)*lda2) const char* uplo_ = lapack_uplo_const( uplo ); const char* jobz_ = lapack_vec_const( jobz ); magmaDoubleComplex c_one = MAGMA_Z_ONE; double d_one = 1.; magma_int_t ione = 1; magma_int_t izero = 0; double d__1; double eps; double anrm; magma_int_t imax; double rmin, rmax; double sigma; //magma_int_t iinfo; magma_int_t lwmin, lrwmin, liwmin; magma_int_t lower; magma_int_t wantz; magma_int_t iscale; double safmin; double bignum; double smlnum; magma_int_t lquery; magma_int_t alleig, valeig, indeig; magma_int_t len; /* determine the number of threads */ magma_int_t parallel_threads = magma_get_parallel_numthreads(); wantz = (jobz == MagmaVec); lower = (uplo == MagmaLower); alleig = (range == MagmaRangeAll); valeig = (range == MagmaRangeV); indeig = (range == MagmaRangeI); lquery = (lwork == -1 || lrwork == -1 || liwork == -1); *info = 0; if (! (wantz || (jobz == MagmaNoVec))) { *info = -1; } else if (! (alleig || valeig || indeig)) { *info = -2; } else if (! (lower || (uplo == MagmaUpper))) { *info = -3; } else if (n < 0) { *info = -4; } else if (lda < max(1,n)) { *info = -6; } else { if (valeig) { if (n > 0 && vu <= vl) { *info = -8; } } else if (indeig) { if (il < 1 || il > max(1,n)) { *info = -9; } else if (iu < min(n,il) || iu > n) { *info = -10; } } } magma_int_t nb = magma_get_zbulge_nb(n, parallel_threads); magma_int_t Vblksiz = magma_zbulge_get_Vblksiz(n, nb, parallel_threads); magma_int_t ldt = Vblksiz; magma_int_t ldv = nb + Vblksiz; magma_int_t blkcnt = magma_bulge_get_blkcnt(n, nb, Vblksiz); magma_int_t lq2 = magma_zbulge_get_lq2(n, parallel_threads); if (wantz) { lwmin = lq2 + 2*n + n*n; lrwmin = 1 + 5*n + 2*n*n; liwmin = 5*n + 3; } else { lwmin = lq2 + n + n*nb; lrwmin = n; liwmin = 1; } // multiply by 1+eps (in Double!) to ensure length gets rounded up, // if it cannot be exactly represented in floating point. real_Double_t one_eps = 1. + lapackf77_dlamch("Epsilon"); work[0] = MAGMA_Z_MAKE( lwmin * one_eps, 0.); // round up rwork[0] = lrwmin * one_eps; iwork[0] = liwmin; if ((lwork < lwmin) && !lquery) { *info = -14; } else if ((lrwork < lrwmin) && ! lquery) { *info = -16; } else if ((liwork < liwmin) && ! lquery) { *info = -18; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { return *info; } if (n == 1) { w[0] = MAGMA_Z_REAL(A[0]); if (wantz) { A[0] = MAGMA_Z_ONE; } return *info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); timer_printf("using %d parallel_threads\n", (int) parallel_threads); /* Check if matrix is very small then just call LAPACK on CPU, no need for GPU */ magma_int_t ntiles = n/nb; if ( ( ntiles < 2 ) || ( n <= 128 ) ) { #ifdef ENABLE_DEBUG printf("--------------------------------------------------------------\n"); printf(" warning matrix too small N=%d NB=%d, calling lapack on CPU \n", (int) n, (int) nb); printf("--------------------------------------------------------------\n"); #endif lapackf77_zheevd(jobz_, uplo_, &n, A, &lda, w, work, &lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, &lrwork, #endif iwork, &liwork, info); *m = n; return *info; } /* Get machine constants. */ safmin = lapackf77_dlamch("Safe minimum"); eps = lapackf77_dlamch("Precision"); smlnum = safmin / eps; bignum = 1. / smlnum; rmin = magma_dsqrt(smlnum); rmax = magma_dsqrt(bignum); /* Scale matrix to allowable range, if necessary. */ anrm = lapackf77_zlanhe("M", uplo_, &n, A, &lda, rwork); iscale = 0; if (anrm > 0. && anrm < rmin) { iscale = 1; sigma = rmin / anrm; } else if (anrm > rmax) { iscale = 1; sigma = rmax / anrm; } if (iscale == 1) { lapackf77_zlascl(uplo_, &izero, &izero, &d_one, &sigma, &n, &n, A, &lda, info); } magma_int_t indT2 = 0; magma_int_t indTAU2 = indT2 + blkcnt*ldt*Vblksiz; magma_int_t indV2 = indTAU2+ blkcnt*Vblksiz; magma_int_t indtau1 = indV2 + blkcnt*ldv*Vblksiz; magma_int_t indwrk = indtau1+ n; magma_int_t indwk2 = indwrk + n*n; magma_int_t llwork = lwork - indwrk; magma_int_t llwrk2 = lwork - indwk2; magma_int_t inde = 0; magma_int_t indrwk = inde + n; magma_int_t llrwk = lrwork - indrwk; magma_timer_t time=0, time_total=0, time_alloc=0, time_dist=0, time_band=0; timer_start( time_total ); #ifdef HE2HB_SINGLEGPU magmaDoubleComplex *dT1; if (MAGMA_SUCCESS != magma_zmalloc( &dT1, n*nb)) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } timer_start( time_band ); magma_zhetrd_he2hb(uplo, n, nb, A, lda, &work[indtau1], &work[indwrk], llwork, dT1, info); timer_stop( time_band ); timer_printf( " 1 GPU seq code time zhetrd_he2hb only = %7.4f\n", time_band ); magma_free(dT1); #else magma_int_t nstream = max(3,nrgpu+2); magma_queue_t streams[MagmaMaxGPUs][20]; magmaDoubleComplex *da[MagmaMaxGPUs], *dT1[MagmaMaxGPUs]; magma_int_t ldda = ((n+31)/32)*32; magma_int_t ver = 0; magma_int_t distblk = max(256, 4*nb); #ifdef ENABLE_DEBUG printf("voici ngpu %d distblk %d NB %d nstream %d version %d \n ", nrgpu, distblk, nb, nstream, ver); #endif timer_start( time_alloc ); for( magma_int_t dev = 0; dev < nrgpu; ++dev ) { magma_int_t mlocal = ((n / distblk) / nrgpu + 1) * distblk; magma_setdevice( dev ); // TODO check malloc magma_zmalloc(&da[dev], ldda*mlocal ); magma_zmalloc(&dT1[dev], (n*nb) ); for( int i = 0; i < nstream; ++i ) { magma_queue_create( &streams[dev][i] ); } } timer_stop( time_alloc ); timer_start( time_dist ); magma_zsetmatrix_1D_col_bcyclic( n, n, A, lda, da, ldda, nrgpu, distblk ); magma_setdevice(0); timer_stop( time_dist ); timer_start( time_band ); if (ver == 30) { magma_zhetrd_he2hb_mgpu_spec(uplo, n, nb, A, lda, &work[indtau1], &work[indwrk], llwork, da, ldda, dT1, nb, nrgpu, distblk, streams, nstream, info); } else { magma_zhetrd_he2hb_mgpu(uplo, n, nb, A, lda, &work[indtau1], &work[indwrk], llwork, da, ldda, dT1, nb, nrgpu, distblk, streams, nstream, info); } timer_stop( time_band ); timer_printf(" time alloc %7.4f, ditribution %7.4f, zhetrd_he2hb only = %7.4f\n", time_alloc, time_dist, time_band ); for( magma_int_t dev = 0; dev < nrgpu; ++dev ) { magma_setdevice( dev ); magma_free( da[dev] ); magma_free( dT1[dev] ); for( int i = 0; i < nstream; ++i ) { magma_queue_destroy( streams[dev][i] ); } } #endif // not HE2HB_SINGLEGPU timer_stop( time_total ); timer_printf( " time zhetrd_he2hb_mgpu = %6.2f\n", time_total ); timer_start( time_total ); timer_start( time ); /* copy the input matrix into WORK(INDWRK) with band storage */ /* PAY ATTENTION THAT work[indwrk] should be able to be of size lda2*n which it should be checked in any future modification of lwork.*/ magma_int_t lda2 = 2*nb; //nb+1+(nb-1); magmaDoubleComplex* A2 = &work[indwrk]; memset(A2, 0, n*lda2*sizeof(magmaDoubleComplex)); for (magma_int_t j = 0; j < n-nb; j++) { len = nb+1; blasf77_zcopy( &len, A(j,j), &ione, A2(0,j), &ione ); memset(A(j,j), 0, (nb+1)*sizeof(magmaDoubleComplex)); *A(nb+j,j) = c_one; } for (magma_int_t j = 0; j < nb; j++) { len = nb-j; blasf77_zcopy( &len, A(j+n-nb,j+n-nb), &ione, A2(0,j+n-nb), &ione ); memset(A(j+n-nb,j+n-nb), 0, (nb-j)*sizeof(magmaDoubleComplex)); } timer_stop( time ); timer_printf( " time zhetrd_convert = %6.2f\n", time ); timer_start( time ); magma_zhetrd_hb2st(uplo, n, nb, Vblksiz, A2, lda2, w, &rwork[inde], &work[indV2], ldv, &work[indTAU2], wantz, &work[indT2], ldt); timer_stop( time ); timer_stop( time_total ); timer_printf( " time zhetrd_hb2st = %6.2f\n", time ); timer_printf( " time zhetrd = %6.2f\n", time_total ); /* For eigenvalues only, call DSTERF. For eigenvectors, first call ZSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the tridiagonal matrix, then call ZUNMTR to multiply it to the Householder transformations represented as Householder vectors in A. */ if (! wantz) { timer_start( time ); lapackf77_dsterf(&n, w, &rwork[inde], info); magma_dmove_eig(range, n, w, &il, &iu, vl, vu, m); timer_stop( time ); timer_printf( " time dstedc = %6.2f\n", time ); } else { timer_start( time_total ); timer_start( time ); magma_zstedx_m(nrgpu, range, n, vl, vu, il, iu, w, &rwork[inde], &work[indwrk], n, &rwork[indrwk], llrwk, iwork, liwork, info); timer_stop( time ); timer_printf( " time zstedx_m = %6.2f\n", time ); timer_start( time ); magma_dmove_eig(range, n, w, &il, &iu, vl, vu, m); /* magmaDoubleComplex *dZ; magma_int_t lddz = n; if (MAGMA_SUCCESS != magma_zmalloc( &dZ, *m*lddz)) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_zbulge_back(uplo, n, nb, *m, Vblksiz, &work[indwrk + n * (il-1)], n, dZ, lddz, &work[indV2], ldv, &work[indTAU2], &work[indT2], ldt, info); magma_zgetmatrix( n, *m, dZ, lddz, &work[indwrk], n); magma_free(dZ); */ magma_zbulge_back_m(nrgpu, uplo, n, nb, *m, Vblksiz, &work[indwrk + n * (il-1)], n, &work[indV2], ldv, &work[indTAU2], &work[indT2], ldt, info); timer_stop( time ); timer_printf( " time zbulge_back_m = %6.2f\n", time ); timer_start( time ); magma_zunmqr_m(nrgpu, MagmaLeft, MagmaNoTrans, n-nb, *m, n-nb, A+nb, lda, &work[indtau1], &work[indwrk + n * (il-1) + nb], n, &work[indwk2], llwrk2, info); lapackf77_zlacpy("A", &n, m, &work[indwrk + n * (il-1)], &n, A, &lda); timer_stop( time ); timer_stop( time_total ); timer_printf( " time zunmqr_m + copy = %6.2f\n", time ); timer_printf( " time eigenvectors backtransf. = %6.2f\n", time_total ); } /* If matrix was scaled, then rescale eigenvalues appropriately. */ if (iscale == 1) { if (*info == 0) { imax = n; } else { imax = *info - 1; } d__1 = 1. / sigma; blasf77_dscal(&imax, &d__1, w, &ione); } work[0] = MAGMA_Z_MAKE( lwmin * one_eps, 0.); // round up rwork[0] = lrwmin * one_eps; iwork[0] = liwmin; magma_setdevice( orig_dev ); return *info; } /* magma_zheevdx_2stage_m */
/* //////////////////////////////////////////////////////////////////////////// -- Testing zheevd_gpu */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gpu_time, cpu_time; magmaDoubleComplex *h_A, *h_R, *d_R, *h_work, aux_work[1]; #ifdef COMPLEX double *rwork, aux_rwork[1]; magma_int_t lrwork; #endif double *w1, *w2, result[4]={0, 0, 0, 0}, eps; magma_int_t *iwork, aux_iwork[1]; magma_int_t N, n2, info, lwork, liwork, lda, ldda; magma_int_t izero = 0; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; eps = lapackf77_dlamch( "E" ); magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); double tolulp = opts.tolerance * lapackf77_dlamch("P"); // checking NoVec requires LAPACK opts.lapack |= (opts.check && opts.jobz == MagmaNoVec); printf("using: jobz = %s, uplo = %s\n", lapack_vec_const(opts.jobz), lapack_uplo_const(opts.uplo)); printf(" N CPU Time (sec) GPU Time (sec)\n"); printf("=======================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; n2 = N*N; lda = N; ldda = roundup( N, opts.roundup ); // by default, round to multiple of 32 // query for workspace sizes magma_zheevd_gpu( opts.jobz, opts.uplo, N, NULL, ldda, NULL, NULL, lda, aux_work, -1, #ifdef COMPLEX aux_rwork, -1, #endif aux_iwork, -1, &info ); lwork = (magma_int_t) MAGMA_Z_REAL( aux_work[0] ); #ifdef COMPLEX lrwork = (magma_int_t) aux_rwork[0]; #endif liwork = aux_iwork[0]; /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, N*lda ); TESTING_MALLOC_CPU( w1, double, N ); TESTING_MALLOC_CPU( w2, double, N ); #ifdef COMPLEX TESTING_MALLOC_CPU( rwork, double, lrwork ); #endif TESTING_MALLOC_CPU( iwork, magma_int_t, liwork ); TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, N*lda ); TESTING_MALLOC_PIN( h_work, magmaDoubleComplex, lwork ); TESTING_MALLOC_DEV( d_R, magmaDoubleComplex, N*ldda ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); magma_zmake_hermitian( N, h_A, N ); magma_zsetmatrix( N, N, h_A, lda, d_R, ldda ); /* warm up run */ if ( opts.warmup ) { magma_zheevd_gpu( opts.jobz, opts.uplo, N, d_R, ldda, w1, h_R, lda, h_work, lwork, #ifdef COMPLEX rwork, lrwork, #endif iwork, liwork, &info ); if (info != 0) printf("magma_zheevd_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_zsetmatrix( N, N, h_A, lda, d_R, ldda ); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_zheevd_gpu( opts.jobz, opts.uplo, N, d_R, ldda, w1, h_R, lda, h_work, lwork, #ifdef COMPLEX rwork, lrwork, #endif iwork, liwork, &info ); gpu_time = magma_wtime() - gpu_time; if (info != 0) printf("magma_zheevd_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.check && opts.jobz != MagmaNoVec ) { /* ===================================================================== Check the results following the LAPACK's [zcds]drvst routine. A is factored as A = U S U' and the following 3 tests computed: (1) | A - U S U' | / ( |A| N ) (2) | I - U'U | / ( N ) (3) | S(with U) - S(w/o U) | / | S | =================================================================== */ magma_zgetmatrix( N, N, d_R, ldda, h_R, lda ); magmaDoubleComplex *work; TESTING_MALLOC_CPU( work, magmaDoubleComplex, 2*N*N ); // e=NULL is unused since kband=0; tau=NULL is unused since itype=1 lapackf77_zhet21( &ione, lapack_uplo_const(opts.uplo), &N, &izero, h_A, &lda, w1, NULL, h_R, &lda, h_R, &lda, NULL, work, #ifdef COMPLEX rwork, #endif &result[0] ); result[0] *= eps; result[1] *= eps; TESTING_FREE_CPU( work ); work=NULL; // Disable eigenvalue check which calls routine again -- // it obscures whether error occurs in first call above or in this call. // But see comparison to LAPACK below. // //magma_zsetmatrix( N, N, h_A, lda, d_R, ldda ); //magma_zheevd_gpu( MagmaNoVec, opts.uplo, // N, d_R, ldda, w2, // h_R, lda, // h_work, lwork, // #ifdef COMPLEX // rwork, lrwork, // #endif // iwork, liwork, // &info); //if (info != 0) // printf("magma_zheevd_gpu returned error %d: %s.\n", // (int) info, magma_strerror( info )); // //double maxw=0, diff=0; //for( int j=0; j < N; j++ ) { // maxw = max(maxw, fabs(w1[j])); // maxw = max(maxw, fabs(w2[j])); // diff = max(diff, fabs(w1[j] - w2[j])); //} //result[2] = diff / (N*maxw); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_zheevd( lapack_vec_const(opts.jobz), lapack_uplo_const(opts.uplo), &N, h_A, &lda, w2, h_work, &lwork, #ifdef COMPLEX rwork, &lrwork, #endif iwork, &liwork, &info); cpu_time = magma_wtime() - cpu_time; if (info != 0) printf("lapackf77_zheevd returned error %d: %s.\n", (int) info, magma_strerror( info )); // compare eigenvalues double maxw=0, diff=0; for( int j=0; j < N; j++ ) { maxw = max(maxw, fabs(w1[j])); maxw = max(maxw, fabs(w2[j])); diff = max(diff, fabs(w1[j] - w2[j])); } result[3] = diff / (N*maxw); printf("%5d %7.2f %7.2f\n", (int) N, cpu_time, gpu_time); } else { printf("%5d --- %7.2f\n", (int) N, gpu_time); } /* ===================================================================== Print execution time =================================================================== */ if ( opts.check && opts.jobz != MagmaNoVec ) { printf("Testing the factorization A = U S U' for correctness:\n"); printf(" | A - U S U' | / (|A| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed") ); printf(" | I - U'U | / N = %8.2e %s\n", result[1], (result[1] < tol ? "ok" : "failed") ); //printf(" | S(w/ U) - S(w/o U) | / |S| = %8.2e %s\n\n", result[2], (result[2] < tolulp ? "ok" : "failed") ); status += ! (result[0] < tol && result[1] < tol); // && result[2] < tolulp) } if ( opts.lapack ) { printf(" | S_magma - S_lapack | / |S| = %8.2e %s\n\n", result[3], (result[3] < tolulp ? "ok" : "failed") ); status += ! (result[3] < tolulp); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( w2 ); #ifdef COMPLEX TESTING_FREE_CPU( rwork ); #endif TESTING_FREE_CPU( iwork ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_work ); TESTING_FREE_DEV( d_R ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }