extern "C" magma_int_t magma_sgeev(magma_vec_t jobvl, magma_vec_t jobvr, magma_int_t n, float *a, magma_int_t lda, float *WR, float *WI, float *vl, magma_int_t ldvl, float *vr, magma_int_t ldvr, float *work, magma_int_t lwork, magma_int_t *info, magma_queue_t queue) { /* -- clMAGMA (version 1.0.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver September 2012 Purpose ======= SGEEV computes for an N-by-N real nonsymmetric matrix A, the eigenvalues and, optionally, the left and/or right eigenvectors. The right eigenvector v(j) of A satisfies A * v(j) = lambda(j) * v(j) where lambda(j) is its eigenvalue. The left eigenvector u(j) of A satisfies u(j)**T * A = lambda(j) * u(j)**T where u(j)**T denotes the transpose of u(j). The computed eigenvectors are normalized to have Euclidean norm equal to 1 and largest component real. Arguments ========= JOBVL (input) CHARACTER*1 = 'N': left eigenvectors of A are not computed; = 'V': left eigenvectors of are computed. JOBVR (input) CHARACTER*1 = 'N': right eigenvectors of A are not computed; = 'V': right eigenvectors of A are computed. N (input) INTEGER The order of the matrix A. N >= 0. A (input/output) DOUBLE PRECISION array, dimension (LDA,N) On entry, the N-by-N matrix A. On exit, A has been overwritten. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). WR (output) DOUBLE PRECISION array, dimension (N) WI (output) DOUBLE PRECISION array, dimension (N) WR and WI contain the real and imaginary parts, respectively, of the computed eigenvalues. Complex conjugate pairs of eigenvalues appear consecutively with the eigenvalue having the positive imaginary part first. VL (output) DOUBLE PRECISION array, dimension (LDVL,N) If JOBVL = 'V', the left eigenvectors u(j) are stored one after another in the columns of VL, in the same order as their eigenvalues. If JOBVL = 'N', VL is not referenced. u(j) = VL(:,j), the j-th column of VL. LDVL (input) INTEGER The leading dimension of the array VL. LDVL >= 1; if JOBVL = 'V', LDVL >= N. VR (output) DOUBLE PRECISION array, dimension (LDVR,N) If JOBVR = 'V', the right eigenvectors v(j) are stored one after another in the columns of VR, in the same order as their eigenvalues. If JOBVR = 'N', VR is not referenced. v(j) = VR(:,j), the j-th column of VR. LDVR (input) INTEGER The leading dimension of the array VR. LDVR >= 1; if JOBVR = 'V', LDVR >= N. WORK (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= (1+nb)*N. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value. > 0: if INFO = i, the QR algorithm failed to compute all the eigenvalues, and no eigenvectors have been computed; elements and i+1:N of W contain eigenvalues which have converged. ===================================================================== */ magma_int_t c__1 = 1; magma_int_t c__0 = 0; magma_int_t c_n1 = -1; magma_int_t a_dim1, a_offset, vl_dim1, vl_offset, vr_dim1, vr_offset, i__1, i__2, i__3; float d__1, d__2; magma_int_t i__, k, ihi, ilo; float r__, cs, sn, scl; float dum[1], eps; magma_int_t ibal; float anrm; magma_int_t ierr, itau, iwrk, nout; magma_int_t scalea; float cscale; float bignum; magma_int_t minwrk; magma_int_t wantvl; float smlnum; magma_int_t lquery, wantvr, select[1]; magma_int_t nb = 0; magmaFloat_ptr dT; //magma_timestr_t start, end; char side[2] = {0, 0}; magma_vec_t jobvl_ = jobvl; magma_vec_t jobvr_ = jobvr; *info = 0; lquery = lwork == -1; wantvl = lapackf77_lsame(lapack_const(jobvl_), "V"); wantvr = lapackf77_lsame(lapack_const(jobvr_), "V"); if (! wantvl && ! lapackf77_lsame(lapack_const(jobvl_), "N")) { *info = -1; } else if (! wantvr && ! lapackf77_lsame(lapack_const(jobvr_), "N")) { *info = -2; } else if (n < 0) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if ( (ldvl < 1) || (wantvl && (ldvl < n))) { *info = -9; } else if ( (ldvr < 1) || (wantvr && (ldvr < n))) { *info = -11; } /* Compute workspace */ if (*info == 0) { nb = magma_get_sgehrd_nb(n); minwrk = (2+nb)*n; work[0] = (float) minwrk; if (lwork < minwrk && ! lquery) { *info = -13; } } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { return *info; } // if eigenvectors are needed #if defined(VERSION3) if (MAGMA_SUCCESS != magma_malloc( &dT, nb*n*sizeof(float) )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } #endif // subtract row and col for 1-based indexing a_dim1 = lda; a_offset = 1 + a_dim1; a -= a_offset; vl_dim1 = ldvl; vl_offset = 1 + vl_dim1; vl -= vl_offset; vr_dim1 = ldvr; vr_offset = 1 + vr_dim1; vr -= vr_offset; --work; /* Get machine constants */ eps = lapackf77_slamch("P"); smlnum = lapackf77_slamch("S"); bignum = 1. / smlnum; lapackf77_slabad(&smlnum, &bignum); smlnum = magma_ssqrt(smlnum) / eps; bignum = 1. / smlnum; /* Scale A if max element outside range [SMLNUM,BIGNUM] */ anrm = lapackf77_slange("M", &n, &n, &a[a_offset], &lda, dum); scalea = 0; if (anrm > 0. && anrm < smlnum) { scalea = 1; cscale = smlnum; } else if (anrm > bignum) { scalea = 1; cscale = bignum; } if (scalea) { lapackf77_slascl("G", &c__0, &c__0, &anrm, &cscale, &n, &n, &a[a_offset], &lda, &ierr); } /* Balance the matrix (Workspace: need N) */ ibal = 1; lapackf77_sgebal("B", &n, &a[a_offset], &lda, &ilo, &ihi, &work[ibal], &ierr); /* Reduce to upper Hessenberg form (Workspace: need 3*N, prefer 2*N+N*NB) */ itau = ibal + n; iwrk = itau + n; i__1 = lwork - iwrk + 1; //start = get_current_time(); #if defined(VERSION1) /* * Version 1 - LAPACK */ lapackf77_sgehrd(&n, &ilo, &ihi, &a[a_offset], &lda, &work[itau], &work[iwrk], &i__1, &ierr); #elif defined(VERSION2) /* * Version 2 - LAPACK consistent HRD */ magma_sgehrd2(n, ilo, ihi, &a[a_offset], lda, &work[itau], &work[iwrk], &i__1, &ierr); #elif defined(VERSION3) /* * Version 3 - LAPACK consistent MAGMA HRD + matrices T stored, */ magma_sgehrd(n, ilo, ihi, &a[a_offset], lda, &work[itau], &work[iwrk], i__1, dT, 0, &ierr, queue); #endif //end = get_current_time(); //printf(" Time for sgehrd = %5.2f sec\n", GetTimerValue(start,end)/1000.); if (wantvl) { /* Want left eigenvectors Copy Householder vectors to VL */ side[0] = 'L'; lapackf77_slacpy(MagmaLowerStr, &n, &n, &a[a_offset], &lda, &vl[vl_offset], &ldvl); /* * Generate orthogonal matrix in VL * (Workspace: need 3*N-1, prefer 2*N+(N-1)*NB) */ i__1 = lwork - iwrk + 1; //start = get_current_time(); #if defined(VERSION1) || defined(VERSION2) /* * Version 1 & 2 - LAPACK */ lapackf77_sorghr(&n, &ilo, &ihi, &vl[vl_offset], &ldvl, &work[itau], &work[iwrk], &i__1, &ierr); #elif defined(VERSION3) /* * Version 3 - LAPACK consistent MAGMA HRD + matrices T stored */ magma_sorghr(n, ilo, ihi, &vl[vl_offset], ldvl, &work[itau], dT, 0, nb, &ierr, queue); #endif //end = get_current_time(); //printf(" Time for sorghr = %5.2f sec\n", GetTimerValue(start,end)/1000.); /* * Perform QR iteration, accumulating Schur vectors in VL * (Workspace: need N+1, prefer N+HSWORK (see comments) ) */ iwrk = itau; i__1 = lwork - iwrk + 1; lapackf77_shseqr("S", "V", &n, &ilo, &ihi, &a[a_offset], &lda, WR, WI, &vl[vl_offset], &ldvl, &work[iwrk], &i__1, info); if (wantvr) { /* Want left and right eigenvectors Copy Schur vectors to VR */ side[0] = 'B'; lapackf77_slacpy("F", &n, &n, &vl[vl_offset], &ldvl, &vr[vr_offset], &ldvr); } } else if (wantvr) { /* Want right eigenvectors Copy Householder vectors to VR */ side[0] = 'R'; lapackf77_slacpy("L", &n, &n, &a[a_offset], &lda, &vr[vr_offset], &ldvr); /* * Generate orthogonal matrix in VR * (Workspace: need 3*N-1, prefer 2*N+(N-1)*NB) */ i__1 = lwork - iwrk + 1; //start = get_current_time(); #if defined(VERSION1) || defined(VERSION2) /* * Version 1 & 2 - LAPACK */ lapackf77_sorghr(&n, &ilo, &ihi, &vr[vr_offset], &ldvr, &work[itau], &work[iwrk], &i__1, &ierr); #elif defined(VERSION3) /* * Version 3 - LAPACK consistent MAGMA HRD + matrices T stored */ magma_sorghr(n, ilo, ihi, &vr[vr_offset], ldvr, &work[itau], dT, 0, nb, &ierr, queue); #endif //end = get_current_time(); //printf(" Time for sorghr = %5.2f sec\n", GetTimerValue(start,end)/1000.); /* * Perform QR iteration, accumulating Schur vectors in VR * (Workspace: need N+1, prefer N+HSWORK (see comments) ) */ iwrk = itau; i__1 = lwork - iwrk + 1; lapackf77_shseqr("S", "V", &n, &ilo, &ihi, &a[a_offset], &lda, WR, WI, &vr[vr_offset], &ldvr, &work[iwrk], &i__1, info); } else { /* * Compute eigenvalues only * (Workspace: need N+1, prefer N+HSWORK (see comments) ) */ iwrk = itau; i__1 = lwork - iwrk + 1; lapackf77_shseqr("E", "N", &n, &ilo, &ihi, &a[a_offset], &lda, WR, WI, &vr[vr_offset], &ldvr, &work[iwrk], &i__1, info); } /* If INFO > 0 from SHSEQR, then quit */ if (*info > 0) { fprintf(stderr, "SHSEQR returned with info = %d\n", (int) *info); goto L50; } if (wantvl || wantvr) { /* * Compute left and/or right eigenvectors * (Workspace: need 4*N) */ lapackf77_strevc(side, "B", select, &n, &a[a_offset], &lda, &vl[vl_offset], &ldvl, &vr[vr_offset], &ldvr, &n, &nout, &work[iwrk], &ierr); } if (wantvl) { /* * Undo balancing of left eigenvectors * (Workspace: need N) */ lapackf77_sgebak("B", "L", &n, &ilo, &ihi, &work[ibal], &n, &vl[vl_offset], &ldvl, &ierr); /* Normalize left eigenvectors and make largest component real */ for (i__ = 1; i__ <= n; ++i__) { if ( WI[i__-1] == 0.) { scl = cblas_snrm2(n, &vl[i__ * vl_dim1 + 1], 1); scl = 1. / scl; cblas_sscal(n, (scl), &vl[i__ * vl_dim1 + 1], 1); } else if (WI[i__-1] > 0.) { d__1 = cblas_snrm2(n, &vl[ i__ * vl_dim1 + 1], 1); d__2 = cblas_snrm2(n, &vl[(i__ + 1) * vl_dim1 + 1], 1); scl = lapackf77_slapy2(&d__1, &d__2); scl = 1. / scl; cblas_sscal(n, (scl), &vl[ i__ * vl_dim1 + 1], 1); cblas_sscal(n, (scl), &vl[(i__ + 1) * vl_dim1 + 1], 1); i__2 = n; for (k = 1; k <= i__2; ++k) { /* Computing 2nd power */ d__1 = vl[k + i__ * vl_dim1]; /* Computing 2nd power */ d__2 = vl[k + (i__ + 1) * vl_dim1]; work[iwrk + k - 1] = d__1 * d__1 + d__2 * d__2; } /* Comment: Fortran BLAS does not have to add 1 C BLAS must add one to cblas_isamax */ k = cblas_isamax(n, &work[iwrk], 1)+1; lapackf77_slartg(&vl[k + i__ * vl_dim1], &vl[k + (i__ + 1) * vl_dim1], &cs, &sn, &r__); cblas_srot(n, &vl[ i__ * vl_dim1 + 1], 1, &vl[(i__ + 1) * vl_dim1 + 1], 1, cs, (sn)); vl[k + (i__ + 1) * vl_dim1] = 0.; } } } if (wantvr) { /* * Undo balancing of right eigenvectors * (Workspace: need N) */ lapackf77_sgebak("B", "R", &n, &ilo, &ihi, &work[ibal], &n, &vr[vr_offset], &ldvr, &ierr); /* Normalize right eigenvectors and make largest component real */ for (i__ = 1; i__ <= n; ++i__) { if (WI[i__-1] == 0.) { scl = 1. / cblas_snrm2(n, &vr[i__ * vr_dim1 + 1], 1); cblas_sscal(n, (scl), &vr[i__ * vr_dim1 + 1], 1); } else if (WI[i__-1] > 0.) { d__1 = cblas_snrm2(n, &vr[ i__ * vr_dim1 + 1], 1); d__2 = cblas_snrm2(n, &vr[(i__ + 1) * vr_dim1 + 1], 1); scl = lapackf77_slapy2(&d__1, &d__2); scl = 1. / scl; cblas_sscal(n, (scl), &vr[ i__ * vr_dim1 + 1], 1); cblas_sscal(n, (scl), &vr[(i__ + 1) * vr_dim1 + 1], 1); i__2 = n; for (k = 1; k <= i__2; ++k) { /* Computing 2nd power */ d__1 = vr[k + i__ * vr_dim1]; /* Computing 2nd power */ d__2 = vr[k + (i__ + 1) * vr_dim1]; work[iwrk + k - 1] = d__1 * d__1 + d__2 * d__2; } /* Comment: Fortran BLAS does not have to add 1 C BLAS must add one to cblas_isamax */ k = cblas_isamax(n, &work[iwrk], 1)+1; lapackf77_slartg(&vr[k + i__ * vr_dim1], &vr[k + (i__ + 1) * vr_dim1], &cs, &sn, &r__); cblas_srot(n, &vr[ i__ * vr_dim1 + 1], 1, &vr[(i__ + 1) * vr_dim1 + 1], 1, cs, (sn)); vr[k + (i__ + 1) * vr_dim1] = 0.; } } } /* Undo scaling if necessary */ L50: if (scalea) { i__1 = n - *info; /* Computing MAX */ i__3 = n - *info; i__2 = max(i__3,1); lapackf77_slascl("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1, WR + (*info), &i__2, &ierr); i__1 = n - *info; /* Computing MAX */ i__3 = n - *info; i__2 = max(i__3,1); lapackf77_slascl("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1, WI + (*info), &i__2, &ierr); if (*info > 0) { i__1 = ilo - 1; lapackf77_slascl("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1, WR, &n, &ierr); i__1 = ilo - 1; lapackf77_slascl("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1, WI, &n, &ierr); } } #if defined(VERSION3) magma_free( dT ); #endif return *info; } /* magma_sgeev */
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgehrd */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float *h_A, *h_R, *h_Q, *h_work, *tau, *twork, *dT; #if defined(PRECISION_z) || defined(PRECISION_c) float *rwork; #endif float eps, result[2]; magma_int_t N, n2, lda, nb, lwork, ltwork, info; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; eps = lapackf77_slamch( "E" ); magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) |A-QHQ'|/N|A| |I-QQ'|/N\n"); printf("=========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; nb = magma_get_sgehrd_nb(N); /* We suppose the magma nb is bigger than lapack nb */ lwork = N*nb; gflops = FLOPS_SGEHRD( N ) / 1e9; TESTING_MALLOC_CPU( h_A, float, n2 ); TESTING_MALLOC_CPU( tau, float, N ); TESTING_MALLOC_PIN( h_R, float, n2 ); TESTING_MALLOC_PIN( h_work, float, lwork ); TESTING_MALLOC_DEV( dT, float, nb*N ); /* Initialize the matrices */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_sgehrd( N, ione, N, h_R, lda, tau, h_work, lwork, dT, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_sgehrd returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the factorization =================================================================== */ if ( opts.check ) { ltwork = 2*(N*N); TESTING_MALLOC_PIN( h_Q, float, lda*N ); TESTING_MALLOC_CPU( twork, float, ltwork ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_MALLOC_CPU( rwork, float, N ); #endif lapackf77_slacpy(MagmaUpperLowerStr, &N, &N, h_R, &lda, h_Q, &lda); for( int j = 0; j < N-1; ++j ) for( int i = j+2; i < N; ++i ) h_R[i+j*lda] = MAGMA_S_ZERO; magma_sorghr(N, ione, N, h_Q, lda, tau, dT, nb, &info); if (info != 0) { printf("magma_sorghr returned error %d: %s.\n", (int) info, magma_strerror( info )); exit(1); } #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_shst01(&N, &ione, &N, h_A, &lda, h_R, &lda, h_Q, &lda, twork, <work, rwork, result); #else lapackf77_shst01(&N, &ione, &N, h_A, &lda, h_R, &lda, h_Q, &lda, twork, <work, result); #endif TESTING_FREE_PIN( h_Q ); TESTING_FREE_CPU( twork ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_FREE_CPU( rwork ); #endif } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_sgehrd(&N, &ione, &N, h_R, &lda, tau, h_work, &lwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_sgehrd returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Print performance and error. =================================================================== */ if ( opts.lapack ) { printf("%5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d --- ( --- ) %7.2f (%7.2f)", (int) N, gpu_perf, gpu_time ); } if ( opts.check ) { printf(" %8.2e %8.2e %s\n", result[0]*eps, result[1]*eps, ( ( (result[0]*eps < tol) && (result[1]*eps < tol) ) ? "ok" : "failed") ); status += ! (result[0]*eps < tol); status += ! (result[1]*eps < tol); } else { printf(" --- ---\n"); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( tau ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_work ); TESTING_FREE_DEV( dT ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/***************************************************************************//** Purpose ------- SGEHRD2 reduces a REAL general matrix A to upper Hessenberg form H by an orthogonal similarity transformation: Q' * A * Q = H . Arguments --------- @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] ilo INTEGER @param[in] ihi INTEGER It is assumed that A is already upper triangular in rows and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally set by a previous call to SGEBAL; otherwise they should be set to 1 and N respectively. See Further Details. 1 <= ILO <= IHI <= N, if N > 0; ILO=1 and IHI=0, if N=0. @param[in,out] A REAL array, dimension (LDA,N) On entry, the N-by-N general matrix to be reduced. On exit, the upper triangle and the first subdiagonal of A are overwritten with the upper Hessenberg matrix H, and the elements below the first subdiagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] tau REAL array, dimension (N-1) The scalar factors of the elementary reflectors (see Further Details). Elements 1:ILO-1 and IHI:N-1 of TAU are set to zero. @param[out] work (workspace) REAL array, dimension (LWORK) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The length of the array WORK. LWORK >= max(1,N). For optimum performance LWORK >= N*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value. Further Details --------------- The matrix Q is represented as a product of (ihi-ilo) elementary reflectors Q = H(ilo) H(ilo+1) . . . H(ihi-1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i) = 0, v(i+1) = 1 and v(ihi+1:n) = 0; v(i+2:ihi) is stored on exit in A(i+2:ihi,i), and tau in TAU(i). The contents of A are illustrated by the following example, with n = 7, ilo = 2 and ihi = 6: @verbatim on entry, on exit, ( a a a a a a a ) ( a a h h h h a ) ( a a a a a a ) ( a h h h h a ) ( a a a a a a ) ( h h h h h h ) ( a a a a a a ) ( v2 h h h h h ) ( a a a a a a ) ( v2 v3 h h h h ) ( a a a a a a ) ( v2 v3 v4 h h h ) ( a ) ( a ) @endverbatim where a denotes an element of the original matrix A, h denotes a modified element of the upper Hessenberg matrix H, and vi denotes an element of the vector defining H(i). This implementation follows the hybrid algorithm and notations described in S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg form through hybrid GPU-based computing," University of Tennessee Computer Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219), May 24, 2009. @ingroup magma_gehrd *******************************************************************************/ extern "C" magma_int_t magma_sgehrd2( magma_int_t n, magma_int_t ilo, magma_int_t ihi, float *A, magma_int_t lda, float *tau, float *work, magma_int_t lwork, magma_int_t *info) { #define A(i_,j_) ( A + (i_) + (j_)*lda) #ifdef HAVE_clBLAS #define dA(i_,j_) dwork, ((i_) + (j_)*ldda + nb*ldda*2) #define dT(i_,j_) dT, ((i_) + (j_)*nb + dT_offset) #define dV(i_,j_) dwork, ((i_) + (j_)*ldda + nb*ldda) #define dwork(i_) dwork, ((i_)) #else #define dA(i_,j_) (dA + (i_) + (j_)*ldda) #define dT(i_,j_) (dT + (i_) + (j_)*nb) #define dV(i_,j_) (dV + (i_) + (j_)*ldda) #define dwork(i_) (dwork + (i_)) #endif // Constants const float c_one = MAGMA_S_ONE; const float c_zero = MAGMA_S_ZERO; // Local variables magma_int_t nb = magma_get_sgehrd_nb( n ); magma_int_t ldda = magma_roundup( n, 32 ); magma_int_t i, nh, iws; magma_int_t iinfo; magma_int_t lquery; *info = 0; iws = n*nb; work[0] = magma_smake_lwork( iws ); lquery = (lwork == -1); if (n < 0) { *info = -1; } else if (ilo < 1 || ilo > max(1,n)) { *info = -2; } else if (ihi < min(ilo,n) || ihi > n) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if (lwork < max(1,n) && ! lquery) { *info = -8; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; // Adjust from 1-based indexing ilo -= 1; // Quick return if possible nh = ihi - ilo; if (nh <= 1) { work[0] = c_one; return *info; } // If not enough workspace, use unblocked code if ( lwork < iws ) { nb = 1; } if (nb == 1 || nb > nh) { // Use unblocked code below i = ilo; } else { // Use blocked code magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); // GPU workspace is: // nb*ldda for dwork for slahru // nb*ldda for dV // n*ldda for dA // nb*nb for dT magmaFloat_ptr dwork; if (MAGMA_SUCCESS != magma_smalloc( &dwork, 2*nb*ldda + n*ldda + nb*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } float *dV = dwork + nb*ldda; float *dA = dwork + nb*ldda*2; float *dT = dwork + nb*ldda*2 + n*ldda; float *T; magma_smalloc_cpu( &T, nb*nb ); if ( T == NULL ) { magma_free( dwork ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } // zero first block of V, which is lower triangular magmablas_slaset( MagmaFull, nb, nb, c_zero, c_zero, dV(0,0), ldda, queue ); // Set elements 0:ILO-1 and IHI-1:N-2 of TAU to zero for (i = 0; i < ilo; ++i) tau[i] = c_zero; for (i = max(0,ihi-1); i < n-1; ++i) tau[i] = c_zero; assert( nb % 4 == 0 ); for (i=0; i < nb*nb; i += 4) T[i] = T[i+1] = T[i+2] = T[i+3] = c_zero; // Copy the matrix to the GPU magma_ssetmatrix( n, n-ilo, A(0,ilo), lda, dA(0,0), ldda, queue ); for (i = ilo; i < ihi-1 - nb; i += nb) { // Reduce columns i:i+nb-1 to Hessenberg form, returning the // matrices V and T of the block reflector H = I - V*T*V' // which performs the reduction, and also the matrix Y = A*V*T // Get the current panel (no need for the 1st iteration) magma_sgetmatrix( ihi-i, nb, dA(i,i-ilo), ldda, A(i,i), lda, queue ); // add 1 to i for 1-based index magma_slahr2( ihi, i+1, nb, dA(0,i-ilo), ldda, dV(0,0), ldda, A(0,i), lda, &tau[i], T, nb, work, n, queue ); // Copy T from the CPU to dT on the GPU magma_ssetmatrix( nb, nb, T, nb, dT(0,0), nb, queue ); magma_slahru( n, ihi, i, nb, A(0,i), lda, dA(0,i-ilo), ldda, // dA dA(i,i-ilo), ldda, // dY, stored over current panel dV(0,0), ldda, dT(0,0), dwork, queue ); } // Copy remainder to host magma_sgetmatrix( n, n-i, dA(0,i-ilo), ldda, A(0,i), lda, queue ); magma_free( dwork ); magma_free_cpu( T ); magma_queue_destroy( queue ); } // Use unblocked code to reduce the rest of the matrix // add 1 to i for 1-based index i += 1; lapackf77_sgehd2(&n, &i, &ihi, A, &lda, tau, work, &iinfo); work[0] = magma_smake_lwork( iws ); return *info; } /* magma_sgehrd2 */
/** Purpose ------- SGEEV computes for an N-by-N real nonsymmetric matrix A, the eigenvalues and, optionally, the left and/or right eigenvectors. The right eigenvector v(j) of A satisfies A * v(j) = lambda(j) * v(j) where lambda(j) is its eigenvalue. The left eigenvector u(j) of A satisfies u(j)**T * A = lambda(j) * u(j)**T where u(j)**T denotes the transpose of u(j). The computed eigenvectors are normalized to have Euclidean norm equal to 1 and largest component real. Arguments --------- @param[in] jobvl magma_vec_t - = MagmaNoVec: left eigenvectors of A are not computed; - = MagmaVec: left eigenvectors of are computed. @param[in] jobvr magma_vec_t - = MagmaNoVec: right eigenvectors of A are not computed; - = MagmaVec: right eigenvectors of A are computed. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A REAL array, dimension (LDA,N) On entry, the N-by-N matrix A. On exit, A has been overwritten. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] wr REAL array, dimension (N) @param[out] wi REAL array, dimension (N) WR and WI contain the real and imaginary parts, respectively, of the computed eigenvalues. Complex conjugate pairs of eigenvalues appear consecutively with the eigenvalue having the positive imaginary part first. @param[out] VL REAL array, dimension (LDVL,N) If JOBVL = MagmaVec, the left eigenvectors u(j) are stored one after another in the columns of VL, in the same order as their eigenvalues. If JOBVL = MagmaNoVec, VL is not referenced. u(j) = VL(:,j), the j-th column of VL. @param[in] ldvl INTEGER The leading dimension of the array VL. LDVL >= 1; if JOBVL = MagmaVec, LDVL >= N. @param[out] VR REAL array, dimension (LDVR,N) If JOBVR = MagmaVec, the right eigenvectors v(j) are stored one after another in the columns of VR, in the same order as their eigenvalues. If JOBVR = MagmaNoVec, VR is not referenced. v(j) = VR(:,j), the j-th column of VR. @param[in] ldvr INTEGER The leading dimension of the array VR. LDVR >= 1; if JOBVR = MagmaVec, LDVR >= N. @param[out] work (workspace) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= (2 + nb + nb*ngpu)*N. For optimal performance, LWORK >= (2 + 2*nb + nb*ngpu)*N. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value. - > 0: if INFO = i, the QR algorithm failed to compute all the eigenvalues, and no eigenvectors have been computed; elements and i+1:N of W contain eigenvalues which have converged. @ingroup magma_sgeev_driver ********************************************************************/ extern "C" magma_int_t magma_sgeev_m( magma_vec_t jobvl, magma_vec_t jobvr, magma_int_t n, float *A, magma_int_t lda, #ifdef COMPLEX float *w, #else float *wr, float *wi, #endif float *VL, magma_int_t ldvl, float *VR, magma_int_t ldvr, float *work, magma_int_t lwork, #ifdef COMPLEX float *rwork, #endif magma_int_t *info ) { #define VL(i,j) (VL + (i) + (j)*ldvl) #define VR(i,j) (VR + (i) + (j)*ldvr) const magma_int_t ione = 1; const magma_int_t izero = 0; float d__1, d__2; float r, cs, sn, scl; float dum[1], eps; float anrm, cscale, bignum, smlnum; magma_int_t i, k, ilo, ihi; magma_int_t ibal, ierr, itau, iwrk, nout, liwrk, nb; magma_int_t scalea, minwrk, optwrk, lquery, wantvl, wantvr, select[1]; magma_side_t side = MagmaRight; magma_int_t ngpu = magma_num_gpus(); magma_timer_t time_total=0, time_gehrd=0, time_unghr=0, time_hseqr=0, time_trevc=0, time_sum=0; magma_flops_t flop_total=0, flop_gehrd=0, flop_unghr=0, flop_hseqr=0, flop_trevc=0, flop_sum=0; timer_start( time_total ); flops_start( flop_total ); *info = 0; lquery = (lwork == -1); wantvl = (jobvl == MagmaVec); wantvr = (jobvr == MagmaVec); if (! wantvl && jobvl != MagmaNoVec) { *info = -1; } else if (! wantvr && jobvr != MagmaNoVec) { *info = -2; } else if (n < 0) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if ( (ldvl < 1) || (wantvl && (ldvl < n))) { *info = -9; } else if ( (ldvr < 1) || (wantvr && (ldvr < n))) { *info = -11; } /* Compute workspace */ nb = magma_get_sgehrd_nb( n ); if (*info == 0) { minwrk = (2 + nb + nb*ngpu)*n; optwrk = (2 + 2*nb + nb*ngpu)*n; work[0] = magma_smake_lwork( optwrk ); if (lwork < minwrk && ! lquery) { *info = -13; } } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { return *info; } #if defined(Version3) float *dT; if (MAGMA_SUCCESS != magma_smalloc( &dT, nb*n )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } #endif #if defined(Version5) float *T; if (MAGMA_SUCCESS != magma_smalloc_cpu( &T, nb*n )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } #endif /* Get machine constants */ eps = lapackf77_slamch( "P" ); smlnum = lapackf77_slamch( "S" ); bignum = 1. / smlnum; lapackf77_slabad( &smlnum, &bignum ); smlnum = magma_ssqrt( smlnum ) / eps; bignum = 1. / smlnum; /* Scale A if max element outside range [SMLNUM,BIGNUM] */ anrm = lapackf77_slange( "M", &n, &n, A, &lda, dum ); scalea = 0; if (anrm > 0. && anrm < smlnum) { scalea = 1; cscale = smlnum; } else if (anrm > bignum) { scalea = 1; cscale = bignum; } if (scalea) { lapackf77_slascl( "G", &izero, &izero, &anrm, &cscale, &n, &n, A, &lda, &ierr ); } /* Balance the matrix * (Workspace: need N) * - this space is reserved until after gebak */ ibal = 0; lapackf77_sgebal( "B", &n, A, &lda, &ilo, &ihi, &work[ibal], &ierr ); /* Reduce to upper Hessenberg form * (Workspace: need 3*N, prefer 2*N + N*NB + NB*NGPU) * - added NB*NGPU needed for multi-GPU magma_sgehrd_m * - including N reserved for gebal/gebak, unused by sgehrd */ itau = ibal + n; iwrk = itau + n; liwrk = lwork - iwrk; timer_start( time_gehrd ); flops_start( flop_gehrd ); #if defined(Version1) // Version 1 - LAPACK lapackf77_sgehrd( &n, &ilo, &ihi, A, &lda, &work[itau], &work[iwrk], &liwrk, &ierr ); #elif defined(Version2) // Version 2 - LAPACK consistent HRD magma_sgehrd2( n, ilo, ihi, A, lda, &work[itau], &work[iwrk], liwrk, &ierr ); #elif defined(Version3) // Version 3 - LAPACK consistent MAGMA HRD + T matrices stored, magma_sgehrd( n, ilo, ihi, A, lda, &work[itau], &work[iwrk], liwrk, dT, &ierr ); #elif defined(Version5) // Version 4 - Multi-GPU, T on host magma_sgehrd_m( n, ilo, ihi, A, lda, &work[itau], &work[iwrk], liwrk, T, &ierr ); #endif time_sum += timer_stop( time_gehrd ); flop_sum += flops_stop( flop_gehrd ); if (wantvl) { /* Want left eigenvectors * Copy Householder vectors to VL */ side = MagmaLeft; lapackf77_slacpy( MagmaLowerStr, &n, &n, A, &lda, VL, &ldvl ); /* Generate orthogonal matrix in VL * (Workspace: need 3*N-1, prefer 2*N + (N-1)*NB) * - including N reserved for gebal/gebak, unused by sorghr */ timer_start( time_unghr ); flops_start( flop_unghr ); #if defined(Version1) || defined(Version2) // Version 1 & 2 - LAPACK lapackf77_sorghr( &n, &ilo, &ihi, VL, &ldvl, &work[itau], &work[iwrk], &liwrk, &ierr ); #elif defined(Version3) // Version 3 - LAPACK consistent MAGMA HRD + T matrices stored magma_sorghr( n, ilo, ihi, VL, ldvl, &work[itau], dT, nb, &ierr ); #elif defined(Version5) // Version 5 - Multi-GPU, T on host magma_sorghr_m( n, ilo, ihi, VL, ldvl, &work[itau], T, nb, &ierr ); #endif time_sum += timer_stop( time_unghr ); flop_sum += flops_stop( flop_unghr ); timer_start( time_hseqr ); flops_start( flop_hseqr ); /* Perform QR iteration, accumulating Schur vectors in VL * (Workspace: need N+1, prefer N+HSWORK (see comments) ) * - including N reserved for gebal/gebak, unused by shseqr */ iwrk = itau; liwrk = lwork - iwrk; lapackf77_shseqr( "S", "V", &n, &ilo, &ihi, A, &lda, wr, wi, VL, &ldvl, &work[iwrk], &liwrk, info ); time_sum += timer_stop( time_hseqr ); flop_sum += flops_stop( flop_hseqr ); if (wantvr) { /* Want left and right eigenvectors * Copy Schur vectors to VR */ side = MagmaBothSides; lapackf77_slacpy( "F", &n, &n, VL, &ldvl, VR, &ldvr ); } } else if (wantvr) { /* Want right eigenvectors * Copy Householder vectors to VR */ side = MagmaRight; lapackf77_slacpy( "L", &n, &n, A, &lda, VR, &ldvr ); /* Generate orthogonal matrix in VR * (Workspace: need 3*N-1, prefer 2*N + (N-1)*NB) * - including N reserved for gebal/gebak, unused by sorghr */ timer_start( time_unghr ); flops_start( flop_unghr ); #if defined(Version1) || defined(Version2) // Version 1 & 2 - LAPACK lapackf77_sorghr( &n, &ilo, &ihi, VR, &ldvr, &work[itau], &work[iwrk], &liwrk, &ierr ); #elif defined(Version3) // Version 3 - LAPACK consistent MAGMA HRD + T matrices stored magma_sorghr( n, ilo, ihi, VR, ldvr, &work[itau], dT, nb, &ierr ); #elif defined(Version5) // Version 5 - Multi-GPU, T on host magma_sorghr_m( n, ilo, ihi, VR, ldvr, &work[itau], T, nb, &ierr ); #endif time_sum += timer_stop( time_unghr ); flop_sum += flops_stop( flop_unghr ); /* Perform QR iteration, accumulating Schur vectors in VR * (Workspace: need N+1, prefer N+HSWORK (see comments) ) * - including N reserved for gebal/gebak, unused by shseqr */ timer_start( time_hseqr ); flops_start( flop_hseqr ); iwrk = itau; liwrk = lwork - iwrk; lapackf77_shseqr( "S", "V", &n, &ilo, &ihi, A, &lda, wr, wi, VR, &ldvr, &work[iwrk], &liwrk, info ); time_sum += timer_stop( time_hseqr ); flop_sum += flops_stop( flop_hseqr ); } else { /* Compute eigenvalues only * (Workspace: need N+1, prefer N+HSWORK (see comments) ) * - including N reserved for gebal/gebak, unused by shseqr */ timer_start( time_hseqr ); flops_start( flop_hseqr ); iwrk = itau; liwrk = lwork - iwrk; lapackf77_shseqr( "E", "N", &n, &ilo, &ihi, A, &lda, wr, wi, VR, &ldvr, &work[iwrk], &liwrk, info ); time_sum += timer_stop( time_hseqr ); flop_sum += flops_stop( flop_hseqr ); } /* If INFO > 0 from SHSEQR, then quit */ if (*info > 0) { goto CLEANUP; } timer_start( time_trevc ); flops_start( flop_trevc ); if (wantvl || wantvr) { /* Compute left and/or right eigenvectors * (Workspace: need 4*N, prefer (2 + 2*nb)*N) * - including N reserved for gebal/gebak, unused by strevc */ liwrk = lwork - iwrk; #if TREVC_VERSION == 1 lapackf77_strevc( lapack_side_const(side), "B", select, &n, A, &lda, VL, &ldvl, VR, &ldvr, &n, &nout, &work[iwrk], &ierr ); #elif TREVC_VERSION == 2 lapackf77_strevc3( lapack_side_const(side), "B", select, &n, A, &lda, VL, &ldvl, VR, &ldvr, &n, &nout, &work[iwrk], &liwrk, &ierr ); #elif TREVC_VERSION == 3 magma_strevc3( side, MagmaBacktransVec, select, n, A, lda, VL, ldvl, VR, ldvr, n, &nout, &work[iwrk], liwrk, &ierr ); #elif TREVC_VERSION == 4 magma_strevc3_mt( side, MagmaBacktransVec, select, n, A, lda, VL, ldvl, VR, ldvr, n, &nout, &work[iwrk], liwrk, &ierr ); #elif TREVC_VERSION == 5 magma_strevc3_mt_gpu( side, MagmaBacktransVec, select, n, A, lda, VL, ldvl, VR, ldvr, n, &nout, &work[iwrk], liwrk, &ierr ); #else #error Unknown TREVC_VERSION #endif } time_sum += timer_stop( time_trevc ); flop_sum += flops_stop( flop_trevc ); if (wantvl) { /* Undo balancing of left eigenvectors * (Workspace: need N) */ lapackf77_sgebak( "B", "L", &n, &ilo, &ihi, &work[ibal], &n, VL, &ldvl, &ierr ); /* Normalize left eigenvectors and make largest component real */ for (i = 0; i < n; ++i) { if ( wi[i] == 0. ) { scl = 1. / magma_cblas_snrm2( n, VL(0,i), 1 ); blasf77_sscal( &n, &scl, VL(0,i), &ione ); } else if ( wi[i] > 0. ) { d__1 = magma_cblas_snrm2( n, VL(0,i), 1 ); d__2 = magma_cblas_snrm2( n, VL(0,i+1), 1 ); scl = 1. / lapackf77_slapy2( &d__1, &d__2 ); blasf77_sscal( &n, &scl, VL(0,i), &ione ); blasf77_sscal( &n, &scl, VL(0,i+1), &ione ); for (k = 0; k < n; ++k) { /* Computing 2nd power */ d__1 = *VL(k,i); d__2 = *VL(k,i+1); work[iwrk + k] = d__1*d__1 + d__2*d__2; } k = blasf77_isamax( &n, &work[iwrk], &ione ) - 1; // subtract 1; k is 0-based lapackf77_slartg( VL(k,i), VL(k,i+1), &cs, &sn, &r ); blasf77_srot( &n, VL(0,i), &ione, VL(0,i+1), &ione, &cs, &sn ); *VL(k,i+1) = 0.; } } } if (wantvr) { /* Undo balancing of right eigenvectors * (Workspace: need N) */ lapackf77_sgebak( "B", "R", &n, &ilo, &ihi, &work[ibal], &n, VR, &ldvr, &ierr ); /* Normalize right eigenvectors and make largest component real */ for (i = 0; i < n; ++i) { if ( wi[i] == 0. ) { scl = 1. / magma_cblas_snrm2( n, VR(0,i), 1 ); blasf77_sscal( &n, &scl, VR(0,i), &ione ); } else if ( wi[i] > 0. ) { d__1 = magma_cblas_snrm2( n, VR(0,i), 1 ); d__2 = magma_cblas_snrm2( n, VR(0,i+1), 1 ); scl = 1. / lapackf77_slapy2( &d__1, &d__2 ); blasf77_sscal( &n, &scl, VR(0,i), &ione ); blasf77_sscal( &n, &scl, VR(0,i+1), &ione ); for (k = 0; k < n; ++k) { /* Computing 2nd power */ d__1 = *VR(k,i); d__2 = *VR(k,i+1); work[iwrk + k] = d__1*d__1 + d__2*d__2; } k = blasf77_isamax( &n, &work[iwrk], &ione ) - 1; // subtract 1; k is 0-based lapackf77_slartg( VR(k,i), VR(k,i+1), &cs, &sn, &r ); blasf77_srot( &n, VR(0,i), &ione, VR(0,i+1), &ione, &cs, &sn ); *VR(k,i+1) = 0.; } } } CLEANUP: /* Undo scaling if necessary */ if (scalea) { // converged eigenvalues, stored in wr[i+1:n] and wi[i+1:n] for i = INFO magma_int_t nval = n - (*info); magma_int_t ld = max( nval, 1 ); lapackf77_slascl( "G", &izero, &izero, &cscale, &anrm, &nval, &ione, wr + (*info), &ld, &ierr ); lapackf77_slascl( "G", &izero, &izero, &cscale, &anrm, &nval, &ione, wi + (*info), &ld, &ierr ); if (*info > 0) { // first ilo columns were already upper triangular, // so the corresponding eigenvalues are also valid. nval = ilo - 1; lapackf77_slascl( "G", &izero, &izero, &cscale, &anrm, &nval, &ione, wr, &n, &ierr ); lapackf77_slascl( "G", &izero, &izero, &cscale, &anrm, &nval, &ione, wi, &n, &ierr ); } } #if defined(Version3) magma_free( dT ); #endif #if defined(Version5) magma_free_cpu( T ); #endif timer_stop( time_total ); flops_stop( flop_total ); timer_printf( "sgeev times n %5d, gehrd %7.3f, unghr %7.3f, hseqr %7.3f, trevc %7.3f, total %7.3f, sum %7.3f\n", (int) n, time_gehrd, time_unghr, time_hseqr, time_trevc, time_total, time_sum ); timer_printf( "sgeev flops n %5d, gehrd %7lld, unghr %7lld, hseqr %7lld, trevc %7lld, total %7lld, sum %7lld\n", (int) n, flop_gehrd, flop_unghr, flop_hseqr, flop_trevc, flop_total, flop_sum ); work[0] = magma_smake_lwork( optwrk ); return *info; } /* magma_sgeev */
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgeev */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gpu_time, cpu_time; float *h_A, *h_R, *VL, *VR, *h_work, *w1, *w2; float *w1i, *w2i; magmaFloatComplex *w1copy, *w2copy; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; float tnrm, result[9]; magma_int_t N, n2, lda, nb, lwork, info; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; float ulp, ulpinv, error; magma_int_t status = 0; ulp = lapackf77_slamch( "P" ); ulpinv = 1./ulp; magma_opts opts; parse_opts( argc, argv, &opts ); // need slightly looser bound (60*eps instead of 30*eps) for some tests opts.tolerance = max( 60., opts.tolerance ); float tol = opts.tolerance * lapackf77_slamch("E"); float tolulp = opts.tolerance * lapackf77_slamch("P"); // enable at least some minimal checks, if requested if ( opts.check && !opts.lapack && opts.jobvl == MagmaNoVec && opts.jobvr == MagmaNoVec ) { fprintf( stderr, "NOTE: Some checks require vectors to be computed;\n" " set jobvl=V (option -LV), or jobvr=V (option -RV), or both.\n" " Some checks require running lapack (-l); setting lapack.\n\n"); opts.lapack = true; } printf(" N CPU Time (sec) GPU Time (sec) |W_magma - W_lapack| / |W_lapack|\n"); printf("===========================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; lda = N; n2 = lda*N; nb = magma_get_sgehrd_nb(N); lwork = N*(2 + nb); // generous workspace - required by sget22 lwork = max( lwork, N*(5 + 2*N) ); TESTING_MALLOC_CPU( w1copy, magmaFloatComplex, N ); TESTING_MALLOC_CPU( w2copy, magmaFloatComplex, N ); TESTING_MALLOC_CPU( w1, float, N ); TESTING_MALLOC_CPU( w2, float, N ); TESTING_MALLOC_CPU( w1i, float, N ); TESTING_MALLOC_CPU( w2i, float, N ); TESTING_MALLOC_CPU( h_A, float, n2 ); TESTING_MALLOC_PIN( h_R, float, n2 ); TESTING_MALLOC_PIN( VL, float, n2 ); TESTING_MALLOC_PIN( VR, float, n2 ); TESTING_MALLOC_PIN( h_work, float, lwork ); /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_sgeev( opts.jobvl, opts.jobvr, N, h_R, lda, w1, w1i, VL, lda, VR, lda, h_work, lwork, &info ); gpu_time = magma_wtime() - gpu_time; if (info != 0) printf("magma_sgeev returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result =================================================================== */ if ( opts.check ) { /* =================================================================== * Check the result following LAPACK's [zcds]drvev routine. * The following tests are performed: * (1) | A * VR - VR * W | / ( n |A| ) * * Here VR is the matrix of unit right eigenvectors. * W is a diagonal matrix with diagonal entries W(j). * * (2) | |VR(i)| - 1 | and whether largest component real * * VR(i) denotes the i-th column of VR. * * (3) | A**T * VL - VL * W**T | / ( n |A| ) * * Here VL is the matrix of unit left eigenvectors, A**T is the * transpose of A, and W is as above. * * (4) | |VL(i)| - 1 | and whether largest component real * * VL(i) denotes the i-th column of VL. * * (5) W(full) = W(partial, W only) -- currently skipped * (6) W(full) = W(partial, W and VR) * (7) W(full) = W(partial, W and VL) * * W(full) denotes the eigenvalues computed when both VR and VL * are also computed, and W(partial) denotes the eigenvalues * computed when only W, only W and VR, or only W and VL are * computed. * * (8) VR(full) = VR(partial, W and VR) * * VR(full) denotes the right eigenvectors computed when both VR * and VL are computed, and VR(partial) denotes the result * when only VR is computed. * * (9) VL(full) = VL(partial, W and VL) * * VL(full) denotes the left eigenvectors computed when both VR * and VL are also computed, and VL(partial) denotes the result * when only VL is computed. * * (1, 2) only if jobvr = V * (3, 4) only if jobvl = V * (5-9) only if check = 2 (option -c2) ================================================================= */ float vmx, vrmx, vtst; // Initialize result. -1 indicates test was not run. for( int j = 0; j < 9; ++j ) result[j] = -1.; if ( opts.jobvr == MagmaVec ) { // Do test 1: | A * VR - VR * W | / ( n |A| ) // Note this writes result[1] also lapackf77_sget22( MagmaNoTransStr, MagmaNoTransStr, MagmaNoTransStr, &N, h_A, &lda, VR, &lda, w1, w1i, h_work, &result[0] ); result[0] *= ulp; // Do test 2: | |VR(i)| - 1 | and whether largest component real result[1] = -1.; for( int j = 0; j < N; ++j ) { tnrm = 1.; if (w1i[j] == 0.) tnrm = cblas_snrm2(N, &VR[j*lda], ione); else if (w1i[j] > 0.) tnrm = magma_slapy2( cblas_snrm2(N, &VR[j *lda], ione), cblas_snrm2(N, &VR[(j+1)*lda], ione) ); result[1] = max( result[1], min( ulpinv, MAGMA_S_ABS(tnrm-1.)/ulp )); if (w1i[j] > 0.) { vmx = vrmx = 0.; for( int jj = 0; jj < N; ++jj ) { vtst = magma_slapy2( VR[jj+j*lda], VR[jj+(j+1)*lda]); if (vtst > vmx) vmx = vtst; if ( (VR[jj + (j+1)*lda])==0. && MAGMA_S_ABS( VR[jj+j*lda] ) > vrmx) { vrmx = MAGMA_S_ABS( VR[jj+j*lda] ); } } if (vrmx / vmx < 1. - ulp*2.) result[1] = ulpinv; } } result[1] *= ulp; } if ( opts.jobvl == MagmaVec ) { // Do test 3: | A**T * VL - VL * W**T | / ( n |A| ) // Note this writes result[3] also lapackf77_sget22( MagmaTransStr, MagmaNoTransStr, MagmaTransStr, &N, h_A, &lda, VL, &lda, w1, w1i, h_work, &result[2] ); result[2] *= ulp; // Do test 4: | |VL(i)| - 1 | and whether largest component real result[3] = -1.; for( int j = 0; j < N; ++j ) { tnrm = 1.; if (w1i[j] == 0.) tnrm = cblas_snrm2(N, &VL[j*lda], ione); else if (w1i[j] > 0.) tnrm = magma_slapy2( cblas_snrm2(N, &VL[j *lda], ione), cblas_snrm2(N, &VL[(j+1)*lda], ione) ); result[3] = max( result[3], min( ulpinv, MAGMA_S_ABS(tnrm-1.)/ulp )); if (w1i[j] > 0.) { vmx = vrmx = 0.; for( int jj = 0; jj < N; ++jj ) { vtst = magma_slapy2( VL[jj+j*lda], VL[jj+(j+1)*lda]); if (vtst > vmx) vmx = vtst; if ( (VL[jj + (j+1)*lda])==0. && MAGMA_S_ABS( VL[jj+j*lda]) > vrmx) { vrmx = MAGMA_S_ABS( VL[jj+j*lda] ); } } if (vrmx / vmx < 1. - ulp*2.) result[3] = ulpinv; } } result[3] *= ulp; } } if ( opts.check == 2 ) { // more extensive tests // this is really slow because it calls magma_zgeev multiple times float *LRE, DUM; TESTING_MALLOC_PIN( LRE, float, n2 ); lapackf77_slarnv( &ione, ISEED, &n2, h_A ); lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); // ---------- // Compute eigenvalues, left and right eigenvectors magma_sgeev( MagmaVec, MagmaVec, N, h_R, lda, w1, w1i, VL, lda, VR, lda, h_work, lwork, &info ); if (info != 0) printf("magma_zgeev (case V, V) returned error %d: %s.\n", (int) info, magma_strerror( info )); // ---------- // Compute eigenvalues only // These are not exactly equal, and not in the same order, so skip for now. //lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); //magma_sgeev( MagmaNoVec, MagmaNoVec, // N, h_R, lda, w2, w2i, // &DUM, 1, &DUM, 1, // h_work, lwork, &info ); //if (info != 0) // printf("magma_sgeev (case N, N) returned error %d: %s.\n", // (int) info, magma_strerror( info )); // //// Do test 5: W(full) = W(partial, W only) //result[4] = 1; //for( int j = 0; j < N; ++j ) // if ( w1[j] != w2[j] || w1i[j] != w2i[j] ) // result[4] = 0; // ---------- // Compute eigenvalues and right eigenvectors lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); magma_sgeev( MagmaNoVec, MagmaVec, N, h_R, lda, w2, w2i, &DUM, 1, LRE, lda, h_work, lwork, &info ); if (info != 0) printf("magma_sgeev (case N, V) returned error %d: %s.\n", (int) info, magma_strerror( info )); // Do test 6: W(full) = W(partial, W and VR) result[5] = 1; for( int j = 0; j < N; ++j ) if ( w1[j] != w2[j] || w1i[j] != w2i[j] ) result[5] = 0; // Do test 8: VR(full) = VR(partial, W and VR) result[7] = 1; for( int j = 0; j < N; ++j ) for( int jj = 0; jj < N; ++jj ) if ( ! MAGMA_S_EQUAL( VR[j+jj*lda], LRE[j+jj*lda] )) result[7] = 0; // ---------- // Compute eigenvalues and left eigenvectors lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); magma_sgeev( MagmaVec, MagmaNoVec, N, h_R, lda, w2, w2i, LRE, lda, &DUM, 1, h_work, lwork, &info ); if (info != 0) printf("magma_sgeev (case V, N) returned error %d: %s.\n", (int) info, magma_strerror( info )); // Do test 7: W(full) = W(partial, W and VL) result[6] = 1; for( int j = 0; j < N; ++j ) if ( w1[j] != w2[j] || w1i[j] != w2i[j] ) result[6] = 0; // Do test 9: VL(full) = VL(partial, W and VL) result[8] = 1; for( int j = 0; j < N; ++j ) for( int jj = 0; jj < N; ++jj ) if ( ! MAGMA_S_EQUAL( VL[j+jj*lda], LRE[j+jj*lda] )) result[8] = 0; TESTING_FREE_PIN( LRE ); } /* ===================================================================== Performs operation using LAPACK Do this after checks, because it overwrites VL and VR. =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_sgeev( &opts.jobvl, &opts.jobvr, &N, h_A, &lda, w2, w2i, VL, &lda, VR, &lda, h_work, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; if (info != 0) printf("lapackf77_sgeev returned error %d: %s.\n", (int) info, magma_strerror( info )); // check | W_magma - W_lapack | / | W | // need to sort eigenvalues first // copy them into complex vectors for ease for( int j=0; j < N; ++j ) { w1copy[j] = MAGMA_C_MAKE( w1[j], w1i[j] ); w2copy[j] = MAGMA_C_MAKE( w2[j], w2i[j] ); } std::sort( w1copy, &w1copy[N], compare ); std::sort( w2copy, &w2copy[N], compare ); // adjust sorting to deal with numerical inaccuracy // search down w2 for eigenvalue that matches w1's eigenvalue for( int j=0; j < N; ++j ) { for( int j2=j; j2 < N; ++j2 ) { magmaFloatComplex diff = MAGMA_C_SUB( w1copy[j], w2copy[j2] ); float diff2 = magma_szlapy2( diff ) / max( magma_szlapy2( w1copy[j] ), tol ); if ( diff2 < 100*tol ) { if ( j != j2 ) { std::swap( w2copy[j], w2copy[j2] ); } break; } } } blasf77_caxpy( &N, &c_neg_one, w2copy, &ione, w1copy, &ione ); error = cblas_scnrm2( N, w1copy, 1 ); error /= cblas_scnrm2( N, w2copy, 1 ); printf("%5d %7.2f %7.2f %.2e %s\n", (int) N, cpu_time, gpu_time, error, (error < tolulp ? " ok" : " failed")); status |= ! (error < tolulp); } else { printf("%5d --- %7.2f\n", (int) N, gpu_time); } if ( opts.check ) { // -1 indicates test was not run if ( result[0] != -1 ) { printf(" | A * VR - VR * W | / ( n |A| ) = %8.2e %s\n", result[0], (result[0] < tol ? " ok" : " failed")); } if ( result[1] != -1 ) { printf(" | |VR(i)| - 1 | = %8.2e %s\n", result[1], (result[1] < tol ? " ok" : " failed")); } if ( result[2] != -1 ) { printf(" | A'* VL - VL * W'| / ( n |A| ) = %8.2e %s\n", result[2], (result[2] < tol ? " ok" : " failed")); } if ( result[3] != -1 ) { printf(" | |VL(i)| - 1 | = %8.2e %s\n", result[3], (result[3] < tol ? " ok" : " failed")); } if ( result[4] != -1 ) { printf(" W (full) == W (partial, W only) %s\n", (result[4] == 1. ? " ok" : " failed")); } if ( result[5] != -1 ) { printf(" W (full) == W (partial, W and VR) %s\n", (result[5] == 1. ? " ok" : " failed")); } if ( result[6] != -1 ) { printf(" W (full) == W (partial, W and VL) %s\n", (result[6] == 1. ? " ok" : " failed")); } if ( result[7] != -1 ) { printf(" VR (full) == VR (partial, W and VR) %s\n", (result[7] == 1. ? " ok" : " failed")); } if ( result[8] != -1 ) { printf(" VL (full) == VL (partial, W and VL) %s\n", (result[8] == 1. ? " ok" : " failed")); } int newline = 0; if ( result[0] != -1 ) { status |= ! (result[0] < tol); newline = 1; } if ( result[1] != -1 ) { status |= ! (result[1] < tol); newline = 1; } if ( result[2] != -1 ) { status |= ! (result[2] < tol); newline = 1; } if ( result[3] != -1 ) { status |= ! (result[3] < tol); newline = 1; } if ( result[4] != -1 ) { status |= ! (result[4] == 1.); newline = 1; } if ( result[5] != -1 ) { status |= ! (result[5] == 1.); newline = 1; } if ( result[6] != -1 ) { status |= ! (result[6] == 1.); newline = 1; } if ( result[7] != -1 ) { status |= ! (result[7] == 1.); newline = 1; } if ( result[8] != -1 ) { status |= ! (result[8] == 1.); newline = 1; } if ( newline ) { printf( "\n" ); } } TESTING_FREE_CPU( w1copy ); TESTING_FREE_CPU( w2copy ); TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( w2 ); TESTING_FREE_CPU( w1i ); TESTING_FREE_CPU( w2i ); TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( VL ); TESTING_FREE_PIN( VR ); TESTING_FREE_PIN( h_work ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/***************************************************************************//** Purpose ------- SGEHRD reduces a REAL general matrix A to upper Hessenberg form H by an orthogonal similarity transformation: Q' * A * Q = H . This version stores the triangular matrices used in the factorization so that they can be applied directly (i.e., without being recomputed) later. As a result, the application of Q is much faster. Arguments --------- @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] ilo INTEGER @param[in] ihi INTEGER It is assumed that A is already upper triangular in rows and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally set by a previous call to SGEBAL; otherwise they should be set to 1 and N respectively. See Further Details. 1 <= ILO <= IHI <= N, if N > 0; ILO=1 and IHI=0, if N=0. @param[in,out] A REAL array, dimension (LDA,N) On entry, the N-by-N general matrix to be reduced. On exit, the upper triangle and the first subdiagonal of A are overwritten with the upper Hessenberg matrix H, and the elements below the first subdiagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] tau REAL array, dimension (N-1) The scalar factors of the elementary reflectors (see Further Details). Elements 1:ILO-1 and IHI:N-1 of TAU are set to zero. @param[out] work (workspace) REAL array, dimension (LWORK) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The length of the array WORK. LWORK >= N*NB. where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] T REAL array, dimension NB*N, where NB is the optimal blocksize. It stores the NB*NB blocks of the triangular T matrices used in the reduction. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value. Further Details --------------- The matrix Q is represented as a product of (ihi-ilo) elementary reflectors Q = H(ilo) H(ilo+1) . . . H(ihi-1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i) = 0, v(i+1) = 1 and v(ihi+1:n) = 0; v(i+2:ihi) is stored on exit in A(i+2:ihi,i), and tau in TAU(i). The contents of A are illustrated by the following example, with n = 7, ilo = 2 and ihi = 6: @verbatim on entry, on exit, ( a a a a a a a ) ( a a h h h h a ) ( a a a a a a ) ( a h h h h a ) ( a a a a a a ) ( h h h h h h ) ( a a a a a a ) ( v2 h h h h h ) ( a a a a a a ) ( v2 v3 h h h h ) ( a a a a a a ) ( v2 v3 v4 h h h ) ( a ) ( a ) @endverbatim where a denotes an element of the original matrix A, h denotes a modified element of the upper Hessenberg matrix H, and vi denotes an element of the vector defining H(i). This implementation follows the hybrid algorithm and notations described in S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg form through hybrid GPU-based computing," University of Tennessee Computer Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219), May 24, 2009. This version stores the T matrices, for later use in magma_sorghr. @ingroup magma_gehrd *******************************************************************************/ extern "C" magma_int_t magma_sgehrd_m( magma_int_t n, magma_int_t ilo, magma_int_t ihi, float *A, magma_int_t lda, float *tau, float *work, magma_int_t lwork, float *T, magma_int_t *info) { #define A( i, j ) (A + (i) + (j)*lda) #define dA( dev, i, j ) (data.dA[dev] + (i) + (j)*ldda) float c_one = MAGMA_S_ONE; float c_zero = MAGMA_S_ZERO; magma_int_t nb = magma_get_sgehrd_nb(n); magma_int_t nh, iws, ldda, min_lblocks, max_lblocks, last_dev, dev; magma_int_t dpanel, di, nlocal, i, i2, ib, ldwork; magma_int_t iinfo; magma_int_t lquery; struct sgehrd_data data; magma_int_t ngpu = magma_num_gpus(); *info = 0; iws = n*(nb + nb*ngpu); work[0] = magma_smake_lwork( iws ); lquery = (lwork == -1); if (n < 0) { *info = -1; } else if (ilo < 1 || ilo > max(1,n)) { *info = -2; } else if (ihi < min(ilo,n) || ihi > n) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if (lwork < iws && ! lquery) { *info = -8; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; // Adjust from 1-based indexing ilo -= 1; // Quick return if possible nh = ihi - ilo; if (nh <= 1) { work[0] = c_one; return *info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); // Set elements 0:ILO-1 and IHI-1:N-2 of TAU to zero for (i = 0; i < ilo; ++i) tau[i] = c_zero; for (i = max(0,ihi-1); i < n-1; ++i) tau[i] = c_zero; // set T to zero lapackf77_slaset( "Full", &nb, &n, &c_zero, &c_zero, T, &nb ); // set to null, to simplify cleanup code for( dev = 0; dev < ngpu; ++dev ) { data.dA[dev] = NULL; data.queues[dev] = NULL; } // Now requires lwork >= iws; else dT won't be computed in unblocked code. // If not enough workspace, use unblocked code //if ( lwork < iws ) { // nb = 1; //} if (nb == 1 || nb >= nh) { // Use unblocked code below i = ilo; } else { // Use blocked code // allocate memory on GPUs for A and workspaces ldda = magma_roundup( n, 32 ); min_lblocks = (n / nb) / ngpu; max_lblocks = ((n-1) / nb) / ngpu + 1; last_dev = (n / nb) % ngpu; // V and Vd need to be padded for copying in mslahr2 data.ngpu = ngpu; data.ldda = ldda; data.ldv = nb*max_lblocks*ngpu; data.ldvd = nb*max_lblocks; for( dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); nlocal = min_lblocks*nb; if ( dev < last_dev ) { nlocal += nb; } else if ( dev == last_dev ) { nlocal += (n % nb); } ldwork = nlocal*ldda // A + nb*data.ldv // V + nb*data.ldvd // Vd + nb*ldda // Y + nb*ldda // W + nb*nb; // Ti if ( MAGMA_SUCCESS != magma_smalloc( &data.dA[dev], ldwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto CLEANUP; } data.dV [dev] = data.dA [dev] + nlocal*ldda; data.dVd[dev] = data.dV [dev] + nb*data.ldv; data.dY [dev] = data.dVd[dev] + nb*data.ldvd; data.dW [dev] = data.dY [dev] + nb*ldda; data.dTi[dev] = data.dW [dev] + nb*ldda; magma_queue_create( dev, &data.queues[dev] ); } // Copy the matrix to GPUs magma_ssetmatrix_1D_col_bcyclic( ngpu, n, n, nb, A, lda, data.dA, ldda, data.queues ); // round ilo down to block boundary ilo = (ilo/nb)*nb; for (i = ilo; i < ihi - 1 - nb; i += nb) { // Reduce columns i:i+nb-1 to Hessenberg form, returning the // matrices V and T of the block reflector H = I - V*T*V' // which performs the reduction, and also the matrix Y = A*V*T // Get the current panel (no need for the 1st iteration) dpanel = (i / nb) % ngpu; di = ((i / nb) / ngpu) * nb; if ( i > ilo ) { magma_setdevice( dpanel ); magma_sgetmatrix( ihi-i, nb, dA(dpanel, i, di), ldda, A(i,i), lda, data.queues[dpanel] ); } // add 1 to i for 1-based index magma_slahr2_m( ihi, i+1, nb, A(0,i), lda, &tau[i], &T[i*nb], nb, work, n, &data ); magma_slahru_m( n, ihi, i, nb, A, lda, &data ); // copy first i rows above panel to host magma_setdevice( dpanel ); magma_sgetmatrix_async( i, nb, dA(dpanel, 0, di), ldda, A(0,i), lda, data.queues[dpanel] ); } // Copy remainder to host, block-by-block for( i2 = i; i2 < n; i2 += nb ) { ib = min( nb, n-i2 ); dev = (i2 / nb) % ngpu; di = (i2 / nb) / ngpu * nb; magma_setdevice( dev ); magma_sgetmatrix( n, ib, dA(dev, 0, di), ldda, A(0,i2), lda, data.queues[dev] ); } } // Use unblocked code to reduce the rest of the matrix // add 1 to i for 1-based index i += 1; lapackf77_sgehd2(&n, &i, &ihi, A, &lda, tau, work, &iinfo); work[0] = magma_smake_lwork( iws ); CLEANUP: for( dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_free( data.dA[dev] ); magma_queue_destroy( data.queues[dev] ); } magma_setdevice( orig_dev ); return *info; } /* magma_sgehrd */
magma_int_t magmaf_get_sgehrd_nb( magma_int_t *m ) { return magma_get_sgehrd_nb( *m ); }
extern "C" magma_int_t magma_sgehrd(magma_int_t n, magma_int_t ilo, magma_int_t ihi, float *a, magma_int_t lda, float *tau, float *work, magma_int_t lwork, float *dT, magma_int_t *info) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= SGEHRD reduces a REAL general matrix A to upper Hessenberg form H by an orthogonal similarity transformation: Q' * A * Q = H . This version stores the triangular matrices used in the factorization so that they can be applied directly (i.e., without being recomputed) later. As a result, the application of Q is much faster. Arguments ========= N (input) INTEGER The order of the matrix A. N >= 0. ILO (input) INTEGER IHI (input) INTEGER It is assumed that A is already upper triangular in rows and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally set by a previous call to SGEBAL; otherwise they should be set to 1 and N respectively. See Further Details. 1 <= ILO <= IHI <= N, if N > 0; ILO=1 and IHI=0, if N=0. A (input/output) REAL array, dimension (LDA,N) On entry, the N-by-N general matrix to be reduced. On exit, the upper triangle and the first subdiagonal of A are overwritten with the upper Hessenberg matrix H, and the elements below the first subdiagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). TAU (output) REAL array, dimension (N-1) The scalar factors of the elementary reflectors (see Further Details). Elements 1:ILO-1 and IHI:N-1 of TAU are set to zero. WORK (workspace/output) REAL array, dimension (LWORK) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The length of the array WORK. LWORK >= max(1,N). For optimum performance LWORK >= N*NB, where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. dT (output) REAL array on the GPU, dimension N*NB, where NB is the optimal blocksize. It stores the NB*NB blocks of the triangular T matrices, used the the reduction. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value. Further Details =============== The matrix Q is represented as a product of (ihi-ilo) elementary reflectors Q = H(ilo) H(ilo+1) . . . H(ihi-1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i) = 0, v(i+1) = 1 and v(ihi+1:n) = 0; v(i+2:ihi) is stored on exit in A(i+2:ihi,i), and tau in TAU(i). The contents of A are illustrated by the following example, with n = 7, ilo = 2 and ihi = 6: on entry, on exit, ( a a a a a a a ) ( a a h h h h a ) ( a a a a a a ) ( a h h h h a ) ( a a a a a a ) ( h h h h h h ) ( a a a a a a ) ( v2 h h h h h ) ( a a a a a a ) ( v2 v3 h h h h ) ( a a a a a a ) ( v2 v3 v4 h h h ) ( a ) ( a ) where a denotes an element of the original matrix A, h denotes a modified element of the upper Hessenberg matrix H, and vi denotes an element of the vector defining H(i). This implementation follows the hybrid algorithm and notations described in S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg form through hybrid GPU-based computing," University of Tennessee Computer Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219), May 24, 2009. ===================================================================== */ float c_one = MAGMA_S_ONE; float c_zero = MAGMA_S_ZERO; magma_int_t nb = magma_get_sgehrd_nb(n); magma_int_t N = n, ldda = n; magma_int_t ib; magma_int_t nh, iws; magma_int_t nbmin, iinfo; magma_int_t ldwork; magma_int_t lquery; --tau; *info = 0; MAGMA_S_SET2REAL( work[0], (float) n * nb ); lquery = lwork == -1; if (n < 0) { *info = -1; } else if (ilo < 1 || ilo > max(1,n)) { *info = -2; } else if (ihi < min(ilo,n) || ihi > n) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if (lwork < max(1,n) && ! lquery) { *info = -8; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; /* Quick return if possible */ nh = ihi - ilo + 1; if (nh <= 1) { work[0] = c_one; return *info; } float *da; if (MAGMA_SUCCESS != magma_smalloc( &da, N*ldda + 2*N*nb + nb*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } float *d_A = da; float *d_work = da + (N+nb)*ldda; magma_int_t i__; float *t, *d_t; magma_smalloc_cpu( &t, nb*nb ); if ( t == NULL ) { magma_free( da ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } d_t = d_work + nb * ldda; szero_nbxnb_block(nb, d_A+N*ldda, ldda); /* Set elements 1:ILO-1 and IHI:N-1 of TAU to zero */ for (i__ = 1; i__ < ilo; ++i__) tau[i__] = c_zero; for (i__ = max(1,ihi); i__ < n; ++i__) tau[i__] = c_zero; for(i__=0; i__< nb*nb; i__+=4) t[i__] = t[i__+1] = t[i__+2] = t[i__+3] = c_zero; nbmin = 2; iws = 1; if (nb > 1 && nb < nh) { /* Determine when to cross over from blocked to unblocked code (last block is always handled by unblocked code) */ if (nb < nh) { /* Determine if workspace is large enough for blocked code */ iws = n * nb; if (lwork < iws) { /* Not enough workspace to use optimal NB: determine the minimum value of NB, and reduce NB or force use of unblocked code */ nbmin = nb; if (lwork >= n * nbmin) nb = lwork / n; else nb = 1; } } } ldwork = n; if (nb < nbmin || nb >= nh) { /* Use unblocked code below */ i__ = ilo; } else { /* Use blocked code */ /* Copy the matrix to the GPU */ magma_ssetmatrix( N, N-ilo+1, a+(ilo-1)*(lda), lda, d_A, ldda ); for (i__ = ilo; i__ < ihi - nb; i__ += nb) { /* Computing MIN */ ib = min(nb, ihi - i__); /* Reduce columns i:i+ib-1 to Hessenberg form, returning the matrices V and T of the block reflector H = I - V*T*V' which performs the reduction, and also the matrix Y = A*V*T */ /* Get the current panel (no need for the 1st iteration) */ magma_sgetmatrix( ihi-i__+1, ib, d_A + (i__ - ilo)*ldda + i__ - 1, ldda, a + (i__ - 1 )*lda + i__ - 1, lda ); magma_slahr2(ihi, i__, ib, d_A + (i__ - ilo)*ldda, d_A + N*ldda + 1, a + (i__ - 1 )*(lda) , lda, &tau[i__], t, nb, work, ldwork); /* Copy T from the CPU to D_T on the GPU */ d_t = dT + (i__ - ilo)*nb; magma_ssetmatrix( nb, nb, t, nb, d_t, nb ); magma_slahru(n, ihi, i__ - 1, ib, a + (i__ - 1 )*(lda), lda, d_A + (i__ - ilo)*ldda, d_A + (i__ - ilo)*ldda + i__ - 1, d_A + N*ldda, d_t, d_work); } } /* Use unblocked code to reduce the rest of the matrix */ if (!(nb < nbmin || nb >= nh)) magma_sgetmatrix( n, n-i__+1, d_A+ (i__-ilo)*ldda, ldda, a + (i__-1)*(lda), lda ); lapackf77_sgehd2(&n, &i__, &ihi, a, &lda, &tau[1], work, &iinfo); MAGMA_S_SET2REAL( work[0], (float) iws ); magma_free( da ); magma_free_cpu(t); return *info; } /* magma_sgehrd */
extern "C" magma_int_t magma_sgehrd(magma_int_t n, magma_int_t ilo, magma_int_t ihi, float *A, magma_int_t lda, float *tau, float *work, magma_int_t lwork, float *dT, magma_int_t *info) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= SGEHRD reduces a REAL general matrix A to upper Hessenberg form H by an orthogonal similarity transformation: Q' * A * Q = H . This version stores the triangular matrices used in the factorization so that they can be applied directly (i.e., without being recomputed) later. As a result, the application of Q is much faster. Arguments ========= N (input) INTEGER The order of the matrix A. N >= 0. ILO (input) INTEGER IHI (input) INTEGER It is assumed that A is already upper triangular in rows and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally set by a previous call to SGEBAL; otherwise they should be set to 1 and N respectively. See Further Details. 1 <= ILO <= IHI <= N, if N > 0; ILO=1 and IHI=0, if N=0. A (input/output) REAL array, dimension (LDA,N) On entry, the N-by-N general matrix to be reduced. On exit, the upper triangle and the first subdiagonal of A are overwritten with the upper Hessenberg matrix H, and the elements below the first subdiagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). TAU (output) REAL array, dimension (N-1) The scalar factors of the elementary reflectors (see Further Details). Elements 1:ILO-1 and IHI:N-1 of TAU are set to zero. WORK (workspace/output) REAL array, dimension (LWORK) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The length of the array WORK. LWORK >= max(1,N). For optimum performance LWORK >= N*NB, where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. dT (output) REAL array on the GPU, dimension NB*N, where NB is the optimal blocksize. It stores the NB*NB blocks of the triangular T matrices used in the reduction. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value. Further Details =============== The matrix Q is represented as a product of (ihi-ilo) elementary reflectors Q = H(ilo) H(ilo+1) . . . H(ihi-1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i) = 0, v(i+1) = 1 and v(ihi+1:n) = 0; v(i+2:ihi) is stored on exit in A(i+2:ihi,i), and tau in TAU(i). The contents of A are illustrated by the following example, with n = 7, ilo = 2 and ihi = 6: on entry, on exit, ( a a a a a a a ) ( a a h h h h a ) ( a a a a a a ) ( a h h h h a ) ( a a a a a a ) ( h h h h h h ) ( a a a a a a ) ( v2 h h h h h ) ( a a a a a a ) ( v2 v3 h h h h ) ( a a a a a a ) ( v2 v3 v4 h h h ) ( a ) ( a ) where a denotes an element of the original matrix A, h denotes a modified element of the upper Hessenberg matrix H, and vi denotes an element of the vector defining H(i). This implementation follows the hybrid algorithm and notations described in S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg form through hybrid GPU-based computing," University of Tennessee Computer Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219), May 24, 2009. This version stores the T matrices in dT, for later use in magma_sorghr. ===================================================================== */ #define A( i, j ) ( A + (i) + (j)*lda) #define dA( i, j ) (dA + (i) + (j-ilo)*ldda) float c_one = MAGMA_S_ONE; float c_zero = MAGMA_S_ZERO; magma_int_t nb = magma_get_sgehrd_nb(n); magma_int_t ldda = n; // assumed in slahru magma_int_t nh, iws; magma_int_t iinfo; magma_int_t ldwork; magma_int_t lquery; *info = 0; iws = n*nb; work[0] = MAGMA_S_MAKE( iws, 0 ); lquery = lwork == -1; if (n < 0) { *info = -1; } else if (ilo < 1 || ilo > max(1,n)) { *info = -2; } else if (ihi < min(ilo,n) || ihi > n) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if (lwork < max(1,n) && ! lquery) { *info = -8; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; // Adjust from 1-based indexing ilo -= 1; // Quick return if possible nh = ihi - ilo; if (nh <= 1) { work[0] = c_one; return *info; } // GPU workspace is: // nb*ldda for dwork for slahru // nb*ldda for dV // n*ldda for dA float *dwork; if (MAGMA_SUCCESS != magma_smalloc( &dwork, 2*nb*ldda + n*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } float *dV = dwork + nb*ldda; float *dA = dwork + nb*ldda*2; ldwork = n; magma_int_t i; float *T, *dTi; magma_smalloc_cpu( &T, nb*nb ); if ( T == NULL ) { magma_free( dwork ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } // zero first block of V, which is lower triangular szero_nbxnb_block(nb, dV, ldda); // Set elements 0:ILO-1 and IHI-1:N-2 of TAU to zero for(i = 0; i < ilo; ++i) tau[i] = c_zero; for(i = max(0,ihi-1); i < n-1; ++i) tau[i] = c_zero; for(i=0; i < nb*nb; i += 4) T[i] = T[i+1] = T[i+2] = T[i+3] = c_zero; magmablas_slaset( 'F', nb, n, dT, nb ); // If not enough workspace, use unblocked code if ( lwork < iws ) { nb = 1; } if (nb == 1 || nb > nh) { // Use unblocked code below i = ilo; } else { // Use blocked code // Copy the matrix to the GPU magma_ssetmatrix( n, n-ilo, A(0,ilo), lda, dA, ldda ); for (i = ilo; i < ihi-1 - nb; i += nb) { // Reduce columns i:i+nb-1 to Hessenberg form, returning the // matrices V and T of the block reflector H = I - V*T*V' // which performs the reduction, and also the matrix Y = A*V*T // Get the current panel (no need for the 1st iteration) magma_sgetmatrix( ihi-i, nb, dA(i,i), ldda, A (i,i), lda ); // add 1 to i for 1-based index magma_slahr2( ihi, i+1, nb, dA(0,i), dV, A (0,i), lda, &tau[i], T, nb, work, ldwork); // Copy T from the CPU to dT on the GPU dTi = dT + (i - ilo)*nb; magma_ssetmatrix( nb, nb, T, nb, dTi, nb ); magma_slahru( n, ihi, i, nb, A (0,i), lda, dA(0,i), // dA dA(i,i), // dY, stored over current panel dV, dTi, dwork ); } // Copy remainder to host magma_sgetmatrix( n, n-i, dA(0,i), ldda, A (0,i), lda ); } // Use unblocked code to reduce the rest of the matrix // add 1 to i for 1-based index i += 1; lapackf77_sgehd2(&n, &i, &ihi, A, &lda, tau, work, &iinfo); work[0] = MAGMA_S_MAKE( iws, 0 ); magma_free( dwork ); magma_free_cpu( T ); return *info; } /* magma_sgehrd */